mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-19 15:20:36 +08:00
feature: introduce heapless state init in L0
Related-To: NEO-7824 Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
8ef6cdbabd
commit
4eae28bd64
@@ -346,6 +346,10 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
return heaplessModeEnabled;
|
||||
}
|
||||
|
||||
bool isHeaplessStateInitEnabled() const {
|
||||
return heaplessStateInitEnabled;
|
||||
}
|
||||
|
||||
virtual bool skipInOrderNonWalkerSignalingAllowed(ze_event_handle_t signalEvent) const { return false; }
|
||||
|
||||
bool getCmdListBatchBufferFlag() const {
|
||||
@@ -436,6 +440,7 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
bool copyThroughLockedPtrEnabled = false;
|
||||
bool useOnlyGlobalTimestamps = false;
|
||||
bool heaplessModeEnabled = false;
|
||||
bool heaplessStateInitEnabled = false;
|
||||
};
|
||||
|
||||
using CommandListAllocatorFn = CommandList *(*)(uint32_t);
|
||||
|
||||
@@ -86,11 +86,13 @@ CommandListCoreFamily<gfxCoreFamily>::~CommandListCoreFamily() {
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::postInitComputeSetup() {
|
||||
if (!this->stateBaseAddressTracking) {
|
||||
|
||||
if (!this->stateBaseAddressTracking && !this->heaplessStateInitEnabled) {
|
||||
if (!this->isFlushTaskSubmissionEnabled) {
|
||||
programStateBaseAddress(commandContainer, false);
|
||||
}
|
||||
}
|
||||
|
||||
commandContainer.setDirtyStateForAllHeaps(false);
|
||||
|
||||
setStreamPropertiesDefaultSettings(requiredStreamState);
|
||||
@@ -235,6 +237,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::initialize(Device *device, NEO
|
||||
this->useOnlyGlobalTimestamps = gfxCoreHelper.useOnlyGlobalTimestamps();
|
||||
this->maxFillPaternSizeForCopyEngine = gfxCoreHelper.getMaxFillPaternSizeForCopyEngine();
|
||||
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
|
||||
this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled();
|
||||
this->requiredStreamState.initSupport(rootDeviceEnvironment);
|
||||
this->finalStreamState.initSupport(rootDeviceEnvironment);
|
||||
this->duplicatedInOrderCounterStorageEnabled = gfxCoreHelper.duplicatedInOrderCounterStorageEnabled(rootDeviceEnvironment);
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "level_zero/core/source/cmdlist/cmdlist_hw.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
|
||||
namespace NEO {
|
||||
struct SvmAllocationData;
|
||||
@@ -219,6 +220,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
|
||||
|
||||
MOCKABLE_VIRTUAL void checkAssert();
|
||||
ComputeFlushMethodType computeFlushMethod = nullptr;
|
||||
std::function<NEO::CompletionStamp(NEO::LinearStream &, size_t, NEO::ImmediateDispatchFlags &, NEO::Device &)> flushImmediateTaskMethod;
|
||||
std::atomic<bool> dependenciesPresent{false};
|
||||
bool latestFlushIsHostVisible = false;
|
||||
};
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
#include "encode_surface_state_args.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
|
||||
namespace L0 {
|
||||
|
||||
@@ -209,11 +210,10 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmedia
|
||||
};
|
||||
CommandListImp::storeReferenceTsToMappedEvents(true);
|
||||
|
||||
return this->csr->flushImmediateTask(
|
||||
cmdStreamTask,
|
||||
taskStartOffset,
|
||||
dispatchFlags,
|
||||
*(this->device->getNEODevice()));
|
||||
return this->flushImmediateTaskMethod(cmdStreamTask,
|
||||
taskStartOffset,
|
||||
dispatchFlags,
|
||||
*(this->device->getNEODevice()));
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
@@ -1363,6 +1363,12 @@ template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamilyImmediate<gfxCoreFamily>::setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
if (L0GfxCoreHelper::useImmediateComputeFlushTask(rootDeviceEnvironment)) {
|
||||
this->computeFlushMethod = &CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediateRegularTask;
|
||||
|
||||
if (this->isHeaplessStateInitEnabled()) {
|
||||
this->flushImmediateTaskMethod = std::bind(&NEO::CommandStreamReceiver::flushImmediateTaskStateless, this->csr, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
|
||||
} else {
|
||||
this->flushImmediateTaskMethod = std::bind(&NEO::CommandStreamReceiver::flushImmediateTask, this->csr, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -276,7 +276,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
|
||||
bool uncachedMocsKernel = isKernelUncachedMocsRequired(kernelImp->getKernelRequiresUncachedMocs());
|
||||
this->requiresQueueUncachedMocs |= kernelImp->getKernelRequiresQueueUncachedMocs();
|
||||
|
||||
updateStreamProperties(*kernel, launchParams.isCooperative, threadGroupDimensions, launchParams.isIndirect);
|
||||
if (this->heaplessStateInitEnabled == false) {
|
||||
updateStreamProperties(*kernel, launchParams.isCooperative, threadGroupDimensions, launchParams.isIndirect);
|
||||
}
|
||||
|
||||
auto localMemSize = static_cast<uint32_t>(neoDevice->getDeviceInfo().localMemSize);
|
||||
auto slmTotalSize = kernelImp->getSlmTotalSize();
|
||||
|
||||
@@ -101,6 +101,7 @@ ze_result_t CommandQueueImp::initialize(bool copyOnly, bool isInternal, bool imm
|
||||
this->dispatchCmdListBatchBufferAsPrimary = L0GfxCoreHelper::dispatchCmdListBatchBufferAsPrimary(rootDeviceEnvironment, !immediateCmdListQueue);
|
||||
auto &compilerProductHelper = rootDeviceEnvironment.getHelper<NEO::CompilerProductHelper>();
|
||||
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled();
|
||||
this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled();
|
||||
}
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
@@ -93,6 +93,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
|
||||
bool dispatchCmdListBatchBufferAsPrimary = false;
|
||||
bool internalQueueForImmediateCommandList = false;
|
||||
bool heaplessModeEnabled = false;
|
||||
bool heaplessStateInitEnabled = false;
|
||||
};
|
||||
|
||||
using CommandQueueAllocatorFn = CommandQueue *(*)(Device *device, NEO::CommandStreamReceiver *csr,
|
||||
|
||||
@@ -23,7 +23,6 @@ template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
struct CommandQueueHw : public CommandQueueImp {
|
||||
using CommandQueueImp::CommandQueueImp;
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
|
||||
ze_result_t createFence(const ze_fence_desc_t *desc, ze_fence_handle_t *phFence) override;
|
||||
ze_result_t executeCommandLists(uint32_t numCommandLists,
|
||||
ze_command_list_handle_t *phCommandLists,
|
||||
@@ -117,6 +116,13 @@ struct CommandQueueHw : public CommandQueueImp {
|
||||
bool lockScratchController = false;
|
||||
};
|
||||
|
||||
ze_result_t executeCommandListsRegularHeapless(CommandListExecutionContext &ctx,
|
||||
uint32_t numCommandLists,
|
||||
ze_command_list_handle_t *commandListHandles,
|
||||
ze_fence_handle_t hFence,
|
||||
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents,
|
||||
ze_event_handle_t *phWaitEvents);
|
||||
|
||||
MOCKABLE_VIRTUAL ze_result_t executeCommandListsRegular(CommandListExecutionContext &ctx,
|
||||
uint32_t numCommandLists,
|
||||
ze_command_list_handle_t *commandListHandles,
|
||||
@@ -139,6 +145,11 @@ struct CommandQueueHw : public CommandQueueImp {
|
||||
ze_fence_handle_t hFence);
|
||||
MOCKABLE_VIRTUAL bool isDispatchTaskCountPostSyncRequired(ze_fence_handle_t hFence, bool containsAnyRegularCmdList) const;
|
||||
inline size_t estimateLinearStreamSizeInitial(CommandListExecutionContext &ctx);
|
||||
size_t estimateStreamSizeForExecuteCommandListsRegularHeapless(CommandListExecutionContext &ctx,
|
||||
uint32_t numCommandLists,
|
||||
ze_command_list_handle_t *commandListHandles,
|
||||
bool instructionCacheFlushRequired,
|
||||
bool stateCacheFlushRequired);
|
||||
inline size_t estimateCommandListSecondaryStart(CommandList *commandList);
|
||||
inline size_t estimateCommandListPrimaryStart(bool required);
|
||||
inline size_t estimateCommandListResidencySize(CommandList *commandList);
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "shared/source/device/device.h"
|
||||
#include "shared/source/execution_environment/root_device_environment.h"
|
||||
#include "shared/source/helpers/api_specific_config.h"
|
||||
#include "shared/source/helpers/compiler_product_helper.h"
|
||||
#include "shared/source/helpers/definitions/command_encoder_args.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/heap_base_address_model.h"
|
||||
@@ -96,7 +97,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
csr->isProgramActivePartitionConfigRequired(),
|
||||
performMigration,
|
||||
csr->getSipSentFlag()};
|
||||
ctx.globalInit |= ctx.isDebugEnabled && !this->commandQueueDebugCmdsProgrammed && device->getL0Debugger();
|
||||
|
||||
ctx.globalInit |= ctx.isDebugEnabled &&
|
||||
!this->commandQueueDebugCmdsProgrammed &&
|
||||
device->getL0Debugger();
|
||||
ctx.lockScratchController = lockScratchController;
|
||||
|
||||
this->startingCmdBuffer = &this->commandStream;
|
||||
@@ -104,6 +108,9 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
|
||||
if (this->isCopyOnlyCommandQueue) {
|
||||
ret = this->executeCommandListsCopyOnly(ctx, numCommandLists, phCommandLists, hFence, nullptr, 0, nullptr);
|
||||
} else if (this->heaplessStateInitEnabled) {
|
||||
ctx.globalInit = false;
|
||||
ret = this->executeCommandListsRegularHeapless(ctx, numCommandLists, phCommandLists, hFence, nullptr, 0, nullptr);
|
||||
} else {
|
||||
ret = this->executeCommandListsRegular(ctx, numCommandLists, phCommandLists, hFence, nullptr, 0, nullptr);
|
||||
}
|
||||
@@ -115,6 +122,119 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
|
||||
CommandListExecutionContext &ctx,
|
||||
uint32_t numCommandLists,
|
||||
ze_command_list_handle_t *commandListHandles,
|
||||
ze_fence_handle_t hFence,
|
||||
ze_event_handle_t hSignalEvent, uint32_t numWaitEvents,
|
||||
ze_event_handle_t *phWaitEvents) {
|
||||
|
||||
this->setupCmdListsAndContextParams(ctx, commandListHandles, numCommandLists, hFence);
|
||||
ctx.isDirectSubmissionEnabled = this->csr->isDirectSubmissionEnabled();
|
||||
bool instructionCacheFlushRequired = this->csr->isInstructionCacheFlushRequired();
|
||||
auto neoDevice = this->device->getNEODevice();
|
||||
bool stateCacheFlushRequired = neoDevice->getBindlessHeapsHelper() ? neoDevice->getBindlessHeapsHelper()->getStateDirtyForContext(this->csr->getOsContext().getContextId()) : false;
|
||||
|
||||
std::unique_lock<std::mutex> lockForIndirect;
|
||||
if (ctx.hasIndirectAccess) {
|
||||
handleIndirectAllocationResidency(ctx.unifiedMemoryControls, lockForIndirect, ctx.isMigrationRequested);
|
||||
}
|
||||
|
||||
size_t linearStreamSizeEstimate = this->estimateStreamSizeForExecuteCommandListsRegularHeapless(ctx, numCommandLists, commandListHandles, instructionCacheFlushRequired, stateCacheFlushRequired);
|
||||
|
||||
this->csr->getResidencyAllocations().reserve(ctx.spaceForResidency);
|
||||
|
||||
NEO::LinearStream child(nullptr);
|
||||
if (const auto ret = this->makeAlignedChildStreamAndSetGpuBase(child, linearStreamSizeEstimate); ret != ZE_RESULT_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
this->makeCsrTagAllocationResident();
|
||||
|
||||
if (instructionCacheFlushRequired) {
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addInstructionCacheFlush(child);
|
||||
this->csr->setInstructionCacheFlushed();
|
||||
}
|
||||
|
||||
if (stateCacheFlushRequired) {
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addStateCacheFlush(child, neoDevice->getRootDeviceEnvironment());
|
||||
neoDevice->getBindlessHeapsHelper()->clearStateDirtyForContext(this->csr->getOsContext().getContextId());
|
||||
}
|
||||
|
||||
for (auto i = 0u; i < numCommandLists; ++i) {
|
||||
auto commandList = CommandList::fromHandle(commandListHandles[i]);
|
||||
|
||||
ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition();
|
||||
|
||||
this->patchCommands(*commandList, this->csr->getScratchSpaceController()->getScratchPatchAddress());
|
||||
this->programOneCmdListBatchBufferStart(commandList, child, ctx);
|
||||
|
||||
this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList);
|
||||
if (commandList->hasKernelWithAssert()) {
|
||||
cmdListWithAssertExecuted.exchange(true);
|
||||
}
|
||||
|
||||
this->collectPrintfContentsFromCommandsList(commandList);
|
||||
}
|
||||
|
||||
this->migrateSharedAllocationsIfRequested(ctx.isMigrationRequested, ctx.firstCommandList);
|
||||
this->programLastCommandListReturnBbStart(child, ctx);
|
||||
this->assignCsrTaskCountToFenceIfAvailable(hFence);
|
||||
this->dispatchTaskCountPostSyncRegular(ctx.isDispatchTaskCountPostSyncRequired, child);
|
||||
|
||||
auto submitResult = this->prepareAndSubmitBatchBuffer(ctx, child);
|
||||
this->updateTaskCountAndPostSync(ctx.isDispatchTaskCountPostSyncRequired);
|
||||
this->csr->makeSurfacePackNonResident(this->csr->getResidencyAllocations(), false);
|
||||
|
||||
auto completionResult = this->waitForCommandQueueCompletionAndCleanHeapContainer();
|
||||
ze_result_t retVal = this->handleSubmissionAndCompletionResults(submitResult, completionResult);
|
||||
|
||||
this->csr->getResidencyAllocations().clear();
|
||||
return retVal;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
size_t CommandQueueHw<gfxCoreFamily>::estimateStreamSizeForExecuteCommandListsRegularHeapless(CommandListExecutionContext &ctx,
|
||||
uint32_t numCommandLists,
|
||||
ze_command_list_handle_t *commandListHandles,
|
||||
bool instructionCacheFlushRequired,
|
||||
bool stateCacheFlushRequired) {
|
||||
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
||||
using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END;
|
||||
|
||||
size_t linearStreamSizeEstimate = 0u;
|
||||
if (ctx.isDirectSubmissionEnabled) {
|
||||
linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
|
||||
if (NEO::debugManager.flags.DirectSubmissionRelaxedOrdering.get() == 1) {
|
||||
linearStreamSizeEstimate += 2 * sizeof(typename GfxFamily::MI_LOAD_REGISTER_REG);
|
||||
}
|
||||
} else {
|
||||
linearStreamSizeEstimate += NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferEndSize();
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < numCommandLists; i++) {
|
||||
auto cmdList = CommandList::fromHandle(commandListHandles[i]);
|
||||
linearStreamSizeEstimate += estimateCommandListSecondaryStart(cmdList);
|
||||
ctx.spaceForResidency += estimateCommandListResidencySize(cmdList);
|
||||
}
|
||||
|
||||
if (ctx.isDispatchTaskCountPostSyncRequired) {
|
||||
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(this->device->getNEODevice()->getRootDeviceEnvironment(), false);
|
||||
}
|
||||
|
||||
if (instructionCacheFlushRequired) {
|
||||
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForInstructionCacheFlush();
|
||||
}
|
||||
|
||||
if (stateCacheFlushRequired) {
|
||||
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForFullCacheFlush();
|
||||
}
|
||||
|
||||
return linearStreamSizeEstimate;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
|
||||
CommandListExecutionContext &ctx,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (C) 2020-2023 Intel Corporation
|
||||
# Copyright (C) 2020-2024 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
#
|
||||
@@ -16,6 +16,7 @@ set(L0_MOCKS_SOURCES
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_cmdlist.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_cmdqueue.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_cmdqueue.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_cmdqueue_handle_indirect_allocs.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_context.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_device.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mock_device_for_spirv.h
|
||||
|
||||
@@ -70,6 +70,8 @@ template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
|
||||
using BaseClass = ::L0::CommandQueueHw<gfxCoreFamily>;
|
||||
using BaseClass::commandStream;
|
||||
using BaseClass::estimateStreamSizeForExecuteCommandListsRegularHeapless;
|
||||
using BaseClass::executeCommandListsRegularHeapless;
|
||||
using BaseClass::prepareAndSubmitBatchBuffer;
|
||||
using BaseClass::printfKernelContainer;
|
||||
using BaseClass::startingCmdBuffer;
|
||||
@@ -79,6 +81,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
|
||||
using L0::CommandQueue::doubleSbaWa;
|
||||
using L0::CommandQueue::frontEndStateTracking;
|
||||
using L0::CommandQueue::heaplessModeEnabled;
|
||||
using L0::CommandQueue::heaplessStateInitEnabled;
|
||||
using L0::CommandQueue::internalQueueForImmediateCommandList;
|
||||
using L0::CommandQueue::internalUsage;
|
||||
using L0::CommandQueue::partitionCount;
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Copyright (C) 2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
|
||||
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
class MockCommandQueueHandleIndirectAllocs : public MockCommandQueueHw<gfxCoreFamily> {
|
||||
public:
|
||||
using typename MockCommandQueueHw<gfxCoreFamily>::CommandListExecutionContext;
|
||||
using MockCommandQueueHw<gfxCoreFamily>::executeCommandListsRegular;
|
||||
using MockCommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless;
|
||||
|
||||
MockCommandQueueHandleIndirectAllocs(L0::Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc) : MockCommandQueueHw<gfxCoreFamily>(device, csr, desc) {}
|
||||
void handleIndirectAllocationResidency(UnifiedMemoryControls unifiedMemoryControls, std::unique_lock<std::mutex> &lockForIndirect, bool performMigration) override {
|
||||
handleIndirectAllocationResidencyCalledTimes++;
|
||||
MockCommandQueueHw<gfxCoreFamily>::handleIndirectAllocationResidency(unifiedMemoryControls, lockForIndirect, performMigration);
|
||||
}
|
||||
void makeResidentAndMigrate(bool performMigration, const NEO::ResidencyContainer &residencyContainer) override {
|
||||
makeResidentAndMigrateCalledTimes++;
|
||||
}
|
||||
uint32_t handleIndirectAllocationResidencyCalledTimes = 0;
|
||||
uint32_t makeResidentAndMigrateCalledTimes = 0;
|
||||
};
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
@@ -245,16 +245,16 @@ HWTEST2_F(CommandListImmediateWithAssert, givenKernelWithAssertWhenAppendedToAsy
|
||||
desc.pNext = 0;
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
|
||||
|
||||
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.isFlushTaskSubmissionEnabled = true;
|
||||
cmdList.callBaseExecute = true;
|
||||
cmdList.cmdListType = CommandList::CommandListType::typeImmediate;
|
||||
cmdList.isSyncModeQueue = false;
|
||||
cmdList.setCsr(&csr);
|
||||
result = cmdList.initialize(device, NEO::EngineGroupType::renderCompute, 0u);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
cmdList.setCsr(&csr);
|
||||
cmdList.getCmdContainer().setImmediateCmdListCsr(&csr);
|
||||
auto commandQueue = CommandQueue::create(productFamily, device, &csr, &desc, cmdList.isCopyOnly(), false, false, result);
|
||||
cmdList.cmdQImmediate = commandQueue;
|
||||
@@ -271,18 +271,17 @@ HWTEST2_F(CommandListImmediateWithAssert, givenKernelWithAssertWhenAppendedToAsy
|
||||
|
||||
HWTEST2_F(CommandListImmediateWithAssert, givenKernelWithAssertWhenAppendedToSynchronousImmCommandListThenAssertIsChecked, IsAtLeastSkl) {
|
||||
ze_result_t result;
|
||||
|
||||
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
Mock<KernelImp> kernel;
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.isFlushTaskSubmissionEnabled = true;
|
||||
cmdList.callBaseExecute = true;
|
||||
cmdList.cmdListType = CommandList::CommandListType::typeImmediate;
|
||||
cmdList.isSyncModeQueue = true;
|
||||
cmdList.setCsr(&csr);
|
||||
result = cmdList.initialize(device, NEO::EngineGroupType::renderCompute, 0u);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
cmdList.setCsr(&csr);
|
||||
cmdList.getCmdContainer().setImmediateCmdListCsr(&csr);
|
||||
|
||||
ze_command_queue_desc_t desc = {};
|
||||
@@ -317,10 +316,10 @@ HWTEST2_F(CommandListImmediateWithAssert, givenKernelWithAssertWhenAppendToSynch
|
||||
cmdList.callBaseExecute = true;
|
||||
cmdList.cmdListType = CommandList::CommandListType::typeImmediate;
|
||||
cmdList.isSyncModeQueue = true;
|
||||
cmdList.setCsr(&csr);
|
||||
result = cmdList.initialize(device, NEO::EngineGroupType::renderCompute, 0u);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
cmdList.setCsr(&csr);
|
||||
cmdList.getCmdContainer().setImmediateCmdListCsr(&csr);
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
|
||||
|
||||
@@ -705,11 +705,13 @@ HWTEST2_F(CmdlistAppendLaunchKernelTests,
|
||||
auto commandList = std::make_unique<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>>();
|
||||
ASSERT_NE(nullptr, commandList);
|
||||
commandList->isFlushTaskSubmissionEnabled = true;
|
||||
ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
|
||||
commandList->device = device;
|
||||
commandList->cmdListType = CommandList::CommandListType::typeImmediate;
|
||||
commandList->csr = device->getNEODevice()->getDefaultEngine().commandStreamReceiver;
|
||||
|
||||
ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u);
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
|
||||
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
|
||||
MockCommandQueueHw<gfxCoreFamily> mockCommandQueue(device, device->getNEODevice()->getDefaultEngine().commandStreamReceiver, &desc);
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "level_zero/core/test/unit_tests/fixtures/module_fixture.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue_handle_indirect_allocs.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_memory_manager.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
|
||||
|
||||
@@ -787,23 +788,6 @@ HWTEST2_F(EngineInstancedDeviceExecuteTests, givenEngineInstancedDeviceWithFabri
|
||||
commandQueue->destroy();
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
class MockCommandQueueHandleIndirectAllocs : public MockCommandQueueHw<gfxCoreFamily> {
|
||||
public:
|
||||
using typename MockCommandQueueHw<gfxCoreFamily>::CommandListExecutionContext;
|
||||
using MockCommandQueueHw<gfxCoreFamily>::executeCommandListsRegular;
|
||||
MockCommandQueueHandleIndirectAllocs(L0::Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc) : MockCommandQueueHw<gfxCoreFamily>(device, csr, desc) {}
|
||||
void handleIndirectAllocationResidency(UnifiedMemoryControls unifiedMemoryControls, std::unique_lock<std::mutex> &lockForIndirect, bool performMigration) override {
|
||||
handleIndirectAllocationResidencyCalledTimes++;
|
||||
MockCommandQueueHw<gfxCoreFamily>::handleIndirectAllocationResidency(unifiedMemoryControls, lockForIndirect, performMigration);
|
||||
}
|
||||
void makeResidentAndMigrate(bool performMigration, const NEO::ResidencyContainer &residencyContainer) override {
|
||||
makeResidentAndMigrateCalledTimes++;
|
||||
}
|
||||
uint32_t handleIndirectAllocationResidencyCalledTimes = 0;
|
||||
uint32_t makeResidentAndMigrateCalledTimes = 0;
|
||||
};
|
||||
|
||||
HWTEST2_F(CommandQueueIndirectAllocations, givenCtxWithIndirectAccessWhenExecutingCommandListImmediateWithFlushTaskThenHandleIndirectAccessCalled, IsAtLeastSkl) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
auto csr = neoDevice->getDefaultEngine().commandStreamReceiver;
|
||||
|
||||
@@ -580,6 +580,15 @@ class CommandStreamReceiverMock : public CommandStreamReceiver {
|
||||
return cs;
|
||||
}
|
||||
|
||||
CompletionStamp flushImmediateTaskStateless(
|
||||
LinearStream &immediateCommandStream,
|
||||
size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags,
|
||||
Device &device) override {
|
||||
CompletionStamp cs = {};
|
||||
return cs;
|
||||
}
|
||||
|
||||
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
|
||||
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override {
|
||||
CompletionStamp cs = {};
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "shared/source/gmm_helper/client_context/gmm_client_context.h"
|
||||
#include "shared/source/helpers/basic_math.h"
|
||||
#include "shared/source/helpers/cache_policy.h"
|
||||
#include "shared/source/helpers/compiler_product_helper.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/hw_walk_order.h"
|
||||
@@ -284,33 +285,37 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
}
|
||||
}
|
||||
|
||||
if (container.isAnyHeapDirty() ||
|
||||
args.requiresUncachedMocs) {
|
||||
bool heaplessStateInitEnabled = rootDeviceEnvironment.getHelper<CompilerProductHelper>().isHeaplessStateInitEnabled();
|
||||
|
||||
PipeControlArgs syncArgs;
|
||||
syncArgs.dcFlushEnable = args.dcFlushEnable;
|
||||
MemorySynchronizationCommands<Family>::addSingleBarrier(*container.getCommandStream(), syncArgs);
|
||||
STATE_BASE_ADDRESS sbaCmd;
|
||||
auto gmmHelper = container.getDevice()->getGmmHelper();
|
||||
uint32_t statelessMocsIndex =
|
||||
args.requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1);
|
||||
auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false);
|
||||
auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true);
|
||||
if (heaplessStateInitEnabled == false) {
|
||||
if (container.isAnyHeapDirty() ||
|
||||
args.requiresUncachedMocs) {
|
||||
|
||||
EncodeStateBaseAddressArgs<Family> encodeStateBaseAddressArgs = {
|
||||
&container, // container
|
||||
sbaCmd, // sbaCmd
|
||||
nullptr, // sbaProperties
|
||||
statelessMocsIndex, // statelessMocsIndex
|
||||
l1CachePolicy, // l1CachePolicy
|
||||
l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
|
||||
args.partitionCount > 1, // multiOsContextCapable
|
||||
args.isRcs, // isRcs
|
||||
container.doubleSbaWaRef(), // doubleSbaWa
|
||||
heaplessModeEnabled, // heaplessModeEnabled
|
||||
};
|
||||
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
|
||||
container.setDirtyStateForAllHeaps(false);
|
||||
PipeControlArgs syncArgs;
|
||||
syncArgs.dcFlushEnable = args.dcFlushEnable;
|
||||
MemorySynchronizationCommands<Family>::addSingleBarrier(*container.getCommandStream(), syncArgs);
|
||||
STATE_BASE_ADDRESS sbaCmd;
|
||||
auto gmmHelper = container.getDevice()->getGmmHelper();
|
||||
uint32_t statelessMocsIndex =
|
||||
args.requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1);
|
||||
auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false);
|
||||
auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true);
|
||||
|
||||
EncodeStateBaseAddressArgs<Family> encodeStateBaseAddressArgs = {
|
||||
&container, // container
|
||||
sbaCmd, // sbaCmd
|
||||
nullptr, // sbaProperties
|
||||
statelessMocsIndex, // statelessMocsIndex
|
||||
l1CachePolicy, // l1CachePolicy
|
||||
l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
|
||||
args.partitionCount > 1, // multiOsContextCapable
|
||||
args.isRcs, // isRcs
|
||||
container.doubleSbaWaRef(), // doubleSbaWa
|
||||
heaplessModeEnabled, // heaplessModeEnabled
|
||||
};
|
||||
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
|
||||
container.setDirtyStateForAllHeaps(false);
|
||||
}
|
||||
}
|
||||
|
||||
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
|
||||
|
||||
@@ -105,6 +105,9 @@ class CommandStreamReceiver {
|
||||
virtual CompletionStamp flushBcsTask(LinearStream &commandStream, size_t commandStreamStart, const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) = 0;
|
||||
virtual CompletionStamp flushImmediateTask(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags, Device &device) = 0;
|
||||
|
||||
virtual CompletionStamp flushImmediateTaskStateless(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags, Device &device) = 0;
|
||||
virtual SubmissionStatus sendRenderStateCacheFlush() = 0;
|
||||
|
||||
virtual bool flushBatchedSubmissions() = 0;
|
||||
|
||||
@@ -70,6 +70,9 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
|
||||
CompletionStamp flushImmediateTask(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags, Device &device) override;
|
||||
|
||||
CompletionStamp flushImmediateTaskStateless(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags, Device &device) override;
|
||||
|
||||
void forcePipeControl(NEO::LinearStream &commandStreamCSR);
|
||||
|
||||
bool flushBatchedSubmissions() override;
|
||||
@@ -305,6 +308,9 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
|
||||
LinearStream &immediateCommandStream,
|
||||
ImmediateFlushData &flushData);
|
||||
|
||||
void handleImmediateFlushStatelessAllocationsResidency(size_t csrEstimatedSize,
|
||||
LinearStream &csrStream);
|
||||
|
||||
inline void handleImmediateFlushAllocationsResidency(Device &device,
|
||||
LinearStream &immediateCommandStream,
|
||||
ImmediateFlushData &flushData,
|
||||
|
||||
@@ -24,6 +24,19 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTaskStateless(
|
||||
return {};
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushImmediateTaskStateless(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags, Device &device) {
|
||||
UNRECOVERABLE_IF(true);
|
||||
return {};
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void CommandStreamReceiverHw<GfxFamily>::handleImmediateFlushStatelessAllocationsResidency(size_t csrEstimatedSize,
|
||||
LinearStream &csrStream) {
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
SubmissionStatus CommandStreamReceiverHw<GfxFamily>::programHeaplessProlog(Device &device) {
|
||||
UNRECOVERABLE_IF(true);
|
||||
|
||||
@@ -358,9 +358,9 @@ size_t MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(bool tl
|
||||
template <typename GfxFamily>
|
||||
size_t MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(const RootDeviceEnvironment &rootDeviceEnvironment, bool tlbInvalidationRequired) {
|
||||
|
||||
size_t size = getSizeForSingleBarrier(tlbInvalidationRequired) +
|
||||
getSizeForBarrierWa(rootDeviceEnvironment) +
|
||||
getSizeForSingleAdditionalSynchronization(rootDeviceEnvironment);
|
||||
size_t size = getSizeForSingleBarrier(tlbInvalidationRequired);
|
||||
size += getSizeForBarrierWa(rootDeviceEnvironment);
|
||||
size += getSizeForSingleAdditionalSynchronization(rootDeviceEnvironment);
|
||||
return size;
|
||||
}
|
||||
|
||||
|
||||
@@ -61,6 +61,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
|
||||
using BaseClass::getScratchSpaceController;
|
||||
using BaseClass::handleAllocationsResidencyForHeaplessProlog;
|
||||
using BaseClass::handleFrontEndStateTransition;
|
||||
using BaseClass::handleImmediateFlushStatelessAllocationsResidency;
|
||||
using BaseClass::handlePipelineSelectStateTransition;
|
||||
using BaseClass::handleStateBaseAddressStateTransition;
|
||||
using BaseClass::heapStorageRequiresRecyclingTag;
|
||||
|
||||
@@ -58,3 +58,13 @@ CompletionStamp MockCommandStreamReceiver::flushImmediateTask(
|
||||
CompletionStamp stamp = {taskCount, taskLevel, flushStamp->peekStamp()};
|
||||
return stamp;
|
||||
}
|
||||
|
||||
CompletionStamp MockCommandStreamReceiver::flushImmediateTaskStateless(
|
||||
LinearStream &immediateCommandStream,
|
||||
size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags,
|
||||
Device &device) {
|
||||
++taskCount;
|
||||
CompletionStamp stamp = {taskCount, taskLevel, flushStamp->peekStamp()};
|
||||
return stamp;
|
||||
}
|
||||
|
||||
@@ -135,6 +135,12 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
|
||||
ImmediateDispatchFlags &dispatchFlags,
|
||||
Device &device) override;
|
||||
|
||||
CompletionStamp flushImmediateTaskStateless(
|
||||
LinearStream &immediateCommandStream,
|
||||
size_t immediateCommandStreamStart,
|
||||
ImmediateDispatchFlags &dispatchFlags,
|
||||
Device &device) override;
|
||||
|
||||
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
|
||||
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override;
|
||||
|
||||
|
||||
@@ -5092,4 +5092,6 @@ HWTEST_F(CommandStreamReceiverHwHeaplessTest, whenHeaplessCommandStreamReceiverF
|
||||
EXPECT_ANY_THROW(csr->handleAllocationsResidencyForflushTaskStateless(nullptr, nullptr, nullptr));
|
||||
EXPECT_ANY_THROW(csr->getRequiredCmdStreamHeaplessSize(csr->recordedDispatchFlags, *pDevice));
|
||||
EXPECT_ANY_THROW(csr->getRequiredCmdStreamHeaplessSizeAligned(csr->recordedDispatchFlags, *pDevice));
|
||||
EXPECT_ANY_THROW(csr->flushImmediateTaskStateless(commandStream, 0, csr->recordedImmediateDispatchFlags, *pDevice));
|
||||
EXPECT_ANY_THROW(csr->handleImmediateFlushStatelessAllocationsResidency(0, commandStream));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user