feature: introduce states programming at driver init heapless ocl

Related-To: NEO-7824
Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk
2024-03-07 16:50:57 +00:00
committed by Compute-Runtime-Automation
parent ea69b156d2
commit 168445784e
32 changed files with 383 additions and 96 deletions

View File

@@ -87,5 +87,11 @@ if(SUPPORT_DG2_AND_LATER)
)
endif()
if(NOT SUPPORT_HEAPLESS)
list(APPEND NEO_CORE_COMMAND_STREAM
${CMAKE_CURRENT_SOURCE_DIR}/command_stream_receiver_hw_heap_addressing.inl
)
endif()
set_property(GLOBAL PROPERTY NEO_CORE_COMMAND_STREAM ${NEO_CORE_COMMAND_STREAM})
add_subdirectories()

View File

@@ -96,6 +96,11 @@ class CommandStreamReceiver {
virtual CompletionStamp flushTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) = 0;
virtual CompletionStamp flushTaskStateless(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) = 0;
virtual CompletionStamp flushBcsTask(LinearStream &commandStream, size_t commandStreamStart, const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) = 0;
virtual CompletionStamp flushImmediateTask(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
ImmediateDispatchFlags &dispatchFlags, Device &device) = 0;
@@ -414,7 +419,7 @@ class CommandStreamReceiver {
lastPreemptionMode = value;
}
virtual SubmissionStatus initializeDeviceWithFirstSubmission() = 0;
virtual SubmissionStatus initializeDeviceWithFirstSubmission(Device &device) = 0;
uint32_t getNumClients() const {
return this->numClients.load();

View File

@@ -59,6 +59,12 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) override;
CompletionStamp flushTaskStateless(LinearStream &commandStream, size_t commandStreamStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) override;
void addPipeControlFlushTaskIfNeeded(LinearStream &commandStreamCSR, TaskCountType taskLevel);
CompletionStamp flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart, const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) override;
CompletionStamp flushImmediateTask(LinearStream &immediateCommandStream, size_t immediateCommandStreamStart,
@@ -79,6 +85,10 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
size_t getRequiredCmdStreamSizeAligned(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamSize(const DispatchBcsFlags &dispatchBcsFlags);
size_t getRequiredCmdStreamSizeAligned(const DispatchBcsFlags &dispatchBcsFlags);
size_t getRequiredCmdStreamHeaplessSize(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdStreamHeaplessSizeAligned(const DispatchFlags &dispatchFlags, Device &device);
size_t getRequiredCmdSizeForPreamble(Device &device) const;
size_t getCmdSizeForPreemption(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForEpilogue(const DispatchFlags &dispatchFlags) const;
@@ -164,7 +174,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
return getCmdSizeForStallingNoPostSyncCommands();
}
void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) override;
SubmissionStatus initializeDeviceWithFirstSubmission() override;
SubmissionStatus initializeDeviceWithFirstSubmission(Device &device) override;
HeapDirtyState &getDshState() {
return dshState;
@@ -179,6 +189,12 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
void dispatchRayTracingStateCommand(LinearStream &cmdStream, Device &device);
uint64_t getScratchPatchAddress();
SubmissionStatus programHeaplessProlog(Device &device);
void programHeaplessStateProlog(Device &device, LinearStream &commandStream);
void programStateBaseAddressHeapless(Device &device, LinearStream &commandStream);
void programComputeModeHeapless(Device &device, LinearStream &commandStream);
void handleAllocationsResidencyForflushTaskStateless(const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh);
protected:
void programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags);
void programL3(LinearStream &csr, uint32_t &newL3Config);
@@ -198,6 +214,8 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
void programEnginePrologue(LinearStream &csr);
size_t getCmdSizeForPrologue() const;
size_t getCmdSizeForHeaplessPrologue(Device &device) const;
void handleAllocationsResidencyForHeaplessProlog(LinearStream &linearStream, Device &device);
void setClearSlmWorkAroundParameter(PipeControlArgs &args);
void addPipeControlBeforeStateSip(LinearStream &commandStream, Device &device);
@@ -328,6 +346,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
size_t cmdStreamStart = 0;
uint32_t latestSentBcsWaValue = std::numeric_limits<uint32_t>::max();
bool heaplessPrologueSent = false;
};
} // namespace NEO

View File

@@ -181,6 +181,30 @@ size_t CommandStreamReceiverHw<GfxFamily>::getCmdsSizeForHardwareContext() const
return getCmdSizeForPrologue();
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::addPipeControlFlushTaskIfNeeded(LinearStream &commandStreamCSR, TaskCountType taskLevel) {
if (this->requiresInstructionCacheFlush) {
MemorySynchronizationCommands<GfxFamily>::addInstructionCacheFlush(commandStreamCSR);
this->requiresInstructionCacheFlush = false;
}
// Add a Pipe Control if we have a dependency on a previous walker to avoid concurrency issues.
if (taskLevel > this->taskLevel) {
const auto programPipeControl = !timestampPacketWriteEnabled;
if (programPipeControl) {
PipeControlArgs args;
MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(commandStreamCSR, args);
}
this->taskLevel = taskLevel;
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "this->taskCount", peekTaskCount());
}
if (debugManager.flags.ForcePipeControlPriorToWalker.get()) {
forcePipeControl(commandStreamCSR);
}
}
template <typename GfxFamily>
CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushBcsTask(LinearStream &commandStreamTask, size_t commandStreamTaskStart,
const DispatchBcsFlags &dispatchBcsFlags, const HardwareInfo &hwInfo) {
@@ -509,25 +533,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
MemorySynchronizationCommands<GfxFamily>::addStateCacheFlush(commandStreamCSR, device.getRootDeviceEnvironment());
}
if (requiresInstructionCacheFlush) {
MemorySynchronizationCommands<GfxFamily>::addInstructionCacheFlush(commandStreamCSR);
requiresInstructionCacheFlush = false;
}
// Add a Pipe Control if we have a dependency on a previous walker to avoid concurrency issues.
if (taskLevel > this->taskLevel) {
const auto programPipeControl = !timestampPacketWriteEnabled;
if (programPipeControl) {
PipeControlArgs args;
MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(commandStreamCSR, args);
}
this->taskLevel = taskLevel;
DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "this->taskCount", peekTaskCount());
}
if (debugManager.flags.ForcePipeControlPriorToWalker.get()) {
forcePipeControl(commandStreamCSR);
}
addPipeControlFlushTaskIfNeeded(commandStreamCSR, taskLevel);
this->makeResident(*tagAllocation);
@@ -1452,11 +1458,6 @@ size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForComputeMode() {
return EncodeComputeMode<GfxFamily>::getCmdSizeForComputeMode(this->peekRootDeviceEnvironment(), hasSharedHandles(), isRcs());
}
template <typename GfxFamily>
SubmissionStatus CommandStreamReceiverHw<GfxFamily>::initializeDeviceWithFirstSubmission() {
return flushTagUpdate();
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::handleFrontEndStateTransition(const DispatchFlags &dispatchFlags) {
if (streamProperties.frontEndState.disableOverdispatch.value != -1) {
@@ -2302,5 +2303,4 @@ inline void CommandStreamReceiverHw<GfxFamily>::chainCsrWorkToTask(LinearStream
this->makeResident(*chainedBatchBuffer);
EncodeNoop<GfxFamily>::alignToCacheLine(commandStreamCSR);
}
} // namespace NEO

View File

@@ -0,0 +1,76 @@
/*
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_stream/command_stream_receiver_hw.h"
namespace NEO {
template <typename GfxFamily>
SubmissionStatus CommandStreamReceiverHw<GfxFamily>::initializeDeviceWithFirstSubmission(Device &device) {
return flushTagUpdate();
}
template <typename GfxFamily>
CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTaskStateless(
LinearStream &commandStream, size_t commandStreamStart,
const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh,
TaskCountType taskLevel, DispatchFlags &dispatchFlags, Device &device) {
UNRECOVERABLE_IF(true);
return {};
}
template <typename GfxFamily>
SubmissionStatus CommandStreamReceiverHw<GfxFamily>::programHeaplessProlog(Device &device) {
UNRECOVERABLE_IF(true);
return SubmissionStatus::unsupported;
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::programStateBaseAddressHeapless(Device &device, LinearStream &commandStream) {
UNRECOVERABLE_IF(true);
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::programComputeModeHeapless(Device &device, LinearStream &commandStream) {
UNRECOVERABLE_IF(true);
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::programHeaplessStateProlog(Device &device, LinearStream &commandStream) {
UNRECOVERABLE_IF(true);
}
template <typename GfxFamily>
size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForHeaplessPrologue(Device &device) const {
UNRECOVERABLE_IF(true);
return 0;
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::handleAllocationsResidencyForflushTaskStateless(const IndirectHeap *dsh, const IndirectHeap *ioh, const IndirectHeap *ssh) {
UNRECOVERABLE_IF(true);
}
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::handleAllocationsResidencyForHeaplessProlog(LinearStream &linearStream, Device &device) {
UNRECOVERABLE_IF(true);
}
template <typename GfxFamily>
inline size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamHeaplessSize(const DispatchFlags &dispatchFlags, Device &device) {
UNRECOVERABLE_IF(true);
return 0u;
}
template <typename GfxFamily>
inline size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamHeaplessSizeAligned(const DispatchFlags &dispatchFlags, Device &device) {
UNRECOVERABLE_IF(true);
return 0u;
}
} // namespace NEO