compute-runtime/shared/source/direct_submission/direct_submission_hw.h

258 lines
9.3 KiB
C++

/*
* Copyright (C) 2020-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/command_stream/queue_throttle.h"
#include "shared/source/helpers/completion_stamp.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/utilities/stackvec.h"
#include <memory>
namespace NEO {
class MemoryManager;
struct RootDeviceEnvironment;
#pragma pack(1)
struct RingSemaphoreData {
uint32_t queueWorkCount;
uint8_t reservedCacheline0[60];
uint32_t tagAllocation;
uint8_t reservedCacheline1[60];
uint32_t diagnosticModeCounter;
uint32_t reserved0Uint32;
uint64_t reserved1Uint64;
uint8_t reservedCacheline2[48];
uint64_t miFlushSpace;
uint8_t reservedCacheline3[56];
};
static_assert((64u * 4) == sizeof(RingSemaphoreData), "Invalid size for RingSemaphoreData");
#pragma pack()
using DirectSubmissionAllocations = StackVec<GraphicsAllocation *, 8>;
struct TagData {
uint64_t tagAddress = 0ull;
uint64_t tagValue = 0ull;
};
enum class DirectSubmissionSfenceMode : int32_t {
disabled = 0,
beforeSemaphoreOnly = 1,
beforeAndAfterSemaphore = 2
};
namespace UllsDefaults {
inline constexpr bool defaultDisableCacheFlush = true;
inline constexpr bool defaultDisableMonitorFence = true;
} // namespace UllsDefaults
struct BatchBuffer;
class DirectSubmissionDiagnosticsCollector;
class FlushStampTracker;
class GraphicsAllocation;
struct HardwareInfo;
class OsContext;
class MemoryOperationsHandler;
struct DirectSubmissionInputParams : NonCopyableClass {
DirectSubmissionInputParams(const CommandStreamReceiver &commandStreamReceiver);
OsContext &osContext;
const RootDeviceEnvironment &rootDeviceEnvironment;
MemoryManager *memoryManager = nullptr;
const GraphicsAllocation *globalFenceAllocation = nullptr;
GraphicsAllocation *workPartitionAllocation = nullptr;
GraphicsAllocation *completionFenceAllocation = nullptr;
const uint32_t rootDeviceIndex;
};
template <typename GfxFamily, typename Dispatcher>
class DirectSubmissionHw {
public:
DirectSubmissionHw(const DirectSubmissionInputParams &inputParams);
virtual ~DirectSubmissionHw();
bool initialize(bool submitOnInit, bool useNotify);
MOCKABLE_VIRTUAL bool stopRingBuffer(bool blocking);
MOCKABLE_VIRTUAL bool dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp);
uint32_t getDispatchErrorCode();
static std::unique_ptr<DirectSubmissionHw<GfxFamily, Dispatcher>> create(const DirectSubmissionInputParams &inputParams);
virtual TaskCountType *getCompletionValuePointer() { return nullptr; }
bool isRelaxedOrderingEnabled() const {
return relaxedOrderingEnabled;
}
virtual void flushMonitorFence(){};
QueueThrottle getLastSubmittedThrottle() {
return this->lastSubmittedThrottle;
}
protected:
static constexpr size_t prefetchSize = 8 * MemoryConstants::cacheLineSize;
static constexpr size_t prefetchNoops = prefetchSize / sizeof(uint32_t);
bool allocateResources();
MOCKABLE_VIRTUAL void deallocateResources();
MOCKABLE_VIRTUAL bool makeResourcesResident(DirectSubmissionAllocations &allocations);
virtual bool allocateOsResources() = 0;
virtual bool submit(uint64_t gpuAddress, size_t size) = 0;
virtual bool handleResidency() = 0;
void handleNewResourcesSubmission();
bool isNewResourceHandleNeeded();
size_t getSizeNewResourceHandler();
virtual void handleStopRingBuffer(){};
virtual void ensureRingCompletion(){};
void switchRingBuffersNeeded(size_t size, ResidencyContainer *allocationsForResidency);
uint64_t switchRingBuffers(ResidencyContainer *allocationsForResidency);
virtual void handleSwitchRingBuffers(ResidencyContainer *allocationsForResidency) = 0;
GraphicsAllocation *switchRingBuffersAllocations();
constexpr static uint64_t updateTagValueFail = std::numeric_limits<uint64_t>::max();
virtual uint64_t updateTagValue(bool requireMonitorFence) = 0;
virtual bool dispatchMonitorFenceRequired(bool requireMonitorFence);
virtual void getTagAddressValue(TagData &tagData) = 0;
void unblockGpu();
bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size);
bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);
void cpuCachelineFlush(void *ptr, size_t size);
void dispatchSemaphoreSection(uint32_t value);
size_t getSizeSemaphoreSection(bool relaxedOrderingSchedulerRequired);
MOCKABLE_VIRTUAL void dispatchRelaxedOrderingSchedulerSection(uint32_t value);
void dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr);
void dispatchStartSection(uint64_t gpuStartAddress);
size_t getSizeStartSection();
size_t getUllsStateSize();
void dispatchUllsState();
void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress);
size_t getSizeSwitchRingBufferSection();
MOCKABLE_VIRTUAL void dispatchRelaxedOrderingQueueStall();
size_t getSizeDispatchRelaxedOrderingQueueStall();
MOCKABLE_VIRTUAL void dispatchTaskStoreSection(uint64_t taskStartSectionVa);
MOCKABLE_VIRTUAL void preinitializeRelaxedOrderingSections();
void initRelaxedOrderingRegisters();
void setReturnAddress(void *returnCmd, uint64_t returnAddress);
void *dispatchWorkloadSection(BatchBuffer &batchBuffer, bool dispatchMonitorFence);
size_t getSizeDispatch(bool relaxedOrderingSchedulerRequired, bool returnPtrsRequired, bool dispatchMonitorFence);
void dispatchPrefetchMitigation();
size_t getSizePrefetchMitigation();
void dispatchDisablePrefetcher(bool disable);
size_t getSizeDisablePrefetcher();
MOCKABLE_VIRTUAL void dispatchStaticRelaxedOrderingScheduler();
size_t getSizeEnd(bool relaxedOrderingSchedulerRequired);
void dispatchPartitionRegisterConfiguration();
size_t getSizePartitionRegisterConfigurationSection();
void dispatchSystemMemoryFenceAddress();
size_t getSizeSystemMemoryFenceAddress();
void createDiagnostic();
void initDiagnostic(bool &submitOnInit);
MOCKABLE_VIRTUAL void performDiagnosticMode();
void dispatchDiagnosticModeSection();
size_t getDiagnosticModeSection();
void setImmWritePostSyncOffset();
virtual bool isCompleted(uint32_t ringBufferIndex) = 0;
void updateRelaxedOrderingQueueSize(uint32_t newSize);
struct RingBufferUse {
RingBufferUse() = default;
RingBufferUse(FlushStamp completionFence, GraphicsAllocation *ringBuffer) : completionFence(completionFence), ringBuffer(ringBuffer){};
constexpr static uint32_t initialRingBufferCount = 2u;
FlushStamp completionFence = 0ull;
GraphicsAllocation *ringBuffer = nullptr;
};
std::vector<RingBufferUse> ringBuffers;
std::unique_ptr<uint8_t[]> preinitializedTaskStoreSection;
std::unique_ptr<uint8_t[]> preinitializedRelaxedOrderingScheduler;
uint32_t currentRingBuffer = 0u;
uint32_t previousRingBuffer = 0u;
uint32_t maxRingBufferCount = std::numeric_limits<uint32_t>::max();
LinearStream ringCommandStream;
std::unique_ptr<DirectSubmissionDiagnosticsCollector> diagnostic;
uint64_t semaphoreGpuVa = 0u;
uint64_t gpuVaForMiFlush = 0u;
uint64_t gpuVaForAdditionalSynchronizationWA = 0u;
uint64_t relaxedOrderingQueueSizeLimitValueVa = 0;
OsContext &osContext;
const uint32_t rootDeviceIndex;
MemoryManager *memoryManager = nullptr;
MemoryOperationsHandler *memoryOperationHandler = nullptr;
const HardwareInfo *hwInfo = nullptr;
const RootDeviceEnvironment &rootDeviceEnvironment;
const GraphicsAllocation *globalFenceAllocation = nullptr;
GraphicsAllocation *completionFenceAllocation = nullptr;
GraphicsAllocation *semaphores = nullptr;
GraphicsAllocation *workPartitionAllocation = nullptr;
GraphicsAllocation *deferredTasksListAllocation = nullptr;
GraphicsAllocation *relaxedOrderingSchedulerAllocation = nullptr;
void *semaphorePtr = nullptr;
volatile RingSemaphoreData *semaphoreData = nullptr;
volatile void *workloadModeOneStoreAddress = nullptr;
uint32_t *pciBarrierPtr = nullptr;
uint32_t currentQueueWorkCount = 1u;
uint32_t workloadMode = 0;
uint32_t workloadModeOneExpectedValue = 0u;
uint32_t activeTiles = 1u;
uint32_t immWritePostSyncOffset = 0u;
uint32_t currentRelaxedOrderingQueueSize = 0;
DirectSubmissionSfenceMode sfenceMode = DirectSubmissionSfenceMode::beforeAndAfterSemaphore;
volatile uint32_t reserved = 0u;
uint32_t dispatchErrorCode = 0;
QueueThrottle lastSubmittedThrottle = QueueThrottle::MEDIUM;
bool ringStart = false;
bool disableCpuCacheFlush = true;
bool disableCacheFlush = false;
bool disableMonitorFence = false;
bool partitionedMode = false;
bool partitionConfigSet = true;
bool useNotifyForPostSync = false;
bool miMemFenceRequired = false;
bool systemMemoryFenceAddressSet = false;
bool completionFenceSupported = false;
bool isDisablePrefetcherRequired = false;
bool dcFlushRequired = false;
bool detectGpuHang = true;
bool relaxedOrderingEnabled = false;
bool relaxedOrderingInitialized = false;
bool relaxedOrderingSchedulerRequired = false;
bool inputMonitorFenceDispatchRequirement = true;
};
} // namespace NEO