feature: Introduce ULLS light

Add core implementation of ULLS without VM_BIND interface aka ULLS
light.

Related-To: NEO-13922

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2025-02-11 14:31:01 +00:00
committed by Compute-Runtime-Automation
parent 488ac4bb6a
commit bc2b49b958
23 changed files with 293 additions and 62 deletions

View File

@@ -112,8 +112,9 @@ class DirectSubmissionHw {
MOCKABLE_VIRTUAL void deallocateResources();
MOCKABLE_VIRTUAL bool makeResourcesResident(DirectSubmissionAllocations &allocations);
virtual bool allocateOsResources() = 0;
virtual bool submit(uint64_t gpuAddress, size_t size) = 0;
virtual bool submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationsForResidency) = 0;
virtual bool handleResidency() = 0;
virtual void handleRingRestartForUllsLightResidency(ResidencyContainer *allocationsForResidency){};
void handleNewResourcesSubmission();
bool isNewResourceHandleNeeded();
size_t getSizeNewResourceHandler();
@@ -129,7 +130,7 @@ class DirectSubmissionHw {
virtual bool dispatchMonitorFenceRequired(bool requireMonitorFence);
virtual void getTagAddressValue(TagData &tagData) = 0;
void unblockGpu();
bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size, bool needWait);
bool submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size, bool needWait, ResidencyContainer *allocationsForResidency);
bool copyCommandBufferIntoRing(BatchBuffer &batchBuffer);
void cpuCachelineFlush(void *ptr, size_t size);

View File

@@ -502,7 +502,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit) {
}
dispatchSemaphoreSection(currentQueueWorkCount);
ringStart = submit(ringCommandStream.getGraphicsAllocation()->getGpuAddress(), startBufferSize);
ringStart = submit(ringCommandStream.getGraphicsAllocation()->getGpuAddress(), startBufferSize, nullptr);
performDiagnosticMode();
return ringStart;
}
@@ -964,6 +964,8 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchUllsState() {
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
this->handleRingRestartForUllsLightResidency(batchBuffer.allocationsForResidency);
lastSubmittedThrottle = batchBuffer.throttle;
bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies);
bool inputRequiredMonitorFence = false;
@@ -1017,7 +1019,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
cpuCachelineFlush(currentPosition, dispatchSize);
auto requiresBlockingResidencyHandling = batchBuffer.pagingFenceSemInfo.requiresBlockingResidencyHandling;
if (!this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize, requiresBlockingResidencyHandling)) {
if (!this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize, requiresBlockingResidencyHandling, batchBuffer.allocationsForResidency)) {
return false;
}
@@ -1035,9 +1037,9 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
}
template <typename GfxFamily, typename Dispatcher>
bool DirectSubmissionHw<GfxFamily, Dispatcher>::submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size, bool needWait) {
bool DirectSubmissionHw<GfxFamily, Dispatcher>::submitCommandBufferToGpu(bool needStart, uint64_t gpuAddress, size_t size, bool needWait, ResidencyContainer *allocationsForResidency) {
if (needStart) {
this->ringStart = this->submit(gpuAddress, size);
this->ringStart = this->submit(gpuAddress, size, allocationsForResidency);
return this->ringStart;
} else {
if (needWait) {

View File

@@ -7,6 +7,8 @@
#pragma once
#include "shared/source/direct_submission/direct_submission_hw.h"
#include "shared/source/os_interface/linux/drm_buffer_object.h"
#include "shared/source/os_interface/linux/drm_wrappers.h"
namespace NEO {
@@ -24,9 +26,10 @@ class DrmDirectSubmission : public DirectSubmissionHw<GfxFamily, Dispatcher> {
protected:
bool allocateOsResources() override;
bool submit(uint64_t gpuAddress, size_t size) override;
bool submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationsForResidency) override;
bool handleResidency() override;
void handleRingRestartForUllsLightResidency(ResidencyContainer *allocationsForResidency) override;
void handleStopRingBuffer() override;
void ensureRingCompletion() override;
@@ -43,5 +46,8 @@ class DrmDirectSubmission : public DirectSubmissionHw<GfxFamily, Dispatcher> {
volatile TagAddressType *tagAddress;
TaskCountType completionFenceValue{};
std::chrono::microseconds gpuHangCheckPeriod{CommonConstants::gpuHangCheckTimeInUS};
std::vector<BufferObject *> residency{};
std::vector<ExecObject> execObjectsStorage{};
};
} // namespace NEO

View File

@@ -12,6 +12,7 @@
#include "shared/source/direct_submission/linux/drm_direct_submission.h"
#include "shared/source/os_interface/linux/drm_allocation.h"
#include "shared/source/os_interface/linux/drm_buffer_object.h"
#include "shared/source/os_interface/linux/drm_memory_operations_handler.h"
#include "shared/source/os_interface/linux/drm_neo.h"
#include "shared/source/os_interface/linux/drm_wrappers.h"
#include "shared/source/os_interface/linux/ioctl_helper.h"
@@ -27,6 +28,7 @@ namespace NEO {
template <typename GfxFamily, typename Dispatcher>
DrmDirectSubmission<GfxFamily, Dispatcher>::DrmDirectSubmission(const DirectSubmissionInputParams &inputParams)
: DirectSubmissionHw<GfxFamily, Dispatcher>(inputParams) {
this->execObjectsStorage.resize(1u);
this->completionFenceValue = inputParams.initialCompletionFenceValue;
if (debugManager.flags.OverrideUserFenceStartValue.get() != -1) {
@@ -124,7 +126,7 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::allocateOsResources() {
}
template <typename GfxFamily, typename Dispatcher>
bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, size_t size) {
bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationsForResidency) {
auto bb = static_cast<DrmAllocation *>(this->ringCommandStream.getGraphicsAllocation())->getBO();
auto osContextLinux = static_cast<OsContextLinux *>(&this->osContext);
@@ -132,9 +134,9 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, siz
auto execFlags = osContextLinux->getEngineFlag() | drm.getIoctlHelper()->getDrmParamValue(DrmParam::execNoReloc);
auto &drmContextIds = osContextLinux->getDrmContextIds();
ExecObject execObject{};
this->handleResidency();
if (!allocationsForResidency) {
this->handleResidency();
}
auto currentBase = this->ringCommandStream.getGraphicsAllocation()->getGpuAddress();
auto offset = ptrDiff(gpuAddress, currentBase);
@@ -151,21 +153,33 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, siz
for (auto drmIterator = 0u; drmIterator < osContextLinux->getDeviceBitfield().size(); drmIterator++) {
if (osContextLinux->getDeviceBitfield().test(drmIterator)) {
uint32_t errorCode = bb->exec(static_cast<uint32_t>(size),
offset,
execFlags,
false,
&this->osContext,
drmIterator,
drmContextIds[drmContextId],
nullptr,
0,
&execObject,
completionFenceGpuAddress,
completionValue);
auto size = allocationsForResidency ? allocationsForResidency->size() : 0u;
for (uint32_t i = 0; i < size; i++) {
auto drmAlloc = static_cast<DrmAllocation *>((*allocationsForResidency)[i]);
drmAlloc->makeBOsResident(&this->osContext, drmIterator, &this->residency, false, false);
}
auto requiredSize = this->residency.size() + 1;
if (requiredSize > this->execObjectsStorage.size()) {
this->execObjectsStorage.resize(requiredSize);
}
auto errorCode = bb->exec(static_cast<uint32_t>(size),
offset,
execFlags,
false,
&this->osContext,
drmIterator,
drmContextIds[drmContextId],
this->residency.data(),
this->residency.size(),
this->execObjectsStorage.data(),
completionFenceGpuAddress,
completionValue);
if (errorCode != 0) {
this->dispatchErrorCode = errorCode;
ret = false;
break;
}
drmContextId++;
if (completionFenceGpuAddress) {
@@ -174,6 +188,8 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, siz
}
}
this->residency.clear();
if (this->isCompletionFenceSupported() && ret) {
completionFenceValue++;
}
@@ -188,6 +204,16 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::handleResidency() {
return true;
}
template <typename GfxFamily, typename Dispatcher>
void DrmDirectSubmission<GfxFamily, Dispatcher>::handleRingRestartForUllsLightResidency(ResidencyContainer *allocationsForResidency) {
if (allocationsForResidency) {
auto restartNeeded = static_cast<DrmMemoryOperationsHandler *>(this->memoryOperationHandler)->obtainAndResetNewResourcesSinceLastRingSubmit();
if (restartNeeded) {
this->stopRingBuffer(false);
}
}
}
template <typename GfxFamily, typename Dispatcher>
void DrmDirectSubmission<GfxFamily, Dispatcher>::handleStopRingBuffer() {
if (this->disableMonitorFence) {
@@ -209,6 +235,11 @@ void DrmDirectSubmission<GfxFamily, Dispatcher>::handleSwitchRingBuffers(Residen
this->ringBuffers[this->previousRingBuffer].completionFence = this->currentTagData.tagValue;
}
}
if (allocationsForResidency) {
allocationsForResidency->clear();
static_cast<DrmMemoryOperationsHandler *>(this->memoryOperationHandler)->mergeWithResidencyContainer(&this->osContext, *allocationsForResidency);
}
}
template <typename GfxFamily, typename Dispatcher>

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -29,7 +29,7 @@ class WddmDirectSubmission : public DirectSubmissionHw<GfxFamily, Dispatcher> {
protected:
bool allocateOsResources() override;
bool submit(uint64_t gpuAddress, size_t size) override;
bool submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationForResidency) override;
bool handleResidency() override;
void handleCompletionFence(uint64_t completionValue, MonitoredFence &fence);

View File

@@ -73,7 +73,7 @@ inline void WddmDirectSubmission<GfxFamily, Dispatcher>::flushMonitorFence() {
Dispatcher::dispatchMonitorFence(this->ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, this->rootDeviceEnvironment, this->partitionedMode, this->dcFlushRequired);
this->dispatchSemaphoreSection(this->currentQueueWorkCount + 1);
this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize, true);
this->submitCommandBufferToGpu(needStart, startVA, requiredMinimalSize, true, nullptr);
this->currentQueueWorkCount++;
this->updateTagValueImpl(this->currentRingBuffer);
@@ -92,7 +92,7 @@ bool WddmDirectSubmission<GfxFamily, Dispatcher>::allocateOsResources() {
}
template <typename GfxFamily, typename Dispatcher>
bool WddmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, size_t size) {
bool WddmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, size_t size, ResidencyContainer *allocationForResidency) {
perfLogResidencyVariadicLog(wddm->getResidencyLogger(), "ULLS Submit to GPU\n");
COMMAND_BUFFER_HEADER *pHeader = reinterpret_cast<COMMAND_BUFFER_HEADER *>(commandBufferHeader.get());
pHeader->RequiresCoherency = false;