Files
compute-runtime/shared/source/os_interface/linux/drm_buffer_object.cpp
Szymon Morek f08b32761e fix: avoid race when evicting resources
Related-To: NEO-13843

DrmMemoryOperationsHandler::mutex is used for residency
handling.
However, this mutex is not being locked during eviction when vm bind
failed.
This is causing races for example when ULLS controller wants to make
resources resident, and user thread wants to evict
them.
Fix is to explicitly obtain lock in problematic path.

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
2025-03-07 07:01:41 +01:00

372 lines
16 KiB
C++

/*
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/os_interface/linux/drm_buffer_object.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/command_stream/task_count_helper.h"
#include "shared/source/execution_environment/execution_environment.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/os_interface/linux/drm_memory_manager.h"
#include "shared/source/os_interface/linux/drm_memory_operations_handler.h"
#include "shared/source/os_interface/linux/drm_neo.h"
#include "shared/source/os_interface/linux/drm_wrappers.h"
#include "shared/source/os_interface/linux/ioctl_helper.h"
#include "shared/source/os_interface/linux/os_context_linux.h"
#include "shared/source/os_interface/os_context.h"
#include <errno.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>
namespace NEO {
BufferObjectHandleWrapper BufferObjectHandleWrapper::acquireSharedOwnership() {
if (controlBlock == nullptr) {
controlBlock = new ControlBlock{1, 0};
}
std::lock_guard lock{controlBlock->blockMutex};
controlBlock->refCount++;
return BufferObjectHandleWrapper{boHandle, rootDeviceIndex, Ownership::strong, controlBlock};
}
BufferObjectHandleWrapper BufferObjectHandleWrapper::acquireWeakOwnership() {
if (controlBlock == nullptr) {
controlBlock = new ControlBlock{1, 0};
}
std::lock_guard lock{controlBlock->blockMutex};
controlBlock->weakRefCount++;
return BufferObjectHandleWrapper{boHandle, rootDeviceIndex, Ownership::weak, controlBlock};
}
BufferObjectHandleWrapper::~BufferObjectHandleWrapper() {
if (controlBlock == nullptr) {
return;
}
std::unique_lock lock{controlBlock->blockMutex};
if (ownership == Ownership::strong) {
controlBlock->refCount--;
} else {
controlBlock->weakRefCount--;
}
if (controlBlock->refCount == 0 && controlBlock->weakRefCount == 0) {
lock.unlock();
delete controlBlock;
controlBlock = nullptr;
}
}
bool BufferObjectHandleWrapper::canCloseBoHandle() {
if (controlBlock == nullptr) {
return true;
}
std::lock_guard lock{controlBlock->blockMutex};
return controlBlock->refCount == 1;
}
BufferObject::BufferObject(uint32_t rootDeviceIndex, Drm *drm, uint64_t patIndex, int handle, size_t size, size_t maxOsContextCount)
: BufferObject(rootDeviceIndex, drm, patIndex, BufferObjectHandleWrapper{handle, rootDeviceIndex}, size, maxOsContextCount) {}
BufferObject::BufferObject(uint32_t rootDeviceIndex, Drm *drm, uint64_t patIndex, BufferObjectHandleWrapper &&handle, size_t size, size_t maxOsContextCount)
: drm(drm), handle(std::move(handle)), size(size), refCount(1), rootDeviceIndex(rootDeviceIndex) {
auto ioctlHelper = drm->getIoctlHelper();
this->tilingMode = ioctlHelper->getDrmParamValue(DrmParam::tilingNone);
this->lockedAddress = nullptr;
this->patIndex = patIndex;
perContextVmsUsed = drm->isPerContextVMRequired();
requiresExplicitResidency = drm->hasPageFaultSupport();
if (perContextVmsUsed) {
bindInfo.resize(maxOsContextCount);
for (auto &iter : bindInfo) {
iter.fill(false);
}
} else {
bindInfo.resize(1);
bindInfo[0].fill(false);
}
}
uint32_t BufferObject::getRefCount() const {
return this->refCount.load();
}
void BufferObject::setAddress(uint64_t address) {
auto gmmHelper = drm->getRootDeviceEnvironment().getGmmHelper();
this->gpuAddress = gmmHelper->canonize(address);
}
bool BufferObject::close() {
if (!this->handle.canCloseBoHandle()) {
PRINT_DEBUG_STRING(debugManager.flags.PrintBOCreateDestroyResult.get(), stdout, "Skipped closing BO-%d - more shared users!\n", this->handle.getBoHandle());
return true;
}
GemClose close{};
close.handle = this->handle.getBoHandle();
close.userptr = this->userptr;
PRINT_DEBUG_STRING(debugManager.flags.PrintBOCreateDestroyResult.get(), stdout, "Calling gem close on handle: BO-%d\n", this->handle.getBoHandle());
auto ioctlHelper = this->drm->getIoctlHelper();
int ret = ioctlHelper->ioctl(DrmIoctl::gemClose, &close);
if (ret != 0) {
int err = errno;
CREATE_DEBUG_STRING(str, "ioctl(GEM_CLOSE) failed with %d. errno=%d(%s)\n", ret, err, strerror(err));
drm->getRootDeviceEnvironment().executionEnvironment.setErrorDescription(std::string(str.get()));
PRINT_DEBUG_STRING(debugManager.flags.PrintDebugMessages.get(), stderr, str.get());
DEBUG_BREAK_IF(true);
return false;
}
this->handle.setBoHandle(-1);
return true;
}
int BufferObject::wait(int64_t timeoutNs) {
if (this->drm->isVmBindAvailable()) {
return 0;
}
int ret = this->drm->waitHandle(this->handle.getBoHandle(), -1);
UNRECOVERABLE_IF(ret != 0);
return ret;
}
bool BufferObject::setTiling(uint32_t mode, uint32_t stride) {
if (this->tilingMode == mode) {
return true;
}
GemSetTiling setTiling{};
setTiling.handle = this->handle.getBoHandle();
setTiling.tilingMode = mode;
setTiling.stride = stride;
auto ioctlHelper = this->drm->getIoctlHelper();
if (!ioctlHelper->setGemTiling(&setTiling)) {
return false;
}
this->tilingMode = setTiling.tilingMode;
return setTiling.tilingMode == mode;
}
uint32_t BufferObject::getOsContextId(OsContext *osContext) {
return perContextVmsUsed ? osContext->getContextId() : 0u;
}
void BufferObject::fillExecObject(ExecObject &execObject, OsContext *osContext, uint32_t vmHandleId, uint32_t drmContextId) {
const auto osContextId = drm->isPerContextVMRequired() ? osContext->getContextId() : 0;
auto ioctlHelper = drm->getIoctlHelper();
ioctlHelper->fillExecObject(execObject, this->handle.getBoHandle(), this->gpuAddress, drmContextId, this->bindInfo[osContextId][vmHandleId], this->isMarkedForCapture());
}
int BufferObject::exec(uint32_t used, size_t startOffset, unsigned int flags, bool requiresCoherency, OsContext *osContext, uint32_t vmHandleId, uint32_t drmContextId,
BufferObject *const residency[], size_t residencyCount, ExecObject *execObjectsStorage, uint64_t completionGpuAddress, TaskCountType completionValue) {
for (size_t i = 0; i < residencyCount; i++) {
residency[i]->fillExecObject(execObjectsStorage[i], osContext, vmHandleId, drmContextId);
}
this->fillExecObject(execObjectsStorage[residencyCount], osContext, vmHandleId, drmContextId);
auto ioctlHelper = drm->getIoctlHelper();
ExecBuffer execbuf{};
ioctlHelper->fillExecBuffer(execbuf, reinterpret_cast<uintptr_t>(execObjectsStorage),
static_cast<uint32_t>(residencyCount + 1u), static_cast<uint32_t>(startOffset),
alignUp(used, 8), flags, drmContextId);
if (debugManager.flags.PrintExecutionBuffer.get()) {
PRINT_DEBUG_STRING(debugManager.flags.PrintExecutionBuffer.get(), stdout, "Exec called with drmVmId = %u\n",
static_cast<const OsContextLinux *>(osContext)->getDrmVmIds().size() ? static_cast<const OsContextLinux *>(osContext)->getDrmVmIds()[vmHandleId] : 0);
printExecutionBuffer(execbuf, residencyCount, execObjectsStorage, residency);
}
int ret = ioctlHelper->execBuffer(&execbuf, completionGpuAddress, completionValue);
if (ret != 0) {
int err = this->drm->getErrno();
if (err == EOPNOTSUPP) {
PRINT_DEBUG_STRING(debugManager.flags.PrintDebugMessages.get(), stderr, "ioctl(I915_GEM_EXECBUFFER2) failed with %d. errno=%d(%s)\n", ret, err, strerror(err));
return err;
}
evictUnusedAllocations(false, true);
ret = ioctlHelper->execBuffer(&execbuf, completionGpuAddress, completionValue);
}
if (ret != 0) {
const auto status = evictUnusedAllocations(true, true);
if (status == MemoryOperationsStatus::gpuHangDetectedDuringOperation) {
PRINT_DEBUG_STRING(debugManager.flags.PrintDebugMessages.get(), stderr, "Error! GPU hang detected in BufferObject::exec(). Returning %d\n", gpuHangDetected);
return gpuHangDetected;
}
ret = ioctlHelper->execBuffer(&execbuf, completionGpuAddress, completionValue);
}
if (ret == 0) {
return 0;
}
int err = this->drm->getErrno();
PRINT_DEBUG_STRING(debugManager.flags.PrintDebugMessages.get(), stderr, "ioctl(I915_GEM_EXECBUFFER2) failed with %d. errno=%d(%s)\n", ret, err, strerror(err));
return err;
}
MemoryOperationsStatus BufferObject::evictUnusedAllocations(bool waitForCompletion, bool isLockNeeded) {
return static_cast<DrmMemoryOperationsHandler *>(this->drm->getRootDeviceEnvironment().memoryOperationsInterface.get())->evictUnusedAllocations(waitForCompletion, isLockNeeded);
}
void BufferObject::printBOBindingResult(OsContext *osContext, uint32_t vmHandleId, bool bind, int retVal) {
if (retVal == 0) {
if (bind) {
PRINT_DEBUG_STRING(debugManager.flags.PrintBOBindingResult.get(), stdout, "bind BO-%d to VM %u, drmVmId = %u, range: %llx - %llx, size: %lld, result: %d\n",
this->handle.getBoHandle(), vmHandleId, static_cast<const OsContextLinux *>(osContext)->getDrmVmIds().size() ? static_cast<const OsContextLinux *>(osContext)->getDrmVmIds()[vmHandleId] : 0, this->gpuAddress, ptrOffset(this->gpuAddress, this->size), this->size, retVal);
} else {
PRINT_DEBUG_STRING(debugManager.flags.PrintBOBindingResult.get(), stdout, "unbind BO-%d from VM %u, drmVmId = %u, range: %llx - %llx, size: %lld, result: %d\n",
this->handle.getBoHandle(), vmHandleId, static_cast<const OsContextLinux *>(osContext)->getDrmVmIds().size() ? static_cast<const OsContextLinux *>(osContext)->getDrmVmIds()[vmHandleId] : 0, this->gpuAddress, ptrOffset(this->gpuAddress, this->size), this->size, retVal);
}
} else {
auto err = this->drm->getErrno();
if (bind) {
PRINT_DEBUG_STRING(debugManager.flags.PrintBOBindingResult.get(), stderr, "bind BO-%d to VM %u, drmVmId = %u, range: %llx - %llx, size: %lld, result: %d, errno: %d(%s)\n",
this->handle.getBoHandle(), vmHandleId, static_cast<const OsContextLinux *>(osContext)->getDrmVmIds().size() ? static_cast<const OsContextLinux *>(osContext)->getDrmVmIds()[vmHandleId] : 0, this->gpuAddress, ptrOffset(this->gpuAddress, this->size), this->size, retVal, err, strerror(err));
} else {
PRINT_DEBUG_STRING(debugManager.flags.PrintBOBindingResult.get(), stderr, "unbind BO-%d from VM %u, drmVmId = %u, range: %llx - %llx, size: %lld, result: %d, errno: %d(%s)\n",
this->handle.getBoHandle(), vmHandleId, static_cast<const OsContextLinux *>(osContext)->getDrmVmIds().size() ? static_cast<const OsContextLinux *>(osContext)->getDrmVmIds()[vmHandleId] : 0, this->gpuAddress, ptrOffset(this->gpuAddress, this->size), this->size, retVal, err, strerror(err));
}
}
}
int BufferObject::bind(OsContext *osContext, uint32_t vmHandleId, const bool forcePagingFence) {
int retVal = 0;
auto contextId = getOsContextId(osContext);
if (!this->bindInfo[contextId][vmHandleId]) {
retVal = this->drm->bindBufferObject(osContext, vmHandleId, this, forcePagingFence);
if (debugManager.flags.PrintBOBindingResult.get()) {
printBOBindingResult(osContext, vmHandleId, true, retVal);
}
if (!retVal) {
this->bindInfo[contextId][vmHandleId] = true;
}
}
return retVal;
}
int BufferObject::unbind(OsContext *osContext, uint32_t vmHandleId) {
int retVal = 0;
auto contextId = getOsContextId(osContext);
if (this->bindInfo[contextId][vmHandleId]) {
retVal = this->drm->unbindBufferObject(osContext, vmHandleId, this);
if (debugManager.flags.PrintBOBindingResult.get()) {
printBOBindingResult(osContext, vmHandleId, false, retVal);
}
if (!retVal) {
this->bindInfo[contextId][vmHandleId] = false;
}
}
return retVal;
}
void BufferObject::printExecutionBuffer(ExecBuffer &execbuf, const size_t &residencyCount, ExecObject *execObjectsStorage, BufferObject *const residency[]) {
auto ioctlHelper = drm->getIoctlHelper();
std::stringstream logger;
ioctlHelper->logExecBuffer(execbuf, logger);
size_t i;
for (i = 0; i < residencyCount; i++) {
ioctlHelper->logExecObject(execObjectsStorage[i], logger, residency[i]->peekSize());
}
logger << "Command ";
ioctlHelper->logExecObject(execObjectsStorage[i], logger, this->peekSize());
printf("%s\n", logger.str().c_str());
}
int bindBOsWithinContext(BufferObject *const boToPin[], size_t numberOfBos, OsContext *osContext, uint32_t vmHandleId, const bool forcePagingFence) {
auto retVal = 0;
for (auto drmIterator = 0u; drmIterator < osContext->getDeviceBitfield().size(); drmIterator++) {
if (osContext->getDeviceBitfield().test(drmIterator)) {
for (size_t i = 0; i < numberOfBos; i++) {
retVal |= boToPin[i]->bind(osContext, drmIterator, forcePagingFence);
}
}
}
return retVal;
}
int BufferObject::pin(BufferObject *const boToPin[], size_t numberOfBos, OsContext *osContext, uint32_t vmHandleId, uint32_t drmContextId) {
auto retVal = 0;
if (this->drm->isVmBindAvailable()) {
auto lock = static_cast<DrmMemoryOperationsHandler *>(this->drm->getRootDeviceEnvironment().memoryOperationsInterface.get())->lockHandlerIfUsed();
retVal = bindBOsWithinContext(boToPin, numberOfBos, osContext, vmHandleId, false);
} else {
StackVec<ExecObject, maxFragmentsCount + 1> execObject(numberOfBos + 1);
retVal = this->exec(4u, 0u, 0u, false, osContext, vmHandleId, drmContextId, boToPin, numberOfBos, &execObject[0], 0, 0);
}
return retVal;
}
int BufferObject::validateHostPtr(BufferObject *const boToPin[], size_t numberOfBos, OsContext *osContext, uint32_t vmHandleId, uint32_t drmContextId) {
auto retVal = 0;
if (this->drm->isVmBindAvailable()) {
auto lock = static_cast<DrmMemoryOperationsHandler *>(this->drm->getRootDeviceEnvironment().memoryOperationsInterface.get())->lockHandlerIfUsed();
for (size_t i = 0; i < numberOfBos; i++) {
retVal = boToPin[i]->bind(osContext, vmHandleId, false);
if (retVal) {
break;
}
}
} else {
if (this->drm->getRootDeviceEnvironment().executionEnvironment.memoryManager.get()) {
const auto &engines = this->drm->getRootDeviceEnvironment().executionEnvironment.memoryManager->getRegisteredEngines(osContext->getRootDeviceIndex());
for (const auto &engine : engines) {
if (engine.osContext->isDirectSubmissionLightActive()) {
auto lock = engine.commandStreamReceiver->obtainUniqueOwnership();
engine.commandStreamReceiver->stopDirectSubmission(false);
}
}
}
StackVec<ExecObject, maxFragmentsCount + 1> execObject(numberOfBos + 1);
retVal = this->exec(4u, 0u, 0u, false, osContext, vmHandleId, drmContextId, boToPin, numberOfBos, &execObject[0], 0, 0);
}
return retVal;
}
void BufferObject::addBindExtHandle(uint32_t handle) {
bindExtHandles.push_back(handle);
}
} // namespace NEO