Files
compute-runtime/shared/source/command_stream/tbx_command_stream_receiver_hw.inl
Mateusz Hoppe b4e4fcf786 feature: add experimental extension to verify memory in aub mode
Related-To: NEO-14153, NEO-17038

Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
2025-12-16 13:57:32 +01:00

552 lines
23 KiB
C++

/*
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/aub/aub_center.h"
#include "shared/source/aub/aub_helper.h"
#include "shared/source/aub_mem_dump/aub_alloc_dump.h"
#include "shared/source/aub_mem_dump/aub_alloc_dump.inl"
#include "shared/source/command_stream/aub_command_stream_receiver.h"
#include "shared/source/command_stream/command_stream_receiver_with_aub_dump.h"
#include "shared/source/command_stream/submission_status.h"
#include "shared/source/command_stream/submissions_aggregator.h"
#include "shared/source/command_stream/task_count_helper.h"
#include "shared/source/command_stream/tbx_command_stream_receiver_hw.h"
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/execution_environment/execution_environment.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/kmd_notify_properties.h"
#include "shared/source/helpers/ptr_math.h"
#include "shared/source/memory_manager/allocation_type.h"
#include "shared/source/memory_manager/graphics_allocation.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/memory_manager/memory_operations_handler.h"
#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
#include "shared/source/utilities/shared_pool_allocation.h"
#include <cstdarg>
#include <cstdint>
#include <cstring>
#include <limits>
#include <type_traits>
namespace NEO {
template <typename GfxFamily>
CpuPageFaultManager *TbxCommandStreamReceiverHw<GfxFamily>::getTbxPageFaultManager() {
return this->getMemoryManager()->getPageFaultManager();
}
template <typename GfxFamily>
TbxCommandStreamReceiverHw<GfxFamily>::TbxCommandStreamReceiverHw(ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield)
: BaseClass(executionEnvironment, rootDeviceIndex, deviceBitfield) {
forceSkipResourceCleanupRequired = true;
auto releaseHelper = executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getReleaseHelper();
physicalAddressAllocator.reset(this->createPhysicalAddressAllocator(&this->peekHwInfo(), releaseHelper));
executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->initAubCenter(this->localMemoryEnabled, "", this->getType());
auto aubCenter = executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->aubCenter.get();
UNRECOVERABLE_IF(nullptr == aubCenter);
aubManager = aubCenter->getAubManager();
ppgtt = std::make_unique<std::conditional<is64bit, PML4, PDPE>::type>(physicalAddressAllocator.get());
ggtt = std::make_unique<PDPE>(physicalAddressAllocator.get());
this->downloadAllocationImpl = [this](GraphicsAllocation &graphicsAllocation) {
this->downloadAllocationTbx(graphicsAllocation);
};
}
template <typename GfxFamily>
TbxCommandStreamReceiverHw<GfxFamily>::~TbxCommandStreamReceiverHw() {
this->downloadAllocationImpl = nullptr;
}
template <typename GfxFamily>
bool TbxCommandStreamReceiverHw<GfxFamily>::isAllocTbxFaultable(GraphicsAllocation *gfxAlloc) {
// indicates host memory not managed by the driver
if ((gfxAlloc->getDriverAllocatedCpuPtr() == nullptr) ||
(debugManager.isTbxPageFaultManagerEnabled() == false) ||
(this->getTbxPageFaultManager() == nullptr)) {
return false;
}
auto allocType = gfxAlloc->getAllocationType();
if (allocType == AllocationType::bufferHostMemory) {
return true;
}
return false;
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::registerAllocationWithTbxFaultMngrIfTbxFaultable(GraphicsAllocation *gfxAlloc, void *cpuAddress, size_t size) {
if (!isAllocTbxFaultable(gfxAlloc)) {
return;
}
auto bank = this->getMemoryBank(gfxAlloc);
if (bank == 0u || gfxAlloc->storageInfo.cloningOfPageTables) {
bank = GraphicsAllocation::defaultBank;
}
auto faultManager = getTbxPageFaultManager();
faultManager->insertAllocation(this, gfxAlloc, bank, cpuAddress, size);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::allowCPUMemoryAccessIfTbxFaultable(GraphicsAllocation *gfxAlloc, void *cpuAddress, size_t size) {
if (!isAllocTbxFaultable(gfxAlloc)) {
return;
}
auto faultManager = getTbxPageFaultManager();
faultManager->allowCPUMemoryAccess(cpuAddress, size);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::protectCPUMemoryAccessIfTbxFaultable(GraphicsAllocation *gfxAlloc, void *cpuAddress, size_t size) {
if (!isAllocTbxFaultable(gfxAlloc)) {
return;
}
auto faultManager = getTbxPageFaultManager();
faultManager->protectCPUMemoryAccess(cpuAddress, size);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::protectCPUMemoryFromWritesIfTbxFaultable(GraphicsAllocation *gfxAlloc, void *cpuAddress, size_t size) {
if (!isAllocTbxFaultable(gfxAlloc)) {
return;
}
auto faultManager = getTbxPageFaultManager();
faultManager->protectCpuMemoryFromWrites(cpuAddress, size);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::initializeEngine() {
if (!isEngineInitialized) {
isEngineInitialized = true;
if (hardwareContextController) {
hardwareContextController->createHardwareContexts(*aubManager);
hardwareContextController->initialize();
return;
}
}
}
template <typename GfxFamily>
CommandStreamReceiver *TbxCommandStreamReceiverHw<GfxFamily>::create(const std::string &baseName,
bool withAubDump,
ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield) {
TbxCommandStreamReceiverHw<GfxFamily> *csr;
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[rootDeviceIndex];
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
if (withAubDump) {
auto localMemoryEnabled = gfxCoreHelper.getEnableLocalMemory(hwInfo);
auto fullName = AUBCommandStreamReceiver::createFullFilePath(hwInfo, baseName, rootDeviceIndex);
if (debugManager.flags.AUBDumpCaptureFileName.get() != "unk") {
fullName.assign(debugManager.flags.AUBDumpCaptureFileName.get());
}
rootDeviceEnvironment.initAubCenter(localMemoryEnabled, fullName, CommandStreamReceiverType::tbxWithAub);
csr = new CommandStreamReceiverWithAUBDump<TbxCommandStreamReceiverHw<GfxFamily>>(baseName, executionEnvironment, rootDeviceIndex, deviceBitfield);
auto aubCenter = rootDeviceEnvironment.aubCenter.get();
UNRECOVERABLE_IF(nullptr == aubCenter);
auto subCaptureCommon = aubCenter->getSubCaptureCommon();
UNRECOVERABLE_IF(nullptr == subCaptureCommon);
if (subCaptureCommon->subCaptureMode > AubSubCaptureManager::SubCaptureMode::off) {
csr->subCaptureManager = std::make_unique<AubSubCaptureManager>(fullName, *subCaptureCommon, ApiSpecificConfig::getRegistryPath());
}
if (csr->aubManager) {
if (!csr->aubManager->isOpen()) {
csr->aubManager->open(csr->subCaptureManager ? csr->subCaptureManager->getSubCaptureFileName("") : fullName);
UNRECOVERABLE_IF(!csr->aubManager->isOpen());
}
}
} else {
csr = new TbxCommandStreamReceiverHw<GfxFamily>(executionEnvironment, rootDeviceIndex, deviceBitfield);
}
return csr;
}
template <typename GfxFamily>
SubmissionStatus TbxCommandStreamReceiverHw<GfxFamily>::flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) {
if (subCaptureManager) {
if (aubManager) {
aubManager->pause(false);
}
}
initializeEngine();
// Write our batch buffer
auto pBatchBuffer = ptrOffset(batchBuffer.commandBufferAllocation->getUnderlyingBuffer(), batchBuffer.startOffset);
auto batchBufferGpuAddress = ptrOffset(batchBuffer.commandBufferAllocation->getGpuAddress(), batchBuffer.startOffset);
auto currentOffset = batchBuffer.usedSize;
DEBUG_BREAK_IF(currentOffset < batchBuffer.startOffset);
auto sizeBatchBuffer = currentOffset - batchBuffer.startOffset;
auto overrideRingHead = false;
auto submissionTaskCount = this->taskCount + 1;
allocationsForResidency.push_back(batchBuffer.commandBufferAllocation);
batchBuffer.commandBufferAllocation->updateResidencyTaskCount(submissionTaskCount, this->osContext->getContextId());
batchBuffer.commandBufferAllocation->updateTaskCount(submissionTaskCount, osContext->getContextId());
// Write allocations for residency
processResidency(allocationsForResidency, 0u);
if (subCaptureManager) {
if (aubManager) {
auto status = subCaptureManager->getSubCaptureStatus();
if (!status.wasActiveInPreviousEnqueue && status.isActive) {
overrideRingHead = true;
}
if (!status.wasActiveInPreviousEnqueue && !status.isActive) {
aubManager->pause(true);
}
}
}
submitBatchBufferTbx(
batchBufferGpuAddress, pBatchBuffer, sizeBatchBuffer,
this->getMemoryBank(batchBuffer.commandBufferAllocation),
this->getPPGTTAdditionalBits(batchBuffer.commandBufferAllocation),
overrideRingHead);
if (subCaptureManager) {
pollForCompletion();
subCaptureManager->disableSubCapture();
}
return SubmissionStatus::success;
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::submitBatchBufferTbx(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead) {
if (hardwareContextController && batchBufferSize) {
hardwareContextController->submit(batchBufferGpuAddress, batchBuffer, batchBufferSize, memoryBank, MemoryConstants::pageSize64k, overrideRingHead);
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::pollForCompletion(bool skipTaskCountCheck) {
if (hardwareContextController) {
hardwareContextController->pollForCompletion();
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::writeMemory(uint64_t gpuAddress, void *cpuAddress, size_t size, uint32_t memoryBank, uint64_t entryBits) {
UNRECOVERABLE_IF(!isEngineInitialized);
}
template <typename GfxFamily>
bool TbxCommandStreamReceiverHw<GfxFamily>::writeMemory(GraphicsAllocation &gfxAllocation, bool isChunkCopy, uint64_t gpuVaChunkOffset, size_t chunkSize) {
uint64_t gpuAddress;
void *cpuAddress;
size_t size;
if (!this->getParametersForMemory(gfxAllocation, gpuAddress, cpuAddress, size)) {
return false;
}
auto allocType = gfxAllocation.getAllocationType();
this->registerAllocationWithTbxFaultMngrIfTbxFaultable(&gfxAllocation, cpuAddress, size);
if (!this->isTbxWritable(gfxAllocation)) {
return false;
}
this->protectCPUMemoryFromWritesIfTbxFaultable(&gfxAllocation, cpuAddress, size);
initializeEngine();
if (aubManager) {
this->writeMemoryWithAubManager(gfxAllocation, isChunkCopy, gpuVaChunkOffset, chunkSize);
} else {
if (isChunkCopy) {
gpuAddress += gpuVaChunkOffset;
cpuAddress = ptrOffset(cpuAddress, static_cast<uintptr_t>(gpuVaChunkOffset));
size = chunkSize;
}
writeMemory(gpuAddress, cpuAddress, size, this->getMemoryBank(&gfxAllocation), this->getPPGTTAdditionalBits(&gfxAllocation));
}
if (AubHelper::isOneTimeAubWritableAllocationType(allocType)) {
this->setTbxWritable(false, gfxAllocation);
}
this->protectCPUMemoryAccessIfTbxFaultable(&gfxAllocation, cpuAddress, size);
return true;
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::writeMMIO(uint32_t offset, uint32_t value) {
if (hardwareContextController) {
hardwareContextController->writeMMIO(offset, value);
}
}
template <typename GfxFamily>
bool TbxCommandStreamReceiverHw<GfxFamily>::expectMemory(const void *gfxAddress, const void *srcAddress,
size_t length, uint32_t compareOperation) {
if (hardwareContextController) {
auto readMemory = std::make_unique<char[]>(length);
// note: memory bank should not matter assuming that we call expect on the memory that was previously allocated
hardwareContextController->readMemory((uint64_t)gfxAddress, readMemory.get(), length, this->getMemoryBankForGtt(), MemoryConstants::pageSize64k);
auto isMemoryEqual = (memcmp(readMemory.get(), srcAddress, length) == 0);
auto isEqualMemoryExpected = (compareOperation == aub_stream::CompareOperationValues::CompareEqual);
hardwareContextController->expectMemory(reinterpret_cast<uint64_t>(gfxAddress), srcAddress, length, compareOperation);
return (isMemoryEqual == isEqualMemoryExpected);
}
return BaseClass::expectMemory(gfxAddress, srcAddress, length, compareOperation);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::writePooledMemory(SharedPoolAllocation &sharedPoolAllocation, bool initFullPageTables) {
auto &gfxAllocation = *sharedPoolAllocation.getGraphicsAllocation();
auto writeMemoryOperation = [&]() {
constexpr uint32_t allBanks = std::numeric_limits<uint32_t>::max();
if (initFullPageTables && gfxAllocation.isTbxWritable(allBanks)) {
writeMemory(gfxAllocation, false, 0, 0);
}
gfxAllocation.setTbxWritable(true, allBanks);
[[maybe_unused]] const auto writeMemoryStatus = writeMemory(gfxAllocation, true, sharedPoolAllocation.getOffset(), sharedPoolAllocation.getSize());
DEBUG_BREAK_IF(!writeMemoryStatus);
gfxAllocation.setTbxWritable(false, allBanks);
};
if (auto mutex = sharedPoolAllocation.getMutex(); mutex) {
std::lock_guard<std::mutex> lock(*mutex);
writeMemoryOperation();
} else {
writeMemoryOperation();
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::flushSubmissionsAndDownloadAllocations(TaskCountType taskCountToWait, bool skipAllocationsDownload) {
this->flushBatchedSubmissions();
if (this->latestFlushedTaskCount < taskCountToWait) {
this->flushTagUpdate();
}
volatile TagAddressType *pollAddress = this->getTagAddress();
for (uint32_t i = 0; i < this->activePartitions; i++) {
while (*pollAddress < this->latestFlushedTaskCount) {
this->downloadAllocation(*this->getTagAllocation());
}
pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset);
}
if (skipAllocationsDownload) {
return;
}
auto lockCSR = this->obtainUniqueOwnership();
for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) {
this->downloadAllocation(*graphicsAllocation);
}
this->allocationsForDownload.clear();
}
template <typename GfxFamily>
WaitStatus TbxCommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(TaskCountType taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, QueueThrottle throttle) {
flushSubmissionsAndDownloadAllocations(taskCountToWait, false);
return BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, throttle);
}
template <typename GfxFamily>
WaitStatus TbxCommandStreamReceiverHw<GfxFamily>::waitForCompletionWithTimeout(const WaitParams &params, TaskCountType taskCountToWait) {
flushSubmissionsAndDownloadAllocations(taskCountToWait, params.skipTbxDownload);
return BaseClass::waitForCompletionWithTimeout(params, taskCountToWait);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::processEviction() {
auto lockCSR = this->obtainUniqueOwnership();
BaseClass::processEviction();
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::makeNonResident(GraphicsAllocation &gfxAllocation) {
auto lock = this->obtainUniqueOwnership();
if (gfxAllocation.isResident(osContext->getContextId())) {
this->allocationsForDownload.insert(&gfxAllocation);
}
BaseClass::makeNonResident(gfxAllocation);
}
template <typename GfxFamily>
SubmissionStatus TbxCommandStreamReceiverHw<GfxFamily>::processResidency(ResidencyContainer &allocationsForResidency, uint32_t handleId) {
for (auto &gfxAllocation : allocationsForResidency) {
if (dumpTbxNonWritable) {
this->setTbxWritable(true, *gfxAllocation);
}
if (!writeMemory(*gfxAllocation)) {
DEBUG_BREAK_IF(!((gfxAllocation->getUnderlyingBufferSize() == 0) ||
!this->isTbxWritable(*gfxAllocation)));
}
gfxAllocation->updateResidencyTaskCount(this->taskCount + 1, this->osContext->getContextId());
}
if (this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex]->memoryOperationsInterface) {
this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex]->memoryOperationsInterface->processFlushResidency(this);
}
dumpTbxNonWritable = false;
return SubmissionStatus::success;
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::downloadAllocationTbx(GraphicsAllocation &gfxAllocation) {
uint64_t gpuAddress = 0;
void *cpuAddress = nullptr;
size_t size = 0;
auto hostFunctionsActive = this->hostFunctionStreamer.get() != nullptr;
std::unique_lock<CommandStreamReceiver::MutexType> lockCsr(this->tagAllocationDownloadMutex, std::defer_lock);
if (hostFunctionsActive && gfxAllocation.getAllocationType() == AllocationType::tagBuffer) {
lockCsr.lock();
}
this->getParametersForMemory(gfxAllocation, gpuAddress, cpuAddress, size);
this->allowCPUMemoryAccessIfTbxFaultable(&gfxAllocation, cpuAddress, size);
if (hardwareContextController) {
hardwareContextController->readMemory(gpuAddress, cpuAddress, size,
this->getMemoryBank(&gfxAllocation), gfxAllocation.getUsedPageSize());
this->protectCPUMemoryFromWritesIfTbxFaultable(&gfxAllocation, cpuAddress, size);
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::downloadAllocations(bool blockingWait, TaskCountType taskCount) {
volatile TagAddressType *pollAddress = this->getTagAddress();
auto waitTaskCount = std::min(taskCount, this->latestFlushedTaskCount.load());
for (uint32_t i = 0; i < this->activePartitions; i++) {
if (*pollAddress < waitTaskCount) {
this->downloadAllocation(*this->getTagAllocation());
auto startTime = std::chrono::high_resolution_clock::now();
uint64_t timeDiff = 0;
while (*pollAddress < waitTaskCount) {
if (!blockingWait) {
// Additional delay to reach PC in case of Event wait
timeDiff = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - startTime).count();
if (timeDiff > getNonBlockingDownloadTimeoutMs()) {
return;
}
}
this->downloadAllocation(*this->getTagAllocation());
}
}
pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset);
}
auto lockCSR = this->obtainUniqueOwnership();
std::vector<GraphicsAllocation *> notReadyAllocations;
for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) {
this->downloadAllocation(*graphicsAllocation);
// Used again while waiting for completion. Another download will be needed.
if (graphicsAllocation->getTaskCount(this->osContext->getContextId()) > taskCount) {
notReadyAllocations.push_back(graphicsAllocation);
}
}
this->allocationsForDownload.clear();
this->allocationsForDownload = std::set<GraphicsAllocation *>(notReadyAllocations.begin(), notReadyAllocations.end());
}
template <typename GfxFamily>
uint32_t TbxCommandStreamReceiverHw<GfxFamily>::getMaskAndValueForPollForCompletion() const {
return 0x100;
}
template <typename GfxFamily>
bool TbxCommandStreamReceiverHw<GfxFamily>::getpollNotEqualValueForPollForCompletion() const {
return false;
}
template <typename GfxFamily>
AubSubCaptureStatus TbxCommandStreamReceiverHw<GfxFamily>::checkAndActivateAubSubCapture(const std::string &kernelName) {
if (!subCaptureManager) {
return {false, false};
}
auto status = subCaptureManager->checkAndActivateSubCapture(kernelName);
if (status.isActive && !status.wasActiveInPreviousEnqueue) {
dumpTbxNonWritable = true;
}
return status;
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::dumpAllocation(GraphicsAllocation &gfxAllocation) {
if (!hardwareContextController) {
return;
}
bool isBcsCsr = EngineHelpers::isBcs(this->osContext->getEngineType());
if (isBcsCsr != gfxAllocation.getAubInfo().bcsDumpOnly) {
return;
}
if (debugManager.flags.AUBDumpAllocsOnEnqueueReadOnly.get() || debugManager.flags.AUBDumpAllocsOnEnqueueSVMMemcpyOnly.get()) {
if (!gfxAllocation.isAllocDumpable()) {
return;
}
gfxAllocation.setAllocDumpable(false, isBcsCsr);
}
auto dumpFormat = AubAllocDump::getDumpFormat(gfxAllocation);
auto surfaceInfo = std::unique_ptr<aub_stream::SurfaceInfo>(AubAllocDump::getDumpSurfaceInfo<GfxFamily>(gfxAllocation, *this->peekGmmHelper(), dumpFormat));
if (surfaceInfo) {
hardwareContextController->pollForCompletion();
hardwareContextController->dumpSurface(*surfaceInfo.get());
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::removeDownloadAllocation(GraphicsAllocation *alloc) {
auto lockCSR = this->obtainUniqueOwnership();
this->allocationsForDownload.erase(alloc);
auto faultManager = getTbxPageFaultManager();
if (faultManager != nullptr) {
faultManager->removeAllocation(alloc);
}
}
} // namespace NEO