Files
compute-runtime/shared/source/command_stream/tbx_command_stream_receiver_hw.inl
Jack Myers 51c0e80299 feature: extend TBX page fault manager from CPU implementation
In TBX mode, the host could not write to host buffers after access from device
code due to the lack of a migration mechanism post-initial TBX upload.
Migration is unnecessary with real hardware, but required for TBX.

This patch introduces a new page fault manager type that extends the original
CPU fault manager, enabling automatic migration of host buffers in TBX mode.

Refactoring was necessary to avoid diamond inheritance, achieved by using a
template parameter as the base class for OS-specific fault managers.

Related-To: NEO-12268
Signed-off-by: Jack Myers <jack.myers@intel.com>
2024-12-11 09:09:50 +01:00

728 lines
30 KiB
C++

/*
* Copyright (C) 2018-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/aub/aub_center.h"
#include "shared/source/aub/aub_helper.h"
#include "shared/source/aub_mem_dump/aub_alloc_dump.h"
#include "shared/source/aub_mem_dump/aub_alloc_dump.inl"
#include "shared/source/aub_mem_dump/page_table_entry_bits.h"
#include "shared/source/command_stream/aub_command_stream_receiver.h"
#include "shared/source/command_stream/command_stream_receiver_with_aub_dump.h"
#include "shared/source/command_stream/submission_status.h"
#include "shared/source/command_stream/submissions_aggregator.h"
#include "shared/source/command_stream/tbx_command_stream_receiver_hw.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/execution_environment/execution_environment.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/ptr_math.h"
#include "shared/source/memory_manager/allocation_type.h"
#include "shared/source/memory_manager/graphics_allocation.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/os_interface/aub_memory_operations_handler.h"
#include "shared/source/os_interface/product_helper.h"
#include "shared/source/page_fault_manager/tbx_page_fault_manager.h"
#include <cstring>
namespace NEO {
template <typename GfxFamily>
CpuPageFaultManager *TbxCommandStreamReceiverHw<GfxFamily>::getTbxPageFaultManager() {
return this->getMemoryManager()->getPageFaultManager();
}
template <typename GfxFamily>
TbxCommandStreamReceiverHw<GfxFamily>::TbxCommandStreamReceiverHw(ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield)
: BaseClass(executionEnvironment, rootDeviceIndex, deviceBitfield) {
forceSkipResourceCleanupRequired = true;
auto releaseHelper = executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getReleaseHelper();
physicalAddressAllocator.reset(this->createPhysicalAddressAllocator(&this->peekHwInfo(), releaseHelper));
executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->initAubCenter(this->localMemoryEnabled, "", this->getType());
auto aubCenter = executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->aubCenter.get();
UNRECOVERABLE_IF(nullptr == aubCenter);
aubManager = aubCenter->getAubManager();
ppgtt = std::make_unique<std::conditional<is64bit, PML4, PDPE>::type>(physicalAddressAllocator.get());
ggtt = std::make_unique<PDPE>(physicalAddressAllocator.get());
auto debugDeviceId = debugManager.flags.OverrideAubDeviceId.get();
this->aubDeviceId = debugDeviceId == -1
? this->peekHwInfo().capabilityTable.aubDeviceId
: static_cast<uint32_t>(debugDeviceId);
this->stream = &tbxStream;
this->downloadAllocationImpl = [this](GraphicsAllocation &graphicsAllocation) {
this->downloadAllocationTbx(graphicsAllocation);
};
}
template <typename GfxFamily>
TbxCommandStreamReceiverHw<GfxFamily>::~TbxCommandStreamReceiverHw() {
this->downloadAllocationImpl = nullptr;
if (streamInitialized) {
tbxStream.close();
}
this->freeEngineInfo(gttRemap);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::allowCPUMemoryAccessIfHostBuffer(AllocationType allocType, void *cpuAddress, size_t size) {
if (allocType == AllocationType::bufferHostMemory) {
auto faultManager = getTbxPageFaultManager();
if (faultManager != nullptr) {
faultManager->allowCPUMemoryAccess(cpuAddress, size);
}
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::protectCPUMemoryAccessIfHostBuffer(AllocationType allocType, void *cpuAddress, size_t size) {
if (allocType == AllocationType::bufferHostMemory) {
auto faultManager = getTbxPageFaultManager();
if (faultManager != nullptr) {
faultManager->protectCPUMemoryAccess(cpuAddress, size);
}
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::initializeEngine() {
isEngineInitialized = true;
if (hardwareContextController) {
hardwareContextController->initialize();
return;
}
DEBUG_BREAK_IF(this->aubManager);
auto csTraits = this->getCsTraits(osContext->getEngineType());
if (engineInfo.pLRCA) {
return;
}
this->initGlobalMMIO();
this->initEngineMMIO();
this->initAdditionalMMIO();
// Global HW Status Page
{
const size_t sizeHWSP = 0x1000;
const size_t alignHWSP = 0x1000;
engineInfo.pGlobalHWStatusPage = alignedMalloc(sizeHWSP, alignHWSP);
engineInfo.ggttHWSP = gttRemap.map(engineInfo.pGlobalHWStatusPage, sizeHWSP);
auto physHWSP = ggtt->map(engineInfo.ggttHWSP, sizeHWSP, this->getGTTBits(), this->getMemoryBankForGtt());
// Write our GHWSP
AubGTTData data = {0};
this->getGTTData(reinterpret_cast<void *>(physHWSP), data);
AUB::reserveAddressGGTT(tbxStream, engineInfo.ggttHWSP, sizeHWSP, physHWSP, data);
tbxStream.writeMMIO(AubMemDump::computeRegisterOffset(csTraits.mmioBase, 0x2080), engineInfo.ggttHWSP);
}
// Allocate the LRCA
const size_t sizeLRCA = csTraits.sizeLRCA;
const size_t alignLRCA = csTraits.alignLRCA;
auto pLRCABase = alignedMalloc(sizeLRCA, alignLRCA);
engineInfo.pLRCA = pLRCABase;
// Initialize the LRCA to a known state
csTraits.initialize(pLRCABase);
// Reserve the RCS ring buffer
engineInfo.sizeRingBuffer = 0x4 * 0x1000;
{
const size_t alignRCS = 0x1000;
engineInfo.pRingBuffer = alignedMalloc(engineInfo.sizeRingBuffer, alignRCS);
engineInfo.ggttRingBuffer = gttRemap.map(engineInfo.pRingBuffer, engineInfo.sizeRingBuffer);
auto physRCS = ggtt->map(engineInfo.ggttRingBuffer, engineInfo.sizeRingBuffer, this->getGTTBits(), this->getMemoryBankForGtt());
AubGTTData data = {0};
this->getGTTData(reinterpret_cast<void *>(physRCS), data);
AUB::reserveAddressGGTT(tbxStream, engineInfo.ggttRingBuffer, engineInfo.sizeRingBuffer, physRCS, data);
}
// Initialize the ring MMIO registers
{
uint32_t ringHead = 0x000;
uint32_t ringTail = 0x000;
auto ringBase = engineInfo.ggttRingBuffer;
auto ringCtrl = (uint32_t)((engineInfo.sizeRingBuffer - 0x1000) | 1);
csTraits.setRingHead(pLRCABase, ringHead);
csTraits.setRingTail(pLRCABase, ringTail);
csTraits.setRingBase(pLRCABase, ringBase);
csTraits.setRingCtrl(pLRCABase, ringCtrl);
}
// Write our LRCA
{
engineInfo.ggttLRCA = gttRemap.map(engineInfo.pLRCA, sizeLRCA);
auto lrcAddressPhys = ggtt->map(engineInfo.ggttLRCA, sizeLRCA, this->getGTTBits(), this->getMemoryBankForGtt());
AubGTTData data = {0};
this->getGTTData(reinterpret_cast<void *>(lrcAddressPhys), data);
AUB::reserveAddressGGTT(tbxStream, engineInfo.ggttLRCA, sizeLRCA, lrcAddressPhys, data);
AUB::addMemoryWrite(
tbxStream,
lrcAddressPhys,
pLRCABase,
sizeLRCA,
this->getAddressSpace(csTraits.aubHintLRCA),
csTraits.aubHintLRCA);
}
DEBUG_BREAK_IF(!engineInfo.pLRCA);
}
template <typename GfxFamily>
CommandStreamReceiver *TbxCommandStreamReceiverHw<GfxFamily>::create(const std::string &baseName,
bool withAubDump,
ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield) {
TbxCommandStreamReceiverHw<GfxFamily> *csr;
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[rootDeviceIndex];
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
const auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
if (withAubDump) {
auto localMemoryEnabled = gfxCoreHelper.getEnableLocalMemory(hwInfo);
auto fullName = AUBCommandStreamReceiver::createFullFilePath(hwInfo, baseName, rootDeviceIndex);
if (debugManager.flags.AUBDumpCaptureFileName.get() != "unk") {
fullName.assign(debugManager.flags.AUBDumpCaptureFileName.get());
}
rootDeviceEnvironment.initAubCenter(localMemoryEnabled, fullName, CommandStreamReceiverType::tbxWithAub);
csr = new CommandStreamReceiverWithAUBDump<TbxCommandStreamReceiverHw<GfxFamily>>(baseName, executionEnvironment, rootDeviceIndex, deviceBitfield);
auto aubCenter = rootDeviceEnvironment.aubCenter.get();
UNRECOVERABLE_IF(nullptr == aubCenter);
auto subCaptureCommon = aubCenter->getSubCaptureCommon();
UNRECOVERABLE_IF(nullptr == subCaptureCommon);
if (subCaptureCommon->subCaptureMode > AubSubCaptureManager::SubCaptureMode::off) {
csr->subCaptureManager = std::make_unique<AubSubCaptureManager>(fullName, *subCaptureCommon, ApiSpecificConfig::getRegistryPath());
}
if (csr->aubManager) {
if (!csr->aubManager->isOpen()) {
csr->aubManager->open(csr->subCaptureManager ? csr->subCaptureManager->getSubCaptureFileName("") : fullName);
UNRECOVERABLE_IF(!csr->aubManager->isOpen());
}
}
} else {
csr = new TbxCommandStreamReceiverHw<GfxFamily>(executionEnvironment, rootDeviceIndex, deviceBitfield);
}
if (!csr->aubManager) {
// Open our stream
csr->stream->open(nullptr);
// Add the file header.
bool streamInitialized = csr->stream->init(productHelper.getAubStreamSteppingFromHwRevId(hwInfo), csr->aubDeviceId);
csr->streamInitialized = streamInitialized;
}
return csr;
}
template <typename GfxFamily>
SubmissionStatus TbxCommandStreamReceiverHw<GfxFamily>::flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) {
if (subCaptureManager) {
if (aubManager) {
aubManager->pause(false);
}
}
initializeEngine();
// Write our batch buffer
auto pBatchBuffer = ptrOffset(batchBuffer.commandBufferAllocation->getUnderlyingBuffer(), batchBuffer.startOffset);
auto batchBufferGpuAddress = ptrOffset(batchBuffer.commandBufferAllocation->getGpuAddress(), batchBuffer.startOffset);
auto currentOffset = batchBuffer.usedSize;
DEBUG_BREAK_IF(currentOffset < batchBuffer.startOffset);
auto sizeBatchBuffer = currentOffset - batchBuffer.startOffset;
auto overrideRingHead = false;
auto submissionTaskCount = this->taskCount + 1;
allocationsForResidency.push_back(batchBuffer.commandBufferAllocation);
batchBuffer.commandBufferAllocation->updateResidencyTaskCount(submissionTaskCount, this->osContext->getContextId());
batchBuffer.commandBufferAllocation->updateTaskCount(submissionTaskCount, osContext->getContextId());
// Write allocations for residency
processResidency(allocationsForResidency, 0u);
if (subCaptureManager) {
if (aubManager) {
auto status = subCaptureManager->getSubCaptureStatus();
if (!status.wasActiveInPreviousEnqueue && status.isActive) {
overrideRingHead = true;
}
if (!status.wasActiveInPreviousEnqueue && !status.isActive) {
aubManager->pause(true);
}
}
}
submitBatchBufferTbx(
batchBufferGpuAddress, pBatchBuffer, sizeBatchBuffer,
this->getMemoryBank(batchBuffer.commandBufferAllocation),
this->getPPGTTAdditionalBits(batchBuffer.commandBufferAllocation),
overrideRingHead);
if (subCaptureManager) {
pollForCompletion();
subCaptureManager->disableSubCapture();
}
return SubmissionStatus::success;
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::submitBatchBufferTbx(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead) {
if (hardwareContextController) {
if (batchBufferSize) {
hardwareContextController->submit(batchBufferGpuAddress, batchBuffer, batchBufferSize, memoryBank, MemoryConstants::pageSize64k, overrideRingHead);
}
return;
}
auto csTraits = this->getCsTraits(osContext->getEngineType());
{
auto physBatchBuffer = ppgtt->map(static_cast<uintptr_t>(batchBufferGpuAddress), batchBufferSize, entryBits, memoryBank);
AubHelperHw<GfxFamily> aubHelperHw(this->localMemoryEnabled);
AUB::reserveAddressPPGTT(tbxStream, static_cast<uintptr_t>(batchBufferGpuAddress), batchBufferSize, physBatchBuffer,
entryBits, aubHelperHw);
AUB::addMemoryWrite(
tbxStream,
physBatchBuffer,
batchBuffer,
batchBufferSize,
this->getAddressSpace(AubMemDump::DataTypeHintValues::TraceBatchBufferPrimary),
AubMemDump::DataTypeHintValues::TraceBatchBufferPrimary);
}
// Add a batch buffer start to the RCS
auto previousTail = engineInfo.tailRingBuffer;
{
typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
typedef typename GfxFamily::MI_BATCH_BUFFER_START MI_BATCH_BUFFER_START;
typedef typename GfxFamily::MI_NOOP MI_NOOP;
auto pTail = ptrOffset(engineInfo.pRingBuffer, engineInfo.tailRingBuffer);
auto ggttTail = ptrOffset(engineInfo.ggttRingBuffer, engineInfo.tailRingBuffer);
auto sizeNeeded =
sizeof(MI_BATCH_BUFFER_START) +
sizeof(MI_NOOP) +
sizeof(MI_LOAD_REGISTER_IMM);
if (engineInfo.tailRingBuffer + sizeNeeded >= engineInfo.sizeRingBuffer) {
// Pad the remaining ring with NOOPs
auto sizeToWrap = engineInfo.sizeRingBuffer - engineInfo.tailRingBuffer;
memset(pTail, 0, sizeToWrap);
// write remaining ring
auto physDumpStart = ggtt->map(ggttTail, sizeToWrap, this->getGTTBits(), this->getMemoryBankForGtt());
AUB::addMemoryWrite(
tbxStream,
physDumpStart,
pTail,
sizeToWrap,
this->getAddressSpace(AubMemDump::DataTypeHintValues::TraceCommandBuffer),
AubMemDump::DataTypeHintValues::TraceCommandBuffer);
previousTail = 0;
engineInfo.tailRingBuffer = 0;
pTail = engineInfo.pRingBuffer;
} else if (engineInfo.tailRingBuffer == 0) {
// Add a LRI if this is our first submission
auto lri = GfxFamily::cmdInitLoadRegisterImm;
lri.setRegisterOffset(AubMemDump::computeRegisterOffset(csTraits.mmioBase, 0x2244));
lri.setDataDword(0x00010000);
*(MI_LOAD_REGISTER_IMM *)pTail = lri;
pTail = ((MI_LOAD_REGISTER_IMM *)pTail) + 1;
}
// Add our BBS
auto bbs = GfxFamily::cmdInitBatchBufferStart;
bbs.setBatchBufferStartAddress(static_cast<uint64_t>(batchBufferGpuAddress));
bbs.setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT);
*(MI_BATCH_BUFFER_START *)pTail = bbs;
pTail = ((MI_BATCH_BUFFER_START *)pTail) + 1;
// Add a NOOP as our tail needs to be aligned to a QWORD
*(MI_NOOP *)pTail = GfxFamily::cmdInitNoop;
pTail = ((MI_NOOP *)pTail) + 1;
// Compute our new ring tail.
engineInfo.tailRingBuffer = (uint32_t)ptrDiff(pTail, engineInfo.pRingBuffer);
// Only dump the new commands
auto ggttDumpStart = ptrOffset(engineInfo.ggttRingBuffer, previousTail);
auto dumpStart = ptrOffset(engineInfo.pRingBuffer, previousTail);
auto dumpLength = engineInfo.tailRingBuffer - previousTail;
// write RCS
auto physDumpStart = ggtt->map(ggttDumpStart, dumpLength, this->getGTTBits(), this->getMemoryBankForGtt());
AUB::addMemoryWrite(
tbxStream,
physDumpStart,
dumpStart,
dumpLength,
this->getAddressSpace(AubMemDump::DataTypeHintValues::TraceCommandBuffer),
AubMemDump::DataTypeHintValues::TraceCommandBuffer);
// update the RCS mmio tail in the LRCA
auto physLRCA = ggtt->map(engineInfo.ggttLRCA, sizeof(engineInfo.tailRingBuffer), this->getGTTBits(), this->getMemoryBankForGtt());
AUB::addMemoryWrite(
tbxStream,
physLRCA + 0x101c,
&engineInfo.tailRingBuffer,
sizeof(engineInfo.tailRingBuffer),
this->getAddressSpace(AubMemDump::DataTypeHintValues::TraceNotype));
DEBUG_BREAK_IF(engineInfo.tailRingBuffer >= engineInfo.sizeRingBuffer);
}
// Submit our execlist by submitting to the execlist submit ports
{
typename AUB::MiContextDescriptorReg contextDescriptor = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
contextDescriptor.sData.valid = true;
contextDescriptor.sData.forcePageDirRestore = false;
contextDescriptor.sData.forceRestore = false;
contextDescriptor.sData.legacy = true;
contextDescriptor.sData.faultSupport = 0;
contextDescriptor.sData.privilegeAccessOrPPGTT = true;
contextDescriptor.sData.aDor64bitSupport = AUB::Traits::addressingBits > 32;
auto ggttLRCA = engineInfo.ggttLRCA;
contextDescriptor.sData.logicalRingCtxAddress = ggttLRCA / 4096;
contextDescriptor.sData.contextID = 0;
this->submitLRCA(contextDescriptor);
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::pollForCompletion(bool skipTaskCountCheck) {
if (hardwareContextController) {
hardwareContextController->pollForCompletion();
return;
}
typedef typename AubMemDump::CmdServicesMemTraceRegisterPoll CmdServicesMemTraceRegisterPoll;
auto mmioBase = this->getCsTraits(osContext->getEngineType()).mmioBase;
bool pollNotEqual = getpollNotEqualValueForPollForCompletion();
uint32_t mask = getMaskAndValueForPollForCompletion();
uint32_t value = mask;
tbxStream.registerPoll(
AubMemDump::computeRegisterOffset(mmioBase, 0x2234), // EXECLIST_STATUS
mask,
value,
pollNotEqual,
CmdServicesMemTraceRegisterPoll::TimeoutActionValues::Abort);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::writeMemory(uint64_t gpuAddress, void *cpuAddress, size_t size, uint32_t memoryBank, uint64_t entryBits) {
UNRECOVERABLE_IF(!isEngineInitialized);
AubHelperHw<GfxFamily> aubHelperHw(this->localMemoryEnabled);
PageWalker walker = [&](uint64_t physAddress, size_t size, size_t offset, uint64_t entryBits) {
AUB::reserveAddressGGTTAndWriteMmeory(tbxStream, static_cast<uintptr_t>(gpuAddress), cpuAddress, physAddress, size, offset, entryBits,
aubHelperHw);
};
ppgtt->pageWalk(static_cast<uintptr_t>(gpuAddress), size, 0, entryBits, walker, memoryBank);
}
template <typename GfxFamily>
bool TbxCommandStreamReceiverHw<GfxFamily>::writeMemory(GraphicsAllocation &gfxAllocation, bool isChunkCopy, uint64_t gpuVaChunkOffset, size_t chunkSize) {
uint64_t gpuAddress;
void *cpuAddress;
size_t size;
if (!this->getParametersForMemory(gfxAllocation, gpuAddress, cpuAddress, size)) {
return false;
}
auto allocType = gfxAllocation.getAllocationType();
if (allocType == AllocationType::bufferHostMemory) {
auto faultManager = getTbxPageFaultManager();
if (faultManager != nullptr) {
faultManager->insertAllocation(this, &gfxAllocation, cpuAddress, size);
}
}
if (!this->isTbxWritable(gfxAllocation)) {
return false;
}
this->allowCPUMemoryAccessIfHostBuffer(allocType, cpuAddress, size);
if (!isEngineInitialized) {
initializeEngine();
}
if (aubManager) {
this->writeMemoryWithAubManager(gfxAllocation, isChunkCopy, gpuVaChunkOffset, chunkSize);
} else {
if (isChunkCopy) {
gpuAddress += gpuVaChunkOffset;
cpuAddress = ptrOffset(cpuAddress, static_cast<uintptr_t>(gpuVaChunkOffset));
size = chunkSize;
}
writeMemory(gpuAddress, cpuAddress, size, this->getMemoryBank(&gfxAllocation), this->getPPGTTAdditionalBits(&gfxAllocation));
}
if (AubHelper::isOneTimeAubWritableAllocationType(gfxAllocation.getAllocationType())) {
this->setTbxWritable(false, gfxAllocation);
}
this->protectCPUMemoryAccessIfHostBuffer(allocType, cpuAddress, size);
return true;
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::writeMMIO(uint32_t offset, uint32_t value) {
if (hardwareContextController) {
hardwareContextController->writeMMIO(offset, value);
}
}
template <typename GfxFamily>
bool TbxCommandStreamReceiverHw<GfxFamily>::expectMemory(const void *gfxAddress, const void *srcAddress,
size_t length, uint32_t compareOperation) {
if (hardwareContextController) {
auto readMemory = std::make_unique<char[]>(length);
// note: memory bank should not matter assuming that we call expect on the memory that was previously allocated
hardwareContextController->readMemory((uint64_t)gfxAddress, readMemory.get(), length, this->getMemoryBankForGtt(), MemoryConstants::pageSize64k);
auto isMemoryEqual = (memcmp(readMemory.get(), srcAddress, length) == 0);
auto isEqualMemoryExpected = (compareOperation == AubMemDump::CmdServicesMemTraceMemoryCompare::CompareOperationValues::CompareEqual);
return (isMemoryEqual == isEqualMemoryExpected);
}
return BaseClass::expectMemory(gfxAddress, srcAddress, length, compareOperation);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::flushSubmissionsAndDownloadAllocations(TaskCountType taskCountToWait, bool skipAllocationsDownload) {
this->flushBatchedSubmissions();
if (this->latestFlushedTaskCount < taskCountToWait) {
this->flushTagUpdate();
}
volatile TagAddressType *pollAddress = this->getTagAddress();
for (uint32_t i = 0; i < this->activePartitions; i++) {
while (*pollAddress < this->latestFlushedTaskCount) {
this->downloadAllocation(*this->getTagAllocation());
}
pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset);
}
if (skipAllocationsDownload) {
return;
}
auto lockCSR = this->obtainUniqueOwnership();
for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) {
this->downloadAllocation(*graphicsAllocation);
}
this->allocationsForDownload.clear();
}
template <typename GfxFamily>
WaitStatus TbxCommandStreamReceiverHw<GfxFamily>::waitForTaskCountWithKmdNotifyFallback(TaskCountType taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, QueueThrottle throttle) {
flushSubmissionsAndDownloadAllocations(taskCountToWait, false);
return BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, throttle);
}
template <typename GfxFamily>
WaitStatus TbxCommandStreamReceiverHw<GfxFamily>::waitForCompletionWithTimeout(const WaitParams &params, TaskCountType taskCountToWait) {
flushSubmissionsAndDownloadAllocations(taskCountToWait, params.skipTbxDownload);
return BaseClass::waitForCompletionWithTimeout(params, taskCountToWait);
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::processEviction() {
this->allocationsForDownload.insert(this->getEvictionAllocations().begin(), this->getEvictionAllocations().end());
BaseClass::processEviction();
}
template <typename GfxFamily>
SubmissionStatus TbxCommandStreamReceiverHw<GfxFamily>::processResidency(ResidencyContainer &allocationsForResidency, uint32_t handleId) {
for (auto &gfxAllocation : allocationsForResidency) {
if (dumpTbxNonWritable) {
this->setTbxWritable(true, *gfxAllocation);
}
if (!writeMemory(*gfxAllocation)) {
DEBUG_BREAK_IF(!((gfxAllocation->getUnderlyingBufferSize() == 0) ||
!this->isTbxWritable(*gfxAllocation)));
}
gfxAllocation->updateResidencyTaskCount(this->taskCount + 1, this->osContext->getContextId());
}
if (this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex]->memoryOperationsInterface) {
this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex]->memoryOperationsInterface->processFlushResidency(this);
}
dumpTbxNonWritable = false;
return SubmissionStatus::success;
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::downloadAllocationTbx(GraphicsAllocation &gfxAllocation) {
uint64_t gpuAddress = 0;
void *cpuAddress = nullptr;
size_t size = 0;
this->getParametersForMemory(gfxAllocation, gpuAddress, cpuAddress, size);
auto allocType = gfxAllocation.getAllocationType();
this->allowCPUMemoryAccessIfHostBuffer(allocType, cpuAddress, size);
if (hardwareContextController) {
hardwareContextController->readMemory(gpuAddress, cpuAddress, size,
this->getMemoryBank(&gfxAllocation), gfxAllocation.getUsedPageSize());
return;
}
if (size) {
PageWalker walker = [&](uint64_t physAddress, size_t size, size_t offset, uint64_t entryBits) {
DEBUG_BREAK_IF(offset > size);
tbxStream.readMemory(physAddress, ptrOffset(cpuAddress, offset), size);
};
ppgtt->pageWalk(static_cast<uintptr_t>(gpuAddress), size, 0, 0, walker, this->getMemoryBank(&gfxAllocation));
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::downloadAllocations(bool blockingWait, TaskCountType taskCount) {
volatile TagAddressType *pollAddress = this->getTagAddress();
auto waitTaskCount = std::min(taskCount, this->latestFlushedTaskCount.load());
for (uint32_t i = 0; i < this->activePartitions; i++) {
if (*pollAddress < waitTaskCount) {
this->downloadAllocation(*this->getTagAllocation());
auto startTime = std::chrono::high_resolution_clock::now();
uint64_t timeDiff = 0;
while (*pollAddress < waitTaskCount) {
if (!blockingWait) {
// Additional delay to reach PC in case of Event wait
timeDiff = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - startTime).count();
if (timeDiff > getNonBlockingDownloadTimeoutMs()) {
return;
}
}
this->downloadAllocation(*this->getTagAllocation());
}
}
pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset);
}
auto lockCSR = this->obtainUniqueOwnership();
std::vector<GraphicsAllocation *> notReadyAllocations;
for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) {
this->downloadAllocation(*graphicsAllocation);
// Used again while waiting for completion. Another download will be needed.
if (graphicsAllocation->getTaskCount(this->osContext->getContextId()) > taskCount) {
notReadyAllocations.push_back(graphicsAllocation);
}
}
this->allocationsForDownload.clear();
this->allocationsForDownload = std::set<GraphicsAllocation *>(notReadyAllocations.begin(), notReadyAllocations.end());
}
template <typename GfxFamily>
uint32_t TbxCommandStreamReceiverHw<GfxFamily>::getMaskAndValueForPollForCompletion() const {
return 0x100;
}
template <typename GfxFamily>
bool TbxCommandStreamReceiverHw<GfxFamily>::getpollNotEqualValueForPollForCompletion() const {
return false;
}
template <typename GfxFamily>
AubSubCaptureStatus TbxCommandStreamReceiverHw<GfxFamily>::checkAndActivateAubSubCapture(const std::string &kernelName) {
if (!subCaptureManager) {
return {false, false};
}
auto status = subCaptureManager->checkAndActivateSubCapture(kernelName);
if (status.isActive && !status.wasActiveInPreviousEnqueue) {
dumpTbxNonWritable = true;
}
return status;
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::dumpAllocation(GraphicsAllocation &gfxAllocation) {
if (!hardwareContextController) {
return;
}
bool isBcsCsr = EngineHelpers::isBcs(this->osContext->getEngineType());
if (isBcsCsr != gfxAllocation.getAubInfo().bcsDumpOnly) {
return;
}
if (debugManager.flags.AUBDumpAllocsOnEnqueueReadOnly.get() || debugManager.flags.AUBDumpAllocsOnEnqueueSVMMemcpyOnly.get()) {
if (!gfxAllocation.isAllocDumpable()) {
return;
}
gfxAllocation.setAllocDumpable(false, isBcsCsr);
}
auto dumpFormat = AubAllocDump::getDumpFormat(gfxAllocation);
auto surfaceInfo = std::unique_ptr<aub_stream::SurfaceInfo>(AubAllocDump::getDumpSurfaceInfo<GfxFamily>(gfxAllocation, *this->peekGmmHelper(), dumpFormat));
if (surfaceInfo) {
hardwareContextController->pollForCompletion();
hardwareContextController->dumpSurface(*surfaceInfo.get());
}
}
template <typename GfxFamily>
void TbxCommandStreamReceiverHw<GfxFamily>::removeDownloadAllocation(GraphicsAllocation *alloc) {
auto lockCSR = this->obtainUniqueOwnership();
this->allocationsForDownload.erase(alloc);
auto faultManager = getTbxPageFaultManager();
if (faultManager != nullptr) {
faultManager->removeAllocation(alloc);
}
}
} // namespace NEO