/* * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/aub/aub_center.h" #include "shared/source/aub/aub_helper.h" #include "shared/source/aub_mem_dump/aub_alloc_dump.h" #include "shared/source/aub_mem_dump/aub_alloc_dump.inl" #include "shared/source/command_stream/aub_command_stream_receiver.h" #include "shared/source/command_stream/command_stream_receiver_with_aub_dump.h" #include "shared/source/command_stream/submission_status.h" #include "shared/source/command_stream/submissions_aggregator.h" #include "shared/source/command_stream/task_count_helper.h" #include "shared/source/command_stream/tbx_command_stream_receiver_hw.h" #include "shared/source/command_stream/wait_status.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/execution_environment/execution_environment.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/constants.h" #include "shared/source/helpers/debug_helpers.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/kmd_notify_properties.h" #include "shared/source/helpers/ptr_math.h" #include "shared/source/memory_manager/allocation_type.h" #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/memory_operations_handler.h" #include "shared/source/page_fault_manager/cpu_page_fault_manager.h" #include "shared/source/utilities/shared_pool_allocation.h" #include #include #include #include #include namespace NEO { template CpuPageFaultManager *TbxCommandStreamReceiverHw::getTbxPageFaultManager() { return this->getMemoryManager()->getPageFaultManager(); } template TbxCommandStreamReceiverHw::TbxCommandStreamReceiverHw(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield) : BaseClass(executionEnvironment, rootDeviceIndex, deviceBitfield) { forceSkipResourceCleanupRequired = true; auto releaseHelper = executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getReleaseHelper(); physicalAddressAllocator.reset(this->createPhysicalAddressAllocator(&this->peekHwInfo(), releaseHelper)); executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->initAubCenter(this->localMemoryEnabled, "", this->getType()); auto aubCenter = executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->aubCenter.get(); UNRECOVERABLE_IF(nullptr == aubCenter); aubManager = aubCenter->getAubManager(); ppgtt = std::make_unique::type>(physicalAddressAllocator.get()); ggtt = std::make_unique(physicalAddressAllocator.get()); this->downloadAllocationImpl = [this](GraphicsAllocation &graphicsAllocation) { this->downloadAllocationTbx(graphicsAllocation); }; } template TbxCommandStreamReceiverHw::~TbxCommandStreamReceiverHw() { this->downloadAllocationImpl = nullptr; } template bool TbxCommandStreamReceiverHw::isAllocTbxFaultable(GraphicsAllocation *gfxAlloc) { // indicates host memory not managed by the driver if ((gfxAlloc->getDriverAllocatedCpuPtr() == nullptr) || (debugManager.isTbxPageFaultManagerEnabled() == false) || (this->getTbxPageFaultManager() == nullptr)) { return false; } auto allocType = gfxAlloc->getAllocationType(); if (allocType == AllocationType::bufferHostMemory) { return true; } return false; } template void TbxCommandStreamReceiverHw::registerAllocationWithTbxFaultMngrIfTbxFaultable(GraphicsAllocation *gfxAlloc, void *cpuAddress, size_t size) { if (!isAllocTbxFaultable(gfxAlloc)) { return; } auto bank = this->getMemoryBank(gfxAlloc); if (bank == 0u || gfxAlloc->storageInfo.cloningOfPageTables) { bank = GraphicsAllocation::defaultBank; } auto faultManager = getTbxPageFaultManager(); faultManager->insertAllocation(this, gfxAlloc, bank, cpuAddress, size); } template void TbxCommandStreamReceiverHw::allowCPUMemoryAccessIfTbxFaultable(GraphicsAllocation *gfxAlloc, void *cpuAddress, size_t size) { if (!isAllocTbxFaultable(gfxAlloc)) { return; } auto faultManager = getTbxPageFaultManager(); faultManager->allowCPUMemoryAccess(cpuAddress, size); } template void TbxCommandStreamReceiverHw::protectCPUMemoryAccessIfTbxFaultable(GraphicsAllocation *gfxAlloc, void *cpuAddress, size_t size) { if (!isAllocTbxFaultable(gfxAlloc)) { return; } auto faultManager = getTbxPageFaultManager(); faultManager->protectCPUMemoryAccess(cpuAddress, size); } template void TbxCommandStreamReceiverHw::protectCPUMemoryFromWritesIfTbxFaultable(GraphicsAllocation *gfxAlloc, void *cpuAddress, size_t size) { if (!isAllocTbxFaultable(gfxAlloc)) { return; } auto faultManager = getTbxPageFaultManager(); faultManager->protectCpuMemoryFromWrites(cpuAddress, size); } template void TbxCommandStreamReceiverHw::initializeEngine() { if (!isEngineInitialized) { isEngineInitialized = true; if (hardwareContextController) { hardwareContextController->createHardwareContexts(*aubManager); hardwareContextController->initialize(); return; } } } template CommandStreamReceiver *TbxCommandStreamReceiverHw::create(const std::string &baseName, bool withAubDump, ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield) { TbxCommandStreamReceiverHw *csr; auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]; auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); if (withAubDump) { auto localMemoryEnabled = gfxCoreHelper.getEnableLocalMemory(hwInfo); auto fullName = AUBCommandStreamReceiver::createFullFilePath(hwInfo, baseName, rootDeviceIndex); if (debugManager.flags.AUBDumpCaptureFileName.get() != "unk") { fullName.assign(debugManager.flags.AUBDumpCaptureFileName.get()); } rootDeviceEnvironment.initAubCenter(localMemoryEnabled, fullName, CommandStreamReceiverType::tbxWithAub); csr = new CommandStreamReceiverWithAUBDump>(baseName, executionEnvironment, rootDeviceIndex, deviceBitfield); auto aubCenter = rootDeviceEnvironment.aubCenter.get(); UNRECOVERABLE_IF(nullptr == aubCenter); auto subCaptureCommon = aubCenter->getSubCaptureCommon(); UNRECOVERABLE_IF(nullptr == subCaptureCommon); if (subCaptureCommon->subCaptureMode > AubSubCaptureManager::SubCaptureMode::off) { csr->subCaptureManager = std::make_unique(fullName, *subCaptureCommon, ApiSpecificConfig::getRegistryPath()); } if (csr->aubManager) { if (!csr->aubManager->isOpen()) { csr->aubManager->open(csr->subCaptureManager ? csr->subCaptureManager->getSubCaptureFileName("") : fullName); UNRECOVERABLE_IF(!csr->aubManager->isOpen()); } } } else { csr = new TbxCommandStreamReceiverHw(executionEnvironment, rootDeviceIndex, deviceBitfield); } return csr; } template SubmissionStatus TbxCommandStreamReceiverHw::flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) { if (subCaptureManager) { if (aubManager) { aubManager->pause(false); } } initializeEngine(); // Write our batch buffer auto pBatchBuffer = ptrOffset(batchBuffer.commandBufferAllocation->getUnderlyingBuffer(), batchBuffer.startOffset); auto batchBufferGpuAddress = ptrOffset(batchBuffer.commandBufferAllocation->getGpuAddress(), batchBuffer.startOffset); auto currentOffset = batchBuffer.usedSize; DEBUG_BREAK_IF(currentOffset < batchBuffer.startOffset); auto sizeBatchBuffer = currentOffset - batchBuffer.startOffset; auto overrideRingHead = false; auto submissionTaskCount = this->taskCount + 1; allocationsForResidency.push_back(batchBuffer.commandBufferAllocation); batchBuffer.commandBufferAllocation->updateResidencyTaskCount(submissionTaskCount, this->osContext->getContextId()); batchBuffer.commandBufferAllocation->updateTaskCount(submissionTaskCount, osContext->getContextId()); // Write allocations for residency processResidency(allocationsForResidency, 0u); if (subCaptureManager) { if (aubManager) { auto status = subCaptureManager->getSubCaptureStatus(); if (!status.wasActiveInPreviousEnqueue && status.isActive) { overrideRingHead = true; } if (!status.wasActiveInPreviousEnqueue && !status.isActive) { aubManager->pause(true); } } } submitBatchBufferTbx( batchBufferGpuAddress, pBatchBuffer, sizeBatchBuffer, this->getMemoryBank(batchBuffer.commandBufferAllocation), this->getPPGTTAdditionalBits(batchBuffer.commandBufferAllocation), overrideRingHead); if (subCaptureManager) { pollForCompletion(); subCaptureManager->disableSubCapture(); } return SubmissionStatus::success; } template void TbxCommandStreamReceiverHw::submitBatchBufferTbx(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead) { if (hardwareContextController && batchBufferSize) { hardwareContextController->submit(batchBufferGpuAddress, batchBuffer, batchBufferSize, memoryBank, MemoryConstants::pageSize64k, overrideRingHead); } } template void TbxCommandStreamReceiverHw::pollForCompletion(bool skipTaskCountCheck) { if (hardwareContextController) { hardwareContextController->pollForCompletion(); } } template void TbxCommandStreamReceiverHw::writeMemory(uint64_t gpuAddress, void *cpuAddress, size_t size, uint32_t memoryBank, uint64_t entryBits) { UNRECOVERABLE_IF(!isEngineInitialized); } template bool TbxCommandStreamReceiverHw::writeMemory(GraphicsAllocation &gfxAllocation, bool isChunkCopy, uint64_t gpuVaChunkOffset, size_t chunkSize) { uint64_t gpuAddress; void *cpuAddress; size_t size; if (!this->getParametersForMemory(gfxAllocation, gpuAddress, cpuAddress, size)) { return false; } auto allocType = gfxAllocation.getAllocationType(); this->registerAllocationWithTbxFaultMngrIfTbxFaultable(&gfxAllocation, cpuAddress, size); if (!this->isTbxWritable(gfxAllocation)) { return false; } this->protectCPUMemoryFromWritesIfTbxFaultable(&gfxAllocation, cpuAddress, size); initializeEngine(); if (aubManager) { this->writeMemoryWithAubManager(gfxAllocation, isChunkCopy, gpuVaChunkOffset, chunkSize); } else { if (isChunkCopy) { gpuAddress += gpuVaChunkOffset; cpuAddress = ptrOffset(cpuAddress, static_cast(gpuVaChunkOffset)); size = chunkSize; } writeMemory(gpuAddress, cpuAddress, size, this->getMemoryBank(&gfxAllocation), this->getPPGTTAdditionalBits(&gfxAllocation)); } if (AubHelper::isOneTimeAubWritableAllocationType(allocType)) { this->setTbxWritable(false, gfxAllocation); } this->protectCPUMemoryAccessIfTbxFaultable(&gfxAllocation, cpuAddress, size); return true; } template void TbxCommandStreamReceiverHw::writeMMIO(uint32_t offset, uint32_t value) { if (hardwareContextController) { hardwareContextController->writeMMIO(offset, value); } } template bool TbxCommandStreamReceiverHw::expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation) { if (hardwareContextController) { auto readMemory = std::make_unique(length); // note: memory bank should not matter assuming that we call expect on the memory that was previously allocated hardwareContextController->readMemory((uint64_t)gfxAddress, readMemory.get(), length, this->getMemoryBankForGtt(), MemoryConstants::pageSize64k); auto isMemoryEqual = (memcmp(readMemory.get(), srcAddress, length) == 0); auto isEqualMemoryExpected = (compareOperation == aub_stream::CompareOperationValues::CompareEqual); hardwareContextController->expectMemory(reinterpret_cast(gfxAddress), srcAddress, length, compareOperation); return (isMemoryEqual == isEqualMemoryExpected); } return BaseClass::expectMemory(gfxAddress, srcAddress, length, compareOperation); } template void TbxCommandStreamReceiverHw::writePooledMemory(SharedPoolAllocation &sharedPoolAllocation, bool initFullPageTables) { auto &gfxAllocation = *sharedPoolAllocation.getGraphicsAllocation(); auto writeMemoryOperation = [&]() { constexpr uint32_t allBanks = std::numeric_limits::max(); if (initFullPageTables && gfxAllocation.isTbxWritable(allBanks)) { writeMemory(gfxAllocation, false, 0, 0); } gfxAllocation.setTbxWritable(true, allBanks); [[maybe_unused]] const auto writeMemoryStatus = writeMemory(gfxAllocation, true, sharedPoolAllocation.getOffset(), sharedPoolAllocation.getSize()); DEBUG_BREAK_IF(!writeMemoryStatus); gfxAllocation.setTbxWritable(false, allBanks); }; if (auto mutex = sharedPoolAllocation.getMutex(); mutex) { std::lock_guard lock(*mutex); writeMemoryOperation(); } else { writeMemoryOperation(); } } template void TbxCommandStreamReceiverHw::flushSubmissionsAndDownloadAllocations(TaskCountType taskCountToWait, bool skipAllocationsDownload) { this->flushBatchedSubmissions(); if (this->latestFlushedTaskCount < taskCountToWait) { this->flushTagUpdate(); } volatile TagAddressType *pollAddress = this->getTagAddress(); for (uint32_t i = 0; i < this->activePartitions; i++) { while (*pollAddress < this->latestFlushedTaskCount) { this->downloadAllocation(*this->getTagAllocation()); } pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset); } if (skipAllocationsDownload) { return; } auto lockCSR = this->obtainUniqueOwnership(); for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) { this->downloadAllocation(*graphicsAllocation); } this->allocationsForDownload.clear(); } template WaitStatus TbxCommandStreamReceiverHw::waitForTaskCountWithKmdNotifyFallback(TaskCountType taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, QueueThrottle throttle) { flushSubmissionsAndDownloadAllocations(taskCountToWait, false); return BaseClass::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, useQuickKmdSleep, throttle); } template WaitStatus TbxCommandStreamReceiverHw::waitForCompletionWithTimeout(const WaitParams ¶ms, TaskCountType taskCountToWait) { flushSubmissionsAndDownloadAllocations(taskCountToWait, params.skipTbxDownload); return BaseClass::waitForCompletionWithTimeout(params, taskCountToWait); } template void TbxCommandStreamReceiverHw::processEviction() { auto lockCSR = this->obtainUniqueOwnership(); BaseClass::processEviction(); } template void TbxCommandStreamReceiverHw::makeNonResident(GraphicsAllocation &gfxAllocation) { auto lock = this->obtainUniqueOwnership(); if (gfxAllocation.isResident(osContext->getContextId())) { this->allocationsForDownload.insert(&gfxAllocation); } BaseClass::makeNonResident(gfxAllocation); } template SubmissionStatus TbxCommandStreamReceiverHw::processResidency(ResidencyContainer &allocationsForResidency, uint32_t handleId) { for (auto &gfxAllocation : allocationsForResidency) { if (dumpTbxNonWritable) { this->setTbxWritable(true, *gfxAllocation); } if (!writeMemory(*gfxAllocation)) { DEBUG_BREAK_IF(!((gfxAllocation->getUnderlyingBufferSize() == 0) || !this->isTbxWritable(*gfxAllocation))); } gfxAllocation->updateResidencyTaskCount(this->taskCount + 1, this->osContext->getContextId()); } if (this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex]->memoryOperationsInterface) { this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex]->memoryOperationsInterface->processFlushResidency(this); } dumpTbxNonWritable = false; return SubmissionStatus::success; } template void TbxCommandStreamReceiverHw::downloadAllocationTbx(GraphicsAllocation &gfxAllocation) { uint64_t gpuAddress = 0; void *cpuAddress = nullptr; size_t size = 0; auto hostFunctionsActive = this->hostFunctionStreamer.get() != nullptr; std::unique_lock lockCsr(this->tagAllocationDownloadMutex, std::defer_lock); if (hostFunctionsActive && gfxAllocation.getAllocationType() == AllocationType::tagBuffer) { lockCsr.lock(); } this->getParametersForMemory(gfxAllocation, gpuAddress, cpuAddress, size); this->allowCPUMemoryAccessIfTbxFaultable(&gfxAllocation, cpuAddress, size); if (hardwareContextController) { hardwareContextController->readMemory(gpuAddress, cpuAddress, size, this->getMemoryBank(&gfxAllocation), gfxAllocation.getUsedPageSize()); this->protectCPUMemoryFromWritesIfTbxFaultable(&gfxAllocation, cpuAddress, size); } } template void TbxCommandStreamReceiverHw::downloadAllocations(bool blockingWait, TaskCountType taskCount) { volatile TagAddressType *pollAddress = this->getTagAddress(); auto waitTaskCount = std::min(taskCount, this->latestFlushedTaskCount.load()); for (uint32_t i = 0; i < this->activePartitions; i++) { if (*pollAddress < waitTaskCount) { this->downloadAllocation(*this->getTagAllocation()); auto startTime = std::chrono::high_resolution_clock::now(); uint64_t timeDiff = 0; while (*pollAddress < waitTaskCount) { if (!blockingWait) { // Additional delay to reach PC in case of Event wait timeDiff = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - startTime).count(); if (timeDiff > getNonBlockingDownloadTimeoutMs()) { return; } } this->downloadAllocation(*this->getTagAllocation()); } } pollAddress = ptrOffset(pollAddress, this->immWritePostSyncWriteOffset); } auto lockCSR = this->obtainUniqueOwnership(); std::vector notReadyAllocations; for (GraphicsAllocation *graphicsAllocation : this->allocationsForDownload) { this->downloadAllocation(*graphicsAllocation); // Used again while waiting for completion. Another download will be needed. if (graphicsAllocation->getTaskCount(this->osContext->getContextId()) > taskCount) { notReadyAllocations.push_back(graphicsAllocation); } } this->allocationsForDownload.clear(); this->allocationsForDownload = std::set(notReadyAllocations.begin(), notReadyAllocations.end()); } template uint32_t TbxCommandStreamReceiverHw::getMaskAndValueForPollForCompletion() const { return 0x100; } template bool TbxCommandStreamReceiverHw::getpollNotEqualValueForPollForCompletion() const { return false; } template AubSubCaptureStatus TbxCommandStreamReceiverHw::checkAndActivateAubSubCapture(const std::string &kernelName) { if (!subCaptureManager) { return {false, false}; } auto status = subCaptureManager->checkAndActivateSubCapture(kernelName); if (status.isActive && !status.wasActiveInPreviousEnqueue) { dumpTbxNonWritable = true; } return status; } template void TbxCommandStreamReceiverHw::dumpAllocation(GraphicsAllocation &gfxAllocation) { if (!hardwareContextController) { return; } bool isBcsCsr = EngineHelpers::isBcs(this->osContext->getEngineType()); if (isBcsCsr != gfxAllocation.getAubInfo().bcsDumpOnly) { return; } if (debugManager.flags.AUBDumpAllocsOnEnqueueReadOnly.get() || debugManager.flags.AUBDumpAllocsOnEnqueueSVMMemcpyOnly.get()) { if (!gfxAllocation.isAllocDumpable()) { return; } gfxAllocation.setAllocDumpable(false, isBcsCsr); } auto dumpFormat = AubAllocDump::getDumpFormat(gfxAllocation); auto surfaceInfo = std::unique_ptr(AubAllocDump::getDumpSurfaceInfo(gfxAllocation, *this->peekGmmHelper(), dumpFormat)); if (surfaceInfo) { hardwareContextController->pollForCompletion(); hardwareContextController->dumpSurface(*surfaceInfo.get()); } } template void TbxCommandStreamReceiverHw::removeDownloadAllocation(GraphicsAllocation *alloc) { auto lockCSR = this->obtainUniqueOwnership(); this->allocationsForDownload.erase(alloc); auto faultManager = getTbxPageFaultManager(); if (faultManager != nullptr) { faultManager->removeAllocation(alloc); } } } // namespace NEO