/* * Copyright (C) 2020-2022 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/submissions_aggregator.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/device/device.h" #include "shared/source/direct_submission/direct_submission_hw.h" #include "shared/source/direct_submission/direct_submission_hw_diagnostic_mode.h" #include "shared/source/direct_submission/relaxed_ordering_helper.h" #include "shared/source/helpers/flush_stamp.h" #include "shared/source/helpers/logical_state_helper.h" #include "shared/source/helpers/ptr_math.h" #include "shared/source/memory_manager/allocation_properties.h" #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/memory_operations_handler.h" #include "shared/source/os_interface/hw_info_config.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/utilities/cpu_info.h" #include "shared/source/utilities/cpuintrinsics.h" #include "create_direct_submission_hw.inl" #include namespace NEO { template DirectSubmissionHw::DirectSubmissionHw(const DirectSubmissionInputParams &inputParams) : ringBuffers(RingBufferUse::initialRingBufferCount), osContext(inputParams.osContext), rootDeviceIndex(inputParams.rootDeviceIndex) { memoryManager = inputParams.memoryManager; globalFenceAllocation = inputParams.globalFenceAllocation; logicalStateHelper = inputParams.logicalStateHelper; hwInfo = inputParams.rootDeviceEnvironment.getHardwareInfo(); memoryOperationHandler = inputParams.rootDeviceEnvironment.memoryOperationsInterface.get(); auto productHelper = ProductHelper::get(hwInfo->platform.eProductFamily); disableCacheFlush = UllsDefaults::defaultDisableCacheFlush; disableMonitorFence = UllsDefaults::defaultDisableMonitorFence; if (DebugManager.flags.DirectSubmissionMaxRingBuffers.get() != -1) { this->maxRingBufferCount = DebugManager.flags.DirectSubmissionMaxRingBuffers.get(); } if (DebugManager.flags.DirectSubmissionDisableCacheFlush.get() != -1) { disableCacheFlush = !!DebugManager.flags.DirectSubmissionDisableCacheFlush.get(); } miMemFenceRequired = productHelper->isGlobalFenceInDirectSubmissionRequired(*hwInfo); if (DebugManager.flags.DirectSubmissionInsertExtraMiMemFenceCommands.get() == 0) { miMemFenceRequired = false; } if (DebugManager.flags.DirectSubmissionInsertSfenceInstructionPriorToSubmission.get() != -1) { sfenceMode = static_cast(DebugManager.flags.DirectSubmissionInsertSfenceInstructionPriorToSubmission.get()); } int32_t disableCacheFlushKey = DebugManager.flags.DirectSubmissionDisableCpuCacheFlush.get(); if (disableCacheFlushKey != -1) { disableCpuCacheFlush = disableCacheFlushKey == 1 ? true : false; } isDisablePrefetcherRequired = productHelper->isPrefetcherDisablingInDirectSubmissionRequired(); if (DebugManager.flags.DirectSubmissionDisablePrefetcher.get() != -1) { isDisablePrefetcherRequired = !!DebugManager.flags.DirectSubmissionDisablePrefetcher.get(); } UNRECOVERABLE_IF(!CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureClflush) && !disableCpuCacheFlush); createDiagnostic(); setPostSyncOffset(); dcFlushRequired = MemorySynchronizationCommands::getDcFlushEnable(true, *hwInfo); relaxedOrderingEnabled = GfxCoreHelperHw::get().isRelaxedOrderingSupported(); if (DebugManager.flags.DirectSubmissionRelaxedOrdering.get() != -1) { relaxedOrderingEnabled = (DebugManager.flags.DirectSubmissionRelaxedOrdering.get() == 1); } if (EngineHelpers::isBcs(this->osContext.getEngineType()) && relaxedOrderingEnabled) { relaxedOrderingEnabled = (DebugManager.flags.DirectSubmissionRelaxedOrderingForBcs.get() != 0); } } template void DirectSubmissionHw::dispatchStaticRelaxedOrderingScheduler() { LinearStream schedulerCmdStream(this->relaxedOrderingSchedulerAllocation); uint64_t schedulerStartAddress = schedulerCmdStream.getGpuBase(); uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress(); uint64_t loopSectionStartAddress = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart; // 1. Init section { EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::Disable); EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( schedulerCmdStream, schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::semaphoreSectionJumpStart, CS_GPR_R1, 0, CompareOperation::Equal, false); LriHelper::program(&schedulerCmdStream, CS_GPR_R2, 0, true); LriHelper::program(&schedulerCmdStream, CS_GPR_R2 + 4, 0, true); uint64_t removeTaskVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::removeTaskSectionStart; LriHelper::program(&schedulerCmdStream, CS_GPR_R3, static_cast(removeTaskVa & 0xFFFF'FFFFULL), true); LriHelper::program(&schedulerCmdStream, CS_GPR_R3 + 4, static_cast(removeTaskVa >> 32), true); uint64_t walkersLoopConditionCheckVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::tasksListLoopCheckSectionStart; LriHelper::program(&schedulerCmdStream, CS_GPR_R4, static_cast(walkersLoopConditionCheckVa & 0xFFFF'FFFFULL), true); LriHelper::program(&schedulerCmdStream, CS_GPR_R4 + 4, static_cast(walkersLoopConditionCheckVa >> 32), true); } // 2. Dispatch task section (loop start) { EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::Disable); LriHelper::program(&schedulerCmdStream, CS_GPR_R6, 8, true); LriHelper::program(&schedulerCmdStream, CS_GPR_R6 + 4, 0, true); LriHelper::program(&schedulerCmdStream, CS_GPR_R8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true); LriHelper::program(&schedulerCmdStream, CS_GPR_R8 + 4, static_cast(deferredTasksListGpuVa >> 32), true); EncodeAluHelper aluHelper; aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_2); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_6); aluHelper.setNextAlu(AluRegisters::OPCODE_SHL); aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_7, AluRegisters::R_ACCU); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_7); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_8); aluHelper.setNextAlu(AluRegisters::OPCODE_ADD); aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_6, AluRegisters::R_ACCU); aluHelper.setNextAlu(AluRegisters::OPCODE_LOADIND, AluRegisters::R_0, AluRegisters::R_ACCU); aluHelper.setNextAlu(AluRegisters::OPCODE_FENCE_RD); aluHelper.copyToCmdStream(schedulerCmdStream); EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerCmdStream, 0, false, true, false); } // 3. Remove task section { EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::Disable); EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::R_1); EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::R_2); EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( schedulerCmdStream, schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::semaphoreSectionJumpStart, CS_GPR_R1, 0, CompareOperation::Equal, false); LriHelper::program(&schedulerCmdStream, CS_GPR_R7, 8, true); LriHelper::program(&schedulerCmdStream, CS_GPR_R7 + 4, 0, true); LriHelper::program(&schedulerCmdStream, CS_GPR_R8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true); LriHelper::program(&schedulerCmdStream, CS_GPR_R8 + 4, static_cast(deferredTasksListGpuVa >> 32), true); EncodeAluHelper aluHelper; aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_1); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_7); aluHelper.setNextAlu(AluRegisters::OPCODE_SHL); aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_7, AluRegisters::R_ACCU); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_7); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_8); aluHelper.setNextAlu(AluRegisters::OPCODE_ADD); aluHelper.setNextAlu(AluRegisters::OPCODE_LOADIND, AluRegisters::R_7, AluRegisters::R_ACCU); aluHelper.setNextAlu(AluRegisters::OPCODE_FENCE_RD); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_6); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD0, AluRegisters::R_SRCB, AluRegisters::OPCODE_NONE); aluHelper.setNextAlu(AluRegisters::OPCODE_ADD); aluHelper.setNextAlu(AluRegisters::OPCODE_STOREIND, AluRegisters::R_ACCU, AluRegisters::R_7); aluHelper.setNextAlu(AluRegisters::OPCODE_FENCE_WR); aluHelper.copyToCmdStream(schedulerCmdStream); } // 4. List loop check section { EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::Disable); EncodeMathMMIO::encodeIncrement(schedulerCmdStream, AluRegisters::R_2); EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferStart( schedulerCmdStream, loopSectionStartAddress, AluRegisters::R_1, AluRegisters::R_2, CompareOperation::NotEqual, false); LriHelper::program(&schedulerCmdStream, CS_GPR_R2, 0, true); LriHelper::program(&schedulerCmdStream, CS_GPR_R2 + 4, 0, true); } // 5. Drain request section { *schedulerCmdStream.getSpaceForCmd() = GfxFamily::cmdInitArbCheck; uint32_t queueSizeLimit = 2; if (DebugManager.flags.DirectSubmissionRelaxedOrderingQueueSizeLimit.get() != -1) { queueSizeLimit = static_cast(DebugManager.flags.DirectSubmissionRelaxedOrderingQueueSizeLimit.get()); } EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( schedulerCmdStream, loopSectionStartAddress, CS_GPR_R1, queueSizeLimit, CompareOperation::GreaterOrEqual, false); EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( schedulerCmdStream, loopSectionStartAddress, CS_GPR_R5, 1, CompareOperation::Equal, false); } // Exit Static scheduler // 6. Jump to scheduler loop check section (dynamic scheduler) EncodeSetMMIO::encodeREG(schedulerCmdStream, CS_GPR_R0, CS_GPR_R9); EncodeSetMMIO::encodeREG(schedulerCmdStream, CS_GPR_R0 + 4, CS_GPR_R9 + 4); EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerCmdStream, 0, false, true, false); // 7. Jump to Semaphore section (dynamic scheduler) EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::Disable); LriHelper::program(&schedulerCmdStream, CS_GPR_R10, static_cast(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::schedulerLoopCheckSectionSize), true); LriHelper::program(&schedulerCmdStream, CS_GPR_R10 + 4, 0, true); EncodeAluHelper aluHelper; aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_9); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_10); aluHelper.setNextAlu(AluRegisters::OPCODE_ADD); aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_0, AluRegisters::R_ACCU); aluHelper.copyToCmdStream(schedulerCmdStream); EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerCmdStream, 0, false, true, false); } template void DirectSubmissionHw::dispatchRelaxedOrderingSchedulerSection(uint32_t value) { LinearStream schedulerCmdStream(this->preinitializedRelaxedOrderingScheduler.get(), RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); // 1. Init section uint64_t schedulerStartVa = ringCommandStream.getCurrentGpuAddressPosition(); uint64_t schedulerLoopCheckVa = schedulerStartVa + RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::schedulerLoopCheckSectionStart; LriHelper::program(&schedulerCmdStream, CS_GPR_R9, static_cast(schedulerLoopCheckVa & 0xFFFF'FFFFULL), true); LriHelper::program(&schedulerCmdStream, CS_GPR_R9 + 4, static_cast(schedulerLoopCheckVa >> 32), true); schedulerCmdStream.getSpace(sizeof(typename GfxFamily::MI_BATCH_BUFFER_START)); // skip patching // 2. Scheduler loop check section { EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart( schedulerCmdStream, schedulerStartVa + RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::endSectionStart, semaphoreGpuVa, value, CompareOperation::GreaterOrEqual, false); schedulerCmdStream.getSpace(sizeof(typename GfxFamily::MI_BATCH_BUFFER_START)); // skip patching } // 3. Semaphore section { using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; schedulerCmdStream.getSpace(EncodeMiPredicate::getCmdSize()); // skip patching EncodeSempahore::addMiSemaphoreWaitCommand(schedulerCmdStream, semaphoreGpuVa, value, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); } // skip patching End section auto dst = ringCommandStream.getSpace(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); memcpy_s(dst, RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize, this->preinitializedRelaxedOrderingScheduler.get(), RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); } template DirectSubmissionHw::~DirectSubmissionHw() = default; template bool DirectSubmissionHw::allocateResources() { DirectSubmissionAllocations allocations; bool isMultiOsContextCapable = osContext.getNumSupportedDevices() > 1u; constexpr size_t minimumRequiredSize = 256 * MemoryConstants::kiloByte; constexpr size_t additionalAllocationSize = MemoryConstants::pageSize; const auto allocationSize = alignUp(minimumRequiredSize + additionalAllocationSize, MemoryConstants::pageSize64k); const AllocationProperties commandStreamAllocationProperties{rootDeviceIndex, true, allocationSize, AllocationType::RING_BUFFER, isMultiOsContextCapable, false, osContext.getDeviceBitfield()}; for (uint32_t ringBufferIndex = 0; ringBufferIndex < RingBufferUse::initialRingBufferCount; ringBufferIndex++) { auto ringBuffer = memoryManager->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties); this->ringBuffers[ringBufferIndex].ringBuffer = ringBuffer; UNRECOVERABLE_IF(ringBuffer == nullptr); allocations.push_back(ringBuffer); memset(ringBuffer->getUnderlyingBuffer(), 0, allocationSize); } const AllocationProperties semaphoreAllocationProperties{rootDeviceIndex, true, MemoryConstants::pageSize, AllocationType::SEMAPHORE_BUFFER, isMultiOsContextCapable, false, osContext.getDeviceBitfield()}; semaphores = memoryManager->allocateGraphicsMemoryWithProperties(semaphoreAllocationProperties); UNRECOVERABLE_IF(semaphores == nullptr); allocations.push_back(semaphores); if (this->workPartitionAllocation != nullptr) { allocations.push_back(workPartitionAllocation); } if (completionFenceAllocation != nullptr) { allocations.push_back(completionFenceAllocation); } if (this->relaxedOrderingEnabled) { const AllocationProperties allocationProperties(rootDeviceIndex, true, MemoryConstants::pageSize64k, AllocationType::DEFERRED_TASKS_LIST, isMultiOsContextCapable, false, osContext.getDeviceBitfield()); deferredTasksListAllocation = memoryManager->allocateGraphicsMemoryWithProperties(allocationProperties); UNRECOVERABLE_IF(deferredTasksListAllocation == nullptr); allocations.push_back(deferredTasksListAllocation); const AllocationProperties relaxedOrderingSchedulerAllocationProperties(rootDeviceIndex, true, MemoryConstants::pageSize64k, AllocationType::COMMAND_BUFFER, isMultiOsContextCapable, false, osContext.getDeviceBitfield()); relaxedOrderingSchedulerAllocation = memoryManager->allocateGraphicsMemoryWithProperties(relaxedOrderingSchedulerAllocationProperties); UNRECOVERABLE_IF(relaxedOrderingSchedulerAllocation == nullptr); allocations.push_back(relaxedOrderingSchedulerAllocation); } if (DebugManager.flags.DirectSubmissionPrintBuffers.get()) { for (uint32_t ringBufferIndex = 0; ringBufferIndex < RingBufferUse::initialRingBufferCount; ringBufferIndex++) { const auto ringBuffer = this->ringBuffers[ringBufferIndex].ringBuffer; printf("Ring buffer %u - gpu address: %" PRIx64 " - %" PRIx64 ", cpu address: %p - %p, size: %zu \n", ringBufferIndex, ringBuffer->getGpuAddress(), ptrOffset(ringBuffer->getGpuAddress(), ringBuffer->getUnderlyingBufferSize()), ringBuffer->getUnderlyingBuffer(), ptrOffset(ringBuffer->getUnderlyingBuffer(), ringBuffer->getUnderlyingBufferSize()), ringBuffer->getUnderlyingBufferSize()); } } handleResidency(); ringCommandStream.replaceBuffer(this->ringBuffers[0u].ringBuffer->getUnderlyingBuffer(), minimumRequiredSize); ringCommandStream.replaceGraphicsAllocation(this->ringBuffers[0].ringBuffer); semaphorePtr = semaphores->getUnderlyingBuffer(); semaphoreGpuVa = semaphores->getGpuAddress(); semaphoreData = static_cast(semaphorePtr); memset(semaphorePtr, 0, sizeof(RingSemaphoreData)); semaphoreData->QueueWorkCount = 0; cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize); workloadModeOneStoreAddress = static_cast(&semaphoreData->DiagnosticModeCounter); *static_cast(workloadModeOneStoreAddress) = 0u; this->gpuVaForMiFlush = this->semaphoreGpuVa + offsetof(RingSemaphoreData, miFlushSpace); auto ret = makeResourcesResident(allocations); return ret && allocateOsResources(); } template bool DirectSubmissionHw::makeResourcesResident(DirectSubmissionAllocations &allocations) { auto ret = memoryOperationHandler->makeResidentWithinOsContext(&this->osContext, ArrayRef(allocations), false) == MemoryOperationsStatus::SUCCESS; return ret; } template inline void DirectSubmissionHw::unblockGpu() { if (sfenceMode >= DirectSubmissionSfenceMode::BeforeSemaphoreOnly) { CpuIntrinsics::sfence(); } semaphoreData->QueueWorkCount = currentQueueWorkCount; if (sfenceMode == DirectSubmissionSfenceMode::BeforeAndAfterSemaphore) { CpuIntrinsics::sfence(); } } template inline void DirectSubmissionHw::cpuCachelineFlush(void *ptr, size_t size) { if (disableCpuCacheFlush) { return; } constexpr size_t cachlineBit = 6; static_assert(MemoryConstants::cacheLineSize == 1 << cachlineBit, "cachlineBit has invalid value"); char *flushPtr = reinterpret_cast(ptr); char *flushEndPtr = reinterpret_cast(ptr) + size; flushPtr = alignDown(flushPtr, MemoryConstants::cacheLineSize); flushEndPtr = alignUp(flushEndPtr, MemoryConstants::cacheLineSize); size_t cachelines = (flushEndPtr - flushPtr) >> cachlineBit; for (size_t i = 0; i < cachelines; i++) { CpuIntrinsics::clFlush(flushPtr); flushPtr += MemoryConstants::cacheLineSize; } } template bool DirectSubmissionHw::initialize(bool submitOnInit, bool useNotify) { useNotifyForPostSync = useNotify; bool ret = allocateResources(); initDiagnostic(submitOnInit); if (ret && submitOnInit) { size_t startBufferSize = Dispatcher::getSizePreemption() + getSizeSemaphoreSection(false); Dispatcher::dispatchPreemption(ringCommandStream); if (this->partitionedMode) { startBufferSize += getSizePartitionRegisterConfigurationSection(); dispatchPartitionRegisterConfiguration(); this->partitionConfigSet = true; } if (this->miMemFenceRequired) { startBufferSize += getSizeSystemMemoryFenceAddress(); dispatchSystemMemoryFenceAddress(); this->systemMemoryFenceAddressSet = true; } if (this->relaxedOrderingEnabled) { preinitializeRelaxedOrderingSections(); initRelaxedOrderingRegisters(); dispatchStaticRelaxedOrderingScheduler(); startBufferSize += RelaxedOrderingHelper::getSizeRegistersInit(); this->relaxedOrderingInitialized = true; } if (workloadMode == 1) { dispatchDiagnosticModeSection(); startBufferSize += getDiagnosticModeSection(); } dispatchSemaphoreSection(currentQueueWorkCount); ringStart = submit(ringCommandStream.getGraphicsAllocation()->getGpuAddress(), startBufferSize); performDiagnosticMode(); return ringStart; } return ret; } template bool DirectSubmissionHw::startRingBuffer() { if (ringStart) { return true; } size_t startSize = getSizeSemaphoreSection(false); if (!this->partitionConfigSet) { startSize += getSizePartitionRegisterConfigurationSection(); } if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) { startSize += getSizeSystemMemoryFenceAddress(); } if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) { startSize += RelaxedOrderingHelper::getSizeRegistersInit(); } size_t requiredSize = startSize + getSizeDispatch(false, false) + getSizeEnd(false); if (ringCommandStream.getAvailableSpace() < requiredSize) { switchRingBuffers(); } uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition(); if (!this->partitionConfigSet) { dispatchPartitionRegisterConfiguration(); this->partitionConfigSet = true; } if (this->miMemFenceRequired && !this->systemMemoryFenceAddressSet) { dispatchSystemMemoryFenceAddress(); this->systemMemoryFenceAddressSet = true; } if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) { preinitializeRelaxedOrderingSections(); dispatchStaticRelaxedOrderingScheduler(); initRelaxedOrderingRegisters(); this->relaxedOrderingInitialized = true; } currentQueueWorkCount++; dispatchSemaphoreSection(currentQueueWorkCount); ringStart = submit(gpuStartVa, startSize); return ringStart; } template bool DirectSubmissionHw::stopRingBuffer() { if (!ringStart) { return true; } bool relaxedOrderingSchedulerWasRequired = this->relaxedOrderingSchedulerRequired; if (this->relaxedOrderingEnabled && this->relaxedOrderingSchedulerRequired) { dispatchRelaxedOrderingQueueStall(); } void *flushPtr = ringCommandStream.getSpace(0); Dispatcher::dispatchCacheFlush(ringCommandStream, *hwInfo, gpuVaForMiFlush); if (disableMonitorFence) { TagData currentTagData = {}; getTagAddressValue(currentTagData); Dispatcher::dispatchMonitorFence(ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, *hwInfo, this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired); } Dispatcher::dispatchStopCommandBuffer(ringCommandStream); auto bytesToPad = Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer(); EncodeNoop::emitNoop(ringCommandStream, bytesToPad); EncodeNoop::alignToCacheLine(ringCommandStream); cpuCachelineFlush(flushPtr, getSizeEnd(relaxedOrderingSchedulerWasRequired)); this->unblockGpu(); cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize); this->handleStopRingBuffer(); this->ringStart = false; return true; } template inline void DirectSubmissionHw::dispatchSemaphoreSection(uint32_t value) { using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; dispatchDisablePrefetcher(true); if (this->relaxedOrderingEnabled && this->relaxedOrderingSchedulerRequired) { dispatchRelaxedOrderingSchedulerSection(value); } else { EncodeSempahore::addMiSemaphoreWaitCommand(ringCommandStream, semaphoreGpuVa, value, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); } if (miMemFenceRequired) { MemorySynchronizationCommands::addAdditionalSynchronizationForDirectSubmission(ringCommandStream, this->gpuVaForAdditionalSynchronizationWA, true, *hwInfo); } dispatchPrefetchMitigation(); dispatchDisablePrefetcher(false); } template inline size_t DirectSubmissionHw::getSizeSemaphoreSection(bool relaxedOrderingSchedulerRequired) { size_t semaphoreSize = (this->relaxedOrderingEnabled && relaxedOrderingSchedulerRequired) ? RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize : EncodeSempahore::getSizeMiSemaphoreWait(); semaphoreSize += getSizePrefetchMitigation(); if (isDisablePrefetcherRequired) { semaphoreSize += 2 * getSizeDisablePrefetcher(); } if (miMemFenceRequired) { semaphoreSize += MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronizationForDirectSubmission(*hwInfo); } return semaphoreSize; } template inline void DirectSubmissionHw::dispatchStartSection(uint64_t gpuStartAddress) { Dispatcher::dispatchStartCommandBuffer(ringCommandStream, gpuStartAddress); } template inline size_t DirectSubmissionHw::getSizeStartSection() { return Dispatcher::getSizeStartCommandBuffer(); } template inline void DirectSubmissionHw::dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress) { if (disableMonitorFence) { TagData currentTagData = {}; getTagAddressValue(currentTagData); Dispatcher::dispatchMonitorFence(ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, *hwInfo, this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired); } Dispatcher::dispatchStartCommandBuffer(ringCommandStream, nextBufferGpuAddress); } template inline size_t DirectSubmissionHw::getSizeSwitchRingBufferSection() { size_t size = Dispatcher::getSizeStartCommandBuffer(); if (disableMonitorFence) { size += Dispatcher::getSizeMonitorFence(*hwInfo); } return size; } template inline size_t DirectSubmissionHw::getSizeEnd(bool relaxedOrderingSchedulerRequired) { size_t size = Dispatcher::getSizeStopCommandBuffer() + Dispatcher::getSizeCacheFlush(*hwInfo) + (Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer()) + MemoryConstants::cacheLineSize; if (disableMonitorFence) { size += Dispatcher::getSizeMonitorFence(*hwInfo); } if (this->relaxedOrderingEnabled && relaxedOrderingSchedulerRequired) { size += getSizeDispatchRelaxedOrderingQueueStall(); } return size; } template inline size_t DirectSubmissionHw::getSizeDispatch(bool relaxedOrderingSchedulerRequired, bool returnPtrsRequired) { size_t size = getSizeSemaphoreSection(relaxedOrderingSchedulerRequired); if (workloadMode == 0) { size += getSizeStartSection(); if (this->relaxedOrderingEnabled && returnPtrsRequired) { size += RelaxedOrderingHelper::getSizeReturnPtrRegs(); } } else if (workloadMode == 1) { size += getDiagnosticModeSection(); } // mode 2 does not dispatch any commands if (!disableCacheFlush) { size += Dispatcher::getSizeCacheFlush(*hwInfo); } if (!disableMonitorFence) { size += Dispatcher::getSizeMonitorFence(*hwInfo); } size += getSizeNewResourceHandler(); return size; } template void *DirectSubmissionHw::dispatchWorkloadSection(BatchBuffer &batchBuffer) { void *currentPosition = ringCommandStream.getSpace(0); if (DebugManager.flags.DirectSubmissionPrintBuffers.get()) { printf("Client buffer:\n"); printf("Command buffer allocation - gpu address: %" PRIx64 " - %" PRIx64 ", cpu address: %p - %p, size: %zu \n", batchBuffer.commandBufferAllocation->getGpuAddress(), ptrOffset(batchBuffer.commandBufferAllocation->getGpuAddress(), batchBuffer.commandBufferAllocation->getUnderlyingBufferSize()), batchBuffer.commandBufferAllocation->getUnderlyingBuffer(), ptrOffset(batchBuffer.commandBufferAllocation->getUnderlyingBuffer(), batchBuffer.commandBufferAllocation->getUnderlyingBufferSize()), batchBuffer.commandBufferAllocation->getUnderlyingBufferSize()); printf("Command buffer - start gpu address: %" PRIx64 " - %" PRIx64 ", start cpu address: %p - %p, start offset: %zu, used size: %zu \n", ptrOffset(batchBuffer.commandBufferAllocation->getGpuAddress(), batchBuffer.startOffset), ptrOffset(ptrOffset(batchBuffer.commandBufferAllocation->getGpuAddress(), batchBuffer.startOffset), batchBuffer.usedSize), ptrOffset(batchBuffer.commandBufferAllocation->getUnderlyingBuffer(), batchBuffer.startOffset), ptrOffset(ptrOffset(batchBuffer.commandBufferAllocation->getUnderlyingBuffer(), batchBuffer.startOffset), batchBuffer.usedSize), batchBuffer.startOffset, batchBuffer.usedSize); } if (workloadMode == 0) { auto commandStreamAddress = ptrOffset(batchBuffer.commandBufferAllocation->getGpuAddress(), batchBuffer.startOffset); void *returnCmd = batchBuffer.endCmdPtr; LinearStream relaxedOrderingReturnPtrCmdStream; if (this->relaxedOrderingEnabled && batchBuffer.hasRelaxedOrderingDependencies) { // preallocate and patch after start section auto relaxedOrderingReturnPtrCmds = ringCommandStream.getSpace(RelaxedOrderingHelper::getSizeReturnPtrRegs()); relaxedOrderingReturnPtrCmdStream.replaceBuffer(relaxedOrderingReturnPtrCmds, RelaxedOrderingHelper::getSizeReturnPtrRegs()); } dispatchStartSection(commandStreamAddress); uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition(); if (this->relaxedOrderingEnabled && batchBuffer.hasRelaxedOrderingDependencies) { dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer); } else { setReturnAddress(returnCmd, returnGpuPointer); } } else if (workloadMode == 1) { DirectSubmissionDiagnostics::diagnosticModeOneDispatch(diagnostic.get()); dispatchDiagnosticModeSection(); } // mode 2 does not dispatch any commands if (this->relaxedOrderingEnabled && batchBuffer.hasRelaxedOrderingDependencies) { dispatchTaskStoreSection(batchBuffer.taskStartAddress); } if (!disableCacheFlush) { Dispatcher::dispatchCacheFlush(ringCommandStream, *hwInfo, gpuVaForMiFlush); } if (!disableMonitorFence) { TagData currentTagData = {}; getTagAddressValue(currentTagData); Dispatcher::dispatchMonitorFence(ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, *hwInfo, this->useNotifyForPostSync, this->partitionedMode, this->dcFlushRequired); } dispatchSemaphoreSection(currentQueueWorkCount + 1); return currentPosition; } template void DirectSubmissionHw::dispatchRelaxedOrderingQueueStall() { LinearStream bbStartStream(ringCommandStream.getSpace(EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart()), EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart()); LriHelper::program(&ringCommandStream, CS_GPR_R5, 1, true); dispatchSemaphoreSection(currentQueueWorkCount); // patch conditional bb_start with current GPU address EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(), CS_GPR_R1, 0, CompareOperation::Equal, false); relaxedOrderingSchedulerRequired = false; } template size_t DirectSubmissionHw::getSizeDispatchRelaxedOrderingQueueStall() { return getSizeSemaphoreSection(true) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(); } template void DirectSubmissionHw::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr) { LriHelper::program(&cmdStream, CS_GPR_R4, static_cast(returnPtr & 0xFFFF'FFFFULL), true); LriHelper::program(&cmdStream, CS_GPR_R4 + 4, static_cast(returnPtr >> 32), true); uint64_t returnPtrAfterTaskStoreSection = returnPtr; returnPtrAfterTaskStoreSection += RelaxedOrderingHelper::getSizeTaskStoreSection(); LriHelper::program(&cmdStream, CS_GPR_R3, static_cast(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true); LriHelper::program(&cmdStream, CS_GPR_R3 + 4, static_cast(returnPtrAfterTaskStoreSection >> 32), true); } template void DirectSubmissionHw::initRelaxedOrderingRegisters() { LriHelper::program(&ringCommandStream, CS_GPR_R1, 0, true); LriHelper::program(&ringCommandStream, CS_GPR_R1 + 4, 0, true); LriHelper::program(&ringCommandStream, CS_GPR_R5, 0, true); LriHelper::program(&ringCommandStream, CS_GPR_R5 + 4, 0, true); } template void DirectSubmissionHw::preinitializeRelaxedOrderingSections() { // Task store section preinitializedTaskStoreSection = std::make_unique(RelaxedOrderingHelper::getSizeTaskStoreSection()); LinearStream stream(preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection()); EncodeMiPredicate::encode(stream, MiPredicateType::Disable); uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress(); LriHelper::program(&stream, CS_GPR_R6, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true); LriHelper::program(&stream, CS_GPR_R6 + 4, static_cast(deferredTasksListGpuVa >> 32), true); // Task start VA LriHelper::program(&stream, CS_GPR_R7, 0, true); LriHelper::program(&stream, CS_GPR_R7 + 4, 0, true); // Shift by 8 = multiply by 256. Address must by 64b aligned (shift by 6), but SHL accepts only 1, 2, 4, 8, 16 and 32 LriHelper::program(&stream, CS_GPR_R8, 8, true); LriHelper::program(&stream, CS_GPR_R8 + 4, 0, true); EncodeAluHelper aluHelper; aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_1); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_8); aluHelper.setNextAlu(AluRegisters::OPCODE_SHL); aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_8, AluRegisters::R_ACCU); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_8); aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_6); aluHelper.setNextAlu(AluRegisters::OPCODE_ADD); aluHelper.setNextAlu(AluRegisters::OPCODE_STOREIND, AluRegisters::R_ACCU, AluRegisters::R_7); aluHelper.setNextAlu(AluRegisters::OPCODE_FENCE_WR); aluHelper.copyToCmdStream(stream); EncodeMathMMIO::encodeIncrement(stream, AluRegisters::R_1); UNRECOVERABLE_IF(stream.getUsed() != RelaxedOrderingHelper::getSizeTaskStoreSection()); // Scheduler section preinitializedRelaxedOrderingScheduler = std::make_unique(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); LinearStream schedulerStream(preinitializedRelaxedOrderingScheduler.get(), RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); uint64_t schedulerStartAddress = relaxedOrderingSchedulerAllocation->getGpuAddress(); // 1. Init section LriHelper::program(&schedulerStream, CS_GPR_R9, 0, true); LriHelper::program(&schedulerStream, CS_GPR_R9 + 4, 0, true); EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerStream, schedulerStartAddress, false, false, false); // 2. Scheduler loop check section { EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(schedulerStream, 0, 0, 0, CompareOperation::GreaterOrEqual, false); EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerStream, schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart, false, false, false); } // 3. Semaphore section { using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; EncodeMiPredicate::encode(schedulerStream, MiPredicateType::Disable); EncodeSempahore::addMiSemaphoreWaitCommand(schedulerStream, 0, 0, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); } // 4. End section { EncodeMiPredicate::encode(schedulerStream, MiPredicateType::Disable); LriHelper::program(&schedulerStream, CS_GPR_R5, 0, true); } UNRECOVERABLE_IF(schedulerStream.getUsed() != RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); } template void DirectSubmissionHw::dispatchTaskStoreSection(uint64_t taskStartSectionVa) { using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; constexpr size_t patchOffset = EncodeMiPredicate::getCmdSize() + (2 * sizeof(MI_LOAD_REGISTER_IMM)); auto lri = reinterpret_cast(ptrOffset(preinitializedTaskStoreSection.get(), patchOffset)); lri->setDataDword(static_cast(taskStartSectionVa & 0xFFFF'FFFFULL)); lri++; lri->setDataDword(static_cast(taskStartSectionVa >> 32)); auto dst = ringCommandStream.getSpace(RelaxedOrderingHelper::getSizeTaskStoreSection()); memcpy_s(dst, RelaxedOrderingHelper::getSizeTaskStoreSection(), preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection()); } template bool DirectSubmissionHw::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) { // for now workloads requiring cache coherency are not supported UNRECOVERABLE_IF(batchBuffer.requiresCoherency); if (batchBuffer.ringBufferRestartRequest) { this->stopRingBuffer(); } this->startRingBuffer(); bool relaxedOrderingSchedulerWillBeNeeded = (this->relaxedOrderingSchedulerRequired || batchBuffer.hasRelaxedOrderingDependencies); size_t dispatchSize = getSizeDispatch(relaxedOrderingSchedulerWillBeNeeded, batchBuffer.hasRelaxedOrderingDependencies); size_t cycleSize = getSizeSwitchRingBufferSection(); size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd(relaxedOrderingSchedulerWillBeNeeded); if (this->relaxedOrderingEnabled) { requiredMinimalSize += +RelaxedOrderingHelper::getSizeReturnPtrRegs(); if (batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) { requiredMinimalSize += getSizeDispatchRelaxedOrderingQueueStall(); } if (batchBuffer.hasRelaxedOrderingDependencies) { requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection(); } } if (ringCommandStream.getAvailableSpace() < requiredMinimalSize) { switchRingBuffers(); } if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && this->relaxedOrderingSchedulerRequired) { dispatchRelaxedOrderingQueueStall(); } this->relaxedOrderingSchedulerRequired |= batchBuffer.hasRelaxedOrderingDependencies; handleNewResourcesSubmission(); void *currentPosition = dispatchWorkloadSection(batchBuffer); cpuCachelineFlush(currentPosition, dispatchSize); handleResidency(); if (DebugManager.flags.DirectSubmissionReadBackCommandBuffer.get() == 1) { volatile auto cmdBufferStart = reinterpret_cast(batchBuffer.commandBufferAllocation->getUnderlyingBuffer()); reserved = *cmdBufferStart; } if (DebugManager.flags.DirectSubmissionReadBackRingBuffer.get() == 1) { volatile auto ringBufferStart = reinterpret_cast(ringCommandStream.getSpace(0)); reserved = *ringBufferStart; } this->unblockGpu(); cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize); currentQueueWorkCount++; DirectSubmissionDiagnostics::diagnosticModeOneSubmit(diagnostic.get()); uint64_t flushValue = updateTagValue(); flushStamp.setStamp(flushValue); return ringStart; } template inline void DirectSubmissionHw::setReturnAddress(void *returnCmd, uint64_t returnAddress) { using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; MI_BATCH_BUFFER_START cmd = GfxFamily::cmdInitBatchBufferStart; cmd.setBatchBufferStartAddress(returnAddress); cmd.setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT); MI_BATCH_BUFFER_START *returnBBStart = static_cast(returnCmd); *returnBBStart = cmd; } template inline void DirectSubmissionHw::handleNewResourcesSubmission() { } template inline size_t DirectSubmissionHw::getSizeNewResourceHandler() { return 0u; } template inline uint64_t DirectSubmissionHw::switchRingBuffers() { GraphicsAllocation *nextRingBuffer = switchRingBuffersAllocations(); void *flushPtr = ringCommandStream.getSpace(0); uint64_t currentBufferGpuVa = ringCommandStream.getCurrentGpuAddressPosition(); if (ringStart) { dispatchSwitchRingBufferSection(nextRingBuffer->getGpuAddress()); cpuCachelineFlush(flushPtr, getSizeSwitchRingBufferSection()); } ringCommandStream.replaceBuffer(nextRingBuffer->getUnderlyingBuffer(), ringCommandStream.getMaxAvailableSpace()); ringCommandStream.replaceGraphicsAllocation(nextRingBuffer); handleSwitchRingBuffers(); return currentBufferGpuVa; } template inline GraphicsAllocation *DirectSubmissionHw::switchRingBuffersAllocations() { this->previousRingBuffer = this->currentRingBuffer; GraphicsAllocation *nextAllocation = nullptr; for (uint32_t ringBufferIndex = 0; ringBufferIndex < this->ringBuffers.size(); ringBufferIndex++) { if (ringBufferIndex != this->currentRingBuffer && this->isCompleted(ringBufferIndex)) { this->currentRingBuffer = ringBufferIndex; nextAllocation = this->ringBuffers[ringBufferIndex].ringBuffer; break; } } if (nextAllocation == nullptr) { if (this->ringBuffers.size() == this->maxRingBufferCount) { this->currentRingBuffer = (this->currentRingBuffer + 1) % this->ringBuffers.size(); nextAllocation = this->ringBuffers[this->currentRingBuffer].ringBuffer; } else { bool isMultiOsContextCapable = osContext.getNumSupportedDevices() > 1u; constexpr size_t minimumRequiredSize = 256 * MemoryConstants::kiloByte; constexpr size_t additionalAllocationSize = MemoryConstants::pageSize; const auto allocationSize = alignUp(minimumRequiredSize + additionalAllocationSize, MemoryConstants::pageSize64k); const AllocationProperties commandStreamAllocationProperties{rootDeviceIndex, true, allocationSize, AllocationType::RING_BUFFER, isMultiOsContextCapable, false, osContext.getDeviceBitfield()}; nextAllocation = memoryManager->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties); this->currentRingBuffer = static_cast(this->ringBuffers.size()); this->ringBuffers.emplace_back(0ull, nextAllocation); auto ret = memoryOperationHandler->makeResidentWithinOsContext(&this->osContext, ArrayRef(&nextAllocation, 1u), false) == MemoryOperationsStatus::SUCCESS; UNRECOVERABLE_IF(!ret); } } UNRECOVERABLE_IF(this->currentRingBuffer == this->previousRingBuffer); return nextAllocation; } template void DirectSubmissionHw::deallocateResources() { for (uint32_t ringBufferIndex = 0; ringBufferIndex < this->ringBuffers.size(); ringBufferIndex++) { memoryManager->freeGraphicsMemory(this->ringBuffers[ringBufferIndex].ringBuffer); } this->ringBuffers.clear(); if (semaphores) { memoryManager->freeGraphicsMemory(semaphores); semaphores = nullptr; } memoryManager->freeGraphicsMemory(deferredTasksListAllocation); memoryManager->freeGraphicsMemory(relaxedOrderingSchedulerAllocation); } template void DirectSubmissionHw::createDiagnostic() { if (directSubmissionDiagnosticAvailable) { workloadMode = DebugManager.flags.DirectSubmissionEnableDebugBuffer.get(); if (workloadMode > 0) { disableCacheFlush = DebugManager.flags.DirectSubmissionDisableCacheFlush.get(); disableMonitorFence = DebugManager.flags.DirectSubmissionDisableMonitorFence.get(); uint32_t executions = static_cast(DebugManager.flags.DirectSubmissionDiagnosticExecutionCount.get()); diagnostic = std::make_unique( executions, workloadMode == 1, DebugManager.flags.DirectSubmissionBufferPlacement.get(), DebugManager.flags.DirectSubmissionSemaphorePlacement.get(), workloadMode, disableCacheFlush, disableMonitorFence); } } } template void DirectSubmissionHw::initDiagnostic(bool &submitOnInit) { if (directSubmissionDiagnosticAvailable) { if (diagnostic.get()) { submitOnInit = true; diagnostic->diagnosticModeAllocation(); } } } template void DirectSubmissionHw::performDiagnosticMode() { if (directSubmissionDiagnosticAvailable) { if (diagnostic.get()) { diagnostic->diagnosticModeDiagnostic(); if (workloadMode == 1) { diagnostic->diagnosticModeOneWait(workloadModeOneStoreAddress, workloadModeOneExpectedValue); } BatchBuffer dummyBuffer = {}; FlushStampTracker dummyTracker(true); for (uint32_t execution = 0; execution < diagnostic->getExecutionsCount(); execution++) { dispatchCommandBuffer(dummyBuffer, dummyTracker); if (workloadMode == 1) { diagnostic->diagnosticModeOneWaitCollect(execution, workloadModeOneStoreAddress, workloadModeOneExpectedValue); } } workloadMode = 0; disableCacheFlush = UllsDefaults::defaultDisableCacheFlush; disableMonitorFence = UllsDefaults::defaultDisableMonitorFence; diagnostic.reset(nullptr); } } } template void DirectSubmissionHw::dispatchDiagnosticModeSection() { workloadModeOneExpectedValue++; uint64_t storeAddress = semaphoreGpuVa; storeAddress += ptrDiff(workloadModeOneStoreAddress, semaphorePtr); Dispatcher::dispatchStoreDwordCommand(ringCommandStream, storeAddress, workloadModeOneExpectedValue); } template size_t DirectSubmissionHw::getDiagnosticModeSection() { return Dispatcher::getSizeStoreDwordCommand(); } template void DirectSubmissionHw::dispatchSystemMemoryFenceAddress() { EncodeMemoryFence::encodeSystemMemoryFence(ringCommandStream, this->globalFenceAllocation, this->logicalStateHelper); if (logicalStateHelper) { logicalStateHelper->writeStreamInline(ringCommandStream, false); } } template size_t DirectSubmissionHw::getSizeSystemMemoryFenceAddress() { return EncodeMemoryFence::getSystemMemoryFenceSize(); } } // namespace NEO