/* * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_stream/submissions_aggregator.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/device/device.h" #include "shared/source/direct_submission/direct_submission_hw.h" #include "shared/source/direct_submission/direct_submission_hw_diagnostic_mode.h" #include "shared/source/helpers/flush_stamp.h" #include "shared/source/helpers/ptr_math.h" #include "shared/source/memory_manager/allocation_properties.h" #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/memory_operations_handler.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/utilities/cpu_info.h" #include "shared/source/utilities/cpuintrinsics.h" #include "create_direct_submission_hw.inl" #include namespace NEO { template DirectSubmissionHw::DirectSubmissionHw(Device &device, OsContext &osContext) : device(device), osContext(osContext) { disableCacheFlush = UllsDefaults::defaultDisableCacheFlush; disableMonitorFence = UllsDefaults::defaultDisableMonitorFence; if (DebugManager.flags.DirectSubmissionDisableCacheFlush.get() != -1) { disableCacheFlush = !!DebugManager.flags.DirectSubmissionDisableCacheFlush.get(); } int32_t disableCacheFlushKey = DebugManager.flags.DirectSubmissionDisableCpuCacheFlush.get(); if (disableCacheFlushKey != -1) { disableCpuCacheFlush = disableCacheFlushKey == 1 ? true : false; } UNRECOVERABLE_IF(!CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureClflush) && !disableCpuCacheFlush); hwInfo = &device.getHardwareInfo(); createDiagnostic(); } template DirectSubmissionHw::~DirectSubmissionHw() = default; template bool DirectSubmissionHw::allocateResources() { DirectSubmissionAllocations allocations; bool isMultiOsContextCapable = osContext.getNumSupportedDevices() > 1u; MemoryManager *memoryManager = device.getExecutionEnvironment()->memoryManager.get(); constexpr size_t minimumRequiredSize = 256 * MemoryConstants::kiloByte; constexpr size_t additionalAllocationSize = MemoryConstants::pageSize; const auto allocationSize = alignUp(minimumRequiredSize + additionalAllocationSize, MemoryConstants::pageSize64k); const AllocationProperties commandStreamAllocationProperties{device.getRootDeviceIndex(), true, allocationSize, GraphicsAllocation::AllocationType::RING_BUFFER, isMultiOsContextCapable, osContext.getDeviceBitfield()}; ringBuffer = memoryManager->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties); UNRECOVERABLE_IF(ringBuffer == nullptr); allocations.push_back(ringBuffer); ringBuffer2 = memoryManager->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties); UNRECOVERABLE_IF(ringBuffer2 == nullptr); allocations.push_back(ringBuffer2); const AllocationProperties semaphoreAllocationProperties{device.getRootDeviceIndex(), true, MemoryConstants::pageSize, GraphicsAllocation::AllocationType::SEMAPHORE_BUFFER, isMultiOsContextCapable, osContext.getDeviceBitfield()}; semaphores = memoryManager->allocateGraphicsMemoryWithProperties(semaphoreAllocationProperties); UNRECOVERABLE_IF(semaphores == nullptr); allocations.push_back(semaphores); if (this->workPartitionAllocation != nullptr) { allocations.push_back(workPartitionAllocation); } handleResidency(); ringCommandStream.replaceBuffer(ringBuffer->getUnderlyingBuffer(), minimumRequiredSize); ringCommandStream.replaceGraphicsAllocation(ringBuffer); memset(ringBuffer->getUnderlyingBuffer(), 0, allocationSize); memset(ringBuffer2->getUnderlyingBuffer(), 0, allocationSize); semaphorePtr = semaphores->getUnderlyingBuffer(); semaphoreGpuVa = semaphores->getGpuAddress(); semaphoreData = static_cast(semaphorePtr); memset(semaphorePtr, 0, sizeof(RingSemaphoreData)); semaphoreData->QueueWorkCount = 0; cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize); workloadModeOneStoreAddress = static_cast(&semaphoreData->DiagnosticModeCounter); *static_cast(workloadModeOneStoreAddress) = 0u; this->gpuVaForMiFlush = this->semaphoreGpuVa + offsetof(RingSemaphoreData, miFlushSpace); auto ret = makeResourcesResident(allocations); return ret && allocateOsResources(); } template bool DirectSubmissionHw::makeResourcesResident(DirectSubmissionAllocations &allocations) { auto memoryInterface = this->device.getRootDeviceEnvironment().memoryOperationsInterface.get(); auto ret = memoryInterface->makeResidentWithinOsContext(&this->osContext, ArrayRef(allocations), false) == MemoryOperationsStatus::SUCCESS; return ret; } template inline void DirectSubmissionHw::cpuCachelineFlush(void *ptr, size_t size) { if (disableCpuCacheFlush) { return; } constexpr size_t cachlineBit = 6; static_assert(MemoryConstants::cacheLineSize == 1 << cachlineBit, "cachlineBit has invalid value"); char *flushPtr = reinterpret_cast(ptr); char *flushEndPtr = reinterpret_cast(ptr) + size; flushPtr = alignDown(flushPtr, MemoryConstants::cacheLineSize); flushEndPtr = alignUp(flushEndPtr, MemoryConstants::cacheLineSize); size_t cachelines = (flushEndPtr - flushPtr) >> cachlineBit; for (size_t i = 0; i < cachelines; i++) { CpuIntrinsics::clFlush(flushPtr); flushPtr += MemoryConstants::cacheLineSize; } } template bool DirectSubmissionHw::initialize(bool submitOnInit) { bool ret = allocateResources(); initDiagnostic(submitOnInit); if (ret && submitOnInit) { size_t startBufferSize = Dispatcher::getSizePreemption() + getSizeSemaphoreSection(); Dispatcher::dispatchPreemption(ringCommandStream); if (this->partitionedMode) { startBufferSize += getSizePartitionRegisterConfigurationSection(); dispatchPartitionRegisterConfiguration(); this->partitionConfigSet = true; } if (workloadMode == 1) { dispatchDiagnosticModeSection(); startBufferSize += getDiagnosticModeSection(); } dispatchSemaphoreSection(currentQueueWorkCount); ringStart = submit(ringCommandStream.getGraphicsAllocation()->getGpuAddress(), startBufferSize); performDiagnosticMode(); return ringStart; } return ret; } template bool DirectSubmissionHw::startRingBuffer() { if (ringStart) { return true; } size_t startSize = getSizeSemaphoreSection(); if (!this->partitionConfigSet) { startSize += getSizePartitionRegisterConfigurationSection(); } size_t requiredSize = startSize + getSizeDispatch() + getSizeEnd(); if (ringCommandStream.getAvailableSpace() < requiredSize) { switchRingBuffers(); } uint64_t gpuStartVa = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0)); if (!this->partitionConfigSet) { dispatchPartitionRegisterConfiguration(); this->partitionConfigSet = true; } currentQueueWorkCount++; dispatchSemaphoreSection(currentQueueWorkCount); ringStart = submit(gpuStartVa, startSize); return ringStart; } template bool DirectSubmissionHw::stopRingBuffer() { if (!ringStart) { return true; } void *flushPtr = ringCommandStream.getSpace(0); Dispatcher::dispatchCacheFlush(ringCommandStream, *hwInfo, gpuVaForMiFlush); if (disableMonitorFence) { TagData currentTagData = {}; getTagAddressValue(currentTagData); Dispatcher::dispatchMonitorFence(ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, *hwInfo, false, this->partitionedMode); } Dispatcher::dispatchStopCommandBuffer(ringCommandStream); auto bytesToPad = Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer(); EncodeNoop::emitNoop(ringCommandStream, bytesToPad); EncodeNoop::alignToCacheLine(ringCommandStream); cpuCachelineFlush(flushPtr, getSizeEnd()); semaphoreData->QueueWorkCount = currentQueueWorkCount; cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize); this->handleStopRingBuffer(); this->ringStart = false; return true; } template inline void DirectSubmissionHw::dispatchSemaphoreSection(uint32_t value) { using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; dispatchDisablePrefetcher(true); EncodeSempahore::addMiSemaphoreWaitCommand(ringCommandStream, semaphoreGpuVa, value, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD); dispatchPrefetchMitigation(); dispatchDisablePrefetcher(false); } template inline size_t DirectSubmissionHw::getSizeSemaphoreSection() { size_t semaphoreSize = EncodeSempahore::getSizeMiSemaphoreWait(); semaphoreSize += getSizePrefetchMitigation(); semaphoreSize += 2 * getSizeDisablePrefetcher(); return semaphoreSize; } template inline void DirectSubmissionHw::dispatchStartSection(uint64_t gpuStartAddress) { Dispatcher::dispatchStartCommandBuffer(ringCommandStream, gpuStartAddress); } template inline size_t DirectSubmissionHw::getSizeStartSection() { return Dispatcher::getSizeStartCommandBuffer(); } template inline void DirectSubmissionHw::dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress) { if (disableMonitorFence) { TagData currentTagData = {}; getTagAddressValue(currentTagData); Dispatcher::dispatchMonitorFence(ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, *hwInfo, false, this->partitionedMode); } Dispatcher::dispatchStartCommandBuffer(ringCommandStream, nextBufferGpuAddress); } template inline size_t DirectSubmissionHw::getSizeSwitchRingBufferSection() { size_t size = Dispatcher::getSizeStartCommandBuffer(); if (disableMonitorFence) { size += Dispatcher::getSizeMonitorFence(*hwInfo); } return size; } template inline size_t DirectSubmissionHw::getSizeEnd() { size_t size = Dispatcher::getSizeStopCommandBuffer() + Dispatcher::getSizeCacheFlush(*hwInfo) + (Dispatcher::getSizeStartCommandBuffer() - Dispatcher::getSizeStopCommandBuffer()) + MemoryConstants::cacheLineSize; if (disableMonitorFence) { size += Dispatcher::getSizeMonitorFence(*hwInfo); } return size; } template inline uint64_t DirectSubmissionHw::getCommandBufferPositionGpuAddress(void *position) { void *currentBase = ringCommandStream.getCpuBase(); size_t offset = ptrDiff(position, currentBase); return ringCommandStream.getGraphicsAllocation()->getGpuAddress() + static_cast(offset); } template inline size_t DirectSubmissionHw::getSizeDispatch() { size_t size = getSizeSemaphoreSection(); if (workloadMode == 0) { size += getSizeStartSection(); } else if (workloadMode == 1) { size += getDiagnosticModeSection(); } //mode 2 does not dispatch any commands if (!disableCacheFlush) { size += Dispatcher::getSizeCacheFlush(*hwInfo); } if (!disableMonitorFence) { size += Dispatcher::getSizeMonitorFence(*hwInfo); } size += getSizeNewResourceHandler(); return size; } template void *DirectSubmissionHw::dispatchWorkloadSection(BatchBuffer &batchBuffer) { void *currentPosition = ringCommandStream.getSpace(0); if (workloadMode == 0) { auto commandStreamAddress = ptrOffset(batchBuffer.commandBufferAllocation->getGpuAddress(), batchBuffer.startOffset); void *returnCmd = batchBuffer.endCmdPtr; dispatchStartSection(commandStreamAddress); void *returnPosition = ringCommandStream.getSpace(0); setReturnAddress(returnCmd, getCommandBufferPositionGpuAddress(returnPosition)); } else if (workloadMode == 1) { DirectSubmissionDiagnostics::diagnosticModeOneDispatch(diagnostic.get()); dispatchDiagnosticModeSection(); } //mode 2 does not dispatch any commands if (!disableCacheFlush) { Dispatcher::dispatchCacheFlush(ringCommandStream, *hwInfo, gpuVaForMiFlush); } if (!disableMonitorFence) { TagData currentTagData = {}; getTagAddressValue(currentTagData); Dispatcher::dispatchMonitorFence(ringCommandStream, currentTagData.tagAddress, currentTagData.tagValue, *hwInfo, false, this->partitionedMode); } dispatchSemaphoreSection(currentQueueWorkCount + 1); return currentPosition; } template bool DirectSubmissionHw::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) { //for now workloads requiring cache coherency are not supported UNRECOVERABLE_IF(batchBuffer.requiresCoherency); this->startRingBuffer(); size_t dispatchSize = getSizeDispatch(); size_t cycleSize = getSizeSwitchRingBufferSection(); size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd(); bool buffersSwitched = false; getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0)); if (ringCommandStream.getAvailableSpace() < requiredMinimalSize) { switchRingBuffers(); buffersSwitched = true; } handleNewResourcesSubmission(); void *currentPosition = dispatchWorkloadSection(batchBuffer); cpuCachelineFlush(currentPosition, dispatchSize); handleResidency(); //unblock GPU semaphoreData->QueueWorkCount = currentQueueWorkCount; cpuCachelineFlush(semaphorePtr, MemoryConstants::cacheLineSize); currentQueueWorkCount++; DirectSubmissionDiagnostics::diagnosticModeOneSubmit(diagnostic.get()); uint64_t flushValue = updateTagValue(); flushStamp.setStamp(flushValue); return ringStart; } template inline void DirectSubmissionHw::setReturnAddress(void *returnCmd, uint64_t returnAddress) { using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; MI_BATCH_BUFFER_START cmd = GfxFamily::cmdInitBatchBufferStart; cmd.setBatchBufferStartAddressGraphicsaddress472(returnAddress); cmd.setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT); MI_BATCH_BUFFER_START *returnBBStart = static_cast(returnCmd); *returnBBStart = cmd; } template inline void DirectSubmissionHw::handleNewResourcesSubmission() { } template inline size_t DirectSubmissionHw::getSizeNewResourceHandler() { return 0u; } template inline uint64_t DirectSubmissionHw::switchRingBuffers() { GraphicsAllocation *nextRingBuffer = switchRingBuffersAllocations(); void *flushPtr = ringCommandStream.getSpace(0); uint64_t currentBufferGpuVa = getCommandBufferPositionGpuAddress(flushPtr); if (ringStart) { dispatchSwitchRingBufferSection(nextRingBuffer->getGpuAddress()); cpuCachelineFlush(flushPtr, getSizeSwitchRingBufferSection()); } ringCommandStream.replaceBuffer(nextRingBuffer->getUnderlyingBuffer(), ringCommandStream.getMaxAvailableSpace()); ringCommandStream.replaceGraphicsAllocation(nextRingBuffer); handleSwitchRingBuffers(); return currentBufferGpuVa; } template inline GraphicsAllocation *DirectSubmissionHw::switchRingBuffersAllocations() { GraphicsAllocation *nextAllocation = nullptr; if (currentRingBuffer == RingBufferUse::FirstBuffer) { nextAllocation = ringBuffer2; currentRingBuffer = RingBufferUse::SecondBuffer; } else { nextAllocation = ringBuffer; currentRingBuffer = RingBufferUse::FirstBuffer; } return nextAllocation; } template void DirectSubmissionHw::deallocateResources() { MemoryManager *memoryManager = device.getExecutionEnvironment()->memoryManager.get(); if (ringBuffer) { memoryManager->freeGraphicsMemory(ringBuffer); ringBuffer = nullptr; } if (ringBuffer2) { memoryManager->freeGraphicsMemory(ringBuffer2); ringBuffer2 = nullptr; } if (semaphores) { memoryManager->freeGraphicsMemory(semaphores); semaphores = nullptr; } } template void DirectSubmissionHw::createDiagnostic() { if (directSubmissionDiagnosticAvailable) { workloadMode = DebugManager.flags.DirectSubmissionEnableDebugBuffer.get(); if (workloadMode > 0) { disableCacheFlush = DebugManager.flags.DirectSubmissionDisableCacheFlush.get(); disableMonitorFence = DebugManager.flags.DirectSubmissionDisableMonitorFence.get(); uint32_t executions = static_cast(DebugManager.flags.DirectSubmissionDiagnosticExecutionCount.get()); diagnostic = std::make_unique( executions, workloadMode == 1, DebugManager.flags.DirectSubmissionBufferPlacement.get(), DebugManager.flags.DirectSubmissionSemaphorePlacement.get(), workloadMode, disableCacheFlush, disableMonitorFence); } } } template void DirectSubmissionHw::initDiagnostic(bool &submitOnInit) { if (directSubmissionDiagnosticAvailable) { if (diagnostic.get()) { submitOnInit = true; diagnostic->diagnosticModeAllocation(); } } } template void DirectSubmissionHw::performDiagnosticMode() { if (directSubmissionDiagnosticAvailable) { if (diagnostic.get()) { diagnostic->diagnosticModeDiagnostic(); if (workloadMode == 1) { diagnostic->diagnosticModeOneWait(workloadModeOneStoreAddress, workloadModeOneExpectedValue); } BatchBuffer dummyBuffer = {}; FlushStampTracker dummyTracker(true); for (uint32_t execution = 0; execution < diagnostic->getExecutionsCount(); execution++) { dispatchCommandBuffer(dummyBuffer, dummyTracker); if (workloadMode == 1) { diagnostic->diagnosticModeOneWaitCollect(execution, workloadModeOneStoreAddress, workloadModeOneExpectedValue); } } workloadMode = 0; disableCacheFlush = UllsDefaults::defaultDisableCacheFlush; disableMonitorFence = UllsDefaults::defaultDisableMonitorFence; diagnostic.reset(nullptr); } } } template void DirectSubmissionHw::dispatchDiagnosticModeSection() { workloadModeOneExpectedValue++; uint64_t storeAddress = semaphoreGpuVa; storeAddress += ptrDiff(workloadModeOneStoreAddress, semaphorePtr); Dispatcher::dispatchStoreDwordCommand(ringCommandStream, storeAddress, workloadModeOneExpectedValue); } template size_t DirectSubmissionHw::getDiagnosticModeSection() { return Dispatcher::getSizeStoreDwordCommand(); } } // namespace NEO