/* * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/gmm_helper/gmm.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/constants.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/timestamp_packet.h" #include "shared/source/memory_manager/allocation_properties.h" #include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/os_interface/os_interface.h" #include "shared/source/utilities/tag_allocator.h" #include "aub_mem_dump.h" #include "pipe_control_args.h" namespace NEO { template const AuxTranslationMode HwHelperHw::defaultAuxTranslationMode = AuxTranslationMode::Builtin; template bool HwHelperHw::isBufferSizeSuitableForRenderCompression(const size_t size, const HardwareInfo &hwInfo) const { return size > KB; } template void HwHelperHw::setupHardwareCapabilities(HardwareCapabilities *caps, const HardwareInfo &hwInfo) { caps->image3DMaxHeight = 16384; caps->image3DMaxWidth = 16384; //With statefull messages we have an allocation cap of 4GB //Reason to subtract 8KB is that driver may pad the buffer with addition pages for over fetching.. caps->maxMemAllocSize = (4ULL * MemoryConstants::gigaByte) - (8ULL * MemoryConstants::kiloByte); caps->isStatelesToStatefullWithOffsetSupported = true; } template bool HwHelperHw::isL3Configurable(const HardwareInfo &hwInfo) { return PreambleHelper::isL3Configurable(hwInfo); } template SipKernelType HwHelperHw::getSipKernelType(bool debuggingActive) const { if (!debuggingActive) { return SipKernelType::Csr; } return SipKernelType::DbgCsr; } template size_t HwHelperHw::getMaxBarrierRegisterPerSlice() const { return 32; } template size_t HwHelperHw::getPaddingForISAAllocation() const { return 512; } template uint32_t HwHelperHw::getPitchAlignmentForImage(const HardwareInfo *hwInfo) const { return 4u; } template uint32_t HwHelperHw::getMaxNumSamplers() const { return 16; } template const AubMemDump::LrcaHelper &HwHelperHw::getCsTraits(aub_stream::EngineType engineType) const { return *AUBFamilyMapper::csTraits[engineType]; } template bool HwHelperHw::isFenceAllocationRequired(const HardwareInfo &hwInfo) const { return false; } template inline bool HwHelperHw::checkResourceCompatibility(GraphicsAllocation &graphicsAllocation) { return true; } template void HwHelperHw::setRenderSurfaceStateForBuffer(const RootDeviceEnvironment &rootDeviceEnvironment, void *surfaceStateBuffer, size_t bufferSize, uint64_t gpuVa, size_t offset, uint32_t pitch, GraphicsAllocation *gfxAlloc, bool isReadOnly, uint32_t surfaceType, bool forceNonAuxMode, bool useL1Cache) { using RENDER_SURFACE_STATE = typename Family::RENDER_SURFACE_STATE; using SURFACE_FORMAT = typename RENDER_SURFACE_STATE::SURFACE_FORMAT; using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; auto gmmHelper = rootDeviceEnvironment.getGmmHelper(); auto surfaceState = reinterpret_cast(surfaceStateBuffer); RENDER_SURFACE_STATE state = Family::cmdInitRenderSurfaceState; auto surfaceSize = alignUp(bufferSize, 4); SURFACE_STATE_BUFFER_LENGTH Length = {0}; Length.Length = static_cast(surfaceSize - 1); state.setWidth(Length.SurfaceState.Width + 1); state.setHeight(Length.SurfaceState.Height + 1); state.setDepth(Length.SurfaceState.Depth + 1); if (pitch) { state.setSurfacePitch(pitch); } // The graphics allocation for Host Ptr surface will be created in makeResident call and GPU address is expected to be the same as CPU address auto bufferStateAddress = (gfxAlloc != nullptr) ? gfxAlloc->getGpuAddress() : gpuVa; bufferStateAddress += offset; auto bufferStateSize = (gfxAlloc != nullptr) ? gfxAlloc->getUnderlyingBufferSize() : bufferSize; state.setSurfaceType(static_cast(surfaceType)); state.setSurfaceFormat(SURFACE_FORMAT::SURFACE_FORMAT_RAW); state.setSurfaceVerticalAlignment(RENDER_SURFACE_STATE::SURFACE_VERTICAL_ALIGNMENT_VALIGN_4); state.setSurfaceHorizontalAlignment(RENDER_SURFACE_STATE::SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_4); state.setTileMode(RENDER_SURFACE_STATE::TILE_MODE_LINEAR); state.setVerticalLineStride(0); state.setVerticalLineStrideOffset(0); if ((isAligned(bufferStateAddress) && isAligned(bufferStateSize)) || isReadOnly) { state.setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER)); } else { state.setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED)); } state.setSurfaceBaseAddress(bufferStateAddress); Gmm *gmm = gfxAlloc ? gfxAlloc->getDefaultGmm() : nullptr; if (gmm && gmm->isCompressionEnabled && !forceNonAuxMode) { // Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios EncodeSurfaceState::setCoherencyType(&state, RENDER_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); EncodeSurfaceState::setBufferAuxParamsForCCS(&state); } else { EncodeSurfaceState::setCoherencyType(&state, RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT); state.setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE); } setL1CachePolicy(useL1Cache, &state, rootDeviceEnvironment.getHardwareInfo()); *surfaceState = state; } template void NEO::HwHelperHw::setL1CachePolicy(bool useL1Cache, typename GfxFamily::RENDER_SURFACE_STATE *surfaceState, const HardwareInfo *hwInfo) {} template bool HwHelperHw::getEnableLocalMemory(const HardwareInfo &hwInfo) const { if (DebugManager.flags.EnableLocalMemory.get() != -1) { return DebugManager.flags.EnableLocalMemory.get(); } else if (DebugManager.flags.AUBDumpForceAllToLocalMemory.get()) { return true; } return OSInterface::osEnableLocalMemory && isLocalMemoryEnabled(hwInfo); } template bool HwHelperHw::is1MbAlignmentSupported(const HardwareInfo &hwInfo, bool isCompressionEnabled) const { return false; } template AuxTranslationMode HwHelperHw::getAuxTranslationMode(const HardwareInfo &hwInfo) { auto mode = HwHelperHw::defaultAuxTranslationMode; if (DebugManager.flags.ForceAuxTranslationMode.get() != -1) { mode = static_cast(DebugManager.flags.ForceAuxTranslationMode.get()); } if (mode == AuxTranslationMode::Blit && !hwInfo.capabilityTable.blitterOperationsSupported) { DEBUG_BREAK_IF(true); mode = AuxTranslationMode::Builtin; } return mode; } template void MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( LinearStream &commandStream, POST_SYNC_OPERATION operation, uint64_t gpuAddress, uint64_t immediateData, const HardwareInfo &hwInfo, PipeControlArgs &args) { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; addPipeControlWA(commandStream, gpuAddress, hwInfo); setPostSyncExtraProperties(args, hwInfo); addPipeControlWithPostSync(commandStream, operation, gpuAddress, immediateData, args); MemorySynchronizationCommands::addAdditionalSynchronization(commandStream, gpuAddress, hwInfo); } template void MemorySynchronizationCommands::addPipeControlWithPostSync( LinearStream &commandStream, POST_SYNC_OPERATION operation, uint64_t gpuAddress, uint64_t immediateData, PipeControlArgs &args) { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; PIPE_CONTROL cmd = GfxFamily::cmdInitPipeControl; setPipeControl(cmd, args); cmd.setPostSyncOperation(operation); cmd.setAddress(static_cast(gpuAddress & 0x0000FFFFFFFFULL)); cmd.setAddressHigh(static_cast(gpuAddress >> 32)); if (operation == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { cmd.setImmediateData(immediateData); } PIPE_CONTROL *pipeControl = commandStream.getSpaceForCmd(); *pipeControl = cmd; } template void MemorySynchronizationCommands::setPipeControl(typename GfxFamily::PIPE_CONTROL &pipeControl, PipeControlArgs &args) { pipeControl.setCommandStreamerStallEnable(true); pipeControl.setConstantCacheInvalidationEnable(args.constantCacheInvalidationEnable); pipeControl.setInstructionCacheInvalidateEnable(args.instructionCacheInvalidateEnable); pipeControl.setPipeControlFlushEnable(args.pipeControlFlushEnable); pipeControl.setRenderTargetCacheFlushEnable(args.renderTargetCacheFlushEnable); pipeControl.setStateCacheInvalidationEnable(args.stateCacheInvalidationEnable); pipeControl.setTextureCacheInvalidationEnable(args.textureCacheInvalidationEnable); pipeControl.setVfCacheInvalidationEnable(args.vfCacheInvalidationEnable); pipeControl.setGenericMediaStateClear(args.genericMediaStateClear); pipeControl.setTlbInvalidate(args.tlbInvalidation); pipeControl.setNotifyEnable(args.notifyEnable); if (isDcFlushAllowed()) { pipeControl.setDcFlushEnable(args.dcFlushEnable); } setPipeControlExtraProperties(pipeControl, args); if (DebugManager.flags.FlushAllCaches.get()) { pipeControl.setDcFlushEnable(true); pipeControl.setRenderTargetCacheFlushEnable(true); pipeControl.setInstructionCacheInvalidateEnable(true); pipeControl.setTextureCacheInvalidationEnable(true); pipeControl.setPipeControlFlushEnable(true); pipeControl.setVfCacheInvalidationEnable(true); pipeControl.setConstantCacheInvalidationEnable(true); pipeControl.setStateCacheInvalidationEnable(true); pipeControl.setTlbInvalidate(true); } if (DebugManager.flags.DoNotFlushCaches.get()) { pipeControl.setDcFlushEnable(false); pipeControl.setRenderTargetCacheFlushEnable(false); pipeControl.setInstructionCacheInvalidateEnable(false); pipeControl.setTextureCacheInvalidationEnable(false); pipeControl.setPipeControlFlushEnable(false); pipeControl.setVfCacheInvalidationEnable(false); pipeControl.setConstantCacheInvalidationEnable(false); pipeControl.setStateCacheInvalidationEnable(false); } } template bool MemorySynchronizationCommands::isDcFlushAllowed() { return true; } template void MemorySynchronizationCommands::addPipeControl(LinearStream &commandStream, PipeControlArgs &args) { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; PIPE_CONTROL cmd = GfxFamily::cmdInitPipeControl; MemorySynchronizationCommands::setPipeControl(cmd, args); auto pipeControl = commandStream.getSpaceForCmd(); *pipeControl = cmd; } template void MemorySynchronizationCommands::addPipeControlWithCSStallOnly(LinearStream &commandStream) { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; PIPE_CONTROL cmd = GfxFamily::cmdInitPipeControl; cmd.setCommandStreamerStallEnable(true); auto pipeControl = commandStream.getSpaceForCmd(); *pipeControl = cmd; } template size_t MemorySynchronizationCommands::getSizeForSinglePipeControl() { return sizeof(typename GfxFamily::PIPE_CONTROL); } template size_t MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(const HardwareInfo &hwInfo) { const auto pipeControlCount = MemorySynchronizationCommands::isPipeControlWArequired(hwInfo) ? 2u : 1u; return pipeControlCount * getSizeForSinglePipeControl() + getSizeForAdditonalSynchronization(hwInfo); } template void MemorySynchronizationCommands::addAdditionalSynchronization(LinearStream &commandStream, uint64_t gpuAddress, const HardwareInfo &hwInfo) { } template inline size_t MemorySynchronizationCommands::getSizeForSingleSynchronization(const HardwareInfo &hwInfo) { return 0u; } template inline size_t MemorySynchronizationCommands::getSizeForAdditonalSynchronization(const HardwareInfo &hwInfo) { return 0u; } template uint32_t HwHelperHw::getMetricsLibraryGenId() const { return static_cast(MetricsLibraryApi::ClientGen::Gen9); } template bool HwHelperHw::tilingAllowed(bool isSharedContext, bool isImage1d, bool forceLinearStorage) { if (DebugManager.flags.ForceLinearImages.get() || forceLinearStorage || isSharedContext) { return false; } return !isImage1d; } template uint32_t HwHelperHw::alignSlmSize(uint32_t slmSize) { if (slmSize == 0u) { return 0u; } slmSize = std::max(slmSize, 1024u); slmSize = Math::nextPowerOfTwo(slmSize); UNRECOVERABLE_IF(slmSize > 64u * KB); return slmSize; } template uint32_t HwHelperHw::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) { auto value = std::max(slmSize, 1024u); value = Math::nextPowerOfTwo(value); value = Math::getMinLsbSet(value); value = value - 9; DEBUG_BREAK_IF(value > 7); return value * !!slmSize; } template uint32_t HwHelperHw::getBarriersCountFromHasBarriers(uint32_t hasBarriers) { return hasBarriers; } template inline bool HwHelperHw::isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo) const { return false; } template uint32_t HwHelperHw::getHwRevIdFromStepping(uint32_t stepping, const HardwareInfo &hwInfo) const { return CommonConstants::invalidStepping; } template uint32_t HwHelperHw::getSteppingFromHwRevId(const HardwareInfo &hwInfo) const { return CommonConstants::invalidStepping; } template uint32_t HwHelperHw::getAubStreamSteppingFromHwRevId(const HardwareInfo &hwInfo) const { switch (getSteppingFromHwRevId(hwInfo)) { default: case REVISION_A0: case REVISION_A1: case REVISION_A3: return AubMemDump::SteppingValues::A; case REVISION_B: return AubMemDump::SteppingValues::B; case REVISION_C: return AubMemDump::SteppingValues::C; case REVISION_D: return AubMemDump::SteppingValues::D; case REVISION_K: return AubMemDump::SteppingValues::K; } } template bool HwHelperHw::isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo) const { auto lowestHwRevIdWithBug = getHwRevIdFromStepping(lowestSteppingWithBug, hwInfo); auto hwRevIdWithFix = getHwRevIdFromStepping(steppingWithFix, hwInfo); if ((lowestHwRevIdWithBug == CommonConstants::invalidStepping) || (hwRevIdWithFix == CommonConstants::invalidStepping)) { return false; } return (lowestHwRevIdWithBug <= hwInfo.platform.usRevId && hwInfo.platform.usRevId < hwRevIdWithFix); } template bool HwHelperHw::is3DPipelineSelectWARequired(const HardwareInfo &hwInfo) const { return false; } template bool HwHelperHw::isForceDefaultRCSEngineWARequired(const HardwareInfo &hwInfo) { return false; } template bool HwHelperHw::isForceEmuInt32DivRemSPWARequired(const HardwareInfo &hwInfo) { return false; } template bool HwHelperHw::isWaDisableRccRhwoOptimizationRequired() const { return false; } template inline uint32_t HwHelperHw::getMinimalSIMDSize() { return 8u; } template inline bool HwHelperHw::isSpecialWorkgroupSizeRequired(const HardwareInfo &hwInfo, bool isSimulation) const { return false; } template inline bool HwHelperHw::allowRenderCompression(const HardwareInfo &hwInfo) const { return true; } template inline bool HwHelperHw::allowStatelessCompression(const HardwareInfo &hwInfo) const { if (DebugManager.flags.EnableStatelessCompression.get() != -1) { return static_cast(DebugManager.flags.EnableStatelessCompression.get()); } return false; } template inline bool HwHelperHw::isBlitCopyRequiredForLocalMemory(const HardwareInfo &hwInfo, const GraphicsAllocation &allocation) const { return allocation.isAllocatedInLocalMemoryPool() && (getLocalMemoryAccessMode(hwInfo) == LocalMemoryAccessMode::CpuAccessDisallowed || !allocation.isAllocationLockable()); } template bool HwHelperHw::isDisableOverdispatchAvailable(const HardwareInfo &hwInfo) const { return false; } template std::unique_ptr HwHelperHw::createTimestampPacketAllocator(const std::vector &rootDeviceIndices, MemoryManager *memoryManager, size_t initialTagCount, CommandStreamReceiverType csrType, DeviceBitfield deviceBitfield) const { bool doNotReleaseNodes = (csrType > CommandStreamReceiverType::CSR_HW) || DebugManager.flags.DisableTimestampPacketOptimizations.get(); auto tagAlignment = getTimestampPacketAllocatorAlignment(); if (DebugManager.flags.OverrideTimestampPacketSize.get() != -1) { if (DebugManager.flags.OverrideTimestampPacketSize.get() == 4) { using TimestampPackets32T = TimestampPackets; return std::make_unique>(rootDeviceIndices, memoryManager, initialTagCount, tagAlignment, sizeof(TimestampPackets32T), doNotReleaseNodes, deviceBitfield); } else if (DebugManager.flags.OverrideTimestampPacketSize.get() == 8) { using TimestampPackets64T = TimestampPackets; return std::make_unique>(rootDeviceIndices, memoryManager, initialTagCount, tagAlignment, sizeof(TimestampPackets64T), doNotReleaseNodes, deviceBitfield); } else { UNRECOVERABLE_IF(true); } } using TimestampPacketType = typename GfxFamily::TimestampPacketType; using TimestampPacketsT = TimestampPackets; return std::make_unique>(rootDeviceIndices, memoryManager, initialTagCount, tagAlignment, sizeof(TimestampPacketsT), doNotReleaseNodes, deviceBitfield); } template size_t HwHelperHw::getTimestampPacketAllocatorAlignment() const { return MemoryConstants::cacheLineSize * 4; } template size_t HwHelperHw::getSingleTimestampPacketSize() const { if (DebugManager.flags.OverrideTimestampPacketSize.get() != -1) { if (DebugManager.flags.OverrideTimestampPacketSize.get() == 4) { return TimestampPackets::getSinglePacketSize(); } else if (DebugManager.flags.OverrideTimestampPacketSize.get() == 8) { return TimestampPackets::getSinglePacketSize(); } else { UNRECOVERABLE_IF(true); } } return TimestampPackets::getSinglePacketSize(); } template LocalMemoryAccessMode HwHelperHw::getLocalMemoryAccessMode(const HardwareInfo &hwInfo) const { switch (static_cast(DebugManager.flags.ForceLocalMemoryAccessMode.get())) { case LocalMemoryAccessMode::Default: case LocalMemoryAccessMode::CpuAccessAllowed: case LocalMemoryAccessMode::CpuAccessDisallowed: return static_cast(DebugManager.flags.ForceLocalMemoryAccessMode.get()); } return getDefaultLocalMemoryAccessMode(hwInfo); } template inline LocalMemoryAccessMode HwHelperHw::getDefaultLocalMemoryAccessMode(const HardwareInfo &hwInfo) const { return LocalMemoryAccessMode::Default; } template size_t MemorySynchronizationCommands::getSizeForFullCacheFlush() { return sizeof(typename GfxFamily::PIPE_CONTROL); } template void MemorySynchronizationCommands::addFullCacheFlush(LinearStream &commandStream) { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; PIPE_CONTROL *pipeControl = commandStream.getSpaceForCmd(); PIPE_CONTROL cmd = GfxFamily::cmdInitPipeControl; PipeControlArgs args(true); args.renderTargetCacheFlushEnable = true; args.instructionCacheInvalidateEnable = true; args.textureCacheInvalidationEnable = true; args.pipeControlFlushEnable = true; args.constantCacheInvalidationEnable = true; args.stateCacheInvalidationEnable = true; args.tlbInvalidation = true; MemorySynchronizationCommands::setCacheFlushExtraProperties(args); MemorySynchronizationCommands::setPipeControl(cmd, args); *pipeControl = cmd; } template const StackVec HwHelperHw::getDeviceSubGroupSizes() const { return {8, 16, 32}; } template const StackVec HwHelperHw::getThreadsPerEUConfigs() const { return {}; } template void HwHelperHw::setExtraAllocationData(AllocationData &allocationData, const AllocationProperties &properties, const HardwareInfo &hwInfo) const {} template bool HwHelperHw::isBankOverrideRequired(const HardwareInfo &hwInfo) const { return false; } template uint32_t HwHelperHw::getDefaultThreadArbitrationPolicy() const { return 0; } template bool HwHelperHw::useOnlyGlobalTimestamps() const { return false; } template bool HwHelperHw::useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const { return !getEnableLocalMemory(hwInfo); } template bool HwHelperHw::isCpuImageTransferPreferred(const HardwareInfo &hwInfo) const { return false; } template bool MemorySynchronizationCommands::isPipeControlPriorToPipelineSelectWArequired(const HardwareInfo &hwInfo) { return false; } template bool HwHelperHw::isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const PRODUCT_FAMILY productFamily) const { return true; } template bool HwHelperHw::isKmdMigrationSupported(const HardwareInfo &hwInfo) const { return false; } template bool HwHelperHw::isNewResidencyModelSupported() const { return false; } template bool HwHelperHw::isDirectSubmissionSupported(const HardwareInfo &hwInfo) const { return false; } template bool HwHelperHw::isCopyOnlyEngineType(EngineGroupType type) const { return NEO::EngineGroupType::Copy == type; } template bool HwHelperHw::isSipWANeeded(const HardwareInfo &hwInfo) const { return false; } template bool HwHelperHw::isAdditionalFeatureFlagRequired(const FeatureTable *featureTable) const { return false; } template uint32_t HwHelperHw::getDefaultRevisionId(const HardwareInfo &hwInfo) const { return 0u; } template uint32_t HwHelperHw::getNumCacheRegions() const { return 0; } template bool HwHelperHw::isSubDeviceEngineSupported(const HardwareInfo &hwInfo, const DeviceBitfield &deviceBitfield, aub_stream::EngineType engineType) const { return true; } template bool HwHelperHw::isBlitterForImagesSupported(const HardwareInfo &hwInfo) const { return false; } template size_t HwHelperHw::getPreemptionAllocationAlignment() const { return 256 * MemoryConstants::kiloByte; } template void HwHelperHw::applyAdditionalCompressionSettings(Gmm &gmm, bool isNotCompressed) const {} template void HwHelperHw::applyRenderCompressionFlag(Gmm &gmm, uint32_t isRenderCompressed) const { gmm.resourceParams.Flags.Info.RenderCompressed = isRenderCompressed; } } // namespace NEO