From 5893fb08fbd8ddba86d67efc60d41dd1dcc42577 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 22 Jul 2025 14:10:29 +0000 Subject: [PATCH] refactor: move surface state programming method to base class Related-To: NEO-15374 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/kernel/kernel_hw.h | 84 -------------------- level_zero/core/source/kernel/kernel_imp.cpp | 80 +++++++++++++++++++ level_zero/core/source/kernel/kernel_imp.h | 3 +- 3 files changed, 82 insertions(+), 85 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index 8907fca625..4db7bf3e8d 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -29,90 +29,6 @@ namespace L0 { template struct KernelHw : public KernelImp { using KernelImp::KernelImp; - using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - - void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override { - uint64_t baseAddress = alloc->getGpuAddressToPatch(); - - // Remove misaligned bytes, accounted for in bufferOffset patch token - baseAddress &= this->surfaceStateAlignmentMask; - auto misalignedSize = ptrDiff(alloc->getGpuAddressToPatch(), baseAddress); - auto offset = ptrDiff(address, reinterpret_cast(baseAddress)); - size_t bufferSizeForSsh = alloc->getUnderlyingBufferSize(); - // If the allocation is part of a mapped virtual range, then set size to maximum to allow for access across multiple virtual ranges. - Device *device = module->getDevice(); - auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast(alloc->getGpuAddress())); - - auto argInfo = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as(); - bool offsetWasPatched = NEO::patchNonPointer(getCrossThreadDataSpan(), - argInfo.bufferOffset, static_cast(offset)); - bool offsetedAddress = false; - if (false == offsetWasPatched) { - // fallback to handling offset in surface state - offsetedAddress = baseAddress != reinterpret_cast(address); - baseAddress = reinterpret_cast(address); - bufferSizeForSsh -= offset; - DEBUG_BREAK_IF(baseAddress != (baseAddress & this->surfaceStateAlignmentMask)); - - offset = 0; - } - void *surfaceStateAddress = nullptr; - auto surfaceState = GfxFamily::cmdInitRenderSurfaceState; - - if (NEO::isValidOffset(argInfo.bindful)) { - surfaceStateAddress = ptrOffset(state.surfaceStateHeapData.get(), argInfo.bindful); - surfaceState = *reinterpret_cast(surfaceStateAddress); - - } else if (NEO::isValidOffset(argInfo.bindless)) { - state.isBindlessOffsetSet[argIndex] = false; - state.usingSurfaceStateHeap[argIndex] = false; - if (this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper() && !offsetedAddress) { - surfaceStateAddress = patchBindlessSurfaceState(alloc, argInfo.bindless); - state.isBindlessOffsetSet[argIndex] = true; - } else { - state.usingSurfaceStateHeap[argIndex] = true; - surfaceStateAddress = ptrOffset(state.surfaceStateHeapData.get(), getSurfaceStateIndexForBindlessOffset(argInfo.bindless) * sizeof(typename GfxFamily::RENDER_SURFACE_STATE)); - } - } - - uint64_t bufferAddressForSsh = baseAddress; - bufferSizeForSsh += misalignedSize; - bufferSizeForSsh = alignUp(bufferSizeForSsh, this->surfaceStateAlignment); - - bool l3Enabled = true; - // Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD. - // Most commonly this issue will occur with Host Point Allocations from customers. - l3Enabled = isL3Capable(*alloc); - - NEO::Device *neoDevice = device->getNEODevice(); - - if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) { - l3Enabled = false; - } - - if (l3Enabled == false) { - this->state.kernelRequiresQueueUncachedMocsCount++; - } - auto isDebuggerActive = neoDevice->getDebugger() != nullptr; - NEO::EncodeSurfaceStateArgs args; - args.outMemory = &surfaceState; - args.graphicsAddress = bufferAddressForSsh; - if (allocData && allocData->virtualReservationData) { - bufferSizeForSsh = MemoryConstants::fullStatefulRegion; - } - args.size = bufferSizeForSsh; - args.mocs = device->getMOCS(l3Enabled, false); - args.numAvailableDevices = neoDevice->getNumGenericSubDevices(); - args.allocation = alloc; - args.gmmHelper = neoDevice->getGmmHelper(); - args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1; - args.implicitScaling = device->isImplicitScalingCapable(); - args.isDebuggerActive = isDebuggerActive; - - NEO::EncodeSurfaceState::encodeBuffer(args); - UNRECOVERABLE_IF(surfaceStateAddress == nullptr); - *reinterpret_cast(surfaceStateAddress) = surfaceState; - } }; } // namespace L0 diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 1d5d133dca..7bfd3f6e8d 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -1083,6 +1083,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { this->walkerInlineDataSize = gfxHelper.getDefaultWalkerInlineDataSize(); this->surfaceStateAlignmentMask = gfxHelper.getSurfaceBaseAddressAlignmentMask(); this->surfaceStateAlignment = gfxHelper.getSurfaceBaseAddressAlignment(); + this->renderSurfaceStateSize = gfxHelper.getRenderSurfaceStateSize(); if (isaAllocation->getAllocationType() == NEO::AllocationType::kernelIsaInternal && this->kernelImmData->getIsaParentAllocation() == nullptr) { isaAllocation->setTbxWritable(true, std::numeric_limits::max()); @@ -1662,4 +1663,83 @@ void KernelImp::evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::Kerne kernelDescriptor.kernelAttributes.simdSize); } +void KernelImp::setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) { + uint64_t baseAddress = alloc->getGpuAddressToPatch(); + + // Remove misaligned bytes, accounted for in bufferOffset patch token + baseAddress &= this->surfaceStateAlignmentMask; + auto misalignedSize = ptrDiff(alloc->getGpuAddressToPatch(), baseAddress); + auto offset = ptrDiff(address, reinterpret_cast(baseAddress)); + size_t bufferSizeForSsh = alloc->getUnderlyingBufferSize(); + // If the allocation is part of a mapped virtual range, then set size to maximum to allow for access across multiple virtual ranges. + Device *device = module->getDevice(); + auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast(alloc->getGpuAddress())); + + auto argInfo = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as(); + bool offsetWasPatched = NEO::patchNonPointer(getCrossThreadDataSpan(), + argInfo.bufferOffset, static_cast(offset)); + bool offsetedAddress = false; + if (false == offsetWasPatched) { + // fallback to handling offset in surface state + offsetedAddress = baseAddress != reinterpret_cast(address); + baseAddress = reinterpret_cast(address); + bufferSizeForSsh -= offset; + DEBUG_BREAK_IF(baseAddress != (baseAddress & this->surfaceStateAlignmentMask)); + } + + NEO::EncodeSurfaceStateArgs args; + void *surfaceStateAddress = nullptr; + + if (NEO::isValidOffset(argInfo.bindful)) { + surfaceStateAddress = ptrOffset(state.surfaceStateHeapData.get(), argInfo.bindful); + args.inTemplateMemory = surfaceStateAddress; + } else if (NEO::isValidOffset(argInfo.bindless)) { + state.isBindlessOffsetSet[argIndex] = false; + state.usingSurfaceStateHeap[argIndex] = false; + if (this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper() && !offsetedAddress) { + surfaceStateAddress = patchBindlessSurfaceState(alloc, argInfo.bindless); + state.isBindlessOffsetSet[argIndex] = true; + } else { + state.usingSurfaceStateHeap[argIndex] = true; + surfaceStateAddress = ptrOffset(state.surfaceStateHeapData.get(), getSurfaceStateIndexForBindlessOffset(argInfo.bindless) * this->renderSurfaceStateSize); + } + } + args.outMemory = surfaceStateAddress; + + uint64_t bufferAddressForSsh = baseAddress; + bufferSizeForSsh += misalignedSize; + bufferSizeForSsh = alignUp(bufferSizeForSsh, this->surfaceStateAlignment); + + bool l3Enabled = true; + // Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD. + // Most commonly this issue will occur with Host Point Allocations from customers. + l3Enabled = isL3Capable(*alloc); + + NEO::Device *neoDevice = device->getNEODevice(); + + if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) { + l3Enabled = false; + } + + if (l3Enabled == false) { + this->state.kernelRequiresQueueUncachedMocsCount++; + } + auto isDebuggerActive = neoDevice->getDebugger() != nullptr; + + args.graphicsAddress = bufferAddressForSsh; + if (allocData && allocData->virtualReservationData) { + bufferSizeForSsh = MemoryConstants::fullStatefulRegion; + } + args.size = bufferSizeForSsh; + args.mocs = device->getMOCS(l3Enabled, false); + args.numAvailableDevices = neoDevice->getNumGenericSubDevices(); + args.allocation = alloc; + args.gmmHelper = neoDevice->getGmmHelper(); + args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1; + args.implicitScaling = device->isImplicitScalingCapable(); + args.isDebuggerActive = isDebuggerActive; + + device->getGfxCoreHelper().encodeBufferSurfaceState(args); +} + } // namespace L0 diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index 9082e8d33f..1d6e01bd04 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -169,7 +169,7 @@ struct KernelImp : Kernel { ze_result_t setArgSampler(uint32_t argIndex, size_t argSize, const void *argVal); - virtual void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) = 0; + MOCKABLE_VIRTUAL void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc); void setInlineSamplers(); @@ -325,6 +325,7 @@ struct KernelImp : Kernel { NEO::GraphicsAllocation *printfBuffer = nullptr; uintptr_t surfaceStateAlignmentMask = 0; uintptr_t surfaceStateAlignment = 0; + size_t renderSurfaceStateSize = 0; uint32_t implicitArgsVersion = 0; uint32_t walkerInlineDataSize = 0;