refactor: move surface state programming method to base class

Related-To: NEO-15374

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-07-22 14:10:29 +00:00
committed by Compute-Runtime-Automation
parent b90b77e4e3
commit 5893fb08fb
3 changed files with 82 additions and 85 deletions

View File

@@ -29,90 +29,6 @@ namespace L0 {
template <GFXCORE_FAMILY gfxCoreFamily>
struct KernelHw : public KernelImp {
using KernelImp::KernelImp;
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {
uint64_t baseAddress = alloc->getGpuAddressToPatch();
// Remove misaligned bytes, accounted for in bufferOffset patch token
baseAddress &= this->surfaceStateAlignmentMask;
auto misalignedSize = ptrDiff(alloc->getGpuAddressToPatch(), baseAddress);
auto offset = ptrDiff(address, reinterpret_cast<void *>(baseAddress));
size_t bufferSizeForSsh = alloc->getUnderlyingBufferSize();
// If the allocation is part of a mapped virtual range, then set size to maximum to allow for access across multiple virtual ranges.
Device *device = module->getDevice();
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
auto argInfo = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
bool offsetWasPatched = NEO::patchNonPointer<uint32_t, uint32_t>(getCrossThreadDataSpan(),
argInfo.bufferOffset, static_cast<uint32_t>(offset));
bool offsetedAddress = false;
if (false == offsetWasPatched) {
// fallback to handling offset in surface state
offsetedAddress = baseAddress != reinterpret_cast<uintptr_t>(address);
baseAddress = reinterpret_cast<uintptr_t>(address);
bufferSizeForSsh -= offset;
DEBUG_BREAK_IF(baseAddress != (baseAddress & this->surfaceStateAlignmentMask));
offset = 0;
}
void *surfaceStateAddress = nullptr;
auto surfaceState = GfxFamily::cmdInitRenderSurfaceState;
if (NEO::isValidOffset(argInfo.bindful)) {
surfaceStateAddress = ptrOffset(state.surfaceStateHeapData.get(), argInfo.bindful);
surfaceState = *reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateAddress);
} else if (NEO::isValidOffset(argInfo.bindless)) {
state.isBindlessOffsetSet[argIndex] = false;
state.usingSurfaceStateHeap[argIndex] = false;
if (this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper() && !offsetedAddress) {
surfaceStateAddress = patchBindlessSurfaceState(alloc, argInfo.bindless);
state.isBindlessOffsetSet[argIndex] = true;
} else {
state.usingSurfaceStateHeap[argIndex] = true;
surfaceStateAddress = ptrOffset(state.surfaceStateHeapData.get(), getSurfaceStateIndexForBindlessOffset(argInfo.bindless) * sizeof(typename GfxFamily::RENDER_SURFACE_STATE));
}
}
uint64_t bufferAddressForSsh = baseAddress;
bufferSizeForSsh += misalignedSize;
bufferSizeForSsh = alignUp(bufferSizeForSsh, this->surfaceStateAlignment);
bool l3Enabled = true;
// Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD.
// Most commonly this issue will occur with Host Point Allocations from customers.
l3Enabled = isL3Capable(*alloc);
NEO::Device *neoDevice = device->getNEODevice();
if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) {
l3Enabled = false;
}
if (l3Enabled == false) {
this->state.kernelRequiresQueueUncachedMocsCount++;
}
auto isDebuggerActive = neoDevice->getDebugger() != nullptr;
NEO::EncodeSurfaceStateArgs args;
args.outMemory = &surfaceState;
args.graphicsAddress = bufferAddressForSsh;
if (allocData && allocData->virtualReservationData) {
bufferSizeForSsh = MemoryConstants::fullStatefulRegion;
}
args.size = bufferSizeForSsh;
args.mocs = device->getMOCS(l3Enabled, false);
args.numAvailableDevices = neoDevice->getNumGenericSubDevices();
args.allocation = alloc;
args.gmmHelper = neoDevice->getGmmHelper();
args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
args.implicitScaling = device->isImplicitScalingCapable();
args.isDebuggerActive = isDebuggerActive;
NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
UNRECOVERABLE_IF(surfaceStateAddress == nullptr);
*reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateAddress) = surfaceState;
}
};
} // namespace L0

View File

@@ -1083,6 +1083,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
this->walkerInlineDataSize = gfxHelper.getDefaultWalkerInlineDataSize();
this->surfaceStateAlignmentMask = gfxHelper.getSurfaceBaseAddressAlignmentMask();
this->surfaceStateAlignment = gfxHelper.getSurfaceBaseAddressAlignment();
this->renderSurfaceStateSize = gfxHelper.getRenderSurfaceStateSize();
if (isaAllocation->getAllocationType() == NEO::AllocationType::kernelIsaInternal && this->kernelImmData->getIsaParentAllocation() == nullptr) {
isaAllocation->setTbxWritable(true, std::numeric_limits<uint32_t>::max());
@@ -1662,4 +1663,83 @@ void KernelImp::evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::Kerne
kernelDescriptor.kernelAttributes.simdSize);
}
void KernelImp::setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) {
uint64_t baseAddress = alloc->getGpuAddressToPatch();
// Remove misaligned bytes, accounted for in bufferOffset patch token
baseAddress &= this->surfaceStateAlignmentMask;
auto misalignedSize = ptrDiff(alloc->getGpuAddressToPatch(), baseAddress);
auto offset = ptrDiff(address, reinterpret_cast<void *>(baseAddress));
size_t bufferSizeForSsh = alloc->getUnderlyingBufferSize();
// If the allocation is part of a mapped virtual range, then set size to maximum to allow for access across multiple virtual ranges.
Device *device = module->getDevice();
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
auto argInfo = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
bool offsetWasPatched = NEO::patchNonPointer<uint32_t, uint32_t>(getCrossThreadDataSpan(),
argInfo.bufferOffset, static_cast<uint32_t>(offset));
bool offsetedAddress = false;
if (false == offsetWasPatched) {
// fallback to handling offset in surface state
offsetedAddress = baseAddress != reinterpret_cast<uintptr_t>(address);
baseAddress = reinterpret_cast<uintptr_t>(address);
bufferSizeForSsh -= offset;
DEBUG_BREAK_IF(baseAddress != (baseAddress & this->surfaceStateAlignmentMask));
}
NEO::EncodeSurfaceStateArgs args;
void *surfaceStateAddress = nullptr;
if (NEO::isValidOffset(argInfo.bindful)) {
surfaceStateAddress = ptrOffset(state.surfaceStateHeapData.get(), argInfo.bindful);
args.inTemplateMemory = surfaceStateAddress;
} else if (NEO::isValidOffset(argInfo.bindless)) {
state.isBindlessOffsetSet[argIndex] = false;
state.usingSurfaceStateHeap[argIndex] = false;
if (this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper() && !offsetedAddress) {
surfaceStateAddress = patchBindlessSurfaceState(alloc, argInfo.bindless);
state.isBindlessOffsetSet[argIndex] = true;
} else {
state.usingSurfaceStateHeap[argIndex] = true;
surfaceStateAddress = ptrOffset(state.surfaceStateHeapData.get(), getSurfaceStateIndexForBindlessOffset(argInfo.bindless) * this->renderSurfaceStateSize);
}
}
args.outMemory = surfaceStateAddress;
uint64_t bufferAddressForSsh = baseAddress;
bufferSizeForSsh += misalignedSize;
bufferSizeForSsh = alignUp(bufferSizeForSsh, this->surfaceStateAlignment);
bool l3Enabled = true;
// Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD.
// Most commonly this issue will occur with Host Point Allocations from customers.
l3Enabled = isL3Capable(*alloc);
NEO::Device *neoDevice = device->getNEODevice();
if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) {
l3Enabled = false;
}
if (l3Enabled == false) {
this->state.kernelRequiresQueueUncachedMocsCount++;
}
auto isDebuggerActive = neoDevice->getDebugger() != nullptr;
args.graphicsAddress = bufferAddressForSsh;
if (allocData && allocData->virtualReservationData) {
bufferSizeForSsh = MemoryConstants::fullStatefulRegion;
}
args.size = bufferSizeForSsh;
args.mocs = device->getMOCS(l3Enabled, false);
args.numAvailableDevices = neoDevice->getNumGenericSubDevices();
args.allocation = alloc;
args.gmmHelper = neoDevice->getGmmHelper();
args.areMultipleSubDevicesInContext = args.numAvailableDevices > 1;
args.implicitScaling = device->isImplicitScalingCapable();
args.isDebuggerActive = isDebuggerActive;
device->getGfxCoreHelper().encodeBufferSurfaceState(args);
}
} // namespace L0

View File

@@ -169,7 +169,7 @@ struct KernelImp : Kernel {
ze_result_t setArgSampler(uint32_t argIndex, size_t argSize, const void *argVal);
virtual void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) = 0;
MOCKABLE_VIRTUAL void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc);
void setInlineSamplers();
@@ -325,6 +325,7 @@ struct KernelImp : Kernel {
NEO::GraphicsAllocation *printfBuffer = nullptr;
uintptr_t surfaceStateAlignmentMask = 0;
uintptr_t surfaceStateAlignment = 0;
size_t renderSurfaceStateSize = 0;
uint32_t implicitArgsVersion = 0;
uint32_t walkerInlineDataSize = 0;