fix: implicit arg buffer

- refactor validating target device and dcoding Intel GT Notes
- decoded versions are stored in singleDeviceBinary when decoding zebin
- adds parsing Intel GT notes from elf when unpacking binary
- sets indirectAccessBufferMajorVersion with correct value
- fix ImplicitArgsV1 - add simdWidth
- use correct simd size in patchImplicitArgs()

Related-To: NEO-16167, NEO-15211, IGC-12358

Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
Mateusz Hoppe
2025-09-22 16:08:15 +00:00
committed by Compute-Runtime-Automation
parent 3a5b197f3a
commit 12263b2e7c
16 changed files with 384 additions and 111 deletions

View File

@@ -681,11 +681,16 @@ void Linker::resolveImplicitArgs(const KernelDescriptorsT &kernelDescriptors, De
kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs |= addImplcictArgs;
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
uint64_t implicitArgsSize = 0;
if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 0) {
uint8_t version = kernelDescriptor.kernelMetadata.indirectAccessBuffer;
if (version == 0) {
version = pDevice->getGfxCoreHelper().getImplicitArgsVersion();
}
if (version == 0) {
implicitArgsSize = ImplicitArgsV0::getAlignedSize();
} else if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 1) {
} else if (version == 1) {
implicitArgsSize = ImplicitArgsV1::getAlignedSize();
} else if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 2) {
} else if (version == 2) {
implicitArgsSize = ImplicitArgsV2::getAlignedSize();
} else {
UNRECOVERABLE_IF(true);

View File

@@ -59,8 +59,9 @@ SingleDeviceBinary unpackSingleZebin(const ArrayRef<const uint8_t> archive, cons
bool validForTarget = true;
if (elf.elfFileHeader->machine == Elf::ElfMachine::EM_INTELGT) {
validForTarget &= Zebin::validateTargetDevice(elf, requestedTargetDevice, outErrReason, outWarning, ret);
validForTarget &= Zebin::validateTargetDevice(elf, requestedTargetDevice, outErrReason, outWarning, ret.generatorFeatureVersions, ret.generator);
} else {
Zebin::validateTargetDevice(elf, requestedTargetDevice, outErrReason, outWarning, ret.generatorFeatureVersions, ret.generator);
const auto &flags = reinterpret_cast<const NEO::Zebin::Elf::ZebinTargetFlags &>(elf.elfFileHeader->flags);
validForTarget &= flags.machineEntryUsesGfxCoreInsteadOfProductFamily
? (requestedTargetDevice.coreFamily == static_cast<GFXCORE_FAMILY>(elf.elfFileHeader->machine))
@@ -114,10 +115,16 @@ DecodeError decodeSingleZebin(ProgramInfo &dst, const SingleDeviceBinary &src, s
return DecodeError::invalidBinary;
}
GeneratorFeatureVersions generatorFeatures = {};
GeneratorType generator = {};
auto ret = Zebin::validateTargetDevice(elf, src.targetDevice, outErrReason, outWarning, generatorFeatures, generator);
if (!ret && elf.elfFileHeader->machine == Elf::ElfMachine::EM_INTELGT) {
return DecodeError::invalidBinary;
}
dst.grfSize = src.targetDevice.grfSize;
dst.minScratchSpaceSize = src.targetDevice.minScratchSpaceSize;
dst.indirectDetectionVersion = src.generatorFeatureVersions.indirectMemoryAccessDetection;
dst.indirectAccessBufferMajorVersion = src.generatorFeatureVersions.indirectAccessBuffer;
dst.indirectDetectionVersion = generatorFeatures.indirectMemoryAccessDetection;
dst.indirectAccessBufferMajorVersion = generatorFeatures.indirectAccessBuffer;
dst.samplerStateSize = src.targetDevice.samplerStateSize;
dst.samplerBorderColorStateSize = src.targetDevice.samplerBorderColorStateSize;
@@ -126,10 +133,11 @@ DecodeError decodeSingleZebin(ProgramInfo &dst, const SingleDeviceBinary &src, s
return decodeError;
}
const bool isGeneratedByIgc = src.generator == GeneratorType::igc;
const bool isGeneratedByIgc = generator == GeneratorType::igc;
for (auto &kernelInfo : dst.kernelInfos) {
kernelInfo->kernelDescriptor.kernelMetadata.isGeneratedByIgc = isGeneratedByIgc;
kernelInfo->kernelDescriptor.kernelMetadata.indirectAccessBuffer = generatorFeatures.indirectAccessBuffer;
if (KernelDescriptor::isBindlessAddressingKernel(kernelInfo->kernelDescriptor)) {
kernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();

View File

@@ -75,6 +75,12 @@ struct TargetDevice {
};
TargetDevice getTargetDevice(const RootDeviceEnvironment &rootDeviceEnvironment);
struct GeneratorFeatureVersions {
using VersionT = uint32_t;
VersionT indirectMemoryAccessDetection = 0u;
VersionT indirectAccessBuffer = 0u;
};
struct SingleDeviceBinary {
DeviceBinaryFormat format = DeviceBinaryFormat::unknown;
ArrayRef<const uint8_t> deviceBinary;
@@ -84,11 +90,7 @@ struct SingleDeviceBinary {
ConstStringRef buildOptions;
TargetDevice targetDevice;
GeneratorType generator = GeneratorType::igc;
struct GeneratorFeatureVersions {
using VersionT = uint32_t;
VersionT indirectMemoryAccessDetection = 0u;
VersionT indirectAccessBuffer = 0u;
} generatorFeatureVersions;
GeneratorFeatureVersions generatorFeatureVersions;
};
template <DeviceBinaryFormat format>

View File

@@ -92,10 +92,10 @@ bool validateTargetDevice(const TargetDevice &targetDevice, Elf::ElfIdentifierCl
return true;
}
template bool validateTargetDevice<Elf::EI_CLASS_32>(const Elf::Elf<Elf::EI_CLASS_32> &elf, const TargetDevice &targetDevice, std::string &outErrReason, std::string &outWarning, SingleDeviceBinary &singleDeviceBinary);
template bool validateTargetDevice<Elf::EI_CLASS_64>(const Elf::Elf<Elf::EI_CLASS_64> &elf, const TargetDevice &targetDevice, std::string &outErrReason, std::string &outWarning, SingleDeviceBinary &singleDeviceBinary);
template bool validateTargetDevice<Elf::EI_CLASS_32>(const Elf::Elf<Elf::EI_CLASS_32> &elf, const TargetDevice &targetDevice, std::string &outErrReason, std::string &outWarning, GeneratorFeatureVersions &generatorFeatures, GeneratorType &generator);
template bool validateTargetDevice<Elf::EI_CLASS_64>(const Elf::Elf<Elf::EI_CLASS_64> &elf, const TargetDevice &targetDevice, std::string &outErrReason, std::string &outWarning, GeneratorFeatureVersions &generatorFeatures, GeneratorType &generator);
template <Elf::ElfIdentifierClass numBits>
bool validateTargetDevice(const Elf::Elf<numBits> &elf, const TargetDevice &targetDevice, std::string &outErrReason, std::string &outWarning, SingleDeviceBinary &singleDeviceBinary) {
bool validateTargetDevice(const Elf::Elf<numBits> &elf, const TargetDevice &targetDevice, std::string &outErrReason, std::string &outWarning, GeneratorFeatureVersions &generatorFeatures, GeneratorType &generator) {
GFXCORE_FAMILY gfxCore = IGFX_UNKNOWN_CORE;
PRODUCT_FAMILY productFamily = IGFX_UNKNOWN;
AOT::PRODUCT_CONFIG productConfig = AOT::UNKNOWN_ISA;
@@ -123,7 +123,7 @@ bool validateTargetDevice(const Elf::Elf<numBits> &elf, const TargetDevice &targ
DEBUG_BREAK_IF(sizeof(uint32_t) != intelGTNote.data.size());
auto targetMetadataPacked = reinterpret_cast<const uint32_t *>(intelGTNote.data.begin());
targetMetadata.packed = static_cast<uint32_t>(*targetMetadataPacked);
singleDeviceBinary.generator = static_cast<GeneratorType>(targetMetadata.generatorId);
generator = static_cast<GeneratorType>(targetMetadata.generatorId);
break;
}
case Elf::IntelGTSectionType::zebinVersion: {
@@ -155,13 +155,13 @@ bool validateTargetDevice(const Elf::Elf<numBits> &elf, const TargetDevice &targ
case Elf::IntelGTSectionType::indirectAccessDetectionVersion: {
DEBUG_BREAK_IF(sizeof(uint32_t) != intelGTNote.data.size());
auto indirectDetectionVersion = reinterpret_cast<const uint32_t *>(intelGTNote.data.begin());
singleDeviceBinary.generatorFeatureVersions.indirectMemoryAccessDetection = static_cast<uint32_t>(*indirectDetectionVersion);
generatorFeatures.indirectMemoryAccessDetection = static_cast<uint32_t>(*indirectDetectionVersion);
break;
}
case Elf::IntelGTSectionType::indirectAccessBufferMajorVersion: {
DEBUG_BREAK_IF(sizeof(uint32_t) != intelGTNote.data.size());
auto indirectDetectionVersion = reinterpret_cast<const uint32_t *>(intelGTNote.data.begin());
singleDeviceBinary.generatorFeatureVersions.indirectAccessBuffer = static_cast<uint32_t>(*indirectDetectionVersion);
generatorFeatures.indirectAccessBuffer = static_cast<uint32_t>(*indirectDetectionVersion);
break;
}
default:

View File

@@ -52,7 +52,7 @@ bool isZebin(ArrayRef<const uint8_t> binary);
bool validateTargetDevice(const TargetDevice &targetDevice, Elf::ElfIdentifierClass numBits, PRODUCT_FAMILY productFamily, GFXCORE_FAMILY gfxCore, AOT::PRODUCT_CONFIG productConfig, Zebin::Elf::ZebinTargetFlags targetMetadata);
template <Elf::ElfIdentifierClass numBits>
bool validateTargetDevice(const Elf::Elf<numBits> &elf, const TargetDevice &targetDevice, std::string &outErrReason, std::string &outWarning, SingleDeviceBinary &singleDeviceBinary);
bool validateTargetDevice(const Elf::Elf<numBits> &elf, const TargetDevice &targetDevice, std::string &outErrReason, std::string &outWarning, GeneratorFeatureVersions &generatorFeatures, GeneratorType &generator);
template <Elf::ElfIdentifierClass numBits>
DecodeError decodeIntelGTNoteSection(ArrayRef<const uint8_t> intelGTNotesSection, std::vector<Elf::IntelGTNote> &intelGTNotes, std::string &outErrReason, std::string &outWarning);

View File

@@ -56,7 +56,7 @@ static_assert(ImplicitArgsV0::getSize() == (28 * sizeof(uint32_t)));
struct alignas(32) ImplicitArgsV1 {
ImplicitArgsHeader header;
uint8_t numWorkDim;
uint8_t padding0;
uint8_t simdWidth;
uint32_t localSizeX;
uint32_t localSizeY;
uint32_t localSizeZ;
@@ -71,7 +71,7 @@ struct alignas(32) ImplicitArgsV1 {
uint32_t groupCountX;
uint32_t groupCountY;
uint32_t groupCountZ;
uint32_t padding1;
uint32_t padding0;
uint64_t rtGlobalBufferPtr;
uint64_t assertBufferPtr;
uint64_t scratchPtr;
@@ -183,12 +183,16 @@ struct alignas(32) ImplicitArgs {
void setSimdWidth(uint32_t simd) {
if (v0.header.structVersion == 0) {
v0.simdWidth = simd;
} else if (v1.header.structVersion == 1) {
v1.simdWidth = simd;
}
}
std::optional<uint32_t> getSimdWidth() const {
if (v0.header.structVersion == 0) {
return v0.simdWidth;
} else if (v1.header.structVersion == 1) {
return v1.simdWidth;
}
return std::nullopt;
}
@@ -328,6 +332,12 @@ struct alignas(32) ImplicitArgs {
}
}
void setScratchBufferPtr(uint64_t scratchBuffer) {
if (v1.header.structVersion == 1) {
v1.scratchPtr = scratchBuffer;
}
}
void setEnqueuedLocalSize(uint32_t x, uint32_t y, uint32_t z) {
if (v1.header.structVersion == 1) {
v1.enqueuedLocalSizeX = x;

View File

@@ -59,7 +59,7 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
uint32_t localIdsSize = 0;
if (false == patchImplicitArgsBufferInCrossThread) {
auto simdSize = 32u;
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize);
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
@@ -91,7 +91,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
uint32_t lws[3] = {0, 0, 0};
implicitArgs.getLocalSize(lws[0], lws[1], lws[2]);
auto simdSize = implicitArgs.getSimdWidth().value_or(32);
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
auto grfSize = getGrfSize(simdSize);
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
auto dimensionOrder = getDimensionOrderForLocalIds(kernelDescriptor.kernelAttributes.workgroupDimensionsOrder, hwGenerationOfLocalIdsParams);

View File

@@ -268,6 +268,7 @@ struct KernelDescriptor : NEO::NonCopyableAndNonMovableClass {
uint16_t compiledSubGroupsNumber = 0U;
uint8_t requiredSubGroupSize = 0U;
uint8_t requiredThreadGroupDispatchSize = 0U;
uint8_t indirectAccessBuffer = 0u;
bool isGeneratedByIgc = true;
} kernelMetadata;