fix: prevent underflow in per thread data offset calculation

Related-To: NEO-14719

Signed-off-by: Naklicki, Mateusz <mateusz.naklicki@intel.com>
This commit is contained in:
Naklicki, Mateusz
2025-06-16 12:44:39 +00:00
committed by Compute-Runtime-Automation
parent 6368f43df8
commit b462f990b6
4 changed files with 88 additions and 13 deletions

View File

@@ -450,8 +450,7 @@ void Linker::patchInstructionsSegments(const std::vector<PatchableSegment> &inst
auto relocAddress = ptrOffset(segment.hostPointer, static_cast<uintptr_t>(relocation.offset));
if (relocation.type == LinkerInput::RelocationInfo::Type::perThreadPayloadOffset) {
uint32_t crossThreadDataSize = kernelDescriptors.at(segId)->kernelAttributes.crossThreadDataSize - kernelDescriptors.at(segId)->kernelAttributes.inlineDataPayloadSize;
*reinterpret_cast<uint32_t *>(relocAddress) = crossThreadDataSize;
*reinterpret_cast<uint32_t *>(relocAddress) = kernelDescriptors.at(segId)->getPerThreadDataOffset();
} else if (relocation.symbolName == implicitArgsRelocationSymbolName) {
pImplicitArgsRelocationAddresses[static_cast<uint32_t>(segId)].push_back(std::pair<void *, RelocationInfo::Type>(relocAddress, relocation.type));
} else if (relocation.symbolName.empty()) {
@@ -701,8 +700,7 @@ void Linker::resolveBuiltins(Device *pDevice, UnresolvedExternals &outUnresolved
auto kernelDescriptor = std::find_if(kernelDescriptors.begin(), kernelDescriptors.end(), [&kernelName](const KernelDescriptor *obj) { return obj->kernelMetadata.kernelName == kernelName; });
if (kernelDescriptor != std::end(kernelDescriptors)) {
uint64_t crossThreadDataSize = (*kernelDescriptor)->kernelAttributes.crossThreadDataSize - (*kernelDescriptor)->kernelAttributes.inlineDataPayloadSize;
symbol.gpuAddress = crossThreadDataSize;
symbol.gpuAddress = (*kernelDescriptor)->getPerThreadDataOffset();
auto relocAddress = ptrOffset(instructionsSegments[outUnresolvedExternals[vecIndex].instructionsSegmentId].hostPointer,
static_cast<uintptr_t>(outUnresolvedExternals[vecIndex].unresolvedRelocation.offset));

View File

@@ -60,6 +60,9 @@ struct KernelDescriptor : NEO::NonCopyableAndNonMovableClass {
const BindlessToSurfaceStateMap &getBindlessOffsetToSurfaceState() const {
return bindlessArgsMap;
}
uint16_t getPerThreadDataOffset() const {
return kernelAttributes.crossThreadDataSize - std::min(kernelAttributes.crossThreadDataSize, kernelAttributes.inlineDataPayloadSize);
}
struct KernelAttributes {
uint32_t slmInlineSize = 0U;

View File

@@ -783,10 +783,8 @@ HWTEST_F(LinkerTests, givenInvalidLinkerInputThenLinkerFails) {
HWTEST_F(LinkerTests, givenUnresolvedExternalSymbolsWhenResolveBuiltinsIsCalledThenSubDeviceIDSymbolsAreRemoved) {
struct LinkerMock : public NEO::Linker {
public:
using NEO::Linker::Linker;
using NEO::Linker::resolveBuiltins;
LinkerMock(const LinkerInput &data) : NEO::Linker(data) {
}
};
NEO::LinkerInput linkerInput;
@@ -867,10 +865,8 @@ HWTEST_F(LinkerTests, givenUnresolvedExternalsWhenLinkThenSubDeviceIDSymbolsAreC
HWTEST_F(LinkerTests, givenUnresolvedExternalSymbolsWhenResolveBuiltinsIsCalledThenPerThreadOffSymbolsAreResolvedAndRemoved) {
struct LinkerMock : public NEO::Linker {
public:
using NEO::Linker::Linker;
using NEO::Linker::resolveBuiltins;
LinkerMock(const LinkerInput &data) : NEO::Linker(data) {
}
};
const uint64_t kernel1RelocOffset = 40;
@@ -922,10 +918,8 @@ HWTEST_F(LinkerTests, givenUnresolvedExternalSymbolsWhenResolveBuiltinsIsCalledT
HWTEST_F(LinkerTests, givenPerThreadOffSymbolInUnresolvedExternalSymbolsAndMissingKernelDescriptorForPerThreadOffSymbolWhenResolveBuiltinsThenPerThreadOffSymbolIsNotResolved) {
struct LinkerMock : public NEO::Linker {
public:
using NEO::Linker::Linker;
using NEO::Linker::resolveBuiltins;
LinkerMock(const LinkerInput &data) : NEO::Linker(data) {
}
};
const uint64_t kernelRelocOffset = 40;
@@ -961,6 +955,40 @@ HWTEST_F(LinkerTests, givenPerThreadOffSymbolInUnresolvedExternalSymbolsAndMissi
EXPECT_TRUE(isPerThreadOffUnresolved);
}
HWTEST_F(LinkerTests, givenUnresolvedExternalSymbolsAndCrossThreadDataSmallerThanInlineDataWhenResolveBuiltinsIsCalledThenPerThreadOffSymbolIsResolvedAndRemoved) {
struct LinkerMock : public NEO::Linker {
public:
using NEO::Linker::Linker;
using NEO::Linker::resolveBuiltins;
};
const uint64_t kernelRelocOffset = 40u;
NEO::LinkerInput linkerInput{};
LinkerMock linker(linkerInput);
NEO::Linker::UnresolvedExternals unresolvedExternals{};
unresolvedExternals.push_back({{NEO::Linker::perThreadOff, kernelRelocOffset, NEO::Linker::RelocationInfo::Type::address16, NEO::SegmentType::instructions, ".text.kernel_func1"}, 0u, false});
std::vector<char> instructionSegment{};
instructionSegment.resize(kernelRelocOffset + 16);
NEO::Linker::PatchableSegments instructionsSegments{};
instructionsSegments.push_back({instructionSegment.data(), 0u});
KernelDescriptor kernelDescriptor1{};
kernelDescriptor1.kernelMetadata.kernelName = "kernel_func1";
kernelDescriptor1.kernelAttributes.crossThreadDataSize = 40u;
kernelDescriptor1.kernelAttributes.inlineDataPayloadSize = 64u;
NEO::Linker::KernelDescriptorsT kernelDescriptors{};
kernelDescriptors.push_back(&kernelDescriptor1);
linker.resolveBuiltins(pDevice, unresolvedExternals, instructionsSegments, kernelDescriptors);
EXPECT_EQ(0U, unresolvedExternals.size());
uint16_t gpuAddress = 0u;
EXPECT_EQ(*reinterpret_cast<uint16_t *>(&instructionSegment[kernelRelocOffset]), gpuAddress);
}
HWTEST_F(LinkerTests, givenUnresolvedExternalWhenPatchingInstructionsThenLinkPartially) {
NEO::LinkerInput linkerInput;
vISA::GenRelocEntry entry = {};
@@ -2737,3 +2765,32 @@ TEST_F(LinkerTests, givenPerThreadPayloadOffsetRelocationWhenPatchingInstruction
uint32_t expectedPatchedValue = kd.kernelAttributes.crossThreadDataSize - kd.kernelAttributes.inlineDataPayloadSize;
EXPECT_EQ(expectedPatchedValue, static_cast<uint32_t>(*perThreadPayloadOffsetPatchedValue));
}
TEST_F(LinkerTests, givenPerThreadPayloadOffsetRelocationAndCrossThreadDataSmallerThanInlineDataWhenPatchingInstructionSegmentsThenPatchItWithOffsetZero) {
WhiteBox<NEO::LinkerInput> linkerInput{};
linkerInput.traits.requiresPatchingOfInstructionSegments = true;
NEO::LinkerInput::RelocationInfo rel{};
rel.offset = 0x4;
rel.type = NEO::LinkerInput::RelocationInfo::Type::perThreadPayloadOffset;
rel.relocationSegment = NEO::SegmentType::instructions;
linkerInput.textRelocations.push_back({rel});
NEO::Linker::KernelDescriptorsT kernelDescriptors{};
KernelDescriptor kd{};
kd.kernelAttributes.crossThreadDataSize = 40u;
kd.kernelAttributes.inlineDataPayloadSize = 64u;
kernelDescriptors.push_back(&kd);
WhiteBox<NEO::Linker> linker(linkerInput);
uint64_t segmentData{0};
NEO::Linker::PatchableSegment segmentToPatch{};
segmentToPatch.hostPointer = reinterpret_cast<void *>(&segmentData);
segmentToPatch.segmentSize = sizeof(segmentData);
NEO::Linker::UnresolvedExternals unresolvedExternals{};
linker.patchInstructionsSegments({segmentToPatch}, unresolvedExternals, kernelDescriptors);
auto perThreadPayloadOffsetPatchedValue = reinterpret_cast<uint32_t *>(ptrOffset(segmentToPatch.hostPointer, static_cast<size_t>(rel.offset)));
uint32_t expectedPatchedValue = 0u;
EXPECT_EQ(expectedPatchedValue, static_cast<uint32_t>(*perThreadPayloadOffsetPatchedValue));
}

View File

@@ -273,3 +273,20 @@ TEST(KernelDescriptor, GivenDescriptorWithoutStatefulArgsWhenInitBindlessOffsets
desc.initBindlessOffsetToSurfaceState();
EXPECT_EQ(0u, desc.bindlessArgsMap.size());
}
TEST(KernelDescriptor, GivenDescriptorWhenGettingPerThreadDataOffsetThenItReturnsCorrectValue) {
NEO::KernelDescriptor desc{};
desc.kernelAttributes.crossThreadDataSize = 64u;
desc.kernelAttributes.inlineDataPayloadSize = 64u;
EXPECT_EQ(0u, desc.getPerThreadDataOffset());
// crossThreadData is fully consumed by inlineDataPayload
desc.kernelAttributes.crossThreadDataSize = 40u;
desc.kernelAttributes.inlineDataPayloadSize = 64u;
EXPECT_EQ(0u, desc.getPerThreadDataOffset());
desc.kernelAttributes.crossThreadDataSize = 128u;
desc.kernelAttributes.inlineDataPayloadSize = 64u;
EXPECT_EQ(64u, desc.getPerThreadDataOffset());
}