From 04af8bc5b4b431b6a00183ab915772543e273d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20Zwoli=C5=84ski?= Date: Tue, 21 May 2024 16:48:50 +0000 Subject: [PATCH] fix: add support for __INTEL_PER_THREAD_OFF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Related-To: NEO-10368 Signed-off-by: Fabian ZwoliƄski --- shared/source/compiler_interface/linker.cpp | 24 ++++- shared/source/compiler_interface/linker.h | 5 +- .../compiler_interface/linker_tests.cpp | 100 +++++++++++++++++- 3 files changed, 125 insertions(+), 4 deletions(-) diff --git a/shared/source/compiler_interface/linker.cpp b/shared/source/compiler_interface/linker.cpp index 5bd9feddd5..67a2ac04e3 100644 --- a/shared/source/compiler_interface/linker.cpp +++ b/shared/source/compiler_interface/linker.cpp @@ -185,6 +185,7 @@ bool LinkerInput::addRelocation(Elf::Elf &elf, const SectionNameToSegme relocationInfo.symbolName = reloc.symbolName; relocationInfo.type = static_cast(reloc.relocType); relocationInfo.relocationSegment = getSegmentForSection(sectionName); + relocationInfo.relocationSegmentName = sectionName; if (SegmentType::instructions == relocationInfo.relocationSegment) { auto kernelName = sectionName.substr(Zebin::Elf::SectionNames::textPrefix.length()); @@ -326,7 +327,7 @@ LinkingStatus Linker::link(const SegmentInfo &globalVariablesSegInfo, const Segm outUnresolvedExternals, pDevice, constantsInitData, constantsInitDataSize, variablesInitData, variablesInitDataSize); removeLocalSymbolsFromRelocatedSymbols(); resolveImplicitArgs(kernelDescriptors, pDevice); - resolveBuiltins(pDevice, outUnresolvedExternals, instructionsSegments); + resolveBuiltins(pDevice, outUnresolvedExternals, instructionsSegments, kernelDescriptors); if (initialUnresolvedExternalsCount < outUnresolvedExternals.size()) { return LinkingStatus::linkedPartially; @@ -406,6 +407,9 @@ void Linker::patchAddress(void *relocAddress, const uint64_t value, const Linker case RelocationInfo::Type::addressHigh: *reinterpret_cast(relocAddress) = static_cast((value >> 32) & 0xffffffff); break; + case RelocationInfo::Type::address16: + *reinterpret_cast(relocAddress) = static_cast(value); + break; } } @@ -656,7 +660,7 @@ void Linker::resolveImplicitArgs(const KernelDescriptorsT &kernelDescriptors, De } } -void Linker::resolveBuiltins(Device *pDevice, UnresolvedExternals &outUnresolvedExternals, const std::vector &instructionsSegments) { +void Linker::resolveBuiltins(Device *pDevice, UnresolvedExternals &outUnresolvedExternals, const std::vector &instructionsSegments, const KernelDescriptorsT &kernelDescriptors) { auto &productHelper = pDevice->getProductHelper(); auto releaseHelper = pDevice->getReleaseHelper(); @@ -673,6 +677,22 @@ void Linker::resolveBuiltins(Device *pDevice, UnresolvedExternals &outUnresolved } outUnresolvedExternals[vecIndex] = outUnresolvedExternals[outUnresolvedExternals.size() - 1u]; outUnresolvedExternals.resize(outUnresolvedExternals.size() - 1u); + } else if (outUnresolvedExternals[vecIndex].unresolvedRelocation.symbolName == perThreadOff) { + RelocatedSymbol symbol; + + auto kernelName = outUnresolvedExternals[vecIndex].unresolvedRelocation.relocationSegmentName.substr(Zebin::Elf::SectionNames::textPrefix.length()); + + auto kernelDescriptor = std::find_if(kernelDescriptors.begin(), kernelDescriptors.end(), [&kernelName](const KernelDescriptor *obj) { return obj->kernelMetadata.kernelName == kernelName; }); + if (kernelDescriptor != std::end(kernelDescriptors)) { + uint64_t crossThreadDataSize = (*kernelDescriptor)->kernelAttributes.crossThreadDataSize - (*kernelDescriptor)->kernelAttributes.inlineDataPayloadSize; + symbol.gpuAddress = crossThreadDataSize; + auto relocAddress = ptrOffset(instructionsSegments[outUnresolvedExternals[vecIndex].instructionsSegmentId].hostPointer, + static_cast(outUnresolvedExternals[vecIndex].unresolvedRelocation.offset)); + + NEO::Linker::patchAddress(relocAddress, symbol.gpuAddress, outUnresolvedExternals[vecIndex].unresolvedRelocation); + outUnresolvedExternals[vecIndex] = outUnresolvedExternals[outUnresolvedExternals.size() - 1u]; + outUnresolvedExternals.resize(outUnresolvedExternals.size() - 1u); + } } } } diff --git a/shared/source/compiler_interface/linker.h b/shared/source/compiler_interface/linker.h index 1f64a78bde..1a25e69770 100644 --- a/shared/source/compiler_interface/linker.h +++ b/shared/source/compiler_interface/linker.h @@ -92,6 +92,7 @@ struct LinkerInput { addressLow, addressHigh, perThreadPayloadOffset, + address16 = 7, relocTypeMax }; @@ -99,6 +100,7 @@ struct LinkerInput { uint64_t offset = std::numeric_limits::max(); Type type = Type::unknown; SegmentType relocationSegment = SegmentType::unknown; + std::string relocationSegmentName; int64_t addend = 0U; }; @@ -189,6 +191,7 @@ struct LinkerInput { struct Linker { inline static const std::string subDeviceID = "__SubDeviceID"; + inline static const std::string perThreadOff = "__INTEL_PER_THREAD_OFF"; using RelocationInfo = LinkerInput::RelocationInfo; @@ -257,7 +260,7 @@ struct Linker { bool resolveExternalFunctions(const KernelDescriptorsT &kernelDescriptors, std::vector &externalFunctions); void resolveImplicitArgs(const KernelDescriptorsT &kernelDescriptors, Device *pDevice); - void resolveBuiltins(Device *pDevice, UnresolvedExternals &outUnresolvedExternals, const std::vector &instructionsSegments); + void resolveBuiltins(Device *pDevice, UnresolvedExternals &outUnresolvedExternals, const std::vector &instructionsSegments, const KernelDescriptorsT &kernelDescriptors); template void patchIncrement(void *dstAllocation, size_t relocationOffset, const void *initData, uint64_t incrementValue); diff --git a/shared/test/unit_test/compiler_interface/linker_tests.cpp b/shared/test/unit_test/compiler_interface/linker_tests.cpp index 2a45bfbbde..81e31ead8d 100644 --- a/shared/test/unit_test/compiler_interface/linker_tests.cpp +++ b/shared/test/unit_test/compiler_interface/linker_tests.cpp @@ -802,7 +802,8 @@ HWTEST_F(LinkerTests, givenUnresolvedExternalSymbolsWhenResolveBuiltinsIsCalledT debugManager.flags.CreateMultipleSubDevices.set(2); debugManager.flags.EnableImplicitScaling.set(1); - linker.resolveBuiltins(pDevice, unresolvedExternals, instructionsSegments); + NEO::Linker::KernelDescriptorsT kernelDescriptors; + linker.resolveBuiltins(pDevice, unresolvedExternals, instructionsSegments, kernelDescriptors); EXPECT_EQ(2U, unresolvedExternals.size()); for (auto &symbol : unresolvedExternals) { @@ -858,6 +859,103 @@ HWTEST_F(LinkerTests, givenUnresolvedExternalsWhenLinkThenSubDeviceIDSymbolsAreC EXPECT_EQ(*reinterpret_cast(instructionSegment.data()), static_cast(gpuAddressAs64bit & 0xffffffff)); } +HWTEST_F(LinkerTests, givenUnresolvedExternalSymbolsWhenResolveBuiltinsIsCalledThenPerThreadOffSymbolsAreResolvedAndRemoved) { + struct LinkerMock : public NEO::Linker { + public: + using NEO::Linker::resolveBuiltins; + + LinkerMock(const LinkerInput &data) : NEO::Linker(data) { + } + }; + + const uint64_t kernel1RelocOffset = 40; + const uint64_t kernel2RelocOffset = 0; + + NEO::LinkerInput linkerInput; + LinkerMock linker(linkerInput); + NEO::Linker::UnresolvedExternals unresolvedExternals; + unresolvedExternals.push_back({{"__INTEL_PER_THREAD_OFF", 0, NEO::Linker::RelocationInfo::Type::addressLow, NEO::SegmentType::instructions, ".text.kernel_func2"}, 0u, false}); + unresolvedExternals.push_back({{"__MaxHWThreadIDPerSubDevice", 156, NEO::Linker::RelocationInfo::Type::addressLow, NEO::SegmentType::instructions}, 0u, false}); + unresolvedExternals.push_back({{"__MaxHWThreadIDPerSubDevice", 140, NEO::Linker::RelocationInfo::Type::addressHigh, NEO::SegmentType::instructions}, 0u, false}); + unresolvedExternals.push_back({{"__INTEL_PER_THREAD_OFF", kernel1RelocOffset, static_cast(7u), NEO::SegmentType::instructions, ".text.kernel_func1"}, 0u, false}); + + std::vector instructionSegment; + instructionSegment.resize(kernel1RelocOffset + 16); + NEO::Linker::PatchableSegments instructionsSegments; + instructionsSegments.push_back({instructionSegment.data(), 0u}); + + KernelDescriptor kernelDescriptor1; + kernelDescriptor1.kernelMetadata.kernelName = "kernel_func1"; + kernelDescriptor1.kernelAttributes.crossThreadDataSize = 96; + kernelDescriptor1.kernelAttributes.inlineDataPayloadSize = 64; + + KernelDescriptor kernelDescriptor2; + kernelDescriptor2.kernelMetadata.kernelName = "kernel_func2"; + kernelDescriptor2.kernelAttributes.crossThreadDataSize = 192; + kernelDescriptor2.kernelAttributes.inlineDataPayloadSize = 64; + + NEO::Linker::KernelDescriptorsT kernelDescriptors; + kernelDescriptors.push_back(&kernelDescriptor1); + kernelDescriptors.push_back(&kernelDescriptor2); + linker.resolveBuiltins(pDevice, unresolvedExternals, instructionsSegments, kernelDescriptors); + + EXPECT_EQ(2U, unresolvedExternals.size()); + for (auto &symbol : unresolvedExternals) { + EXPECT_NE(NEO::Linker::perThreadOff, symbol.unresolvedRelocation.symbolName); + } + + uint16_t gpuAddress1 = kernelDescriptor1.kernelAttributes.crossThreadDataSize - + kernelDescriptor1.kernelAttributes.inlineDataPayloadSize; + + uint16_t gpuAddress2 = kernelDescriptor2.kernelAttributes.crossThreadDataSize - + kernelDescriptor2.kernelAttributes.inlineDataPayloadSize; + + EXPECT_EQ(*reinterpret_cast(&instructionSegment[kernel1RelocOffset]), static_cast(gpuAddress1)); + EXPECT_EQ(*reinterpret_cast(&instructionSegment[kernel2RelocOffset]), static_cast(gpuAddress2 & 0xffffffff)); +} + +HWTEST_F(LinkerTests, givenPerThreadOffSymbolInUnresolvedExternalSymbolsAndMissingKernelDescriptorForPerThreadOffSymbolWhenResolveBuiltinsThenPerThreadOffSymbolIsNotResolved) { + struct LinkerMock : public NEO::Linker { + public: + using NEO::Linker::resolveBuiltins; + + LinkerMock(const LinkerInput &data) : NEO::Linker(data) { + } + }; + + const uint64_t kernelRelocOffset = 40; + + NEO::LinkerInput linkerInput; + LinkerMock linker(linkerInput); + NEO::Linker::UnresolvedExternals unresolvedExternals; + unresolvedExternals.push_back({{"__MaxHWThreadIDPerSubDevice", 156, NEO::Linker::RelocationInfo::Type::addressLow, NEO::SegmentType::instructions}, 0u, false}); + unresolvedExternals.push_back({{"__MaxHWThreadIDPerSubDevice", 140, NEO::Linker::RelocationInfo::Type::addressHigh, NEO::SegmentType::instructions}, 0u, false}); + unresolvedExternals.push_back({{"__INTEL_PER_THREAD_OFF", kernelRelocOffset, NEO::Linker::RelocationInfo::Type::address16, NEO::SegmentType::instructions, ".text.kernel_func"}, 0u, false}); + + std::vector instructionSegment; + instructionSegment.resize(64); + NEO::Linker::PatchableSegments instructionsSegments; + instructionsSegments.push_back({instructionSegment.data(), 0u}); + + KernelDescriptor kernelDescriptor; + kernelDescriptor.kernelMetadata.kernelName = "kernel_name"; + kernelDescriptor.kernelAttributes.crossThreadDataSize = 96; + + NEO::Linker::KernelDescriptorsT kernelDescriptors; + kernelDescriptors.push_back(&kernelDescriptor); + linker.resolveBuiltins(pDevice, unresolvedExternals, instructionsSegments, kernelDescriptors); + + EXPECT_EQ(3U, unresolvedExternals.size()); + + bool isPerThreadOffUnresolved = false; + for (auto &symbol : unresolvedExternals) { + if (NEO::Linker::perThreadOff == symbol.unresolvedRelocation.symbolName) { + isPerThreadOffUnresolved = true; + } + } + EXPECT_TRUE(isPerThreadOffUnresolved); +} + HWTEST_F(LinkerTests, givenUnresolvedExternalWhenPatchingInstructionsThenLinkPartially) { NEO::LinkerInput linkerInput; vISA::GenRelocEntry entry = {};