Files
llvm/lld/MachO/SymbolTable.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

694 lines
25 KiB
C++
Raw Normal View History

//===- SymbolTable.cpp ----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "SymbolTable.h"
#include "ConcatOutputSection.h"
#include "Config.h"
#include "InputFiles.h"
#include "InputSection.h"
#include "Symbols.h"
#include "SyntheticSections.h"
#include "lld/Common/ErrorHandler.h"
#include "lld/Common/Memory.h"
#include "llvm/Demangle/Demangle.h"
using namespace llvm;
using namespace lld;
using namespace lld::macho;
Symbol *SymbolTable::find(CachedHashStringRef cachedName) {
auto it = symMap.find(cachedName);
if (it == symMap.end())
return nullptr;
return symVector[it->second];
}
std::pair<Symbol *, bool> SymbolTable::insert(StringRef name,
const InputFile *file) {
auto p = symMap.insert({CachedHashStringRef(name), (int)symVector.size()});
Symbol *sym;
if (!p.second) {
// Name already present in the symbol table.
sym = symVector[p.first->second];
} else {
// Name is a new symbol.
sym = reinterpret_cast<Symbol *>(make<SymbolUnion>());
symVector.push_back(sym);
}
sym->isUsedInRegularObj |= !file || isa<ObjFile>(file);
return {sym, p.second};
}
namespace {
struct DuplicateSymbolDiag {
// Pair containing source location and source file
const std::pair<std::string, std::string> src1;
const std::pair<std::string, std::string> src2;
const Symbol *sym;
DuplicateSymbolDiag(const std::pair<std::string, std::string> src1,
const std::pair<std::string, std::string> src2,
const Symbol *sym)
: src1(src1), src2(src2), sym(sym) {}
};
SmallVector<DuplicateSymbolDiag> dupSymDiags;
} // namespace
[lld:MachO] Allow independent override of weak symbols aliased via .set (#167825) Currently, if multiple external weak symbols are defined at the same address in an object file (e.g., by using the .set assembler directive to alias them to a single weak variable), ld64.lld treats them as a single unit. When any one of these symbols is overridden by a strong definition, all of the original weak symbols resolve to the strong definition. This patch changes the behavior in `transplantSymbolsAtOffset`. When a weak symbol is being replaced by a strong one, only non-external (local) symbols at the same offset are moved to the new symbol's section. Other *external* symbols are no longer transplanted. This allows each external weak symbol to be overridden independently. This behavior is consistent with Apple's ld-classic, but diverges from ld-prime in one case, as noted on https://github.com/llvm/llvm-project/issues/167262 (this discrepancy has recently been reported to Apple). ### Backward Compatibility This change alters linker behavior for a specific scenario. The creation of multiple external weak symbols aliased to the same address via assembler directives is primarily an advanced technique. It's unlikely that existing builds rely on the current behavior of all aliases being overridden together. If there are concerns, this could be put behind a linker option, but the new default seems more correct, less surprising, and is consistent with ld-classic. ### Testing The new lit test `test/MachO/weak-alias-override.s` verifies this behavior using llvm-nm. Fixes #167262
2025-11-21 17:03:40 -08:00
// Move local symbols at \p fromOff in \p fromIsec into \p toIsec, unless that
// symbol is \p skip, in which case we just remove it.
static void transplantSymbolsAtOffset(InputSection *fromIsec,
InputSection *toIsec, Defined *skip,
uint64_t fromOff, uint64_t toOff) {
// Ensure the symbols will still be in address order after our insertions.
auto symSucceedsOff = [](uint64_t off, const Symbol *s) {
return cast<Defined>(s)->value > off;
};
assert(std::is_partitioned(toIsec->symbols.begin(), toIsec->symbols.end(),
[symSucceedsOff, toOff](const Symbol *s) {
return !symSucceedsOff(toOff, s);
}) &&
"Symbols in toIsec must be partitioned by toOff.");
auto insertIt = llvm::upper_bound(toIsec->symbols, toOff, symSucceedsOff);
llvm::erase_if(fromIsec->symbols, [&](Symbol *s) {
auto *d = cast<Defined>(s);
[lld:MachO] Allow independent override of weak symbols aliased via .set (#167825) Currently, if multiple external weak symbols are defined at the same address in an object file (e.g., by using the .set assembler directive to alias them to a single weak variable), ld64.lld treats them as a single unit. When any one of these symbols is overridden by a strong definition, all of the original weak symbols resolve to the strong definition. This patch changes the behavior in `transplantSymbolsAtOffset`. When a weak symbol is being replaced by a strong one, only non-external (local) symbols at the same offset are moved to the new symbol's section. Other *external* symbols are no longer transplanted. This allows each external weak symbol to be overridden independently. This behavior is consistent with Apple's ld-classic, but diverges from ld-prime in one case, as noted on https://github.com/llvm/llvm-project/issues/167262 (this discrepancy has recently been reported to Apple). ### Backward Compatibility This change alters linker behavior for a specific scenario. The creation of multiple external weak symbols aliased to the same address via assembler directives is primarily an advanced technique. It's unlikely that existing builds rely on the current behavior of all aliases being overridden together. If there are concerns, this could be put behind a linker option, but the new default seems more correct, less surprising, and is consistent with ld-classic. ### Testing The new lit test `test/MachO/weak-alias-override.s` verifies this behavior using llvm-nm. Fixes #167262
2025-11-21 17:03:40 -08:00
if (d == skip)
return true;
if (d->value != fromOff || d->isExternal())
return false;
[lld:MachO] Allow independent override of weak symbols aliased via .set (#167825) Currently, if multiple external weak symbols are defined at the same address in an object file (e.g., by using the .set assembler directive to alias them to a single weak variable), ld64.lld treats them as a single unit. When any one of these symbols is overridden by a strong definition, all of the original weak symbols resolve to the strong definition. This patch changes the behavior in `transplantSymbolsAtOffset`. When a weak symbol is being replaced by a strong one, only non-external (local) symbols at the same offset are moved to the new symbol's section. Other *external* symbols are no longer transplanted. This allows each external weak symbol to be overridden independently. This behavior is consistent with Apple's ld-classic, but diverges from ld-prime in one case, as noted on https://github.com/llvm/llvm-project/issues/167262 (this discrepancy has recently been reported to Apple). ### Backward Compatibility This change alters linker behavior for a specific scenario. The creation of multiple external weak symbols aliased to the same address via assembler directives is primarily an advanced technique. It's unlikely that existing builds rely on the current behavior of all aliases being overridden together. If there are concerns, this could be put behind a linker option, but the new default seems more correct, less surprising, and is consistent with ld-classic. ### Testing The new lit test `test/MachO/weak-alias-override.s` verifies this behavior using llvm-nm. Fixes #167262
2025-11-21 17:03:40 -08:00
// This repeated insertion will be quadratic unless insertIt is the end
// iterator. However, that is typically the case for files that have
// .subsections_via_symbols set.
insertIt = toIsec->symbols.insert(insertIt, d);
d->originalIsec = toIsec;
d->value = toOff;
// We don't want to have more than one unwindEntry at a given address, so
// drop the redundant ones. We can safely drop the unwindEntries of the
// symbols in fromIsec since we will be adding another unwindEntry as we
// finish parsing toIsec's file. (We can assume that toIsec has its own
// unwindEntry because of the ODR.)
d->originalUnwindEntry = nullptr;
return true;
});
}
Defined *SymbolTable::addDefined(StringRef name, InputFile *file,
InputSection *isec, uint64_t value,
uint64_t size, bool isWeakDef,
bool isPrivateExtern,
bool isReferencedDynamically, bool noDeadStrip,
bool isWeakDefCanBeHidden) {
bool overridesWeakDef = false;
auto [s, wasInserted] = insert(name, file);
assert(!file || !isa<BitcodeFile>(file) || !isec);
[lld/mac] Write every weak symbol only once in the output Before this, if an inline function was defined in several input files, lld would write each copy of the inline function the output. With this patch, it only writes one copy. Reduces the size of Chromium Framework from 378MB to 345MB (compared to 290MB linked with ld64, which also does dead-stripping, which we don't do yet), and makes linking it faster: N Min Max Median Avg Stddev x 10 3.9957051 4.3496981 4.1411121 4.156837 0.10092097 + 10 3.908154 4.169318 3.9712729 3.9846753 0.075773012 Difference at 95.0% confidence -0.172162 +/- 0.083847 -4.14165% +/- 2.01709% (Student's t, pooled s = 0.0892373) Implementation-wise, when merging two weak symbols, this sets a "canOmitFromOutput" on the InputSection belonging to the weak symbol not put in the symbol table. We then don't write InputSections that have this set, as long as they are not referenced from other symbols. (This happens e.g. for object files that don't set .subsections_via_symbols or that use .alt_entry.) Some restrictions: - not yet done for bitcode inputs - no "comdat" handling (`kindNoneGroupSubordinate*` in ld64) -- Frame Descriptor Entries (FDEs), Language Specific Data Areas (LSDAs) (that is, catch block unwind information) and Personality Routines associated with weak functions still not stripped. This is wasteful, but harmless. - However, this does strip weaks from __unwind_info (which is needed for correctness and not just for size) - This nopes out on InputSections that are referenced form more than one symbol (eg from .alt_entry) for now Things that work based on symbols Just Work: - map files (change in MapFile.cpp is no-op and not needed; I just found it a bit more explicit) - exports Things that work with inputSections need to explicitly check if an inputSection is written (e.g. unwind info). This patch is useful in itself, but it's also likely also a useful foundation for dead_strip. I used to have a "canoncialRepresentative" pointer on InputSection instead of just the bool, which would be handy for ICF too. But I ended up not needing it for this patch, so I removed that again for now. Differential Revision: https://reviews.llvm.org/D102076
2021-05-06 14:47:57 -04:00
if (!wasInserted) {
if (auto *defined = dyn_cast<Defined>(s)) {
[lld/mac] Implement support for private extern symbols Private extern symbols are used for things scoped to the linkage unit. They cause duplicate symbol errors (so they're in the symbol table, unlike TU-scoped truly local symbols), but they don't make it into the export trie. They are created e.g. by compiling with -fvisibility=hidden. If two weak symbols have differing privateness, the combined symbol is non-private external. (Example: inline functions and some TUs that include the header defining it were built with -fvisibility-inlines-hidden and some weren't). A weak private external symbol implicitly has its "weak" dropped and behaves like a regular strong private external symbol: Weak is an export trie concept, and private symbols are not in the export trie. If a weak and a strong symbol have different privateness, the strong symbol wins. If two common symbols have differing privateness, the larger symbol wins. If they have the same size, the privateness of the symbol seen later during the link wins (!) -- this is a bit lame, but it matches ld64 and this behavior takes 2 lines less to implement than the less surprising "result is non-private external), so match ld64. (Example: `int a` in two .c files, both built with -fcommon, one built with -fvisibility=hidden and one without.) This also makes `__dyld_private` a true TU-local symbol, matching ld64. To make this work, make the `const char*` StringRefZ ctor to correctly set `size` (without this, writing the string table crashed when calling getName() on the __dyld_private symbol). Mention in CommonSymbol's comment that common symbols are now disabled by default in clang. Mention in -keep_private_externs's HelpText that the flag only has an effect with `-r` (which we don't implement yet -- so this patch here doesn't regress any behavior around -r + -keep_private_externs)). ld64 doesn't explicitly document it, but the commit text of http://reviews.llvm.org/rL216146 does, and ld64's OutputFile::buildSymbolTable() checks `_options.outputKind() == Options::kObjectFile` before calling `_options.keepPrivateExterns()` (the only reference to that function). Fixes PR48536. Differential Revision: https://reviews.llvm.org/D93609
2020-12-17 13:30:18 -05:00
if (isWeakDef) {
// See further comment in createDefined() in InputFiles.cpp
[lld/mac] Write every weak symbol only once in the output Before this, if an inline function was defined in several input files, lld would write each copy of the inline function the output. With this patch, it only writes one copy. Reduces the size of Chromium Framework from 378MB to 345MB (compared to 290MB linked with ld64, which also does dead-stripping, which we don't do yet), and makes linking it faster: N Min Max Median Avg Stddev x 10 3.9957051 4.3496981 4.1411121 4.156837 0.10092097 + 10 3.908154 4.169318 3.9712729 3.9846753 0.075773012 Difference at 95.0% confidence -0.172162 +/- 0.083847 -4.14165% +/- 2.01709% (Student's t, pooled s = 0.0892373) Implementation-wise, when merging two weak symbols, this sets a "canOmitFromOutput" on the InputSection belonging to the weak symbol not put in the symbol table. We then don't write InputSections that have this set, as long as they are not referenced from other symbols. (This happens e.g. for object files that don't set .subsections_via_symbols or that use .alt_entry.) Some restrictions: - not yet done for bitcode inputs - no "comdat" handling (`kindNoneGroupSubordinate*` in ld64) -- Frame Descriptor Entries (FDEs), Language Specific Data Areas (LSDAs) (that is, catch block unwind information) and Personality Routines associated with weak functions still not stripped. This is wasteful, but harmless. - However, this does strip weaks from __unwind_info (which is needed for correctness and not just for size) - This nopes out on InputSections that are referenced form more than one symbol (eg from .alt_entry) for now Things that work based on symbols Just Work: - map files (change in MapFile.cpp is no-op and not needed; I just found it a bit more explicit) - exports Things that work with inputSections need to explicitly check if an inputSection is written (e.g. unwind info). This patch is useful in itself, but it's also likely also a useful foundation for dead_strip. I used to have a "canoncialRepresentative" pointer on InputSection instead of just the bool, which would be handy for ICF too. But I ended up not needing it for this patch, so I removed that again for now. Differential Revision: https://reviews.llvm.org/D102076
2021-05-06 14:47:57 -04:00
if (defined->isWeakDef()) {
[lld/mac] Implement support for private extern symbols Private extern symbols are used for things scoped to the linkage unit. They cause duplicate symbol errors (so they're in the symbol table, unlike TU-scoped truly local symbols), but they don't make it into the export trie. They are created e.g. by compiling with -fvisibility=hidden. If two weak symbols have differing privateness, the combined symbol is non-private external. (Example: inline functions and some TUs that include the header defining it were built with -fvisibility-inlines-hidden and some weren't). A weak private external symbol implicitly has its "weak" dropped and behaves like a regular strong private external symbol: Weak is an export trie concept, and private symbols are not in the export trie. If a weak and a strong symbol have different privateness, the strong symbol wins. If two common symbols have differing privateness, the larger symbol wins. If they have the same size, the privateness of the symbol seen later during the link wins (!) -- this is a bit lame, but it matches ld64 and this behavior takes 2 lines less to implement than the less surprising "result is non-private external), so match ld64. (Example: `int a` in two .c files, both built with -fcommon, one built with -fvisibility=hidden and one without.) This also makes `__dyld_private` a true TU-local symbol, matching ld64. To make this work, make the `const char*` StringRefZ ctor to correctly set `size` (without this, writing the string table crashed when calling getName() on the __dyld_private symbol). Mention in CommonSymbol's comment that common symbols are now disabled by default in clang. Mention in -keep_private_externs's HelpText that the flag only has an effect with `-r` (which we don't implement yet -- so this patch here doesn't regress any behavior around -r + -keep_private_externs)). ld64 doesn't explicitly document it, but the commit text of http://reviews.llvm.org/rL216146 does, and ld64's OutputFile::buildSymbolTable() checks `_options.outputKind() == Options::kObjectFile` before calling `_options.keepPrivateExterns()` (the only reference to that function). Fixes PR48536. Differential Revision: https://reviews.llvm.org/D93609
2020-12-17 13:30:18 -05:00
defined->privateExtern &= isPrivateExtern;
defined->weakDefCanBeHidden &= isWeakDefCanBeHidden;
defined->referencedDynamically |= isReferencedDynamically;
[lld/mac] Implement -dead_strip Also adds support for live_support sections, no_dead_strip sections, .no_dead_strip symbols. Chromium Framework 345MB unstripped -> 250MB stripped (vs 290MB unstripped -> 236M stripped with ld64). Doing dead stripping is a bit faster than not, because so much less data needs to be processed: % ministat lld_* x lld_nostrip.txt + lld_strip.txt N Min Max Median Avg Stddev x 10 3.929414 4.07692 4.0269079 4.0089678 0.044214794 + 10 3.8129408 3.9025559 3.8670411 3.8642573 0.024779651 Difference at 95.0% confidence -0.144711 +/- 0.0336749 -3.60967% +/- 0.839989% (Student's t, pooled s = 0.0358398) This interacts with many parts of the linker. I tried to add test coverage for all added `isLive()` checks, so that some test will fail if any of them is removed. I checked that the test expectations for the most part match ld64's behavior (except for live-support-iterations.s, see the comment in the test). Interacts with: - debug info - export tries - import opcodes - flags like -exported_symbol(s_list) - -U / dynamic_lookup - mod_init_funcs, mod_term_funcs - weak symbol handling - unwind info - stubs - map files - -sectcreate - undefined, dylib, common, defined (both absolute and normal) symbols It's possible it interacts with more features I didn't think of, of course. I also did some manual testing: - check-llvm check-clang check-lld work with lld with this patch as host linker and -dead_strip enabled - Chromium still starts - Chromium's base_unittests still pass, including unwind tests Implemenation-wise, this is InputSection-based, so it'll work for object files with .subsections_via_symbols (which includes all object files generated by clang). I first based this on the COFF implementation, but later realized that things are more similar to ELF. I think it'd be good to refactor MarkLive.cpp to look more like the ELF part at some point, but I'd like to get a working state checked in first. Mechanical parts: - Rename canOmitFromOutput to wasCoalesced (no behavior change) since it really is for weak coalesced symbols - Add noDeadStrip to Defined, corresponding to N_NO_DEAD_STRIP (`.no_dead_strip` in asm) Fixes PR49276. Differential Revision: https://reviews.llvm.org/D103324
2021-05-07 17:10:05 -04:00
defined->noDeadStrip |= noDeadStrip;
[lld/mac] Write every weak symbol only once in the output Before this, if an inline function was defined in several input files, lld would write each copy of the inline function the output. With this patch, it only writes one copy. Reduces the size of Chromium Framework from 378MB to 345MB (compared to 290MB linked with ld64, which also does dead-stripping, which we don't do yet), and makes linking it faster: N Min Max Median Avg Stddev x 10 3.9957051 4.3496981 4.1411121 4.156837 0.10092097 + 10 3.908154 4.169318 3.9712729 3.9846753 0.075773012 Difference at 95.0% confidence -0.172162 +/- 0.083847 -4.14165% +/- 2.01709% (Student's t, pooled s = 0.0892373) Implementation-wise, when merging two weak symbols, this sets a "canOmitFromOutput" on the InputSection belonging to the weak symbol not put in the symbol table. We then don't write InputSections that have this set, as long as they are not referenced from other symbols. (This happens e.g. for object files that don't set .subsections_via_symbols or that use .alt_entry.) Some restrictions: - not yet done for bitcode inputs - no "comdat" handling (`kindNoneGroupSubordinate*` in ld64) -- Frame Descriptor Entries (FDEs), Language Specific Data Areas (LSDAs) (that is, catch block unwind information) and Personality Routines associated with weak functions still not stripped. This is wasteful, but harmless. - However, this does strip weaks from __unwind_info (which is needed for correctness and not just for size) - This nopes out on InputSections that are referenced form more than one symbol (eg from .alt_entry) for now Things that work based on symbols Just Work: - map files (change in MapFile.cpp is no-op and not needed; I just found it a bit more explicit) - exports Things that work with inputSections need to explicitly check if an inputSection is written (e.g. unwind info). This patch is useful in itself, but it's also likely also a useful foundation for dead_strip. I used to have a "canoncialRepresentative" pointer on InputSection instead of just the bool, which would be handy for ICF too. But I ended up not needing it for this patch, so I removed that again for now. Differential Revision: https://reviews.llvm.org/D102076
2021-05-06 14:47:57 -04:00
}
if (auto concatIsec = dyn_cast_or_null<ConcatInputSection>(isec)) {
concatIsec->wasCoalesced = true;
// Any local symbols that alias the coalesced symbol should be moved
// into the prevailing section. Note that we have sorted the symbols
// in ObjFile::parseSymbols() such that extern weak symbols appear
// last, so we don't need to worry about subsequent symbols being
// added to an already-coalesced section.
if (defined->isec())
transplantSymbolsAtOffset(concatIsec, defined->isec(),
/*skip=*/nullptr, value, defined->value);
}
return defined;
[lld/mac] Implement support for private extern symbols Private extern symbols are used for things scoped to the linkage unit. They cause duplicate symbol errors (so they're in the symbol table, unlike TU-scoped truly local symbols), but they don't make it into the export trie. They are created e.g. by compiling with -fvisibility=hidden. If two weak symbols have differing privateness, the combined symbol is non-private external. (Example: inline functions and some TUs that include the header defining it were built with -fvisibility-inlines-hidden and some weren't). A weak private external symbol implicitly has its "weak" dropped and behaves like a regular strong private external symbol: Weak is an export trie concept, and private symbols are not in the export trie. If a weak and a strong symbol have different privateness, the strong symbol wins. If two common symbols have differing privateness, the larger symbol wins. If they have the same size, the privateness of the symbol seen later during the link wins (!) -- this is a bit lame, but it matches ld64 and this behavior takes 2 lines less to implement than the less surprising "result is non-private external), so match ld64. (Example: `int a` in two .c files, both built with -fcommon, one built with -fvisibility=hidden and one without.) This also makes `__dyld_private` a true TU-local symbol, matching ld64. To make this work, make the `const char*` StringRefZ ctor to correctly set `size` (without this, writing the string table crashed when calling getName() on the __dyld_private symbol). Mention in CommonSymbol's comment that common symbols are now disabled by default in clang. Mention in -keep_private_externs's HelpText that the flag only has an effect with `-r` (which we don't implement yet -- so this patch here doesn't regress any behavior around -r + -keep_private_externs)). ld64 doesn't explicitly document it, but the commit text of http://reviews.llvm.org/rL216146 does, and ld64's OutputFile::buildSymbolTable() checks `_options.outputKind() == Options::kObjectFile` before calling `_options.keepPrivateExterns()` (the only reference to that function). Fixes PR48536. Differential Revision: https://reviews.llvm.org/D93609
2020-12-17 13:30:18 -05:00
}
if (defined->isWeakDef()) {
if (auto concatIsec =
dyn_cast_or_null<ConcatInputSection>(defined->isec())) {
concatIsec->wasCoalesced = true;
if (isec)
transplantSymbolsAtOffset(concatIsec, isec, defined, defined->value,
value);
}
} else {
std::string srcLoc1 = defined->getSourceLocation();
std::string srcLoc2 = isec ? isec->getSourceLocation(value) : "";
std::string srcFile1 = toString(defined->getFile());
std::string srcFile2 = toString(file);
dupSymDiags.push_back({make_pair(srcLoc1, srcFile1),
make_pair(srcLoc2, srcFile2), defined});
}
} else if (auto *dysym = dyn_cast<DylibSymbol>(s)) {
overridesWeakDef = !isWeakDef && dysym->isWeakDef();
dysym->unreference();
} else if (auto *undef = dyn_cast<Undefined>(s)) {
if (undef->wasBitcodeSymbol) {
auto objFile = dyn_cast<ObjFile>(file);
if (!objFile) {
// The file must be a native object file, as opposed to potentially
// being another bitcode file. A situation arises when some symbols
// are defined thru `module asm` and thus they are not present in the
// bitcode's symbol table. Consider bitcode modules `A`, `B`, and `C`.
// LTO compiles only `A` and `C`, since there's no explicit symbol
// reference to `B` other than a symbol from `A` via `module asm`.
// After LTO is finished, the missing symbol now appears in the
// resulting object file for `A`, which prematurely resolves another
// prevailing symbol with `B` that hasn't been compiled, instead of
// the resulting object for `C`. Consequently, an incorrect
// relocation is generated for the prevailing symbol.
assert(isa<BitcodeFile>(file) && "Bitcode file is expected.");
std::string message =
"The pending prevailing symbol(" + name.str() +
") in the bitcode file(" + toString(undef->getFile()) +
") is overridden by a non-native object (from bitcode): " +
toString(file);
error(message);
} else if (!objFile->builtFromBitcode) {
// Ideally, this should be an object file compiled from a bitcode
// file. However, this might not hold true if a LC linker option is
// used. In case LTO internalizes a prevailing hidden weak symbol,
// there's a situation where an unresolved prevailing symbol might be
// linked with the corresponding one from a native library, which is
// loaded later after LTO. Although this could potentially result in
// an ODR violation, we choose to permit this scenario as a warning.
std::string message = "The pending prevailing symbol(" + name.str() +
") in the bitcode file(" +
toString(undef->getFile()) +
") is overridden by a post-processed native "
"object (from native archive): " +
toString(file);
warn(message);
} else {
// Preserve the original bitcode file name (instead of using the
// object file name).
file = undef->getFile();
}
}
}
// Defined symbols take priority over other types of symbols, so in case
// of a name conflict, we fall through to the replaceSymbol() call below.
}
// With -flat_namespace, all extern symbols in dylibs are interposable.
bool interposable = ((config->namespaceKind == NamespaceKind::flat &&
config->outputType != MachO::MH_EXECUTE) ||
config->interposable) &&
!isPrivateExtern;
Defined *defined = replaceSymbol<Defined>(
s, name, file, isec, value, size, isWeakDef, /*isExternal=*/true,
isPrivateExtern, /*includeInSymtab=*/true, isReferencedDynamically,
noDeadStrip, overridesWeakDef, isWeakDefCanBeHidden, interposable);
return defined;
}
Defined *SymbolTable::aliasDefined(Defined *src, StringRef target,
InputFile *newFile, bool makePrivateExtern) {
bool isPrivateExtern = makePrivateExtern || src->privateExtern;
return addDefined(target, newFile, src->isec(), src->value, src->size,
src->isWeakDef(), isPrivateExtern,
src->referencedDynamically, src->noDeadStrip,
src->weakDefCanBeHidden);
}
Symbol *SymbolTable::addUndefined(StringRef name, InputFile *file,
bool isWeakRef) {
auto [s, wasInserted] = insert(name, file);
RefState refState = isWeakRef ? RefState::Weak : RefState::Strong;
if (wasInserted)
replaceSymbol<Undefined>(s, name, file, refState,
/*wasBitcodeSymbol=*/false);
else if (auto *lazy = dyn_cast<LazyArchive>(s))
lazy->fetchArchiveMember();
else if (isa<LazyObject>(s))
extract(*s->getFile(), s->getName());
else if (auto *dynsym = dyn_cast<DylibSymbol>(s))
dynsym->reference(refState);
else if (auto *undefined = dyn_cast<Undefined>(s))
undefined->refState = std::max(undefined->refState, refState);
return s;
}
Symbol *SymbolTable::addCommon(StringRef name, InputFile *file, uint64_t size,
[lld/mac] Implement support for private extern symbols Private extern symbols are used for things scoped to the linkage unit. They cause duplicate symbol errors (so they're in the symbol table, unlike TU-scoped truly local symbols), but they don't make it into the export trie. They are created e.g. by compiling with -fvisibility=hidden. If two weak symbols have differing privateness, the combined symbol is non-private external. (Example: inline functions and some TUs that include the header defining it were built with -fvisibility-inlines-hidden and some weren't). A weak private external symbol implicitly has its "weak" dropped and behaves like a regular strong private external symbol: Weak is an export trie concept, and private symbols are not in the export trie. If a weak and a strong symbol have different privateness, the strong symbol wins. If two common symbols have differing privateness, the larger symbol wins. If they have the same size, the privateness of the symbol seen later during the link wins (!) -- this is a bit lame, but it matches ld64 and this behavior takes 2 lines less to implement than the less surprising "result is non-private external), so match ld64. (Example: `int a` in two .c files, both built with -fcommon, one built with -fvisibility=hidden and one without.) This also makes `__dyld_private` a true TU-local symbol, matching ld64. To make this work, make the `const char*` StringRefZ ctor to correctly set `size` (without this, writing the string table crashed when calling getName() on the __dyld_private symbol). Mention in CommonSymbol's comment that common symbols are now disabled by default in clang. Mention in -keep_private_externs's HelpText that the flag only has an effect with `-r` (which we don't implement yet -- so this patch here doesn't regress any behavior around -r + -keep_private_externs)). ld64 doesn't explicitly document it, but the commit text of http://reviews.llvm.org/rL216146 does, and ld64's OutputFile::buildSymbolTable() checks `_options.outputKind() == Options::kObjectFile` before calling `_options.keepPrivateExterns()` (the only reference to that function). Fixes PR48536. Differential Revision: https://reviews.llvm.org/D93609
2020-12-17 13:30:18 -05:00
uint32_t align, bool isPrivateExtern) {
auto [s, wasInserted] = insert(name, file);
if (!wasInserted) {
if (auto *common = dyn_cast<CommonSymbol>(s)) {
if (size < common->size)
return s;
} else if (isa<Defined>(s)) {
return s;
}
// Common symbols take priority over all non-Defined symbols, so in case of
// a name conflict, we fall through to the replaceSymbol() call below.
}
[lld/mac] Implement support for private extern symbols Private extern symbols are used for things scoped to the linkage unit. They cause duplicate symbol errors (so they're in the symbol table, unlike TU-scoped truly local symbols), but they don't make it into the export trie. They are created e.g. by compiling with -fvisibility=hidden. If two weak symbols have differing privateness, the combined symbol is non-private external. (Example: inline functions and some TUs that include the header defining it were built with -fvisibility-inlines-hidden and some weren't). A weak private external symbol implicitly has its "weak" dropped and behaves like a regular strong private external symbol: Weak is an export trie concept, and private symbols are not in the export trie. If a weak and a strong symbol have different privateness, the strong symbol wins. If two common symbols have differing privateness, the larger symbol wins. If they have the same size, the privateness of the symbol seen later during the link wins (!) -- this is a bit lame, but it matches ld64 and this behavior takes 2 lines less to implement than the less surprising "result is non-private external), so match ld64. (Example: `int a` in two .c files, both built with -fcommon, one built with -fvisibility=hidden and one without.) This also makes `__dyld_private` a true TU-local symbol, matching ld64. To make this work, make the `const char*` StringRefZ ctor to correctly set `size` (without this, writing the string table crashed when calling getName() on the __dyld_private symbol). Mention in CommonSymbol's comment that common symbols are now disabled by default in clang. Mention in -keep_private_externs's HelpText that the flag only has an effect with `-r` (which we don't implement yet -- so this patch here doesn't regress any behavior around -r + -keep_private_externs)). ld64 doesn't explicitly document it, but the commit text of http://reviews.llvm.org/rL216146 does, and ld64's OutputFile::buildSymbolTable() checks `_options.outputKind() == Options::kObjectFile` before calling `_options.keepPrivateExterns()` (the only reference to that function). Fixes PR48536. Differential Revision: https://reviews.llvm.org/D93609
2020-12-17 13:30:18 -05:00
replaceSymbol<CommonSymbol>(s, name, file, size, align, isPrivateExtern);
return s;
}
Symbol *SymbolTable::addDylib(StringRef name, DylibFile *file, bool isWeakDef,
bool isTlv) {
auto [s, wasInserted] = insert(name, file);
RefState refState = RefState::Unreferenced;
if (!wasInserted) {
if (auto *defined = dyn_cast<Defined>(s)) {
if (isWeakDef && !defined->isWeakDef())
defined->overridesWeakDef = true;
} else if (auto *undefined = dyn_cast<Undefined>(s)) {
refState = undefined->refState;
} else if (auto *dysym = dyn_cast<DylibSymbol>(s)) {
refState = dysym->getRefState();
}
}
bool isDynamicLookup = file == nullptr;
if (wasInserted || isa<Undefined>(s) ||
(isa<DylibSymbol>(s) &&
((!isWeakDef && s->isWeakDef()) ||
(!isDynamicLookup && cast<DylibSymbol>(s)->isDynamicLookup())))) {
if (auto *dynsym = dyn_cast<DylibSymbol>(s))
dynsym->unreference();
replaceSymbol<DylibSymbol>(s, file, name, isWeakDef, refState, isTlv);
}
return s;
}
Symbol *SymbolTable::addDynamicLookup(StringRef name) {
return addDylib(name, /*file=*/nullptr, /*isWeakDef=*/false, /*isTlv=*/false);
}
Symbol *SymbolTable::addLazyArchive(StringRef name, ArchiveFile *file,
const object::Archive::Symbol &sym) {
auto [s, wasInserted] = insert(name, file);
if (wasInserted) {
replaceSymbol<LazyArchive>(s, file, sym);
} else if (isa<Undefined>(s)) {
file->fetch(sym);
} else if (auto *dysym = dyn_cast<DylibSymbol>(s)) {
if (dysym->isWeakDef()) {
if (dysym->getRefState() != RefState::Unreferenced)
file->fetch(sym);
else
replaceSymbol<LazyArchive>(s, file, sym);
}
}
return s;
}
Symbol *SymbolTable::addLazyObject(StringRef name, InputFile &file) {
auto [s, wasInserted] = insert(name, &file);
if (wasInserted) {
replaceSymbol<LazyObject>(s, file, name);
} else if (isa<Undefined>(s)) {
extract(file, name);
} else if (auto *dysym = dyn_cast<DylibSymbol>(s)) {
if (dysym->isWeakDef()) {
if (dysym->getRefState() != RefState::Unreferenced)
extract(file, name);
else
replaceSymbol<LazyObject>(s, file, name);
}
}
return s;
}
Defined *SymbolTable::addSynthetic(StringRef name, InputSection *isec,
uint64_t value, bool isPrivateExtern,
bool includeInSymtab,
bool referencedDynamically) {
assert(!isec || !isec->getFile()); // See makeSyntheticInputSection().
Defined *s = addDefined(name, /*file=*/nullptr, isec, value, /*size=*/0,
/*isWeakDef=*/false, isPrivateExtern,
referencedDynamically, /*noDeadStrip=*/false,
/*isWeakDefCanBeHidden=*/false);
s->includeInSymtab = includeInSymtab;
return s;
}
enum class Boundary {
Start,
End,
};
static Defined *createBoundarySymbol(const Undefined &sym) {
return symtab->addSynthetic(
sym.getName(), /*isec=*/nullptr, /*value=*/-1, /*isPrivateExtern=*/true,
/*includeInSymtab=*/false, /*referencedDynamically=*/false);
}
static void handleSectionBoundarySymbol(const Undefined &sym, StringRef segSect,
Boundary which) {
auto [segName, sectName] = segSect.split('$');
// Attach the symbol to any InputSection that will end up in the right
// OutputSection -- it doesn't matter which one we pick.
// Don't bother looking through inputSections for a matching
// ConcatInputSection -- we need to create ConcatInputSection for
// non-existing sections anyways, and that codepath works even if we should
// already have a ConcatInputSection with the right name.
OutputSection *osec = nullptr;
// This looks for __TEXT,__cstring etc.
for (SyntheticSection *ssec : syntheticSections)
if (ssec->segname == segName && ssec->name == sectName) {
osec = ssec->isec->parent;
break;
}
if (!osec) {
[lld-macho][nfc] Eliminate InputSection::Shared Earlier in LLD's evolution, I tried to create the illusion that subsections were indistinguishable from "top-level" sections. Thus, even though the subsections shared many common field values, I hid those common values away in a private Shared struct (see D105305). More recently, however, @gkm added a public `Section` struct in D113241 that served as an explicit way to store values that are common to an entire set of subsections (aka InputSections). Now that we have another "common value" struct, `Shared` has been rendered redundant. All its fields can be moved into `Section` instead, and the pointer to `Shared` can be replaced with a pointer to `Section`. This `Section` pointer also has the advantage of letting us inspect other subsections easily, simplifying the implementation of {D118798}. P.S. I do think that having both `Section` and `InputSection` makes for a slightly confusing naming scheme. I considered renaming `InputSection` to `Subsection`, but that would break the symmetry with `OutputSection`. It would also make us deviate from LLD-ELF's naming scheme. This change is perf-neutral on my 3.2 GHz 16-Core Intel Xeon W machine: base diff difference (95% CI) sys_time 1.258 ± 0.031 1.248 ± 0.023 [ -1.6% .. +0.1%] user_time 3.659 ± 0.047 3.658 ± 0.041 [ -0.5% .. +0.4%] wall_time 4.640 ± 0.085 4.625 ± 0.063 [ -1.0% .. +0.3%] samples 49 61 There's also no stat sig change in RSS (as measured by `time -l`): base diff difference (95% CI) time 998038627.097 ± 13567305.958 1003327715.556 ± 15210451.236 [ -0.2% .. +1.2%] samples 31 36 Reviewed By: #lld-macho, oontvoo Differential Revision: https://reviews.llvm.org/D118797
2022-02-03 19:53:29 -05:00
ConcatInputSection *isec = makeSyntheticInputSection(segName, sectName);
// This runs after markLive() and is only called for Undefineds that are
// live. Marking the isec live ensures an OutputSection is created that the
// start/end symbol can refer to.
assert(sym.isLive());
assert(isec->live);
// This runs after gatherInputSections(), so need to explicitly set parent
// and add to inputSections.
osec = isec->parent = ConcatOutputSection::getOrCreateForInput(isec);
inputSections.push_back(isec);
}
if (which == Boundary::Start)
osec->sectionStartSymbols.push_back(createBoundarySymbol(sym));
else
osec->sectionEndSymbols.push_back(createBoundarySymbol(sym));
}
static void handleSegmentBoundarySymbol(const Undefined &sym, StringRef segName,
Boundary which) {
OutputSegment *seg = getOrCreateOutputSegment(segName);
if (which == Boundary::Start)
seg->segmentStartSymbols.push_back(createBoundarySymbol(sym));
else
seg->segmentEndSymbols.push_back(createBoundarySymbol(sym));
}
// Try to find a definition for an undefined symbol.
// Returns true if a definition was found and no diagnostics are needed.
static bool recoverFromUndefinedSymbol(const Undefined &sym) {
// Handle start/end symbols.
StringRef name = sym.getName();
if (name.consume_front("section$start$")) {
handleSectionBoundarySymbol(sym, name, Boundary::Start);
return true;
}
if (name.consume_front("section$end$")) {
handleSectionBoundarySymbol(sym, name, Boundary::End);
return true;
}
if (name.consume_front("segment$start$")) {
handleSegmentBoundarySymbol(sym, name, Boundary::Start);
return true;
}
if (name.consume_front("segment$end$")) {
handleSegmentBoundarySymbol(sym, name, Boundary::End);
return true;
}
// Leave dtrace symbols, since we will handle them when we do the relocation
if (name.starts_with("___dtrace_"))
return true;
// Handle -U.
if (config->explicitDynamicLookups.count(sym.getName())) {
symtab->addDynamicLookup(sym.getName());
return true;
}
// Handle -undefined.
if (config->undefinedSymbolTreatment ==
UndefinedSymbolTreatment::dynamic_lookup ||
config->undefinedSymbolTreatment == UndefinedSymbolTreatment::suppress) {
symtab->addDynamicLookup(sym.getName());
return true;
}
// We do not return true here, as we still need to print diagnostics.
if (config->undefinedSymbolTreatment == UndefinedSymbolTreatment::warning)
symtab->addDynamicLookup(sym.getName());
return false;
}
namespace {
struct UndefinedDiag {
struct SectionAndOffset {
const InputSection *isec;
uint64_t offset;
};
std::vector<SectionAndOffset> codeReferences;
std::vector<std::string> otherReferences;
};
MapVector<const Undefined *, UndefinedDiag> undefs;
} // namespace
void macho::reportPendingDuplicateSymbols() {
for (const auto &duplicate : dupSymDiags) {
if (!config->deadStripDuplicates || duplicate.sym->isLive()) {
std::string message =
"duplicate symbol: " + toString(*duplicate.sym) + "\n>>> defined in ";
if (!duplicate.src1.first.empty())
message += duplicate.src1.first + "\n>>> ";
message += duplicate.src1.second + "\n>>> defined in ";
if (!duplicate.src2.first.empty())
message += duplicate.src2.first + "\n>>> ";
error(message + duplicate.src2.second);
}
}
}
// Check whether the definition name def is a mangled function name that matches
// the reference name ref.
static bool canSuggestExternCForCXX(StringRef ref, StringRef def) {
llvm::ItaniumPartialDemangler d;
std::string name = def.str();
if (d.partialDemangle(name.c_str()))
return false;
char *buf = d.getFunctionName(nullptr, nullptr);
if (!buf)
return false;
bool ret = ref == buf;
free(buf);
return ret;
}
// Suggest an alternative spelling of an "undefined symbol" diagnostic. Returns
// the suggested symbol, which is either in the symbol table, or in the same
// file of sym.
static const Symbol *getAlternativeSpelling(const Undefined &sym,
std::string &preHint,
std::string &postHint) {
DenseMap<StringRef, const Symbol *> map;
if (sym.getFile() && sym.getFile()->kind() == InputFile::ObjKind) {
// Build a map of local defined symbols.
for (const Symbol *s : sym.getFile()->symbols)
if (auto *defined = dyn_cast_or_null<Defined>(s))
if (!defined->isExternal())
map.try_emplace(s->getName(), s);
}
auto suggest = [&](StringRef newName) -> const Symbol * {
// If defined locally.
if (const Symbol *s = map.lookup(newName))
return s;
// If in the symbol table and not undefined.
if (const Symbol *s = symtab->find(newName))
if (!isa<Undefined>(s))
return s;
return nullptr;
};
// This loop enumerates all strings of Levenshtein distance 1 as typo
// correction candidates and suggests the one that exists as a non-undefined
// symbol.
StringRef name = sym.getName();
for (size_t i = 0, e = name.size(); i != e + 1; ++i) {
// Insert a character before name[i].
std::string newName = (name.substr(0, i) + "0" + name.substr(i)).str();
for (char c = '0'; c <= 'z'; ++c) {
newName[i] = c;
if (const Symbol *s = suggest(newName))
return s;
}
if (i == e)
break;
// Substitute name[i].
newName = std::string(name);
for (char c = '0'; c <= 'z'; ++c) {
newName[i] = c;
if (const Symbol *s = suggest(newName))
return s;
}
// Transpose name[i] and name[i+1]. This is of edit distance 2 but it is
// common.
if (i + 1 < e) {
newName[i] = name[i + 1];
newName[i + 1] = name[i];
if (const Symbol *s = suggest(newName))
return s;
}
// Delete name[i].
newName = (name.substr(0, i) + name.substr(i + 1)).str();
if (const Symbol *s = suggest(newName))
return s;
}
// Case mismatch, e.g. Foo vs FOO.
for (auto &it : map)
if (name.equals_insensitive(it.first))
return it.second;
for (Symbol *sym : symtab->getSymbols())
if (!isa<Undefined>(sym) && name.equals_insensitive(sym->getName()))
return sym;
// The reference may be a mangled name while the definition is not. Suggest a
// missing extern "C".
if (name.starts_with("__Z")) {
std::string buf = name.str();
llvm::ItaniumPartialDemangler d;
if (!d.partialDemangle(buf.c_str()))
if (char *buf = d.getFunctionName(nullptr, nullptr)) {
const Symbol *s = suggest((Twine("_") + buf).str());
free(buf);
if (s) {
preHint = ": extern \"C\" ";
return s;
}
}
} else {
StringRef nameWithoutUnderscore = name;
nameWithoutUnderscore.consume_front("_");
const Symbol *s = nullptr;
for (auto &it : map)
if (canSuggestExternCForCXX(nameWithoutUnderscore, it.first)) {
s = it.second;
break;
}
if (!s)
for (Symbol *sym : symtab->getSymbols())
if (canSuggestExternCForCXX(nameWithoutUnderscore, sym->getName())) {
s = sym;
break;
}
if (s) {
preHint = " to declare ";
postHint = " as extern \"C\"?";
return s;
}
}
return nullptr;
}
static void reportUndefinedSymbol(const Undefined &sym,
const UndefinedDiag &locations,
bool correctSpelling) {
std::string message = "undefined symbol";
if (config->archMultiple)
message += (" for arch " + getArchitectureName(config->arch())).str();
message += ": " + toString(sym);
const size_t maxUndefinedReferences = 3;
size_t i = 0;
for (const std::string &loc : locations.otherReferences) {
if (i >= maxUndefinedReferences)
break;
message += "\n>>> referenced by " + loc;
++i;
}
for (const UndefinedDiag::SectionAndOffset &loc : locations.codeReferences) {
if (i >= maxUndefinedReferences)
break;
message += "\n>>> referenced by ";
std::string src = loc.isec->getSourceLocation(loc.offset);
if (!src.empty())
message += src + "\n>>> ";
message += loc.isec->getLocation(loc.offset);
++i;
}
size_t totalReferences =
locations.otherReferences.size() + locations.codeReferences.size();
if (totalReferences > i)
message +=
("\n>>> referenced " + Twine(totalReferences - i) + " more times")
.str();
if (correctSpelling) {
std::string preHint = ": ", postHint;
if (const Symbol *corrected =
getAlternativeSpelling(sym, preHint, postHint)) {
message +=
"\n>>> did you mean" + preHint + toString(*corrected) + postHint;
if (corrected->getFile())
message += "\n>>> defined in: " + toString(corrected->getFile());
}
}
if (config->undefinedSymbolTreatment == UndefinedSymbolTreatment::error)
error(message);
else if (config->undefinedSymbolTreatment ==
UndefinedSymbolTreatment::warning)
warn(message);
else
assert(false && "diagnostics make sense for -undefined error|warning only");
}
void macho::reportPendingUndefinedSymbols() {
// Enable spell corrector for the first 2 diagnostics.
for (const auto &[i, undef] : llvm::enumerate(undefs))
reportUndefinedSymbol(*undef.first, undef.second, i < 2);
// This function is called multiple times during execution. Clear the printed
// diagnostics to avoid printing the same things again the next time.
undefs.clear();
}
void macho::treatUndefinedSymbol(const Undefined &sym, StringRef source) {
if (recoverFromUndefinedSymbol(sym))
return;
undefs[&sym].otherReferences.push_back(source.str());
}
void macho::treatUndefinedSymbol(const Undefined &sym, const InputSection *isec,
uint64_t offset) {
if (recoverFromUndefinedSymbol(sym))
return;
undefs[&sym].codeReferences.push_back({isec, offset});
}
std::unique_ptr<SymbolTable> macho::symtab;