mirror of
https://github.com/intel/llvm.git
synced 2026-01-23 16:06:39 +08:00
Fix handling of medial hyphens in Unicode Names.
In a Unicode name was stored in a way that caused a medial hyphen to be at the end of a a chunk, it would not be properly ignored by the loose matching algorithm. For example if `LEFT-TO-RIGHT OVERRIDE` was stored as `LEFT-` [...], the `-` would not be ignored. The generators now ensures nodes are not cut accross medial hyphen boundaries. Fixes #64161 Differential Revision: https://reviews.llvm.org/D156518
This commit is contained in:
@@ -117,6 +117,10 @@ Bug Fixes to C++ Support
|
||||
This limit can be modified by `-fconstexpr-steps`.
|
||||
(`#63562 <https://github.com/llvm/llvm-project/issues/63562>`_)
|
||||
|
||||
- Fix a crash caused by some named unicode escape sequences designating
|
||||
a Unicode character whose name contains a ``-``.
|
||||
(`Fixes #64161 <https://github.com/llvm/llvm-project/issues/64161>_`)
|
||||
|
||||
Bug Fixes to AST Handling
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
||||
@@ -385,10 +385,10 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
|
||||
++I;
|
||||
auto Delim = std::find(I, Input.end(), '}');
|
||||
assert(Delim != Input.end());
|
||||
StringRef Name(I, std::distance(I, Delim));
|
||||
std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
|
||||
llvm::sys::unicode::nameToCodepointLooseMatching(
|
||||
StringRef(I, std::distance(I, Delim)));
|
||||
assert(Res);
|
||||
llvm::sys::unicode::nameToCodepointLooseMatching(Name);
|
||||
assert(Res && "could not find a codepoint that was previously found");
|
||||
CodePoint = Res->CodePoint;
|
||||
assert(CodePoint != 0xFFFFFFFF);
|
||||
appendCodePoint(CodePoint, Buf);
|
||||
|
||||
@@ -159,3 +159,6 @@ int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>;
|
||||
// expected-warning@-1 {{incomplete}}\
|
||||
// expected-error@-1 {{expected unqualified-id}}
|
||||
#endif
|
||||
|
||||
// GH64161
|
||||
int A\N{LEFT-TO-RIGHT OVERRIDE}; // expected-error {{character <U+202D> not allowed in an identifier}}
|
||||
|
||||
@@ -119,7 +119,7 @@ static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
|
||||
|
||||
static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
|
||||
std::size_t &Consummed, char &PreviousCharInName,
|
||||
char &PreviousCharInNeedle, bool IsPrefix = false) {
|
||||
bool IsPrefix = false) {
|
||||
|
||||
Consummed = 0;
|
||||
if (Strict) {
|
||||
@@ -135,18 +135,18 @@ static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
|
||||
auto NeedlePos = Needle.begin();
|
||||
|
||||
char PreviousCharInNameOrigin = PreviousCharInName;
|
||||
char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
|
||||
|
||||
char PreviousCharInNeedle = *Needle.begin();
|
||||
auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
|
||||
bool IgnoreEnd = false) {
|
||||
bool IsPrefix = false) {
|
||||
while (It != End) {
|
||||
const auto Next = std::next(It);
|
||||
// Ignore spaces, underscore, medial hyphens
|
||||
// https://unicode.org/reports/tr44/#UAX44-LM2.
|
||||
// The generator ensures a needle never ends (or starts) by a medial
|
||||
// hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
|
||||
bool Ignore =
|
||||
*It == ' ' || *It == '_' ||
|
||||
(*It == '-' && isAlnum(PreviousChar) &&
|
||||
((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
|
||||
((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix)));
|
||||
PreviousChar = *It;
|
||||
if (!Ignore)
|
||||
break;
|
||||
@@ -171,20 +171,18 @@ static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
|
||||
Consummed = std::distance(Name.begin(), NamePos);
|
||||
if (NeedlePos != Needle.end()) {
|
||||
PreviousCharInName = PreviousCharInNameOrigin;
|
||||
PreviousCharInNeedle = PreviousCharInNeedleOrigin;
|
||||
}
|
||||
return NeedlePos == Needle.end();
|
||||
}
|
||||
|
||||
static std::tuple<Node, bool, uint32_t>
|
||||
compareNode(uint32_t Offset, StringRef Name, bool Strict,
|
||||
char PreviousCharInName, char PreviousCharInNeedle,
|
||||
BufferType &Buffer, const Node *Parent = nullptr) {
|
||||
char PreviousCharInName, BufferType &Buffer,
|
||||
const Node *Parent = nullptr) {
|
||||
Node N = readNode(Offset, Parent);
|
||||
std::size_t Consummed = 0;
|
||||
bool DoesStartWith =
|
||||
N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
|
||||
PreviousCharInName, PreviousCharInNeedle);
|
||||
bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
|
||||
PreviousCharInName);
|
||||
if (!DoesStartWith)
|
||||
return std::make_tuple(N, false, 0);
|
||||
|
||||
@@ -199,7 +197,7 @@ compareNode(uint32_t Offset, StringRef Name, bool Strict,
|
||||
uint32_t Value;
|
||||
std::tie(C, Matches, Value) =
|
||||
compareNode(ChildOffset, Name.substr(Consummed), Strict,
|
||||
PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
|
||||
PreviousCharInName, Buffer, &N);
|
||||
if (Matches) {
|
||||
std::reverse_copy(C.Name.begin(), C.Name.end(),
|
||||
std::back_inserter(Buffer));
|
||||
@@ -215,7 +213,7 @@ compareNode(uint32_t Offset, StringRef Name, bool Strict,
|
||||
|
||||
static std::tuple<Node, bool, uint32_t>
|
||||
compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
|
||||
return compareNode(Offset, Name, Strict, 0, 0, Buffer);
|
||||
return compareNode(Offset, Name, Strict, 0, Buffer);
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
@@ -262,7 +260,6 @@ static std::size_t findSyllable(StringRef Name, bool Strict,
|
||||
char &PreviousInName, int &Pos, int Column) {
|
||||
assert(Column == 0 || Column == 1 || Column == 2);
|
||||
static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
|
||||
char NeedleStart = 0;
|
||||
int Len = -1;
|
||||
int Prev = PreviousInName;
|
||||
for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
|
||||
@@ -271,8 +268,8 @@ static std::size_t findSyllable(StringRef Name, bool Strict,
|
||||
continue;
|
||||
std::size_t Consummed = 0;
|
||||
char PreviousInNameCopy = PreviousInName;
|
||||
bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
|
||||
PreviousInNameCopy, NeedleStart);
|
||||
bool DoesStartWith =
|
||||
startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);
|
||||
if (!DoesStartWith)
|
||||
continue;
|
||||
Len = Consummed;
|
||||
@@ -290,9 +287,9 @@ nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
|
||||
Buffer.clear();
|
||||
// Hangul Syllable Decomposition
|
||||
std::size_t Consummed = 0;
|
||||
char NameStart = 0, NeedleStart = 0;
|
||||
bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
|
||||
NameStart, NeedleStart);
|
||||
char NameStart = 0;
|
||||
bool DoesStartWith =
|
||||
startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);
|
||||
if (!DoesStartWith)
|
||||
return std::nullopt;
|
||||
Name = Name.substr(Consummed);
|
||||
@@ -348,9 +345,9 @@ nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
|
||||
for (auto &&Item : GeneratedNamesDataTable) {
|
||||
Buffer.clear();
|
||||
std::size_t Consummed = 0;
|
||||
char NameStart = 0, NeedleStart = 0;
|
||||
char NameStart = 0;
|
||||
bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
|
||||
NameStart, NeedleStart, /*isPrefix*/ true);
|
||||
NameStart, /*IsPrefix=*/true);
|
||||
if (!DoesStartWith)
|
||||
continue;
|
||||
auto Number = Name.substr(Consummed);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -316,7 +316,14 @@ TEST(Unicode, nameToCodepointLoose) {
|
||||
EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A"));
|
||||
EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
|
||||
EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
|
||||
;
|
||||
|
||||
// GH64161
|
||||
EXPECT_EQ(0x202Du, map("LEFT-TO-RIGHT OVERRIDE"));
|
||||
EXPECT_EQ(0x202Du, map("LEFT TO RIGHT OVERRIDE"));
|
||||
EXPECT_EQ(0x202Du, map("LEFTTORIGHTOVERRIDE"));
|
||||
EXPECT_EQ(0x202Du, map("LEFT-TO-RIGHT-OVERRIDE"));
|
||||
EXPECT_FALSE(nameToCodepointLooseMatching("-LEFT-TO-RIGHT OVERRIDE"));
|
||||
EXPECT_FALSE(nameToCodepointLooseMatching("LEFT-TO-RIGHT OVERRIDE-"));
|
||||
|
||||
// special case
|
||||
EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E"));
|
||||
|
||||
@@ -95,8 +95,23 @@ public:
|
||||
// Once all characters are inserted, the tree is compacted
|
||||
void insert(llvm::StringRef Name, char32_t Codepoint) {
|
||||
Node *N = Root.get();
|
||||
for (auto Ch : Name) {
|
||||
bool IsBeforeMedial = false;
|
||||
for (auto ChIt = Name.begin(); ChIt != Name.end();
|
||||
ChIt += (IsBeforeMedial ? 3 : 1)) {
|
||||
char Ch = *ChIt;
|
||||
assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
|
||||
|
||||
std::string Label(1, Ch);
|
||||
|
||||
// We need to ensure a node never ends or starts by
|
||||
// a medial hyphen as this would break the
|
||||
// loose matching algorithm.
|
||||
IsBeforeMedial = llvm::isAlnum(Ch) && ChIt + 1 != Name.end() &&
|
||||
*(ChIt + 1) == '-' && ChIt + 2 != Name.end() &&
|
||||
llvm::isAlnum(*(ChIt + 2));
|
||||
if (IsBeforeMedial)
|
||||
Label.assign(ChIt, ChIt + 3);
|
||||
|
||||
auto It = llvm::find_if(N->Children,
|
||||
[&](const auto &C) { return C->Name == Label; });
|
||||
if (It == N->Children.end()) {
|
||||
|
||||
Reference in New Issue
Block a user