Fix handling of medial hyphens in Unicode Names.

In a Unicode name was stored in a way that caused
a medial hyphen to be at the end of a a chunk, it would not
be properly ignored by the loose matching algorithm.

For example if `LEFT-TO-RIGHT OVERRIDE` was stored as
`LEFT-` [...], the `-` would not be ignored.

The generators now ensures nodes are not cut accross
medial hyphen boundaries.

Fixes #64161

Differential Revision: https://reviews.llvm.org/D156518
This commit is contained in:
Corentin Jabot
2023-07-28 10:07:47 +02:00
parent a428b5afbd
commit 68410fbed7
7 changed files with 20940 additions and 20841 deletions

View File

@@ -117,6 +117,10 @@ Bug Fixes to C++ Support
This limit can be modified by `-fconstexpr-steps`.
(`#63562 <https://github.com/llvm/llvm-project/issues/63562>`_)
- Fix a crash caused by some named unicode escape sequences designating
a Unicode character whose name contains a ``-``.
(`Fixes #64161 <https://github.com/llvm/llvm-project/issues/64161>_`)
Bug Fixes to AST Handling
^^^^^^^^^^^^^^^^^^^^^^^^^

View File

@@ -385,10 +385,10 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
++I;
auto Delim = std::find(I, Input.end(), '}');
assert(Delim != Input.end());
StringRef Name(I, std::distance(I, Delim));
std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
llvm::sys::unicode::nameToCodepointLooseMatching(
StringRef(I, std::distance(I, Delim)));
assert(Res);
llvm::sys::unicode::nameToCodepointLooseMatching(Name);
assert(Res && "could not find a codepoint that was previously found");
CodePoint = Res->CodePoint;
assert(CodePoint != 0xFFFFFFFF);
appendCodePoint(CodePoint, Buf);

View File

@@ -159,3 +159,6 @@ int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>;
// expected-warning@-1 {{incomplete}}\
// expected-error@-1 {{expected unqualified-id}}
#endif
// GH64161
int A\N{LEFT-TO-RIGHT OVERRIDE}; // expected-error {{character <U+202D> not allowed in an identifier}}

View File

@@ -119,7 +119,7 @@ static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
std::size_t &Consummed, char &PreviousCharInName,
char &PreviousCharInNeedle, bool IsPrefix = false) {
bool IsPrefix = false) {
Consummed = 0;
if (Strict) {
@@ -135,18 +135,18 @@ static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
auto NeedlePos = Needle.begin();
char PreviousCharInNameOrigin = PreviousCharInName;
char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
char PreviousCharInNeedle = *Needle.begin();
auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
bool IgnoreEnd = false) {
bool IsPrefix = false) {
while (It != End) {
const auto Next = std::next(It);
// Ignore spaces, underscore, medial hyphens
// https://unicode.org/reports/tr44/#UAX44-LM2.
// The generator ensures a needle never ends (or starts) by a medial
// hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
bool Ignore =
*It == ' ' || *It == '_' ||
(*It == '-' && isAlnum(PreviousChar) &&
((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix)));
PreviousChar = *It;
if (!Ignore)
break;
@@ -171,20 +171,18 @@ static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
Consummed = std::distance(Name.begin(), NamePos);
if (NeedlePos != Needle.end()) {
PreviousCharInName = PreviousCharInNameOrigin;
PreviousCharInNeedle = PreviousCharInNeedleOrigin;
}
return NeedlePos == Needle.end();
}
static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset, StringRef Name, bool Strict,
char PreviousCharInName, char PreviousCharInNeedle,
BufferType &Buffer, const Node *Parent = nullptr) {
char PreviousCharInName, BufferType &Buffer,
const Node *Parent = nullptr) {
Node N = readNode(Offset, Parent);
std::size_t Consummed = 0;
bool DoesStartWith =
N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
PreviousCharInName, PreviousCharInNeedle);
bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
PreviousCharInName);
if (!DoesStartWith)
return std::make_tuple(N, false, 0);
@@ -199,7 +197,7 @@ compareNode(uint32_t Offset, StringRef Name, bool Strict,
uint32_t Value;
std::tie(C, Matches, Value) =
compareNode(ChildOffset, Name.substr(Consummed), Strict,
PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
PreviousCharInName, Buffer, &N);
if (Matches) {
std::reverse_copy(C.Name.begin(), C.Name.end(),
std::back_inserter(Buffer));
@@ -215,7 +213,7 @@ compareNode(uint32_t Offset, StringRef Name, bool Strict,
static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
return compareNode(Offset, Name, Strict, 0, 0, Buffer);
return compareNode(Offset, Name, Strict, 0, Buffer);
}
// clang-format off
@@ -262,7 +260,6 @@ static std::size_t findSyllable(StringRef Name, bool Strict,
char &PreviousInName, int &Pos, int Column) {
assert(Column == 0 || Column == 1 || Column == 2);
static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
char NeedleStart = 0;
int Len = -1;
int Prev = PreviousInName;
for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
@@ -271,8 +268,8 @@ static std::size_t findSyllable(StringRef Name, bool Strict,
continue;
std::size_t Consummed = 0;
char PreviousInNameCopy = PreviousInName;
bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
PreviousInNameCopy, NeedleStart);
bool DoesStartWith =
startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);
if (!DoesStartWith)
continue;
Len = Consummed;
@@ -290,9 +287,9 @@ nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
Buffer.clear();
// Hangul Syllable Decomposition
std::size_t Consummed = 0;
char NameStart = 0, NeedleStart = 0;
bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
NameStart, NeedleStart);
char NameStart = 0;
bool DoesStartWith =
startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);
if (!DoesStartWith)
return std::nullopt;
Name = Name.substr(Consummed);
@@ -348,9 +345,9 @@ nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
for (auto &&Item : GeneratedNamesDataTable) {
Buffer.clear();
std::size_t Consummed = 0;
char NameStart = 0, NeedleStart = 0;
char NameStart = 0;
bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
NameStart, NeedleStart, /*isPrefix*/ true);
NameStart, /*IsPrefix=*/true);
if (!DoesStartWith)
continue;
auto Number = Name.substr(Consummed);

File diff suppressed because it is too large Load Diff

View File

@@ -316,7 +316,14 @@ TEST(Unicode, nameToCodepointLoose) {
EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A"));
EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
;
// GH64161
EXPECT_EQ(0x202Du, map("LEFT-TO-RIGHT OVERRIDE"));
EXPECT_EQ(0x202Du, map("LEFT TO RIGHT OVERRIDE"));
EXPECT_EQ(0x202Du, map("LEFTTORIGHTOVERRIDE"));
EXPECT_EQ(0x202Du, map("LEFT-TO-RIGHT-OVERRIDE"));
EXPECT_FALSE(nameToCodepointLooseMatching("-LEFT-TO-RIGHT OVERRIDE"));
EXPECT_FALSE(nameToCodepointLooseMatching("LEFT-TO-RIGHT OVERRIDE-"));
// special case
EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E"));

View File

@@ -95,8 +95,23 @@ public:
// Once all characters are inserted, the tree is compacted
void insert(llvm::StringRef Name, char32_t Codepoint) {
Node *N = Root.get();
for (auto Ch : Name) {
bool IsBeforeMedial = false;
for (auto ChIt = Name.begin(); ChIt != Name.end();
ChIt += (IsBeforeMedial ? 3 : 1)) {
char Ch = *ChIt;
assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
std::string Label(1, Ch);
// We need to ensure a node never ends or starts by
// a medial hyphen as this would break the
// loose matching algorithm.
IsBeforeMedial = llvm::isAlnum(Ch) && ChIt + 1 != Name.end() &&
*(ChIt + 1) == '-' && ChIt + 2 != Name.end() &&
llvm::isAlnum(*(ChIt + 2));
if (IsBeforeMedial)
Label.assign(ChIt, ChIt + 3);
auto It = llvm::find_if(N->Children,
[&](const auto &C) { return C->Name == Label; });
if (It == N->Children.end()) {