Fix handling of medial hyphens in Unicode Names.

In a Unicode name was stored in a way that caused a medial hyphen to be at the end of a a chunk, it would not be properly ignored by the loose matching algorithm. For example if `LEFT-TO-RIGHT OVERRIDE` was stored as `LEFT-` [...], the `-` would not be ignored. The generators now ensures nodes are not cut accross medial hyphen boundaries. Fixes #64161 Differential Revision: https://reviews.llvm.org/D156518
2026-01-23 16:06:39 +08:00 · 2023-07-28 10:07:47 +02:00
parent a428b5afbd
commit 68410fbed7
7 changed files with 20940 additions and 20841 deletions
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -117,6 +117,10 @@ Bug Fixes to C++ Support
  This limit can be modified by `-fconstexpr-steps`.
  (`#63562 <https://github.com/llvm/llvm-project/issues/63562>`_)

+- Fix a crash caused by some named unicode escape sequences designating
+  a Unicode character whose name contains a ``-``.
+  (`Fixes #64161 <https://github.com/llvm/llvm-project/issues/64161>_`)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -385,10 +385,10 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
      ++I;
      auto Delim = std::find(I, Input.end(), '}');
      assert(Delim != Input.end());
+      StringRef Name(I, std::distance(I, Delim));
      std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
-          llvm::sys::unicode::nameToCodepointLooseMatching(
-              StringRef(I, std::distance(I, Delim)));
-      assert(Res);
+          llvm::sys::unicode::nameToCodepointLooseMatching(Name);
+      assert(Res && "could not find a codepoint that was previously found");
      CodePoint = Res->CodePoint;
      assert(CodePoint != 0xFFFFFFFF);
      appendCodePoint(CodePoint, Buf);
--- a/clang/test/Preprocessor/ucn-pp-identifier.c
+++ b/clang/test/Preprocessor/ucn-pp-identifier.c
@@ -159,3 +159,6 @@ int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>;
 // expected-warning@-1 {{incomplete}}\
 // expected-error@-1 {{expected unqualified-id}}
 #endif
+
+// GH64161
+int A\N{LEFT-TO-RIGHT OVERRIDE}; // expected-error {{character <U+202D> not allowed in an identifier}}
--- a/llvm/lib/Support/UnicodeNameToCodepoint.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepoint.cpp
@@ -119,7 +119,7 @@ static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {

 static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
                       std::size_t &Consummed, char &PreviousCharInName,
-                       char &PreviousCharInNeedle, bool IsPrefix = false) {
+                       bool IsPrefix = false) {

  Consummed = 0;
  if (Strict) {
@@ -135,18 +135,18 @@ static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
  auto NeedlePos = Needle.begin();

  char PreviousCharInNameOrigin = PreviousCharInName;
-  char PreviousCharInNeedleOrigin = PreviousCharInNeedle;
-
+  char PreviousCharInNeedle = *Needle.begin();
  auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
-                         bool IgnoreEnd = false) {
+                         bool IsPrefix = false) {
    while (It != End) {
      const auto Next = std::next(It);
      // Ignore spaces, underscore, medial hyphens
-      // https://unicode.org/reports/tr44/#UAX44-LM2.
+      // The generator ensures a needle never ends (or starts) by a medial
+      // hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
      bool Ignore =
          *It == ' ' || *It == '_' ||
          (*It == '-' && isAlnum(PreviousChar) &&
-           ((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
+           ((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix)));
      PreviousChar = *It;
      if (!Ignore)
        break;
@@ -171,20 +171,18 @@ static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
  Consummed = std::distance(Name.begin(), NamePos);
  if (NeedlePos != Needle.end()) {
    PreviousCharInName = PreviousCharInNameOrigin;
-    PreviousCharInNeedle = PreviousCharInNeedleOrigin;
  }
  return NeedlePos == Needle.end();
 }

 static std::tuple<Node, bool, uint32_t>
 compareNode(uint32_t Offset, StringRef Name, bool Strict,
-            char PreviousCharInName, char PreviousCharInNeedle,
-            BufferType &Buffer, const Node *Parent = nullptr) {
+            char PreviousCharInName, BufferType &Buffer,
+            const Node *Parent = nullptr) {
  Node N = readNode(Offset, Parent);
  std::size_t Consummed = 0;
-  bool DoesStartWith =
-      N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
-                             PreviousCharInName, PreviousCharInNeedle);
+  bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
+                                              PreviousCharInName);
  if (!DoesStartWith)
    return std::make_tuple(N, false, 0);

@@ -199,7 +197,7 @@ compareNode(uint32_t Offset, StringRef Name, bool Strict,
      uint32_t Value;
      std::tie(C, Matches, Value) =
          compareNode(ChildOffset, Name.substr(Consummed), Strict,
-                      PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
+                      PreviousCharInName, Buffer, &N);
      if (Matches) {
        std::reverse_copy(C.Name.begin(), C.Name.end(),
                          std::back_inserter(Buffer));
@@ -215,7 +213,7 @@ compareNode(uint32_t Offset, StringRef Name, bool Strict,

 static std::tuple<Node, bool, uint32_t>
 compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
-  return compareNode(Offset, Name, Strict, 0, 0, Buffer);
+  return compareNode(Offset, Name, Strict, 0, Buffer);
 }

 // clang-format off
@@ -262,7 +260,6 @@ static std::size_t findSyllable(StringRef Name, bool Strict,
                                char &PreviousInName, int &Pos, int Column) {
  assert(Column == 0 || Column == 1 || Column == 2);
  static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
-  char NeedleStart = 0;
  int Len = -1;
  int Prev = PreviousInName;
  for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
@@ -271,8 +268,8 @@ static std::size_t findSyllable(StringRef Name, bool Strict,
      continue;
    std::size_t Consummed = 0;
    char PreviousInNameCopy = PreviousInName;
-    bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
-                                    PreviousInNameCopy, NeedleStart);
+    bool DoesStartWith =
+        startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);
    if (!DoesStartWith)
      continue;
    Len = Consummed;
@@ -290,9 +287,9 @@ nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
  Buffer.clear();
  // Hangul Syllable Decomposition
  std::size_t Consummed = 0;
-  char NameStart = 0, NeedleStart = 0;
-  bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
-                                  NameStart, NeedleStart);
+  char NameStart = 0;
+  bool DoesStartWith =
+      startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);
  if (!DoesStartWith)
    return std::nullopt;
  Name = Name.substr(Consummed);
@@ -348,9 +345,9 @@ nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
  for (auto &&Item : GeneratedNamesDataTable) {
    Buffer.clear();
    std::size_t Consummed = 0;
-    char NameStart = 0, NeedleStart = 0;
+    char NameStart = 0;
    bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
-                                    NameStart, NeedleStart, /*isPrefix*/ true);
+                                    NameStart, /*IsPrefix=*/true);
    if (!DoesStartWith)
      continue;
    auto Number = Name.substr(Consummed);
--- a/llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp
+++ b/llvm/lib/Support/UnicodeNameToCodepointGenerated.cpp
--- a/llvm/unittests/Support/UnicodeTest.cpp
+++ b/llvm/unittests/Support/UnicodeTest.cpp
@@ -316,7 +316,14 @@ TEST(Unicode, nameToCodepointLoose) {
  EXPECT_EQ(0x0F68u, map("TIBETAN LETTER-A"));
  EXPECT_EQ(0x0F60u, map("TIBETAN LETTER -A"));
  EXPECT_EQ(0x0F60u, map("TIBETAN LETTER  -A"));
-  ;
+
+  // GH64161
+  EXPECT_EQ(0x202Du, map("LEFT-TO-RIGHT OVERRIDE"));
+  EXPECT_EQ(0x202Du, map("LEFT TO RIGHT OVERRIDE"));
+  EXPECT_EQ(0x202Du, map("LEFTTORIGHTOVERRIDE"));
+  EXPECT_EQ(0x202Du, map("LEFT-TO-RIGHT-OVERRIDE"));
+  EXPECT_FALSE(nameToCodepointLooseMatching("-LEFT-TO-RIGHT OVERRIDE"));
+  EXPECT_FALSE(nameToCodepointLooseMatching("LEFT-TO-RIGHT OVERRIDE-"));

  // special case
  EXPECT_EQ(0x1180u, map("HANGUL JUNGSEONG O-E"));
--- a/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
+++ b/llvm/utils/UnicodeData/UnicodeNameMappingGenerator.cpp
@@ -95,8 +95,23 @@ public:
  // Once all  characters are inserted, the tree is compacted
  void insert(llvm::StringRef Name, char32_t Codepoint) {
    Node *N = Root.get();
-    for (auto Ch : Name) {
+    bool IsBeforeMedial = false;
+    for (auto ChIt = Name.begin(); ChIt != Name.end();
+         ChIt += (IsBeforeMedial ? 3 : 1)) {
+      char Ch = *ChIt;
+      assert(Letters.contains(Ch) && "Unexpected symbol in Unicode name");
+
      std::string Label(1, Ch);
+
+      // We need to ensure a node never ends or starts by
+      // a medial hyphen as this would break the
+      // loose matching algorithm.
+      IsBeforeMedial = llvm::isAlnum(Ch) && ChIt + 1 != Name.end() &&
+                       *(ChIt + 1) == '-' && ChIt + 2 != Name.end() &&
+                       llvm::isAlnum(*(ChIt + 2));
+      if (IsBeforeMedial)
+        Label.assign(ChIt, ChIt + 3);
+
      auto It = llvm::find_if(N->Children,
                              [&](const auto &C) { return C->Name == Label; });
      if (It == N->Children.end()) {