[NFC][GlobPattern] Add GlobPattern::longest_substr() (#164512)

Finds longest (almost) plain substring in the pattern.

Implementation is conservative to avoid false positives.

The result is not used to optimize
`GlobPattern::match()` so it's calculated on
request.

For
* https://github.com/llvm/llvm-project/pull/164545

---------

Co-authored-by: Luke Lau <luke@igalia.com>
This commit is contained in:
Vitaly Buka
2025-10-22 20:46:40 -07:00
committed by GitHub
parent 3c2dae6919
commit 6fdef0bbe2
3 changed files with 117 additions and 0 deletions

View File

@@ -79,6 +79,9 @@ public:
StringRef prefix() const { return Pattern.take_front(PrefixSize); }
// Returns plain suffix of the pattern.
StringRef suffix() const { return Pattern.take_back(SuffixSize); }
// Returns the longest plain substring of the pattern between prefix and
// suffix.
StringRef longest_substr() const;
private:
StringRef Pattern;

View File

@@ -132,6 +132,49 @@ parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
return std::move(SubPatterns);
}
static StringRef maxPlainSubstring(StringRef S) {
StringRef Best;
while (!S.empty()) {
size_t PrefixSize = S.find_first_of("?*[{\\");
if (PrefixSize == std::string::npos)
PrefixSize = S.size();
if (Best.size() < PrefixSize)
Best = S.take_front(PrefixSize);
S = S.drop_front(PrefixSize);
// It's impossible, as the first and last characters of the input string
// must be Glob special characters, otherwise they would be parts of
// the prefix or the suffix.
assert(!S.empty());
switch (S.front()) {
case '\\':
S = S.drop_front(2);
break;
case '[': {
// Drop '[' and the first character which can be ']'.
S = S.drop_front(2);
size_t EndBracket = S.find_first_of("]");
// Should not be possible, SubGlobPattern::create should fail on invalid
// pattern before we get here.
assert(EndBracket != std::string::npos);
S = S.drop_front(EndBracket + 1);
break;
}
case '{':
// TODO: implement.
// Fallback to whatever is best for now.
return Best;
default:
S = S.drop_front(1);
}
}
return Best;
}
Expected<GlobPattern>
GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
GlobPattern Pat;
@@ -202,6 +245,11 @@ GlobPattern::SubGlobPattern::create(StringRef S) {
return Pat;
}
StringRef GlobPattern::longest_substr() const {
return maxPlainSubstring(
Pattern.drop_front(PrefixSize).drop_back(SuffixSize));
}
bool GlobPattern::match(StringRef S) const {
if (!S.consume_front(prefix()))
return false;

View File

@@ -329,6 +329,72 @@ TEST_F(GlobPatternTest, PrefixSuffix) {
EXPECT_EQ("cd", Pat->suffix());
}
TEST_F(GlobPatternTest, Substr) {
auto Pat = GlobPattern::create("");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("", Pat->longest_substr());
Pat = GlobPattern::create("abcd");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("", Pat->longest_substr());
Pat = GlobPattern::create("a*bcd");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("", Pat->longest_substr());
Pat = GlobPattern::create("*abcd");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("", Pat->longest_substr());
Pat = GlobPattern::create("abcd*");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("", Pat->longest_substr());
Pat = GlobPattern::create("a*bc*d");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("bc", Pat->longest_substr());
Pat = GlobPattern::create("a*bc*def*g");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("def", Pat->longest_substr());
Pat = GlobPattern::create("a*bcd*ef*g");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("bcd", Pat->longest_substr());
Pat = GlobPattern::create("a*bcd*efg*h");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("bcd", Pat->longest_substr());
Pat = GlobPattern::create("a*bcd[ef]g*h");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("bcd", Pat->longest_substr());
Pat = GlobPattern::create("a*bc[d]efg*h");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("efg", Pat->longest_substr());
Pat = GlobPattern::create("a*bc[]]efg*h");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("efg", Pat->longest_substr());
Pat = GlobPattern::create("a*bcde\\fg*h");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("bcde", Pat->longest_substr());
Pat = GlobPattern::create("a*bcde\\[fg*h");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("bcde", Pat->longest_substr());
Pat = GlobPattern::create("a*bcde?fg*h");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("bcde", Pat->longest_substr());
Pat = GlobPattern::create("a*bcdef{g}*h");
ASSERT_TRUE((bool)Pat);
EXPECT_EQ("bcdef", Pat->longest_substr());
}
TEST_F(GlobPatternTest, Pathological) {
std::string P, S(40, 'a');
StringRef Pieces[] = {"a*", "[ba]*", "{b*,a*}*"};