Revert "[pseudo] Add error-recovery framework & brace-based recovery"

This reverts commit a0f4c10ae2.
This commit hadn't been reviewed yet, and was unintentionally included
on another branch.
This commit is contained in:
Sam McCall
2022-06-28 21:09:15 +02:00
parent 9553d69580
commit 743971faaf
12 changed files with 38 additions and 516 deletions

View File

@@ -144,26 +144,6 @@ void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
const ParseParams &Params);
// Heuristically recover from a state where no further parsing is possible.
//
// OldHeads is the parse state at TokenIndex.
// This function consumes consumes zero or more tokens (advancing TokenIndex),
// and places any recovery states created in NewHeads.
//
// On failure, NewHeads is empty and TokenIndex is unchanged.
//
// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
// there is a risk of the parser falling into an infinite loop, creating an
// endless sequence of recovery nodes.
// Generally it is safe for recovery to match 0 tokens against sequence symbols
// like `statement-seq`, as the grammar won't permit another statement-seq
// immediately afterwards. However recovery strategies for `statement` should
// consume at least one token, as statements may be adjacent in the input.
void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
unsigned &TokenIndex, const TokenStream &Tokens,
const ParseParams &Params,
std::vector<const GSS::Node *> &NewHeads);
} // namespace pseudo
} // namespace clang

View File

@@ -81,12 +81,9 @@ inline tok::TokenKind symbolToToken(SymbolID SID) {
assert(SID < NumTerminals);
return static_cast<tok::TokenKind>(SID);
}
inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
inline SymbolID tokenSymbol(tok::TokenKind TK) {
return TokenFlag | static_cast<SymbolID>(TK);
}
// Error recovery strategies.
// FIXME: these should be provided as extensions instead.
enum class RecoveryStrategy : uint8_t { None, Braces };
// An extension is a piece of native code specific to a grammar that modifies
// the behavior of annotated rules. One ExtensionID is assigned for each unique
@@ -110,7 +107,7 @@ struct Rule {
// length to 9 (this is the longest sequence in cxx grammar).
static constexpr unsigned SizeBits = 4;
static constexpr unsigned MaxElements = 9;
static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
static_assert(SizeBits + SymbolBits <= 16,
"Must be able to store symbol ID + size efficiently");
@@ -126,13 +123,6 @@ struct Rule {
// being set for this rule.
ExtensionID Guard = 0;
// Specifies the index within Sequence eligible for error recovery.
// Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
// should recover by finding the matching brace, and forcing stmt-seq to match
// everything between braces.
uint8_t RecoveryIndex = -1;
RecoveryStrategy Recovery = RecoveryStrategy::None;
llvm::ArrayRef<SymbolID> seq() const {
return llvm::ArrayRef<SymbolID>(Sequence, Size);
}

View File

@@ -137,20 +137,8 @@ public:
SymbolID Label;
};
// A possible error recovery: choose to match some tokens against a symbol.
//
// e.g. a state that contains
// stmt := { . stmt-seq [recover=braces] }
// has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
struct Recovery {
StateID Src; // The state we are in when encountering the error.
RecoveryStrategy Strategy; // Heuristic choosing the tokens to match.
SymbolID Result; // The symbol that is produced.
};
llvm::ArrayRef<State> states() const { return States; }
llvm::ArrayRef<Edge> edges() const { return Edges; }
llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
return StartStates;
}
@@ -159,15 +147,12 @@ public:
private:
LRGraph(std::vector<State> States, std::vector<Edge> Edges,
std::vector<Recovery> Recoveries,
std::vector<std::pair<SymbolID, StateID>> StartStates)
: States(std::move(States)), Edges(std::move(Edges)),
Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
}
StartStates(std::move(StartStates)) {}
std::vector<State> States;
std::vector<Edge> Edges;
std::vector<Recovery> Recoveries;
std::vector<std::pair<SymbolID, StateID>> StartStates;
};

View File

@@ -121,14 +121,6 @@ public:
uint16_t Value : ValueBits;
};
struct Recovery {
RecoveryStrategy Strategy;
SymbolID Result;
};
// Returns all available actions for the given state on a terminal.
// Expected to be called by LR parsers.
llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
// Returns the state after we reduce a nonterminal.
// Expected to be called by LR parsers.
// REQUIRES: Nonterminal is valid here.
@@ -159,12 +151,6 @@ public:
symbolToToken(Terminal));
}
// Looks up available recovery actions if we stopped parsing in this state.
llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State],
Recoveries.data() + RecoveryOffset[State + 1]);
}
// Returns the state from which the LR parser should start to parse the input
// tokens as the given StartSymbol.
//
@@ -202,15 +188,9 @@ public:
StateID State;
RuleID Rule;
};
struct RecoveryEntry {
StateID State;
RecoveryStrategy Strategy;
SymbolID Result;
};
// Build a specified table for testing purposes.
static LRTable buildForTests(const Grammar &, llvm::ArrayRef<Entry>,
llvm::ArrayRef<ReduceEntry>,
llvm::ArrayRef<RecoveryEntry> = {});
// Build a specifid table for testing purposes.
static LRTable buildForTests(const Grammar &G, llvm::ArrayRef<Entry>,
llvm::ArrayRef<ReduceEntry>);
private:
// Looks up actions stored in the generic table.
@@ -242,11 +222,6 @@ private:
// This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
// as an index: Nonterminal * NUM_TOKENS + Token.
llvm::BitVector FollowSets;
// Recovery stores all recovery actions from all states.
// A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
std::vector<uint32_t> RecoveryOffset;
std::vector<Recovery> Recoveries;
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);

View File

@@ -24,156 +24,6 @@
namespace clang {
namespace pseudo {
namespace {
llvm::Optional<unsigned>
findRecoveryEndpoint(RecoveryStrategy Strategy,
const GSS::Node *RecoveryNode,
const TokenStream &Tokens) {
assert(Strategy == RecoveryStrategy::Braces);
const ForestNode *LBrace = RecoveryNode->Payload;
assert(LBrace->kind() == ForestNode::Terminal &&
LBrace->symbol() == tokenSymbol(tok::l_brace));
if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair())
return Tokens.index(*RBrace);
return llvm::None;
}
} // namespace
void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
unsigned &TokenIndex, const TokenStream &Tokens,
const ParseParams &Params,
std::vector<const GSS::Node *> &NewHeads) {
LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
// Describes a possibility to recover by forcibly interpreting a range of
// tokens around the cursor as a nonterminal that we expected to see.
struct PlaceholderRecovery {
// The token prior to the nonterminal which is being recovered.
// This starts of the region we're skipping, so higher Position is better.
Token::Index Position;
// The nonterminal which will be created in order to recover.
SymbolID Symbol;
// The heuristic used to choose the bounds of the nonterminal to recover.
RecoveryStrategy Strategy;
// The GSS head where we are expecting the recovered nonterminal.
const GSS::Node *RecoveryNode;
// Payload of nodes on the way back from the OldHead to the recovery node.
// These represent the partial parse that is being discarded.
// They should become the children of the opaque recovery node.
//
// There may be multiple paths leading to the same recovery node, we choose
// one arbitrarily.
std::vector<const ForestNode *> Path;
};
std::vector<PlaceholderRecovery> Options;
// Find recovery options by walking up the stack.
//
// This is similar to exception handling: we walk up the "frames" of nested
// rules being parsed until we find one that has a "handler" which allows us
// to determine the node bounds without parsing it.
//
// Unfortunately there's a significant difference: the stack contains both
// "upward" nodes (ancestor parses) and "leftward" ones.
// e.g. when parsing `int(2 + ?)`, the stack contains:
// expr := expr + . expr - which we're currently parsing
// expr := type ( . expr ) - (up) we should recover this outer expr
// expr := . type ( expr ) - (up+left) we should not recover this type!
//
// It's not obvious how to avoid collecting the latter as a recovery option.
// I think the distinction is ill-defined after merging items into states.
// For now, we have to take this into account when defining recovery rules.
// FIXME: find a more satisfying way to avoid such false recovery.
std::vector<const ForestNode *> Path;
llvm::DenseSet<const GSS::Node *> Seen;
auto DFS = [&](const GSS::Node *N, Token::Index NextTok, auto &DFS) {
if (!Seen.insert(N).second)
return;
for (auto Strategy : Params.Table.getRecovery(N->State)) {
Options.push_back(PlaceholderRecovery{
NextTok,
Strategy.Result,
Strategy.Strategy,
N,
Path,
});
LLVM_DEBUG(llvm::dbgs()
<< "Option: recover " << Params.G.symbolName(Strategy.Result)
<< " at token " << NextTok << "\n");
}
Path.push_back(N->Payload);
for (const GSS::Node *Parent : N->parents())
DFS(Parent, N->Payload->startTokenIndex(), DFS);
Path.pop_back();
};
for (auto *N : llvm::reverse(OldHeads))
DFS(N, TokenIndex, DFS);
// Now we select the option(s) we will use to recover.
//
// We prefer options starting further right, as these discard less code
// (e.g. we prefer to recover inner scopes rather than outer ones).
// The options also need to agree on an endpoint, so the parser has a
// consistent position afterwards.
//
// So conceptually we're sorting by the tuple (start, end), though we avoid
// computing `end` for options that can't be winners.
// Consider options starting further right first.
// Don't drop the others yet though, we may still use them if preferred fails.
llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
return L.Position > R.Position;
});
assert(NewHeads.empty()); // We may repeatedly populate and clear it.
llvm::Optional<Token::Range> RecoveryRange;
for (const PlaceholderRecovery &Option : Options) {
// If this starts further right than options we've already found, then
// we'll never find anything better. Skip computing End for the rest.
if (RecoveryRange && Option.Position < RecoveryRange->Begin)
break;
auto End =
findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Tokens);
// Only consider recovery that advances the parse.
if (!End || *End <= TokenIndex)
continue;
if (RecoveryRange) {
// If this is worse than our previous options, ignore it.
if (RecoveryRange->End < *End)
continue;
// If this is an improvement over our previous options, then drop them.
if (RecoveryRange->End > *End)
NewHeads.clear();
}
// Create recovery nodes and heads for them in the GSS. These may be
// discarded if a better recovery is later found, but this path isn't hot.
RecoveryRange = {Option.Position, *End};
const ForestNode &Placeholder =
Params.Forest.createOpaque(Option.Symbol, Option.Position);
const GSS::Node *NewHead = Params.GSStack.addNode(
Params.Table.getGoToState(Option.RecoveryNode->State, Option.Symbol),
&Placeholder, {Option.RecoveryNode});
NewHeads.push_back(NewHead);
}
// Advance the cursor, whether recovery succeeded or not.
if (RecoveryRange) {
LLVM_DEBUG({
llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
for (const auto *Head : NewHeads)
llvm::dbgs() << " " << Params.G.symbolName(Head->Payload->symbol());
llvm::dbgs() << "\n";
});
TokenIndex = RecoveryRange->End;
} else {
LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
<< " strategies\n");
++TokenIndex;
}
}
using StateID = LRTable::StateID;
@@ -181,9 +31,8 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
std::vector<std::string> ParentStates;
for (const auto *Parent : N.parents())
ParentStates.push_back(llvm::formatv("{0}", Parent->State));
OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
N.Payload ? N.Payload->symbol() : 0,
llvm::join(ParentStates, ", "));
OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State,
N.Payload->symbol(), llvm::join(ParentStates, ", "));
return OS;
}
@@ -578,27 +427,15 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
GSS.gc(std::move(Roots));
};
// Each iteration fully processes a single token.
for (unsigned I = 0; I < Terminals.size();) {
for (unsigned I = 0; I < Terminals.size(); ++I) {
LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
"Next token {0} (id={1})\n",
G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
// Consume the token.
glrShift(Heads, Terminals[I], Params, NextHeads);
// If we weren't able to consume the token, try to skip over some tokens
// so we can keep parsing.
if (NextHeads.empty()) {
glrRecover(Heads, I, Tokens, Params, NextHeads);
if (NextHeads.empty())
// FIXME: Ensure the `_ := start-symbol` rules have a fallback
// error-recovery strategy attached. Then this condition can't happen.
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
} else
++I;
// Form nonterminals containing the token we just consumed.
SymbolID Lookahead =
I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof)
: Terminals[I + 1].symbol();
Reduce(NextHeads, Lookahead);
// Prepare for the next token.
std::swap(Heads, NextHeads);
@@ -607,35 +444,22 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
}
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
// The parse was successful if we're in state `_ := start-symbol .`
StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol);
auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
const ForestNode *Result = nullptr;
for (const auto *Head : Heads) {
if (Head->State == AcceptState) {
assert(Head->Payload->symbol() == StartSymbol);
assert(Result == nullptr && "multiple results!");
Result = Head->Payload;
}
const ForestNode *Result = nullptr;
for (const auto *Head : Heads) {
if (Head->State == AcceptState) {
assert(Head->Payload->symbol() == StartSymbol);
assert(Result == nullptr && "multiple results!");
Result = Head->Payload;
}
return Result;
};
if (auto *Result = SearchForAccept(Heads))
}
if (Result)
return *Result;
// Failed to parse the input, attempt to run recovery.
// FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
// More elegant is to include EOF in the token stream, and make the
// augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
// would not be a special case: it show up as a failure to shift the EOF
// token.
unsigned I = Terminals.size();
glrRecover(Heads, I, Tokens, Params, NextHeads);
Reduce(NextHeads, tokenSymbol(tok::eof));
if (auto *Result = SearchForAccept(NextHeads))
return *Result;
// We failed to parse the input, returning an opaque forest node for recovery.
// FIXME: as above, we can add fallback error handling so this is impossible.
//
// FIXME: We will need to invoke our generic error-recovery handlers when we
// reach EOF without reaching accept state, and involving the eof
// token in the above main for-loopmay be the best way to reuse the code).
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
}
@@ -646,10 +470,9 @@ void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
}
const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
llvm::ArrayRef<const Node *> Parents) {
Node *Result = new (allocate(Parents.size()))
Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
Node({State, GCParity, static_cast<unsigned>(Parents.size())});
Alive.push_back(Result);
++NodesCreated;
Result->Payload = Symbol;

View File

@@ -59,11 +59,8 @@ std::string Grammar::dumpRule(RuleID RID) const {
llvm::raw_string_ostream OS(Result);
const Rule &R = T->Rules[RID];
OS << symbolName(R.Target) << " :=";
for (unsigned I = 0; I < R.Size; ++I) {
OS << " " << symbolName(R.Sequence[I]);
if (R.RecoveryIndex == I)
OS << " [recover=" << static_cast<unsigned>(R.Recovery) << "]";
}
for (SymbolID SID : R.seq())
OS << " " << symbolName(SID);
if (R.Guard)
OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
return Result;

View File

@@ -106,17 +106,6 @@ public:
assert(T->Rules.size() < (1 << RuleBits) &&
"Too many rules to fit in RuleID bits!");
// Wherever RHS contains { foo }, mark foo for brace-recovery.
// FIXME: this should be grammar annotations instead.
for (auto &Rule : T->Rules) {
for (unsigned I = 2; I < Rule.Size; ++I)
if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) &&
Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) &&
!isToken(Rule.Sequence[I - 1])) {
Rule.Recovery = RecoveryStrategy::Braces;
Rule.RecoveryIndex = I - 1;
}
}
const auto &SymbolOrder = getTopologicalOrder(T.get());
llvm::stable_sort(
T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {

View File

@@ -120,20 +120,6 @@ nextAvailableKernelItems(const State &S, const Grammar &G) {
return Results;
}
std::vector<std::pair<RecoveryStrategy, SymbolID>>
availableRecovery(const State &S, const Grammar &G) {
std::vector<std::pair<RecoveryStrategy, SymbolID>> Result;
for (const Item &I : S.Items) {
const auto &Rule = G.lookupRule(I.rule());
if (I.dot() != Rule.RecoveryIndex)
continue;
Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
}
llvm::sort(Result);
Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
return Result;
}
} // namespace
std::string Item::dump(const Grammar &G) const {
@@ -144,10 +130,9 @@ std::string Item::dump(const Grammar &G) const {
Results.push_back(G.symbolName(SID));
return Results;
};
return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
.str();
}
@@ -196,11 +181,6 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
Edges.push_back({Src, Dst, Label});
}
void insertRecovery(StateID Src, RecoveryStrategy Strategy,
SymbolID Result) {
Recoveries.push_back({Src, Strategy, Result});
}
// Returns a state with the given id.
const State &find(StateID ID) const {
assert(ID < States.size());
@@ -214,10 +194,9 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
LRGraph build() && {
States.shrink_to_fit();
Edges.shrink_to_fit();
Recoveries.shrink_to_fit();
llvm::sort(StartStates);
StartStates.shrink_to_fit();
return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
return LRGraph(std::move(States), std::move(Edges),
std::move(StartStates));
}
@@ -226,7 +205,6 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
std::vector<State> States;
std::vector<Edge> Edges;
std::vector<Recovery> Recoveries;
const Grammar &G;
std::vector<std::pair<SymbolID, StateID>> StartStates;
} Builder(G);
@@ -247,16 +225,15 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
}
while (!PendingStates.empty()) {
auto StateID = PendingStates.back();
auto CurrentStateID = PendingStates.back();
PendingStates.pop_back();
for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
for (auto Next :
nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
auto Insert = Builder.insert(Next.second);
if (Insert.second) // new state, insert to the pending queue.
PendingStates.push_back(Insert.first);
Builder.insertEdge(StateID, Insert.first, Next.first);
Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
}
for (auto Recovery : availableRecovery(Builder.find(StateID), G))
Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
}
return std::move(Builder).build();
}

View File

@@ -11,7 +11,6 @@
#include "clang-pseudo/grammar/LRTable.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
namespace llvm {
@@ -45,7 +44,6 @@ struct LRTable::Builder {
llvm::DenseSet<Entry> Entries;
llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduces;
std::vector<llvm::DenseSet<SymbolID>> FollowSets;
std::vector<LRGraph::Recovery> Recoveries;
LRTable build(unsigned NumStates) && {
// E.g. given the following parsing table with 3 states and 3 terminals:
@@ -90,26 +88,6 @@ struct LRTable::Builder {
}
Table.StartStates = std::move(StartStates);
// Error recovery entries: sort (no dups already), and build offset lookup.
llvm::sort(Recoveries,
[&](const LRGraph::Recovery &L, const LRGraph::Recovery &R) {
return std::tie(L.Src, L.Result, L.Strategy) <
std::tie(R.Src, R.Result, R.Strategy);
});
Table.Recoveries.reserve(Recoveries.size());
for (const auto &R : Recoveries)
Table.Recoveries.push_back({R.Strategy, R.Result});
Table.RecoveryOffset = std::vector<uint32_t>(NumStates + 1, 0);
SortedIndex = 0;
for (StateID State = 0; State < NumStates; ++State) {
Table.RecoveryOffset[State] = SortedIndex;
while (SortedIndex < Recoveries.size() &&
Recoveries[SortedIndex].Src == State)
SortedIndex++;
}
Table.RecoveryOffset[NumStates] = SortedIndex;
assert(SortedIndex == Recoveries.size());
// Compile the follow sets into a bitmap.
Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size());
for (SymbolID NT = 0; NT < FollowSets.size(); ++NT)
@@ -136,8 +114,7 @@ struct LRTable::Builder {
};
LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef<Entry> Entries,
llvm::ArrayRef<ReduceEntry> Reduces,
llvm::ArrayRef<RecoveryEntry> Recoveries) {
llvm::ArrayRef<ReduceEntry> Reduces) {
StateID MaxState = 0;
for (const auto &Entry : Entries) {
MaxState = std::max(MaxState, Entry.State);
@@ -151,8 +128,6 @@ LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef<Entry> Entries,
for (const ReduceEntry &E : Reduces)
Build.Reduces[E.State].insert(E.Rule);
Build.FollowSets = followSets(G);
for (const auto &R : Recoveries)
Build.Recoveries.push_back({R.State, R.Strategy, R.Result});
return std::move(Build).build(/*NumStates=*/MaxState + 1);
}
@@ -160,7 +135,6 @@ LRTable LRTable::buildSLR(const Grammar &G) {
auto Graph = LRGraph::buildLR0(G);
Builder Build;
Build.StartStates = Graph.startStates();
Build.Recoveries = Graph.recoveries();
for (const auto &T : Graph.edges()) {
Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
Build.Entries.insert({T.Src, T.Label, Act});

View File

@@ -2,7 +2,7 @@
class Foo {
public:
};
// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] }
// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification }
// CHECK-NEXT: ├─class-head := class-key class-head-name
// CHECK-NEXT: │ ├─class-key~CLASS := tok[0]
// CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1]

View File

@@ -1,13 +0,0 @@
// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s
auto x = { complete garbage };
// CHECK: translation-unit~simple-declaration
// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0]
// CHECK-NEXT: ├─init-declarator-list~init-declarator
// CHECK-NEXT: │ ├─declarator~IDENTIFIER := tok[1]
// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer
// CHECK-NEXT: │ ├─= := tok[2]
// CHECK-NEXT: │ └─initializer-clause~braced-init-list
// CHECK-NEXT: │ ├─{ := tok[3]
// CHECK-NEXT: │ ├─initializer-list := <opaque>
// CHECK-NEXT: │ └─} := tok[6]
// CHECK-NEXT: └─; := tok[7]

View File

@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "clang-pseudo/GLR.h"
#include "clang-pseudo/Bracket.h"
#include "clang-pseudo/Token.h"
#include "clang-pseudo/grammar/Grammar.h"
#include "clang/Basic/LangOptions.h"
@@ -33,13 +32,11 @@ namespace {
using Action = LRTable::Action;
using testing::AllOf;
using testing::ElementsAre;
using testing::IsEmpty;
using testing::UnorderedElementsAre;
MATCHER_P(state, StateID, "") { return arg->State == StateID; }
MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
testing::Matcher<const GSS::Node *>
parents(llvm::ArrayRef<const GSS::Node *> Parents) {
@@ -241,9 +238,9 @@ TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
/*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
const auto *GSSNode2 = GSStack.addNode(
/*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
const auto *GSSNode3 = GSStack.addNode(
/*State=*/3, /*ForestNode=*/ClassNameNode,
/*Parents=*/{GSSNode1});
const auto *GSSNode3 =
GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
/*Parents=*/{GSSNode1});
const auto *GSSNode4 =
GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
/*Parents=*/{GSSNode2});
@@ -366,124 +363,6 @@ TEST_F(GLRTest, ReduceLookahead) {
EXPECT_THAT(Heads, ElementsAre(GSSNode1));
}
TEST_F(GLRTest, Recover) {
// Recovery while parsing "word" inside braces.
// Before:
// 0--1({)--2(?)
// After recovering a `word` at state 1:
// 0--3(word) // 3 is goto(1, word)
buildGrammar({"word"}, {});
LRTable Table = LRTable::buildForTests(
G, {{/*State=*/1, id("word"), Action::goTo(3)}}, /*Reduce=*/{},
/*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
auto *Question1 = &Arena.createTerminal(tok::question, 1);
const auto *Root = GSStack.addNode(0, nullptr, {});
const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
// Need a token stream with paired braces so the strategy works.
clang::LangOptions LOptions;
TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
pairBrackets(Tokens);
std::vector<const GSS::Node *> NewHeads;
unsigned TokenIndex = 2;
glrRecover({AfterQuestion1}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
NewHeads);
EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
EXPECT_THAT(NewHeads, ElementsAre(
AllOf(state(3), parsedSymbolID(id("word")),
parents({OpenedBraces}), start(1u))));
EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
// Test recovery failure: omit closing brace so strategy fails
TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
pairBrackets(NoRBrace);
NewHeads.clear();
TokenIndex = 2;
glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
{G, Table, Arena, GSStack}, NewHeads);
EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
EXPECT_THAT(NewHeads, IsEmpty());
}
TEST_F(GLRTest, RecoverRightmost) {
// In a nested block structure, we recover at the innermost possible block.
// Before:
// 0--1({)--1({)--1({)
// After recovering a `block` at inside the second braces:
// 0--1({)--2(body) // 2 is goto(1, body)
buildGrammar({"body"}, {});
LRTable Table = LRTable::buildForTests(
G, {{/*State=*/1, id("body"), Action::goTo(2)}}, /*Reduce=*/{},
/*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
clang::LangOptions LOptions;
// Innermost brace is unmatched, to test fallback to next brace.
TokenStream Tokens = cook(lex("{ { { ? ? } }", LOptions), LOptions);
Tokens.tokens()[0].Pair = 5;
Tokens.tokens()[1].Pair = 4;
Tokens.tokens()[4].Pair = 1;
Tokens.tokens()[5].Pair = 0;
auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
const auto *Root = GSStack.addNode(0, nullptr, {});
const auto *In1 = GSStack.addNode(1, Brace1, {Root});
const auto *In2 = GSStack.addNode(1, Brace2, {In1});
const auto *In3 = GSStack.addNode(1, Brace3, {In2});
unsigned TokenIndex = 3;
std::vector<const GSS::Node *> NewHeads;
glrRecover({In3}, TokenIndex, Tokens, {G, Table, Arena, GSStack}, NewHeads);
EXPECT_EQ(TokenIndex, 5u);
EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
parents({In2}), start(2u))));
}
TEST_F(GLRTest, RecoverAlternatives) {
// Recovery inside braces with multiple equally good options
// Before:
// 0--1({)
// After recovering either `word` or `number` inside the braces:
// 0--1({)--2(word) // 2 is goto(1, word)
// └--3(number) // 3 is goto(1, number)
buildGrammar({"number", "word"}, {});
LRTable Table = LRTable::buildForTests(
G,
{
{/*State=*/1, id("number"), Action::goTo(2)},
{/*State=*/1, id("word"), Action::goTo(3)},
},
/*Reduce=*/{},
/*Recovery=*/
{
{/*State=*/1, RecoveryStrategy::Braces, id("number")},
{/*State=*/1, RecoveryStrategy::Braces, id("word")},
});
auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
const auto *Root = GSStack.addNode(0, nullptr, {});
const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
clang::LangOptions LOptions;
TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
pairBrackets(Tokens);
std::vector<const GSS::Node *> NewHeads;
unsigned TokenIndex = 1;
glrRecover({OpenedBraces}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
NewHeads);
EXPECT_EQ(TokenIndex, 2u);
EXPECT_THAT(NewHeads,
UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
parents({OpenedBraces}), start(1u)),
AllOf(state(3), parsedSymbolID(id("word")),
parents({OpenedBraces}), start(1u))));
}
TEST_F(GLRTest, PerfectForestNodeSharing) {
// Run the GLR on a simple grammar and test that we build exactly one forest
// node per (SymbolID, token range).
@@ -552,40 +431,6 @@ TEST_F(GLRTest, GLRReduceOrder) {
"[ 0, end) └─IDENTIFIER := tok[0]\n");
}
TEST_F(GLRTest, RecoveryEndToEnd) {
// Simple example of brace-based recovery showing:
// - recovered region includes tokens both ahead of and behind the cursor
// - multiple possible recovery rules
// - recovery from outer scopes is rejected
build(R"bnf(
_ := block
block := { block }
block := { numbers }
numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
)bnf");
auto LRTable = LRTable::buildSLR(G);
clang::LangOptions LOptions;
TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
pairBrackets(Tokens);
const ForestNode &Parsed =
glrParse(Tokens, {G, LRTable, Arena, GSStack}, id("block"));
EXPECT_EQ(Parsed.dumpRecursive(G),
"[ 0, end) block := { block [recover=1] }\n"
"[ 0, 1) ├─{ := tok[0]\n"
"[ 1, 5) ├─block := <ambiguous>\n"
"[ 1, 5) │ ├─block := { block [recover=1] }\n"
"[ 1, 2) │ │ ├─{ := tok[1]\n"
"[ 2, 4) │ │ ├─block := <opaque>\n"
"[ 4, 5) │ │ └─} := tok[4]\n"
"[ 1, 5) │ └─block := { numbers [recover=1] }\n"
"[ 1, 2) │ ├─{ := tok[1]\n"
"[ 2, 4) │ ├─numbers := <opaque>\n"
"[ 4, 5) │ └─} := tok[4]\n"
"[ 5, end) └─} := tok[5]\n");
}
TEST_F(GLRTest, NoExplicitAccept) {
build(R"bnf(
_ := test