mirror of
https://github.com/intel/llvm.git
synced 2026-01-24 00:20:25 +08:00
Revert "[pseudo] Add error-recovery framework & brace-based recovery"
This reverts commit a0f4c10ae2.
This commit hadn't been reviewed yet, and was unintentionally included
on another branch.
This commit is contained in:
@@ -144,26 +144,6 @@ void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
|
||||
void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
|
||||
const ParseParams &Params);
|
||||
|
||||
// Heuristically recover from a state where no further parsing is possible.
|
||||
//
|
||||
// OldHeads is the parse state at TokenIndex.
|
||||
// This function consumes consumes zero or more tokens (advancing TokenIndex),
|
||||
// and places any recovery states created in NewHeads.
|
||||
//
|
||||
// On failure, NewHeads is empty and TokenIndex is unchanged.
|
||||
//
|
||||
// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
|
||||
// there is a risk of the parser falling into an infinite loop, creating an
|
||||
// endless sequence of recovery nodes.
|
||||
// Generally it is safe for recovery to match 0 tokens against sequence symbols
|
||||
// like `statement-seq`, as the grammar won't permit another statement-seq
|
||||
// immediately afterwards. However recovery strategies for `statement` should
|
||||
// consume at least one token, as statements may be adjacent in the input.
|
||||
void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
|
||||
unsigned &TokenIndex, const TokenStream &Tokens,
|
||||
const ParseParams &Params,
|
||||
std::vector<const GSS::Node *> &NewHeads);
|
||||
|
||||
} // namespace pseudo
|
||||
} // namespace clang
|
||||
|
||||
|
||||
@@ -81,12 +81,9 @@ inline tok::TokenKind symbolToToken(SymbolID SID) {
|
||||
assert(SID < NumTerminals);
|
||||
return static_cast<tok::TokenKind>(SID);
|
||||
}
|
||||
inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
|
||||
inline SymbolID tokenSymbol(tok::TokenKind TK) {
|
||||
return TokenFlag | static_cast<SymbolID>(TK);
|
||||
}
|
||||
// Error recovery strategies.
|
||||
// FIXME: these should be provided as extensions instead.
|
||||
enum class RecoveryStrategy : uint8_t { None, Braces };
|
||||
|
||||
// An extension is a piece of native code specific to a grammar that modifies
|
||||
// the behavior of annotated rules. One ExtensionID is assigned for each unique
|
||||
@@ -110,7 +107,7 @@ struct Rule {
|
||||
// length to 9 (this is the longest sequence in cxx grammar).
|
||||
static constexpr unsigned SizeBits = 4;
|
||||
static constexpr unsigned MaxElements = 9;
|
||||
static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
|
||||
static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
|
||||
static_assert(SizeBits + SymbolBits <= 16,
|
||||
"Must be able to store symbol ID + size efficiently");
|
||||
|
||||
@@ -126,13 +123,6 @@ struct Rule {
|
||||
// being set for this rule.
|
||||
ExtensionID Guard = 0;
|
||||
|
||||
// Specifies the index within Sequence eligible for error recovery.
|
||||
// Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
|
||||
// should recover by finding the matching brace, and forcing stmt-seq to match
|
||||
// everything between braces.
|
||||
uint8_t RecoveryIndex = -1;
|
||||
RecoveryStrategy Recovery = RecoveryStrategy::None;
|
||||
|
||||
llvm::ArrayRef<SymbolID> seq() const {
|
||||
return llvm::ArrayRef<SymbolID>(Sequence, Size);
|
||||
}
|
||||
|
||||
@@ -137,20 +137,8 @@ public:
|
||||
SymbolID Label;
|
||||
};
|
||||
|
||||
// A possible error recovery: choose to match some tokens against a symbol.
|
||||
//
|
||||
// e.g. a state that contains
|
||||
// stmt := { . stmt-seq [recover=braces] }
|
||||
// has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
|
||||
struct Recovery {
|
||||
StateID Src; // The state we are in when encountering the error.
|
||||
RecoveryStrategy Strategy; // Heuristic choosing the tokens to match.
|
||||
SymbolID Result; // The symbol that is produced.
|
||||
};
|
||||
|
||||
llvm::ArrayRef<State> states() const { return States; }
|
||||
llvm::ArrayRef<Edge> edges() const { return Edges; }
|
||||
llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
|
||||
llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
|
||||
return StartStates;
|
||||
}
|
||||
@@ -159,15 +147,12 @@ public:
|
||||
|
||||
private:
|
||||
LRGraph(std::vector<State> States, std::vector<Edge> Edges,
|
||||
std::vector<Recovery> Recoveries,
|
||||
std::vector<std::pair<SymbolID, StateID>> StartStates)
|
||||
: States(std::move(States)), Edges(std::move(Edges)),
|
||||
Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
|
||||
}
|
||||
StartStates(std::move(StartStates)) {}
|
||||
|
||||
std::vector<State> States;
|
||||
std::vector<Edge> Edges;
|
||||
std::vector<Recovery> Recoveries;
|
||||
std::vector<std::pair<SymbolID, StateID>> StartStates;
|
||||
};
|
||||
|
||||
|
||||
@@ -121,14 +121,6 @@ public:
|
||||
uint16_t Value : ValueBits;
|
||||
};
|
||||
|
||||
struct Recovery {
|
||||
RecoveryStrategy Strategy;
|
||||
SymbolID Result;
|
||||
};
|
||||
|
||||
// Returns all available actions for the given state on a terminal.
|
||||
// Expected to be called by LR parsers.
|
||||
llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
|
||||
// Returns the state after we reduce a nonterminal.
|
||||
// Expected to be called by LR parsers.
|
||||
// REQUIRES: Nonterminal is valid here.
|
||||
@@ -159,12 +151,6 @@ public:
|
||||
symbolToToken(Terminal));
|
||||
}
|
||||
|
||||
// Looks up available recovery actions if we stopped parsing in this state.
|
||||
llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
|
||||
return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State],
|
||||
Recoveries.data() + RecoveryOffset[State + 1]);
|
||||
}
|
||||
|
||||
// Returns the state from which the LR parser should start to parse the input
|
||||
// tokens as the given StartSymbol.
|
||||
//
|
||||
@@ -202,15 +188,9 @@ public:
|
||||
StateID State;
|
||||
RuleID Rule;
|
||||
};
|
||||
struct RecoveryEntry {
|
||||
StateID State;
|
||||
RecoveryStrategy Strategy;
|
||||
SymbolID Result;
|
||||
};
|
||||
// Build a specified table for testing purposes.
|
||||
static LRTable buildForTests(const Grammar &, llvm::ArrayRef<Entry>,
|
||||
llvm::ArrayRef<ReduceEntry>,
|
||||
llvm::ArrayRef<RecoveryEntry> = {});
|
||||
// Build a specifid table for testing purposes.
|
||||
static LRTable buildForTests(const Grammar &G, llvm::ArrayRef<Entry>,
|
||||
llvm::ArrayRef<ReduceEntry>);
|
||||
|
||||
private:
|
||||
// Looks up actions stored in the generic table.
|
||||
@@ -242,11 +222,6 @@ private:
|
||||
// This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
|
||||
// as an index: Nonterminal * NUM_TOKENS + Token.
|
||||
llvm::BitVector FollowSets;
|
||||
|
||||
// Recovery stores all recovery actions from all states.
|
||||
// A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
|
||||
std::vector<uint32_t> RecoveryOffset;
|
||||
std::vector<Recovery> Recoveries;
|
||||
};
|
||||
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
|
||||
|
||||
|
||||
@@ -24,156 +24,6 @@
|
||||
|
||||
namespace clang {
|
||||
namespace pseudo {
|
||||
namespace {
|
||||
|
||||
llvm::Optional<unsigned>
|
||||
findRecoveryEndpoint(RecoveryStrategy Strategy,
|
||||
const GSS::Node *RecoveryNode,
|
||||
const TokenStream &Tokens) {
|
||||
assert(Strategy == RecoveryStrategy::Braces);
|
||||
const ForestNode *LBrace = RecoveryNode->Payload;
|
||||
assert(LBrace->kind() == ForestNode::Terminal &&
|
||||
LBrace->symbol() == tokenSymbol(tok::l_brace));
|
||||
if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair())
|
||||
return Tokens.index(*RBrace);
|
||||
return llvm::None;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
|
||||
unsigned &TokenIndex, const TokenStream &Tokens,
|
||||
const ParseParams &Params,
|
||||
std::vector<const GSS::Node *> &NewHeads) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
|
||||
// Describes a possibility to recover by forcibly interpreting a range of
|
||||
// tokens around the cursor as a nonterminal that we expected to see.
|
||||
struct PlaceholderRecovery {
|
||||
// The token prior to the nonterminal which is being recovered.
|
||||
// This starts of the region we're skipping, so higher Position is better.
|
||||
Token::Index Position;
|
||||
// The nonterminal which will be created in order to recover.
|
||||
SymbolID Symbol;
|
||||
// The heuristic used to choose the bounds of the nonterminal to recover.
|
||||
RecoveryStrategy Strategy;
|
||||
|
||||
// The GSS head where we are expecting the recovered nonterminal.
|
||||
const GSS::Node *RecoveryNode;
|
||||
// Payload of nodes on the way back from the OldHead to the recovery node.
|
||||
// These represent the partial parse that is being discarded.
|
||||
// They should become the children of the opaque recovery node.
|
||||
//
|
||||
// There may be multiple paths leading to the same recovery node, we choose
|
||||
// one arbitrarily.
|
||||
std::vector<const ForestNode *> Path;
|
||||
};
|
||||
std::vector<PlaceholderRecovery> Options;
|
||||
|
||||
// Find recovery options by walking up the stack.
|
||||
//
|
||||
// This is similar to exception handling: we walk up the "frames" of nested
|
||||
// rules being parsed until we find one that has a "handler" which allows us
|
||||
// to determine the node bounds without parsing it.
|
||||
//
|
||||
// Unfortunately there's a significant difference: the stack contains both
|
||||
// "upward" nodes (ancestor parses) and "leftward" ones.
|
||||
// e.g. when parsing `int(2 + ?)`, the stack contains:
|
||||
// expr := expr + . expr - which we're currently parsing
|
||||
// expr := type ( . expr ) - (up) we should recover this outer expr
|
||||
// expr := . type ( expr ) - (up+left) we should not recover this type!
|
||||
//
|
||||
// It's not obvious how to avoid collecting the latter as a recovery option.
|
||||
// I think the distinction is ill-defined after merging items into states.
|
||||
// For now, we have to take this into account when defining recovery rules.
|
||||
// FIXME: find a more satisfying way to avoid such false recovery.
|
||||
std::vector<const ForestNode *> Path;
|
||||
llvm::DenseSet<const GSS::Node *> Seen;
|
||||
auto DFS = [&](const GSS::Node *N, Token::Index NextTok, auto &DFS) {
|
||||
if (!Seen.insert(N).second)
|
||||
return;
|
||||
for (auto Strategy : Params.Table.getRecovery(N->State)) {
|
||||
Options.push_back(PlaceholderRecovery{
|
||||
NextTok,
|
||||
Strategy.Result,
|
||||
Strategy.Strategy,
|
||||
N,
|
||||
Path,
|
||||
});
|
||||
LLVM_DEBUG(llvm::dbgs()
|
||||
<< "Option: recover " << Params.G.symbolName(Strategy.Result)
|
||||
<< " at token " << NextTok << "\n");
|
||||
}
|
||||
Path.push_back(N->Payload);
|
||||
for (const GSS::Node *Parent : N->parents())
|
||||
DFS(Parent, N->Payload->startTokenIndex(), DFS);
|
||||
Path.pop_back();
|
||||
};
|
||||
for (auto *N : llvm::reverse(OldHeads))
|
||||
DFS(N, TokenIndex, DFS);
|
||||
|
||||
// Now we select the option(s) we will use to recover.
|
||||
//
|
||||
// We prefer options starting further right, as these discard less code
|
||||
// (e.g. we prefer to recover inner scopes rather than outer ones).
|
||||
// The options also need to agree on an endpoint, so the parser has a
|
||||
// consistent position afterwards.
|
||||
//
|
||||
// So conceptually we're sorting by the tuple (start, end), though we avoid
|
||||
// computing `end` for options that can't be winners.
|
||||
|
||||
// Consider options starting further right first.
|
||||
// Don't drop the others yet though, we may still use them if preferred fails.
|
||||
llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
|
||||
return L.Position > R.Position;
|
||||
});
|
||||
|
||||
assert(NewHeads.empty()); // We may repeatedly populate and clear it.
|
||||
llvm::Optional<Token::Range> RecoveryRange;
|
||||
for (const PlaceholderRecovery &Option : Options) {
|
||||
// If this starts further right than options we've already found, then
|
||||
// we'll never find anything better. Skip computing End for the rest.
|
||||
if (RecoveryRange && Option.Position < RecoveryRange->Begin)
|
||||
break;
|
||||
|
||||
auto End =
|
||||
findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Tokens);
|
||||
// Only consider recovery that advances the parse.
|
||||
if (!End || *End <= TokenIndex)
|
||||
continue;
|
||||
if (RecoveryRange) {
|
||||
// If this is worse than our previous options, ignore it.
|
||||
if (RecoveryRange->End < *End)
|
||||
continue;
|
||||
// If this is an improvement over our previous options, then drop them.
|
||||
if (RecoveryRange->End > *End)
|
||||
NewHeads.clear();
|
||||
}
|
||||
// Create recovery nodes and heads for them in the GSS. These may be
|
||||
// discarded if a better recovery is later found, but this path isn't hot.
|
||||
RecoveryRange = {Option.Position, *End};
|
||||
const ForestNode &Placeholder =
|
||||
Params.Forest.createOpaque(Option.Symbol, Option.Position);
|
||||
const GSS::Node *NewHead = Params.GSStack.addNode(
|
||||
Params.Table.getGoToState(Option.RecoveryNode->State, Option.Symbol),
|
||||
&Placeholder, {Option.RecoveryNode});
|
||||
NewHeads.push_back(NewHead);
|
||||
}
|
||||
|
||||
// Advance the cursor, whether recovery succeeded or not.
|
||||
if (RecoveryRange) {
|
||||
LLVM_DEBUG({
|
||||
llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
|
||||
for (const auto *Head : NewHeads)
|
||||
llvm::dbgs() << " " << Params.G.symbolName(Head->Payload->symbol());
|
||||
llvm::dbgs() << "\n";
|
||||
});
|
||||
TokenIndex = RecoveryRange->End;
|
||||
} else {
|
||||
LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
|
||||
<< " strategies\n");
|
||||
++TokenIndex;
|
||||
}
|
||||
}
|
||||
|
||||
using StateID = LRTable::StateID;
|
||||
|
||||
@@ -181,9 +31,8 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GSS::Node &N) {
|
||||
std::vector<std::string> ParentStates;
|
||||
for (const auto *Parent : N.parents())
|
||||
ParentStates.push_back(llvm::formatv("{0}", Parent->State));
|
||||
OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
|
||||
N.Payload ? N.Payload->symbol() : 0,
|
||||
llvm::join(ParentStates, ", "));
|
||||
OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State,
|
||||
N.Payload->symbol(), llvm::join(ParentStates, ", "));
|
||||
return OS;
|
||||
}
|
||||
|
||||
@@ -578,27 +427,15 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
|
||||
GSS.gc(std::move(Roots));
|
||||
};
|
||||
// Each iteration fully processes a single token.
|
||||
for (unsigned I = 0; I < Terminals.size();) {
|
||||
for (unsigned I = 0; I < Terminals.size(); ++I) {
|
||||
LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
|
||||
"Next token {0} (id={1})\n",
|
||||
G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
|
||||
// Consume the token.
|
||||
glrShift(Heads, Terminals[I], Params, NextHeads);
|
||||
|
||||
// If we weren't able to consume the token, try to skip over some tokens
|
||||
// so we can keep parsing.
|
||||
if (NextHeads.empty()) {
|
||||
glrRecover(Heads, I, Tokens, Params, NextHeads);
|
||||
if (NextHeads.empty())
|
||||
// FIXME: Ensure the `_ := start-symbol` rules have a fallback
|
||||
// error-recovery strategy attached. Then this condition can't happen.
|
||||
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
|
||||
} else
|
||||
++I;
|
||||
|
||||
// Form nonterminals containing the token we just consumed.
|
||||
SymbolID Lookahead =
|
||||
I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
|
||||
SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof)
|
||||
: Terminals[I + 1].symbol();
|
||||
Reduce(NextHeads, Lookahead);
|
||||
// Prepare for the next token.
|
||||
std::swap(Heads, NextHeads);
|
||||
@@ -607,35 +444,22 @@ const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
|
||||
}
|
||||
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
|
||||
|
||||
// The parse was successful if we're in state `_ := start-symbol .`
|
||||
StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol);
|
||||
auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
|
||||
const ForestNode *Result = nullptr;
|
||||
for (const auto *Head : Heads) {
|
||||
if (Head->State == AcceptState) {
|
||||
assert(Head->Payload->symbol() == StartSymbol);
|
||||
assert(Result == nullptr && "multiple results!");
|
||||
Result = Head->Payload;
|
||||
}
|
||||
const ForestNode *Result = nullptr;
|
||||
for (const auto *Head : Heads) {
|
||||
if (Head->State == AcceptState) {
|
||||
assert(Head->Payload->symbol() == StartSymbol);
|
||||
assert(Result == nullptr && "multiple results!");
|
||||
Result = Head->Payload;
|
||||
}
|
||||
return Result;
|
||||
};
|
||||
if (auto *Result = SearchForAccept(Heads))
|
||||
}
|
||||
if (Result)
|
||||
return *Result;
|
||||
// Failed to parse the input, attempt to run recovery.
|
||||
// FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
|
||||
// More elegant is to include EOF in the token stream, and make the
|
||||
// augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
|
||||
// would not be a special case: it show up as a failure to shift the EOF
|
||||
// token.
|
||||
unsigned I = Terminals.size();
|
||||
glrRecover(Heads, I, Tokens, Params, NextHeads);
|
||||
Reduce(NextHeads, tokenSymbol(tok::eof));
|
||||
if (auto *Result = SearchForAccept(NextHeads))
|
||||
return *Result;
|
||||
|
||||
// We failed to parse the input, returning an opaque forest node for recovery.
|
||||
// FIXME: as above, we can add fallback error handling so this is impossible.
|
||||
//
|
||||
// FIXME: We will need to invoke our generic error-recovery handlers when we
|
||||
// reach EOF without reaching accept state, and involving the eof
|
||||
// token in the above main for-loopmay be the best way to reuse the code).
|
||||
return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
|
||||
}
|
||||
|
||||
@@ -646,10 +470,9 @@ void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
|
||||
}
|
||||
|
||||
const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
|
||||
|
||||
llvm::ArrayRef<const Node *> Parents) {
|
||||
Node *Result = new (allocate(Parents.size()))
|
||||
Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
|
||||
Node({State, GCParity, static_cast<unsigned>(Parents.size())});
|
||||
Alive.push_back(Result);
|
||||
++NodesCreated;
|
||||
Result->Payload = Symbol;
|
||||
|
||||
@@ -59,11 +59,8 @@ std::string Grammar::dumpRule(RuleID RID) const {
|
||||
llvm::raw_string_ostream OS(Result);
|
||||
const Rule &R = T->Rules[RID];
|
||||
OS << symbolName(R.Target) << " :=";
|
||||
for (unsigned I = 0; I < R.Size; ++I) {
|
||||
OS << " " << symbolName(R.Sequence[I]);
|
||||
if (R.RecoveryIndex == I)
|
||||
OS << " [recover=" << static_cast<unsigned>(R.Recovery) << "]";
|
||||
}
|
||||
for (SymbolID SID : R.seq())
|
||||
OS << " " << symbolName(SID);
|
||||
if (R.Guard)
|
||||
OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
|
||||
return Result;
|
||||
|
||||
@@ -106,17 +106,6 @@ public:
|
||||
|
||||
assert(T->Rules.size() < (1 << RuleBits) &&
|
||||
"Too many rules to fit in RuleID bits!");
|
||||
// Wherever RHS contains { foo }, mark foo for brace-recovery.
|
||||
// FIXME: this should be grammar annotations instead.
|
||||
for (auto &Rule : T->Rules) {
|
||||
for (unsigned I = 2; I < Rule.Size; ++I)
|
||||
if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) &&
|
||||
Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) &&
|
||||
!isToken(Rule.Sequence[I - 1])) {
|
||||
Rule.Recovery = RecoveryStrategy::Braces;
|
||||
Rule.RecoveryIndex = I - 1;
|
||||
}
|
||||
}
|
||||
const auto &SymbolOrder = getTopologicalOrder(T.get());
|
||||
llvm::stable_sort(
|
||||
T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
|
||||
|
||||
@@ -120,20 +120,6 @@ nextAvailableKernelItems(const State &S, const Grammar &G) {
|
||||
return Results;
|
||||
}
|
||||
|
||||
std::vector<std::pair<RecoveryStrategy, SymbolID>>
|
||||
availableRecovery(const State &S, const Grammar &G) {
|
||||
std::vector<std::pair<RecoveryStrategy, SymbolID>> Result;
|
||||
for (const Item &I : S.Items) {
|
||||
const auto &Rule = G.lookupRule(I.rule());
|
||||
if (I.dot() != Rule.RecoveryIndex)
|
||||
continue;
|
||||
Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
|
||||
}
|
||||
llvm::sort(Result);
|
||||
Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
|
||||
return Result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::string Item::dump(const Grammar &G) const {
|
||||
@@ -144,10 +130,9 @@ std::string Item::dump(const Grammar &G) const {
|
||||
Results.push_back(G.symbolName(SID));
|
||||
return Results;
|
||||
};
|
||||
return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
|
||||
return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
|
||||
llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
|
||||
llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
|
||||
Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
|
||||
llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
|
||||
.str();
|
||||
}
|
||||
|
||||
@@ -196,11 +181,6 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
|
||||
Edges.push_back({Src, Dst, Label});
|
||||
}
|
||||
|
||||
void insertRecovery(StateID Src, RecoveryStrategy Strategy,
|
||||
SymbolID Result) {
|
||||
Recoveries.push_back({Src, Strategy, Result});
|
||||
}
|
||||
|
||||
// Returns a state with the given id.
|
||||
const State &find(StateID ID) const {
|
||||
assert(ID < States.size());
|
||||
@@ -214,10 +194,9 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
|
||||
LRGraph build() && {
|
||||
States.shrink_to_fit();
|
||||
Edges.shrink_to_fit();
|
||||
Recoveries.shrink_to_fit();
|
||||
llvm::sort(StartStates);
|
||||
StartStates.shrink_to_fit();
|
||||
return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
|
||||
return LRGraph(std::move(States), std::move(Edges),
|
||||
std::move(StartStates));
|
||||
}
|
||||
|
||||
@@ -226,7 +205,6 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
|
||||
llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
|
||||
std::vector<State> States;
|
||||
std::vector<Edge> Edges;
|
||||
std::vector<Recovery> Recoveries;
|
||||
const Grammar &G;
|
||||
std::vector<std::pair<SymbolID, StateID>> StartStates;
|
||||
} Builder(G);
|
||||
@@ -247,16 +225,15 @@ LRGraph LRGraph::buildLR0(const Grammar &G) {
|
||||
}
|
||||
|
||||
while (!PendingStates.empty()) {
|
||||
auto StateID = PendingStates.back();
|
||||
auto CurrentStateID = PendingStates.back();
|
||||
PendingStates.pop_back();
|
||||
for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
|
||||
for (auto Next :
|
||||
nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
|
||||
auto Insert = Builder.insert(Next.second);
|
||||
if (Insert.second) // new state, insert to the pending queue.
|
||||
PendingStates.push_back(Insert.first);
|
||||
Builder.insertEdge(StateID, Insert.first, Next.first);
|
||||
Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
|
||||
}
|
||||
for (auto Recovery : availableRecovery(Builder.find(StateID), G))
|
||||
Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
|
||||
}
|
||||
return std::move(Builder).build();
|
||||
}
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include "clang-pseudo/grammar/LRTable.h"
|
||||
#include "clang/Basic/TokenKinds.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <cstdint>
|
||||
|
||||
namespace llvm {
|
||||
@@ -45,7 +44,6 @@ struct LRTable::Builder {
|
||||
llvm::DenseSet<Entry> Entries;
|
||||
llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduces;
|
||||
std::vector<llvm::DenseSet<SymbolID>> FollowSets;
|
||||
std::vector<LRGraph::Recovery> Recoveries;
|
||||
|
||||
LRTable build(unsigned NumStates) && {
|
||||
// E.g. given the following parsing table with 3 states and 3 terminals:
|
||||
@@ -90,26 +88,6 @@ struct LRTable::Builder {
|
||||
}
|
||||
Table.StartStates = std::move(StartStates);
|
||||
|
||||
// Error recovery entries: sort (no dups already), and build offset lookup.
|
||||
llvm::sort(Recoveries,
|
||||
[&](const LRGraph::Recovery &L, const LRGraph::Recovery &R) {
|
||||
return std::tie(L.Src, L.Result, L.Strategy) <
|
||||
std::tie(R.Src, R.Result, R.Strategy);
|
||||
});
|
||||
Table.Recoveries.reserve(Recoveries.size());
|
||||
for (const auto &R : Recoveries)
|
||||
Table.Recoveries.push_back({R.Strategy, R.Result});
|
||||
Table.RecoveryOffset = std::vector<uint32_t>(NumStates + 1, 0);
|
||||
SortedIndex = 0;
|
||||
for (StateID State = 0; State < NumStates; ++State) {
|
||||
Table.RecoveryOffset[State] = SortedIndex;
|
||||
while (SortedIndex < Recoveries.size() &&
|
||||
Recoveries[SortedIndex].Src == State)
|
||||
SortedIndex++;
|
||||
}
|
||||
Table.RecoveryOffset[NumStates] = SortedIndex;
|
||||
assert(SortedIndex == Recoveries.size());
|
||||
|
||||
// Compile the follow sets into a bitmap.
|
||||
Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size());
|
||||
for (SymbolID NT = 0; NT < FollowSets.size(); ++NT)
|
||||
@@ -136,8 +114,7 @@ struct LRTable::Builder {
|
||||
};
|
||||
|
||||
LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef<Entry> Entries,
|
||||
llvm::ArrayRef<ReduceEntry> Reduces,
|
||||
llvm::ArrayRef<RecoveryEntry> Recoveries) {
|
||||
llvm::ArrayRef<ReduceEntry> Reduces) {
|
||||
StateID MaxState = 0;
|
||||
for (const auto &Entry : Entries) {
|
||||
MaxState = std::max(MaxState, Entry.State);
|
||||
@@ -151,8 +128,6 @@ LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef<Entry> Entries,
|
||||
for (const ReduceEntry &E : Reduces)
|
||||
Build.Reduces[E.State].insert(E.Rule);
|
||||
Build.FollowSets = followSets(G);
|
||||
for (const auto &R : Recoveries)
|
||||
Build.Recoveries.push_back({R.State, R.Strategy, R.Result});
|
||||
return std::move(Build).build(/*NumStates=*/MaxState + 1);
|
||||
}
|
||||
|
||||
@@ -160,7 +135,6 @@ LRTable LRTable::buildSLR(const Grammar &G) {
|
||||
auto Graph = LRGraph::buildLR0(G);
|
||||
Builder Build;
|
||||
Build.StartStates = Graph.startStates();
|
||||
Build.Recoveries = Graph.recoveries();
|
||||
for (const auto &T : Graph.edges()) {
|
||||
Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
|
||||
Build.Entries.insert({T.Src, T.Label, Act});
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
class Foo {
|
||||
public:
|
||||
};
|
||||
// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] }
|
||||
// CHECK: decl-specifier-seq~class-specifier := class-head { member-specification }
|
||||
// CHECK-NEXT: ├─class-head := class-key class-head-name
|
||||
// CHECK-NEXT: │ ├─class-key~CLASS := tok[0]
|
||||
// CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1]
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s
|
||||
auto x = { complete garbage };
|
||||
// CHECK: translation-unit~simple-declaration
|
||||
// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0]
|
||||
// CHECK-NEXT: ├─init-declarator-list~init-declarator
|
||||
// CHECK-NEXT: │ ├─declarator~IDENTIFIER := tok[1]
|
||||
// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer
|
||||
// CHECK-NEXT: │ ├─= := tok[2]
|
||||
// CHECK-NEXT: │ └─initializer-clause~braced-init-list
|
||||
// CHECK-NEXT: │ ├─{ := tok[3]
|
||||
// CHECK-NEXT: │ ├─initializer-list := <opaque>
|
||||
// CHECK-NEXT: │ └─} := tok[6]
|
||||
// CHECK-NEXT: └─; := tok[7]
|
||||
@@ -7,7 +7,6 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "clang-pseudo/GLR.h"
|
||||
#include "clang-pseudo/Bracket.h"
|
||||
#include "clang-pseudo/Token.h"
|
||||
#include "clang-pseudo/grammar/Grammar.h"
|
||||
#include "clang/Basic/LangOptions.h"
|
||||
@@ -33,13 +32,11 @@ namespace {
|
||||
using Action = LRTable::Action;
|
||||
using testing::AllOf;
|
||||
using testing::ElementsAre;
|
||||
using testing::IsEmpty;
|
||||
using testing::UnorderedElementsAre;
|
||||
|
||||
MATCHER_P(state, StateID, "") { return arg->State == StateID; }
|
||||
MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
|
||||
MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
|
||||
MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
|
||||
|
||||
testing::Matcher<const GSS::Node *>
|
||||
parents(llvm::ArrayRef<const GSS::Node *> Parents) {
|
||||
@@ -241,9 +238,9 @@ TEST_F(GLRTest, ReduceJoiningWithMultipleBases) {
|
||||
/*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
|
||||
const auto *GSSNode2 = GSStack.addNode(
|
||||
/*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
|
||||
const auto *GSSNode3 = GSStack.addNode(
|
||||
/*State=*/3, /*ForestNode=*/ClassNameNode,
|
||||
/*Parents=*/{GSSNode1});
|
||||
const auto *GSSNode3 =
|
||||
GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
|
||||
/*Parents=*/{GSSNode1});
|
||||
const auto *GSSNode4 =
|
||||
GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
|
||||
/*Parents=*/{GSSNode2});
|
||||
@@ -366,124 +363,6 @@ TEST_F(GLRTest, ReduceLookahead) {
|
||||
EXPECT_THAT(Heads, ElementsAre(GSSNode1));
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, Recover) {
|
||||
// Recovery while parsing "word" inside braces.
|
||||
// Before:
|
||||
// 0--1({)--2(?)
|
||||
// After recovering a `word` at state 1:
|
||||
// 0--3(word) // 3 is goto(1, word)
|
||||
buildGrammar({"word"}, {});
|
||||
LRTable Table = LRTable::buildForTests(
|
||||
G, {{/*State=*/1, id("word"), Action::goTo(3)}}, /*Reduce=*/{},
|
||||
/*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
|
||||
|
||||
auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
|
||||
auto *Question1 = &Arena.createTerminal(tok::question, 1);
|
||||
const auto *Root = GSStack.addNode(0, nullptr, {});
|
||||
const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
|
||||
const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
|
||||
|
||||
// Need a token stream with paired braces so the strategy works.
|
||||
clang::LangOptions LOptions;
|
||||
TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
|
||||
pairBrackets(Tokens);
|
||||
std::vector<const GSS::Node *> NewHeads;
|
||||
|
||||
unsigned TokenIndex = 2;
|
||||
glrRecover({AfterQuestion1}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
|
||||
NewHeads);
|
||||
EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
|
||||
EXPECT_THAT(NewHeads, ElementsAre(
|
||||
AllOf(state(3), parsedSymbolID(id("word")),
|
||||
parents({OpenedBraces}), start(1u))));
|
||||
EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
|
||||
|
||||
// Test recovery failure: omit closing brace so strategy fails
|
||||
TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
|
||||
pairBrackets(NoRBrace);
|
||||
NewHeads.clear();
|
||||
TokenIndex = 2;
|
||||
glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
|
||||
{G, Table, Arena, GSStack}, NewHeads);
|
||||
EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
|
||||
EXPECT_THAT(NewHeads, IsEmpty());
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, RecoverRightmost) {
|
||||
// In a nested block structure, we recover at the innermost possible block.
|
||||
// Before:
|
||||
// 0--1({)--1({)--1({)
|
||||
// After recovering a `block` at inside the second braces:
|
||||
// 0--1({)--2(body) // 2 is goto(1, body)
|
||||
buildGrammar({"body"}, {});
|
||||
LRTable Table = LRTable::buildForTests(
|
||||
G, {{/*State=*/1, id("body"), Action::goTo(2)}}, /*Reduce=*/{},
|
||||
/*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
|
||||
|
||||
clang::LangOptions LOptions;
|
||||
// Innermost brace is unmatched, to test fallback to next brace.
|
||||
TokenStream Tokens = cook(lex("{ { { ? ? } }", LOptions), LOptions);
|
||||
Tokens.tokens()[0].Pair = 5;
|
||||
Tokens.tokens()[1].Pair = 4;
|
||||
Tokens.tokens()[4].Pair = 1;
|
||||
Tokens.tokens()[5].Pair = 0;
|
||||
|
||||
auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
|
||||
auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
|
||||
auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
|
||||
const auto *Root = GSStack.addNode(0, nullptr, {});
|
||||
const auto *In1 = GSStack.addNode(1, Brace1, {Root});
|
||||
const auto *In2 = GSStack.addNode(1, Brace2, {In1});
|
||||
const auto *In3 = GSStack.addNode(1, Brace3, {In2});
|
||||
|
||||
unsigned TokenIndex = 3;
|
||||
std::vector<const GSS::Node *> NewHeads;
|
||||
glrRecover({In3}, TokenIndex, Tokens, {G, Table, Arena, GSStack}, NewHeads);
|
||||
EXPECT_EQ(TokenIndex, 5u);
|
||||
EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
|
||||
parents({In2}), start(2u))));
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, RecoverAlternatives) {
|
||||
// Recovery inside braces with multiple equally good options
|
||||
// Before:
|
||||
// 0--1({)
|
||||
// After recovering either `word` or `number` inside the braces:
|
||||
// 0--1({)--2(word) // 2 is goto(1, word)
|
||||
// └--3(number) // 3 is goto(1, number)
|
||||
buildGrammar({"number", "word"}, {});
|
||||
LRTable Table = LRTable::buildForTests(
|
||||
G,
|
||||
{
|
||||
{/*State=*/1, id("number"), Action::goTo(2)},
|
||||
{/*State=*/1, id("word"), Action::goTo(3)},
|
||||
},
|
||||
/*Reduce=*/{},
|
||||
/*Recovery=*/
|
||||
{
|
||||
{/*State=*/1, RecoveryStrategy::Braces, id("number")},
|
||||
{/*State=*/1, RecoveryStrategy::Braces, id("word")},
|
||||
});
|
||||
auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
|
||||
const auto *Root = GSStack.addNode(0, nullptr, {});
|
||||
const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
|
||||
|
||||
clang::LangOptions LOptions;
|
||||
TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
|
||||
pairBrackets(Tokens);
|
||||
std::vector<const GSS::Node *> NewHeads;
|
||||
unsigned TokenIndex = 1;
|
||||
|
||||
glrRecover({OpenedBraces}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
|
||||
NewHeads);
|
||||
EXPECT_EQ(TokenIndex, 2u);
|
||||
EXPECT_THAT(NewHeads,
|
||||
UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
|
||||
parents({OpenedBraces}), start(1u)),
|
||||
AllOf(state(3), parsedSymbolID(id("word")),
|
||||
parents({OpenedBraces}), start(1u))));
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, PerfectForestNodeSharing) {
|
||||
// Run the GLR on a simple grammar and test that we build exactly one forest
|
||||
// node per (SymbolID, token range).
|
||||
@@ -552,40 +431,6 @@ TEST_F(GLRTest, GLRReduceOrder) {
|
||||
"[ 0, end) └─IDENTIFIER := tok[0]\n");
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, RecoveryEndToEnd) {
|
||||
// Simple example of brace-based recovery showing:
|
||||
// - recovered region includes tokens both ahead of and behind the cursor
|
||||
// - multiple possible recovery rules
|
||||
// - recovery from outer scopes is rejected
|
||||
build(R"bnf(
|
||||
_ := block
|
||||
|
||||
block := { block }
|
||||
block := { numbers }
|
||||
numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
|
||||
)bnf");
|
||||
auto LRTable = LRTable::buildSLR(G);
|
||||
clang::LangOptions LOptions;
|
||||
TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
|
||||
pairBrackets(Tokens);
|
||||
|
||||
const ForestNode &Parsed =
|
||||
glrParse(Tokens, {G, LRTable, Arena, GSStack}, id("block"));
|
||||
EXPECT_EQ(Parsed.dumpRecursive(G),
|
||||
"[ 0, end) block := { block [recover=1] }\n"
|
||||
"[ 0, 1) ├─{ := tok[0]\n"
|
||||
"[ 1, 5) ├─block := <ambiguous>\n"
|
||||
"[ 1, 5) │ ├─block := { block [recover=1] }\n"
|
||||
"[ 1, 2) │ │ ├─{ := tok[1]\n"
|
||||
"[ 2, 4) │ │ ├─block := <opaque>\n"
|
||||
"[ 4, 5) │ │ └─} := tok[4]\n"
|
||||
"[ 1, 5) │ └─block := { numbers [recover=1] }\n"
|
||||
"[ 1, 2) │ ├─{ := tok[1]\n"
|
||||
"[ 2, 4) │ ├─numbers := <opaque>\n"
|
||||
"[ 4, 5) │ └─} := tok[4]\n"
|
||||
"[ 5, end) └─} := tok[5]\n");
|
||||
}
|
||||
|
||||
TEST_F(GLRTest, NoExplicitAccept) {
|
||||
build(R"bnf(
|
||||
_ := test
|
||||
|
||||
Reference in New Issue
Block a user