compute-runtime/shared/source/device_binary_format/yaml/yaml_parser.cpp

492 lines
20 KiB
C++

/*
* Copyright (C) 2020-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/device_binary_format/yaml/yaml_parser.h"
namespace NEO {
namespace Yaml {
std::string constructYamlError(size_t lineNumber, const char *lineBeg, const char *parsePos, const char *reason) {
auto ret = "NEO::Yaml : Could not parse line : [" + std::to_string(lineNumber) + "] : [" + ConstStringRef(lineBeg, parsePos - lineBeg + 1).str() + "] <-- parser position on error";
if (nullptr != reason) {
ret += ". Reason : ";
ret.append(reason);
}
ret += "\n";
return ret;
}
inline Node &addNode(NodesCache &outNodes, Node &parent) {
UNRECOVERABLE_IF(outNodes.size() >= outNodes.capacity()); // resize must not grow
parent.firstChildId = static_cast<NodeId>(outNodes.size());
parent.lastChildId = static_cast<NodeId>(outNodes.size());
outNodes.push_back(Node());
auto &curr = *outNodes.rbegin();
curr.id = parent.lastChildId;
curr.parentId = parent.id;
++parent.numChildren;
return curr;
}
inline Node &addNode(NodesCache &outNodes, Node &prevSibling, Node &parent) {
UNRECOVERABLE_IF(outNodes.size() >= outNodes.capacity()); // resize must not grow
prevSibling.nextSiblingId = static_cast<NodeId>(outNodes.size());
outNodes.push_back(Node());
auto &curr = *outNodes.rbegin();
curr.id = prevSibling.nextSiblingId;
curr.parentId = parent.id;
parent.lastChildId = curr.id;
++parent.numChildren;
return curr;
}
struct TokenizerContext {
TokenizerContext(ConstStringRef text)
: pos(text.begin()),
end(text.end()),
lineBeginPos(text.begin()) {
lineTraits.reset();
}
const char *pos = nullptr;
const char *const end = nullptr;
uint32_t lineIndent = 0U;
TokenId lineBegin = 0U;
const char *lineBeginPos = nullptr;
bool isParsingIdent = false;
Line::LineTraits lineTraits;
};
bool tokenizeEndLine(ConstStringRef text, LinesCache &outLines, TokensCache &outTokens, std::string &outErrReason, std::string &outWarning, TokenizerContext &context) {
TokenId lineEnd = static_cast<uint32_t>(outTokens.size());
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
auto lineBegToken = outTokens[context.lineBegin];
Line::LineType lineType = Line::LineType::empty;
if (lineEnd != context.lineBegin) {
switch (lineBegToken.traits.type) {
default:
outErrReason = constructYamlError(outLines.size(), lineBegToken.pos, context.pos, "Internal error - undefined line type");
return false;
case Token::singleCharacter:
switch (lineBegToken.traits.character0) {
default:
outErrReason = constructYamlError(outLines.size(), lineBegToken.pos, context.pos, (std::string("Unhandled keyword character : ") + lineBegToken.traits.character0).c_str());
return false;
case '#':
lineType = Line::LineType::comment;
break;
case '-':
lineType = Line::LineType::listEntry;
break;
}
break;
case Token::identifier:
lineType = Line::LineType::dictionaryEntry;
break;
case Token::fileSectionBeg:
lineType = Line::LineType::fileSection;
break;
case Token::fileSectionEnd:
lineType = Line::LineType::fileSection;
break;
}
}
outLines.push_back(Line{lineType, static_cast<uint16_t>(context.lineIndent), context.lineBegin, lineEnd, context.lineTraits});
++context.pos;
context.lineIndent = 0U;
context.lineBegin = static_cast<uint32_t>(outTokens.size());
context.lineBeginPos = context.pos;
context.isParsingIdent = true;
context.lineTraits.reset();
return true;
}
bool isValidInlineCollectionFormat(const char *context, const char *contextEnd) {
auto consumeAlphaNum = [](const char *&text) {
while (isAlphaNumeric(*text)) {
text++;
}
};
bool endNum = false;
bool endCollection = false;
context++; // skip '['
while (context < contextEnd && *context != '\n') {
if (isWhitespace(*context)) {
context++;
} else if (false == endNum) {
if (isAlphaNumeric(*context)) {
consumeAlphaNum(context);
endNum = true;
} else {
return false;
}
} else if (false == endCollection) {
if (*context == ',') {
context++;
endNum = false;
} else if (*context == ']') {
context++;
endCollection = true;
} else {
return false;
}
} else {
return false;
}
}
return endCollection;
}
bool tokenize(ConstStringRef text, LinesCache &outLines, TokensCache &outTokens, std::string &outErrReason, std::string &outWarning) {
if (text.empty()) {
outWarning.append("NEO::Yaml : input text is empty\n");
return true;
}
TokenizerContext context{text};
context.isParsingIdent = true;
while (context.pos < context.end) {
reserveBasedOnEstimates(outTokens, text.begin(), text.end(), context.pos);
switch (context.pos[0]) {
case ' ':
context.lineIndent += context.isParsingIdent ? 1 : 0;
++context.pos;
break;
case '\t':
if (context.isParsingIdent) {
context.lineIndent += 4U;
outWarning.append("NEO::Yaml : Tabs used as indent at line : " + std::to_string(outLines.size()) + "\n");
}
++context.pos;
break;
case '\r':
case '\0':
++context.pos;
break;
case '#': {
context.isParsingIdent = false;
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
auto commentIt = context.pos + 1;
while (commentIt < context.end) {
if ('\n' == commentIt[0]) {
break;
}
++commentIt;
}
if (context.pos + 1 != commentIt) {
outTokens.push_back(Token(ConstStringRef(context.pos + 1, commentIt - (context.pos + 1)), Token::comment));
}
context.pos = commentIt;
break;
}
case '\n': {
reserveBasedOnEstimates(outLines, text.begin(), text.end(), context.pos);
if (false == tokenizeEndLine(text, outLines, outTokens, outErrReason, outWarning, context)) {
return false;
}
} break;
case '\"':
case '\'': {
context.isParsingIdent = false;
auto parseTokEnd = consumeStringLiteral(text, context.pos);
if (parseTokEnd == context.pos) {
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, "Unterminated string");
return false;
}
outTokens.push_back(Token(ConstStringRef(context.pos, parseTokEnd - context.pos), Token::literalString));
context.pos = parseTokEnd;
break;
}
case '-': {
ConstStringRef fileSectionMarker("---");
if ((context.isParsingIdent) && isMatched(text, context.pos, fileSectionMarker)) {
outTokens.push_back(Token(ConstStringRef(context.pos, fileSectionMarker.size()), Token::fileSectionBeg));
context.pos += fileSectionMarker.size();
} else {
auto tokEnd = consumeNumberOrSign(text, context.pos);
if (tokEnd > context.pos + 1) {
outTokens.push_back(Token(ConstStringRef(context.pos, tokEnd - context.pos), Token::literalNumber));
} else {
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
}
context.pos = tokEnd;
}
context.isParsingIdent = false;
break;
}
case '.': {
ConstStringRef fileSectionMarker("...");
if ((context.isParsingIdent) && isMatched(text, context.pos, fileSectionMarker)) {
outTokens.push_back(Token(ConstStringRef(context.pos, fileSectionMarker.size()), Token::fileSectionEnd));
context.pos += fileSectionMarker.size();
} else {
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
++context.pos;
}
context.isParsingIdent = false;
break;
}
case '[':
if (false == isValidInlineCollectionFormat(context.pos, text.end())) {
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, inlineCollectionYamlErrorMsg.data());
return false;
}
context.lineTraits.hasInlineDataMarkers = true;
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::collectionBeg));
++context.pos;
break;
case ']':
if (false == context.lineTraits.hasInlineDataMarkers) {
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, inlineCollectionYamlErrorMsg.data());
return false;
}
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::collectionEnd));
++context.pos;
break;
case ',':
if (false == context.lineTraits.hasInlineDataMarkers) {
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, inlineCollectionYamlErrorMsg.data());
return false;
}
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
++context.pos;
break;
case '{':
case '}':
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, "NEO::Yaml : Inline dictionaries are not supported");
return false;
case ':':
context.lineTraits.hasDictionaryEntry = true;
context.isParsingIdent = false;
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
++context.pos;
break;
default: {
context.isParsingIdent = false;
auto tokEnd = consumeNameIdentifier(text, context.pos);
if (tokEnd != context.pos) {
auto tokenData = ConstStringRef(context.pos, tokEnd - context.pos);
tokenData = tokenData.trimEnd(isWhitespace);
if (context.lineTraits.hasDictionaryEntry) {
outTokens.push_back(Token(tokenData, Token::literalString));
} else {
outTokens.push_back(Token(tokenData, Token::identifier));
}
} else {
tokEnd = consumeNumberOrSign(text, context.pos);
if (tokEnd > context.pos) {
outTokens.push_back(Token(ConstStringRef(context.pos, tokEnd - context.pos), Token::literalNumber));
} else {
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, "Invalid numeric literal");
return false;
}
}
context.pos = tokEnd;
break;
}
}
}
if (outTokens.empty()) {
outWarning.append("NEO::Yaml : text tokenized to 0 tokens\n");
} else {
if ('\n' != *outTokens.rbegin()) {
outWarning.append("NEO::Yaml : text does not end with newline\n");
tokenizeEndLine(text, outLines, outTokens, outErrReason, outWarning, context);
}
}
return true;
}
void finalizeNode(NodeId nodeId, const TokensCache &tokens, NodesCache &outNodes, std::string &outErrReason, std::string &outWarning) {
auto &node = outNodes[nodeId];
if (invalidTokenId != node.key) {
return;
}
if (invalidTokenId == node.value) {
return;
}
auto valueTokenIt = node.value + 1;
auto colon = invalidTokenId;
while ('\n' != tokens[valueTokenIt]) {
if (':' == tokens[valueTokenIt]) {
colon = valueTokenIt;
}
++valueTokenIt;
}
UNRECOVERABLE_IF((colon == invalidTokenId) || (colon + 1 == valueTokenIt));
UNRECOVERABLE_IF(invalidNodeID == node.lastChildId)
outNodes[node.lastChildId].nextSiblingId = static_cast<NodeId>(outNodes.size());
outNodes.push_back(Node());
auto &newNode = *outNodes.rbegin();
newNode.id = static_cast<NodeId>(outNodes.size() - 1);
newNode.parentId = nodeId;
node.lastChildId = outNodes.rbegin()->id;
newNode.key = node.value;
newNode.value = colon + 1;
node.value = invalidTokenId;
++node.numChildren;
}
bool isEmptyVector(const Token &token, size_t lineId, std::string &outError) {
if (isVectorDataType(token)) {
outError = constructYamlError(lineId, token.pos, token.pos + token.len, "Vector data type expects to have at least one value starting with -");
return false;
}
return true;
}
bool buildTree(const LinesCache &lines, const TokensCache &tokens, NodesCache &outNodes, std::string &outErrReason, std::string &outWarning) {
StackVec<NodeId, 64> nesting;
size_t lineId = 0U;
size_t lastUsedLine = 0u;
outNodes.push_back(Node());
outNodes.rbegin()->id = 0U;
outNodes.rbegin()->firstChildId = 1U;
outNodes.rbegin()->lastChildId = 1U;
nesting.resize(1); // root
while (lineId < lines.size()) {
if (isUnused(lines[lineId].lineType)) {
++lineId;
continue;
}
auto currLineIndent = lines[lineId].indent;
if (currLineIndent == outNodes.rbegin()->indent) {
if (lineId > 0u && false == isEmptyVector(tokens[lines[lastUsedLine].first], lastUsedLine, outErrReason)) {
return false;
}
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
auto &prev = *outNodes.rbegin();
auto &parent = outNodes[*nesting.rbegin()];
auto &curr = addNode(outNodes, prev, parent);
curr.indent = currLineIndent;
} else if (currLineIndent > outNodes.rbegin()->indent) {
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
auto &parent = *outNodes.rbegin();
auto &curr = addNode(outNodes, parent);
curr.indent = currLineIndent;
nesting.push_back(parent.id);
} else {
while (currLineIndent < outNodes[*nesting.rbegin()].indent) {
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
finalizeNode(*nesting.rbegin(), tokens, outNodes, outErrReason, outWarning);
UNRECOVERABLE_IF(nesting.empty());
nesting.pop_back();
}
bool hasInvalidIndent = (currLineIndent != outNodes[*nesting.rbegin()].indent);
if (hasInvalidIndent) {
outErrReason = constructYamlError(lineId, tokens[lines[lineId].first].pos, tokens[lines[lineId].first].pos + 1, "Invalid indentation");
return false;
} else {
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
auto &prev = outNodes[*nesting.rbegin()];
auto &parent = outNodes[prev.parentId];
auto &curr = addNode(outNodes, prev, parent);
curr.indent = currLineIndent;
}
}
if (Line::LineType::dictionaryEntry == lines[lineId].lineType) {
auto numTokensInLine = lines[lineId].last - lines[lineId].first + 1;
outNodes.rbegin()->key = lines[lineId].first;
UNRECOVERABLE_IF(numTokensInLine < 3); // at least key, : and \n
if (lines[lineId].traits.hasInlineDataMarkers) {
auto collectionBeg = lines[lineId].first + 2;
auto collectionEnd = lines[lineId].last - 1;
UNRECOVERABLE_IF(tokens[collectionBeg].traits.type != Token::Type::collectionBeg || tokens[collectionEnd].traits.type != Token::Type::collectionEnd);
auto parentNodeId = outNodes.size() - 1;
auto previousSiblingId = std::numeric_limits<size_t>::max();
for (auto currTokenId = collectionBeg + 1; currTokenId < collectionEnd; currTokenId += 2) {
auto tokenType = tokens[currTokenId].traits.type;
UNRECOVERABLE_IF(tokenType != Token::Type::literalNumber && tokenType != Token::Type::literalString);
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
auto &parentNode = outNodes[parentNodeId];
if (previousSiblingId == std::numeric_limits<size_t>::max()) {
addNode(outNodes, parentNode);
} else {
auto &previousSibling = outNodes[previousSiblingId];
addNode(outNodes, previousSibling, parentNode);
}
previousSiblingId = outNodes.size() - 1;
outNodes[previousSiblingId].indent = currLineIndent + 1;
outNodes[previousSiblingId].value = currTokenId;
}
nesting.push_back(static_cast<NodeId>(parentNodeId));
} else if (('#' != tokens[lines[lineId].first + 2]) && ('\n' != tokens[lines[lineId].first + 2])) {
outNodes.rbegin()->value = lines[lineId].first + 2;
}
} else {
auto numTokensInLine = lines[lineId].last - lines[lineId].first + 1;
(void)numTokensInLine;
UNRECOVERABLE_IF(numTokensInLine < 2); // at least : - and \n
UNRECOVERABLE_IF(Line::LineType::listEntry != lines[lineId].lineType);
UNRECOVERABLE_IF('-' != tokens[lines[lineId].first]);
if (('#' != tokens[lines[lineId].first + 1]) && ('\n' != tokens[lines[lineId].first + 1])) {
outNodes.rbegin()->value = lines[lineId].first + 1;
}
}
lastUsedLine = lineId;
++lineId;
}
outNodes.reserve(outNodes.size() + nesting.size());
while (false == nesting.empty()) {
finalizeNode(*nesting.rbegin(), tokens, outNodes, outErrReason, outWarning);
nesting.pop_back();
}
if (1U == outNodes.size()) {
outWarning.append("NEO::Yaml : Text has no data\n");
outNodes.clear();
return true;
}
return true;
}
DebugNode *buildDebugNodes(NEO::Yaml::NodeId rootId, const NEO::Yaml::NodesCache &nodes, const NEO::Yaml::TokensCache &tokens) {
DebugNode *curr = new DebugNode;
auto &src = nodes[rootId];
curr->src = &src;
if (src.key != NEO::Yaml::invalidTokenId) {
curr->key = tokens[src.key].cstrref();
}
if (src.value != NEO::Yaml::invalidTokenId) {
curr->value = tokens[src.value].cstrref();
}
auto childId = src.firstChildId;
while (NEO::Yaml::invalidNodeID != childId) {
curr->children.push_back(buildDebugNodes(childId, nodes, tokens));
(*curr->children.rbegin())->parent = curr;
childId = nodes[childId].nextSiblingId;
}
return curr;
}
DebugNode *YamlParser::buildDebugNodes(const Node &parent) const {
return NEO::Yaml::buildDebugNodes(parent.id, nodes, tokens);
}
DebugNode *YamlParser::buildDebugNodes() const {
return (false == empty()) ? NEO::Yaml::buildDebugNodes(0U, nodes, tokens) : nullptr;
}
} // namespace Yaml
} // namespace NEO