492 lines
20 KiB
C++
492 lines
20 KiB
C++
/*
|
|
* Copyright (C) 2020-2023 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#include "shared/source/device_binary_format/yaml/yaml_parser.h"
|
|
|
|
namespace NEO {
|
|
|
|
namespace Yaml {
|
|
|
|
std::string constructYamlError(size_t lineNumber, const char *lineBeg, const char *parsePos, const char *reason) {
|
|
auto ret = "NEO::Yaml : Could not parse line : [" + std::to_string(lineNumber) + "] : [" + ConstStringRef(lineBeg, parsePos - lineBeg + 1).str() + "] <-- parser position on error";
|
|
if (nullptr != reason) {
|
|
ret += ". Reason : ";
|
|
ret.append(reason);
|
|
}
|
|
ret += "\n";
|
|
return ret;
|
|
}
|
|
|
|
inline Node &addNode(NodesCache &outNodes, Node &parent) {
|
|
UNRECOVERABLE_IF(outNodes.size() >= outNodes.capacity()); // resize must not grow
|
|
parent.firstChildId = static_cast<NodeId>(outNodes.size());
|
|
parent.lastChildId = static_cast<NodeId>(outNodes.size());
|
|
outNodes.push_back(Node());
|
|
auto &curr = *outNodes.rbegin();
|
|
curr.id = parent.lastChildId;
|
|
curr.parentId = parent.id;
|
|
++parent.numChildren;
|
|
return curr;
|
|
}
|
|
|
|
inline Node &addNode(NodesCache &outNodes, Node &prevSibling, Node &parent) {
|
|
UNRECOVERABLE_IF(outNodes.size() >= outNodes.capacity()); // resize must not grow
|
|
prevSibling.nextSiblingId = static_cast<NodeId>(outNodes.size());
|
|
outNodes.push_back(Node());
|
|
auto &curr = *outNodes.rbegin();
|
|
curr.id = prevSibling.nextSiblingId;
|
|
curr.parentId = parent.id;
|
|
parent.lastChildId = curr.id;
|
|
++parent.numChildren;
|
|
return curr;
|
|
}
|
|
|
|
struct TokenizerContext {
|
|
TokenizerContext(ConstStringRef text)
|
|
: pos(text.begin()),
|
|
end(text.end()),
|
|
lineBeginPos(text.begin()) {
|
|
lineTraits.reset();
|
|
}
|
|
|
|
const char *pos = nullptr;
|
|
const char *const end = nullptr;
|
|
|
|
uint32_t lineIndent = 0U;
|
|
TokenId lineBegin = 0U;
|
|
const char *lineBeginPos = nullptr;
|
|
bool isParsingIdent = false;
|
|
Line::LineTraits lineTraits;
|
|
};
|
|
|
|
bool tokenizeEndLine(ConstStringRef text, LinesCache &outLines, TokensCache &outTokens, std::string &outErrReason, std::string &outWarning, TokenizerContext &context) {
|
|
TokenId lineEnd = static_cast<uint32_t>(outTokens.size());
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
|
|
auto lineBegToken = outTokens[context.lineBegin];
|
|
Line::LineType lineType = Line::LineType::empty;
|
|
if (lineEnd != context.lineBegin) {
|
|
switch (lineBegToken.traits.type) {
|
|
default:
|
|
outErrReason = constructYamlError(outLines.size(), lineBegToken.pos, context.pos, "Internal error - undefined line type");
|
|
return false;
|
|
case Token::singleCharacter:
|
|
switch (lineBegToken.traits.character0) {
|
|
default:
|
|
outErrReason = constructYamlError(outLines.size(), lineBegToken.pos, context.pos, (std::string("Unhandled keyword character : ") + lineBegToken.traits.character0).c_str());
|
|
return false;
|
|
case '#':
|
|
lineType = Line::LineType::comment;
|
|
break;
|
|
case '-':
|
|
lineType = Line::LineType::listEntry;
|
|
break;
|
|
}
|
|
break;
|
|
case Token::identifier:
|
|
lineType = Line::LineType::dictionaryEntry;
|
|
break;
|
|
case Token::fileSectionBeg:
|
|
lineType = Line::LineType::fileSection;
|
|
break;
|
|
case Token::fileSectionEnd:
|
|
lineType = Line::LineType::fileSection;
|
|
break;
|
|
}
|
|
}
|
|
outLines.push_back(Line{lineType, static_cast<uint16_t>(context.lineIndent), context.lineBegin, lineEnd, context.lineTraits});
|
|
++context.pos;
|
|
|
|
context.lineIndent = 0U;
|
|
context.lineBegin = static_cast<uint32_t>(outTokens.size());
|
|
context.lineBeginPos = context.pos;
|
|
context.isParsingIdent = true;
|
|
context.lineTraits.reset();
|
|
return true;
|
|
}
|
|
|
|
bool isValidInlineCollectionFormat(const char *context, const char *contextEnd) {
|
|
auto consumeAlphaNum = [](const char *&text) {
|
|
while (isAlphaNumeric(*text)) {
|
|
text++;
|
|
}
|
|
};
|
|
|
|
bool endNum = false;
|
|
bool endCollection = false;
|
|
context++; // skip '['
|
|
while (context < contextEnd && *context != '\n') {
|
|
if (isWhitespace(*context)) {
|
|
context++;
|
|
} else if (false == endNum) {
|
|
if (isAlphaNumeric(*context)) {
|
|
consumeAlphaNum(context);
|
|
endNum = true;
|
|
} else {
|
|
return false;
|
|
}
|
|
} else if (false == endCollection) {
|
|
if (*context == ',') {
|
|
context++;
|
|
endNum = false;
|
|
} else if (*context == ']') {
|
|
context++;
|
|
endCollection = true;
|
|
} else {
|
|
return false;
|
|
}
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
return endCollection;
|
|
}
|
|
|
|
bool tokenize(ConstStringRef text, LinesCache &outLines, TokensCache &outTokens, std::string &outErrReason, std::string &outWarning) {
|
|
if (text.empty()) {
|
|
outWarning.append("NEO::Yaml : input text is empty\n");
|
|
return true;
|
|
}
|
|
|
|
TokenizerContext context{text};
|
|
context.isParsingIdent = true;
|
|
|
|
while (context.pos < context.end) {
|
|
reserveBasedOnEstimates(outTokens, text.begin(), text.end(), context.pos);
|
|
switch (context.pos[0]) {
|
|
case ' ':
|
|
context.lineIndent += context.isParsingIdent ? 1 : 0;
|
|
++context.pos;
|
|
break;
|
|
case '\t':
|
|
if (context.isParsingIdent) {
|
|
context.lineIndent += 4U;
|
|
outWarning.append("NEO::Yaml : Tabs used as indent at line : " + std::to_string(outLines.size()) + "\n");
|
|
}
|
|
++context.pos;
|
|
break;
|
|
case '\r':
|
|
case '\0':
|
|
++context.pos;
|
|
break;
|
|
case '#': {
|
|
context.isParsingIdent = false;
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
|
|
auto commentIt = context.pos + 1;
|
|
while (commentIt < context.end) {
|
|
if ('\n' == commentIt[0]) {
|
|
break;
|
|
}
|
|
++commentIt;
|
|
}
|
|
if (context.pos + 1 != commentIt) {
|
|
outTokens.push_back(Token(ConstStringRef(context.pos + 1, commentIt - (context.pos + 1)), Token::comment));
|
|
}
|
|
context.pos = commentIt;
|
|
break;
|
|
}
|
|
case '\n': {
|
|
reserveBasedOnEstimates(outLines, text.begin(), text.end(), context.pos);
|
|
if (false == tokenizeEndLine(text, outLines, outTokens, outErrReason, outWarning, context)) {
|
|
return false;
|
|
}
|
|
} break;
|
|
case '\"':
|
|
case '\'': {
|
|
context.isParsingIdent = false;
|
|
auto parseTokEnd = consumeStringLiteral(text, context.pos);
|
|
if (parseTokEnd == context.pos) {
|
|
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, "Unterminated string");
|
|
return false;
|
|
}
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, parseTokEnd - context.pos), Token::literalString));
|
|
context.pos = parseTokEnd;
|
|
break;
|
|
}
|
|
case '-': {
|
|
ConstStringRef fileSectionMarker("---");
|
|
if ((context.isParsingIdent) && isMatched(text, context.pos, fileSectionMarker)) {
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, fileSectionMarker.size()), Token::fileSectionBeg));
|
|
context.pos += fileSectionMarker.size();
|
|
} else {
|
|
auto tokEnd = consumeNumberOrSign(text, context.pos);
|
|
if (tokEnd > context.pos + 1) {
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, tokEnd - context.pos), Token::literalNumber));
|
|
} else {
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
|
|
}
|
|
|
|
context.pos = tokEnd;
|
|
}
|
|
context.isParsingIdent = false;
|
|
break;
|
|
}
|
|
case '.': {
|
|
ConstStringRef fileSectionMarker("...");
|
|
if ((context.isParsingIdent) && isMatched(text, context.pos, fileSectionMarker)) {
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, fileSectionMarker.size()), Token::fileSectionEnd));
|
|
context.pos += fileSectionMarker.size();
|
|
} else {
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
|
|
++context.pos;
|
|
}
|
|
context.isParsingIdent = false;
|
|
break;
|
|
}
|
|
case '[':
|
|
if (false == isValidInlineCollectionFormat(context.pos, text.end())) {
|
|
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, inlineCollectionYamlErrorMsg.data());
|
|
return false;
|
|
}
|
|
context.lineTraits.hasInlineDataMarkers = true;
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::collectionBeg));
|
|
++context.pos;
|
|
break;
|
|
case ']':
|
|
if (false == context.lineTraits.hasInlineDataMarkers) {
|
|
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, inlineCollectionYamlErrorMsg.data());
|
|
return false;
|
|
}
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::collectionEnd));
|
|
++context.pos;
|
|
break;
|
|
case ',':
|
|
if (false == context.lineTraits.hasInlineDataMarkers) {
|
|
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, inlineCollectionYamlErrorMsg.data());
|
|
return false;
|
|
}
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
|
|
++context.pos;
|
|
break;
|
|
case '{':
|
|
case '}':
|
|
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, "NEO::Yaml : Inline dictionaries are not supported");
|
|
return false;
|
|
case ':':
|
|
context.lineTraits.hasDictionaryEntry = true;
|
|
context.isParsingIdent = false;
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, 1), Token::singleCharacter));
|
|
++context.pos;
|
|
break;
|
|
default: {
|
|
context.isParsingIdent = false;
|
|
auto tokEnd = consumeNameIdentifier(text, context.pos);
|
|
if (tokEnd != context.pos) {
|
|
auto tokenData = ConstStringRef(context.pos, tokEnd - context.pos);
|
|
tokenData = tokenData.trimEnd(isWhitespace);
|
|
if (context.lineTraits.hasDictionaryEntry) {
|
|
outTokens.push_back(Token(tokenData, Token::literalString));
|
|
} else {
|
|
outTokens.push_back(Token(tokenData, Token::identifier));
|
|
}
|
|
} else {
|
|
tokEnd = consumeNumberOrSign(text, context.pos);
|
|
if (tokEnd > context.pos) {
|
|
outTokens.push_back(Token(ConstStringRef(context.pos, tokEnd - context.pos), Token::literalNumber));
|
|
} else {
|
|
outErrReason = constructYamlError(outLines.size(), context.lineBeginPos, context.pos, "Invalid numeric literal");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
context.pos = tokEnd;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (outTokens.empty()) {
|
|
outWarning.append("NEO::Yaml : text tokenized to 0 tokens\n");
|
|
} else {
|
|
if ('\n' != *outTokens.rbegin()) {
|
|
outWarning.append("NEO::Yaml : text does not end with newline\n");
|
|
tokenizeEndLine(text, outLines, outTokens, outErrReason, outWarning, context);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void finalizeNode(NodeId nodeId, const TokensCache &tokens, NodesCache &outNodes, std::string &outErrReason, std::string &outWarning) {
|
|
auto &node = outNodes[nodeId];
|
|
if (invalidTokenId != node.key) {
|
|
return;
|
|
}
|
|
if (invalidTokenId == node.value) {
|
|
return;
|
|
}
|
|
auto valueTokenIt = node.value + 1;
|
|
auto colon = invalidTokenId;
|
|
while ('\n' != tokens[valueTokenIt]) {
|
|
if (':' == tokens[valueTokenIt]) {
|
|
colon = valueTokenIt;
|
|
}
|
|
++valueTokenIt;
|
|
}
|
|
UNRECOVERABLE_IF((colon == invalidTokenId) || (colon + 1 == valueTokenIt));
|
|
UNRECOVERABLE_IF(invalidNodeID == node.lastChildId)
|
|
outNodes[node.lastChildId].nextSiblingId = static_cast<NodeId>(outNodes.size());
|
|
|
|
outNodes.push_back(Node());
|
|
auto &newNode = *outNodes.rbegin();
|
|
newNode.id = static_cast<NodeId>(outNodes.size() - 1);
|
|
newNode.parentId = nodeId;
|
|
node.lastChildId = outNodes.rbegin()->id;
|
|
newNode.key = node.value;
|
|
newNode.value = colon + 1;
|
|
|
|
node.value = invalidTokenId;
|
|
++node.numChildren;
|
|
}
|
|
|
|
bool isEmptyVector(const Token &token, size_t lineId, std::string &outError) {
|
|
if (isVectorDataType(token)) {
|
|
outError = constructYamlError(lineId, token.pos, token.pos + token.len, "Vector data type expects to have at least one value starting with -");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool buildTree(const LinesCache &lines, const TokensCache &tokens, NodesCache &outNodes, std::string &outErrReason, std::string &outWarning) {
|
|
StackVec<NodeId, 64> nesting;
|
|
size_t lineId = 0U;
|
|
size_t lastUsedLine = 0u;
|
|
outNodes.push_back(Node());
|
|
outNodes.rbegin()->id = 0U;
|
|
outNodes.rbegin()->firstChildId = 1U;
|
|
outNodes.rbegin()->lastChildId = 1U;
|
|
nesting.resize(1); // root
|
|
while (lineId < lines.size()) {
|
|
if (isUnused(lines[lineId].lineType)) {
|
|
++lineId;
|
|
continue;
|
|
}
|
|
auto currLineIndent = lines[lineId].indent;
|
|
if (currLineIndent == outNodes.rbegin()->indent) {
|
|
if (lineId > 0u && false == isEmptyVector(tokens[lines[lastUsedLine].first], lastUsedLine, outErrReason)) {
|
|
return false;
|
|
}
|
|
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
|
|
auto &prev = *outNodes.rbegin();
|
|
auto &parent = outNodes[*nesting.rbegin()];
|
|
auto &curr = addNode(outNodes, prev, parent);
|
|
curr.indent = currLineIndent;
|
|
} else if (currLineIndent > outNodes.rbegin()->indent) {
|
|
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
|
|
auto &parent = *outNodes.rbegin();
|
|
auto &curr = addNode(outNodes, parent);
|
|
curr.indent = currLineIndent;
|
|
nesting.push_back(parent.id);
|
|
} else {
|
|
while (currLineIndent < outNodes[*nesting.rbegin()].indent) {
|
|
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
|
|
finalizeNode(*nesting.rbegin(), tokens, outNodes, outErrReason, outWarning);
|
|
UNRECOVERABLE_IF(nesting.empty());
|
|
nesting.pop_back();
|
|
}
|
|
bool hasInvalidIndent = (currLineIndent != outNodes[*nesting.rbegin()].indent);
|
|
if (hasInvalidIndent) {
|
|
outErrReason = constructYamlError(lineId, tokens[lines[lineId].first].pos, tokens[lines[lineId].first].pos + 1, "Invalid indentation");
|
|
return false;
|
|
} else {
|
|
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
|
|
auto &prev = outNodes[*nesting.rbegin()];
|
|
auto &parent = outNodes[prev.parentId];
|
|
auto &curr = addNode(outNodes, prev, parent);
|
|
curr.indent = currLineIndent;
|
|
}
|
|
}
|
|
|
|
if (Line::LineType::dictionaryEntry == lines[lineId].lineType) {
|
|
auto numTokensInLine = lines[lineId].last - lines[lineId].first + 1;
|
|
outNodes.rbegin()->key = lines[lineId].first;
|
|
UNRECOVERABLE_IF(numTokensInLine < 3); // at least key, : and \n
|
|
|
|
if (lines[lineId].traits.hasInlineDataMarkers) {
|
|
auto collectionBeg = lines[lineId].first + 2;
|
|
auto collectionEnd = lines[lineId].last - 1;
|
|
UNRECOVERABLE_IF(tokens[collectionBeg].traits.type != Token::Type::collectionBeg || tokens[collectionEnd].traits.type != Token::Type::collectionEnd);
|
|
|
|
auto parentNodeId = outNodes.size() - 1;
|
|
auto previousSiblingId = std::numeric_limits<size_t>::max();
|
|
|
|
for (auto currTokenId = collectionBeg + 1; currTokenId < collectionEnd; currTokenId += 2) {
|
|
auto tokenType = tokens[currTokenId].traits.type;
|
|
UNRECOVERABLE_IF(tokenType != Token::Type::literalNumber && tokenType != Token::Type::literalString);
|
|
reserveBasedOnEstimates(outNodes, static_cast<size_t>(0U), lines.size(), lineId);
|
|
|
|
auto &parentNode = outNodes[parentNodeId];
|
|
if (previousSiblingId == std::numeric_limits<size_t>::max()) {
|
|
addNode(outNodes, parentNode);
|
|
} else {
|
|
auto &previousSibling = outNodes[previousSiblingId];
|
|
addNode(outNodes, previousSibling, parentNode);
|
|
}
|
|
previousSiblingId = outNodes.size() - 1;
|
|
outNodes[previousSiblingId].indent = currLineIndent + 1;
|
|
outNodes[previousSiblingId].value = currTokenId;
|
|
}
|
|
nesting.push_back(static_cast<NodeId>(parentNodeId));
|
|
} else if (('#' != tokens[lines[lineId].first + 2]) && ('\n' != tokens[lines[lineId].first + 2])) {
|
|
outNodes.rbegin()->value = lines[lineId].first + 2;
|
|
}
|
|
} else {
|
|
auto numTokensInLine = lines[lineId].last - lines[lineId].first + 1;
|
|
(void)numTokensInLine;
|
|
UNRECOVERABLE_IF(numTokensInLine < 2); // at least : - and \n
|
|
UNRECOVERABLE_IF(Line::LineType::listEntry != lines[lineId].lineType);
|
|
UNRECOVERABLE_IF('-' != tokens[lines[lineId].first]);
|
|
if (('#' != tokens[lines[lineId].first + 1]) && ('\n' != tokens[lines[lineId].first + 1])) {
|
|
outNodes.rbegin()->value = lines[lineId].first + 1;
|
|
}
|
|
}
|
|
lastUsedLine = lineId;
|
|
++lineId;
|
|
}
|
|
outNodes.reserve(outNodes.size() + nesting.size());
|
|
while (false == nesting.empty()) {
|
|
finalizeNode(*nesting.rbegin(), tokens, outNodes, outErrReason, outWarning);
|
|
nesting.pop_back();
|
|
}
|
|
if (1U == outNodes.size()) {
|
|
outWarning.append("NEO::Yaml : Text has no data\n");
|
|
outNodes.clear();
|
|
return true;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
DebugNode *buildDebugNodes(NEO::Yaml::NodeId rootId, const NEO::Yaml::NodesCache &nodes, const NEO::Yaml::TokensCache &tokens) {
|
|
DebugNode *curr = new DebugNode;
|
|
auto &src = nodes[rootId];
|
|
curr->src = &src;
|
|
if (src.key != NEO::Yaml::invalidTokenId) {
|
|
curr->key = tokens[src.key].cstrref();
|
|
}
|
|
if (src.value != NEO::Yaml::invalidTokenId) {
|
|
curr->value = tokens[src.value].cstrref();
|
|
}
|
|
|
|
auto childId = src.firstChildId;
|
|
while (NEO::Yaml::invalidNodeID != childId) {
|
|
curr->children.push_back(buildDebugNodes(childId, nodes, tokens));
|
|
(*curr->children.rbegin())->parent = curr;
|
|
childId = nodes[childId].nextSiblingId;
|
|
}
|
|
return curr;
|
|
}
|
|
|
|
DebugNode *YamlParser::buildDebugNodes(const Node &parent) const {
|
|
return NEO::Yaml::buildDebugNodes(parent.id, nodes, tokens);
|
|
}
|
|
|
|
DebugNode *YamlParser::buildDebugNodes() const {
|
|
return (false == empty()) ? NEO::Yaml::buildDebugNodes(0U, nodes, tokens) : nullptr;
|
|
}
|
|
|
|
} // namespace Yaml
|
|
|
|
} // namespace NEO
|