mirror of
https://github.com/intel/llvm.git
synced 2026-02-05 22:17:23 +08:00
[Doc parsing] Patch to parse Doxygen-supported HTML character
references to their UTIF-8 encoding. Reviewed offline by Doug. // rdar://12392215 llvm-svn: 173850
This commit is contained in:
@@ -282,11 +282,18 @@ private:
|
||||
/// it stands for (e.g., "<").
|
||||
StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
|
||||
|
||||
/// Given a Doxygen-supported named character reference (e.g., "™"),
|
||||
/// it returns its UTF8 encoding.
|
||||
StringRef HTMLDoxygenCharacterReference(StringRef Name) const;
|
||||
|
||||
/// Given a Unicode codepoint as base-10 integer, return the character.
|
||||
StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
|
||||
|
||||
/// Given a Unicode codepoint as base-16 integer, return the character.
|
||||
StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
|
||||
|
||||
/// Helper routine to do part of the work for resolveHTMLHexCharacterReference.
|
||||
StringRef helperResolveHTMLHexCharacterReference(unsigned CodePoint) const;
|
||||
|
||||
void formTokenWithChars(Token &Result, const char *TokEnd,
|
||||
tok::TokenKind Kind) {
|
||||
|
||||
@@ -34,6 +34,31 @@ bool isHTMLHexCharacterReferenceCharacter(char C) {
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
static unsigned getCodePoint(StringRef Name) {
|
||||
unsigned CodePoint = 0;
|
||||
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
|
||||
CodePoint *= 16;
|
||||
const char C = Name[i];
|
||||
assert(isHTMLHexCharacterReferenceCharacter(C));
|
||||
CodePoint += llvm::hexDigitValue(C);
|
||||
}
|
||||
return CodePoint;
|
||||
}
|
||||
|
||||
StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const {
|
||||
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
|
||||
char *ResolvedPtr = Resolved;
|
||||
if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
|
||||
return StringRef(Resolved, ResolvedPtr - Resolved);
|
||||
else
|
||||
return StringRef();
|
||||
}
|
||||
|
||||
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
|
||||
unsigned CodePoint = getCodePoint(Name);
|
||||
return helperResolveHTMLHexCharacterReference(CodePoint);
|
||||
}
|
||||
|
||||
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
|
||||
return llvm::StringSwitch<StringRef>(Name)
|
||||
.Case("amp", "&")
|
||||
@@ -41,8 +66,154 @@ StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
|
||||
.Case("gt", ">")
|
||||
.Case("quot", "\"")
|
||||
.Case("apos", "\'")
|
||||
.Case("minus", "-")
|
||||
.Case("sim", "~")
|
||||
.Default("");
|
||||
}
|
||||
|
||||
StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const {
|
||||
return llvm::StringSwitch<StringRef>(Name)
|
||||
.Case("copy", helperResolveHTMLHexCharacterReference(0x000A9))
|
||||
.Case("trade", helperResolveHTMLHexCharacterReference(0x02122))
|
||||
.Case("reg", helperResolveHTMLHexCharacterReference(0x000AE))
|
||||
.Case("lt", helperResolveHTMLHexCharacterReference(0x0003C))
|
||||
.Case("gt", helperResolveHTMLHexCharacterReference(0x0003C))
|
||||
.Case("amp", helperResolveHTMLHexCharacterReference(0x00026))
|
||||
.Case("apos", helperResolveHTMLHexCharacterReference(0x00027))
|
||||
.Case("quot", helperResolveHTMLHexCharacterReference(0x00022))
|
||||
.Case("lsquo", helperResolveHTMLHexCharacterReference(0x02018))
|
||||
.Case("rsquo", helperResolveHTMLHexCharacterReference(0x02019))
|
||||
.Case("ldquo", helperResolveHTMLHexCharacterReference(0x0201C))
|
||||
.Case("rdquo", helperResolveHTMLHexCharacterReference(0x0201D))
|
||||
.Case("ndash", helperResolveHTMLHexCharacterReference(0x02013))
|
||||
.Case("mdash", helperResolveHTMLHexCharacterReference(0x02014))
|
||||
.Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4))
|
||||
.Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB))
|
||||
.Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF))
|
||||
.Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6))
|
||||
.Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC))
|
||||
.Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178))
|
||||
.Case("auml", helperResolveHTMLHexCharacterReference(0x000E4))
|
||||
.Case("euml", helperResolveHTMLHexCharacterReference(0x000EB))
|
||||
.Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF))
|
||||
.Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6))
|
||||
.Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC))
|
||||
.Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF))
|
||||
.Case("Aacute", helperResolveHTMLHexCharacterReference(0x000C1))
|
||||
.Case("Eacute", helperResolveHTMLHexCharacterReference(0x000C9))
|
||||
.Case("Iacute", helperResolveHTMLHexCharacterReference(0x000CD))
|
||||
.Case("Oacute", helperResolveHTMLHexCharacterReference(0x000D3))
|
||||
.Case("Uacute", helperResolveHTMLHexCharacterReference(0x000DA))
|
||||
.Case("Yacute", helperResolveHTMLHexCharacterReference(0x000DD))
|
||||
.Case("aacute", helperResolveHTMLHexCharacterReference(0x000E1))
|
||||
.Case("eacute", helperResolveHTMLHexCharacterReference(0x000E9))
|
||||
.Case("iacute", helperResolveHTMLHexCharacterReference(0x000ED))
|
||||
.Case("oacute", helperResolveHTMLHexCharacterReference(0x000F3))
|
||||
.Case("uacute", helperResolveHTMLHexCharacterReference(0x000FA))
|
||||
.Case("yacute", helperResolveHTMLHexCharacterReference(0x000FD))
|
||||
.Case("Agrave", helperResolveHTMLHexCharacterReference(0x000C0))
|
||||
.Case("Egrave", helperResolveHTMLHexCharacterReference(0x000C8))
|
||||
.Case("Igrave", helperResolveHTMLHexCharacterReference(0x000CC))
|
||||
.Case("Ograve", helperResolveHTMLHexCharacterReference(0x000D2))
|
||||
.Case("Ugrave", helperResolveHTMLHexCharacterReference(0x000D9))
|
||||
.Case("agrave", helperResolveHTMLHexCharacterReference(0x000E0))
|
||||
.Case("egrave", helperResolveHTMLHexCharacterReference(0x000E8))
|
||||
.Case("igrave", helperResolveHTMLHexCharacterReference(0x000EC))
|
||||
.Case("ograve", helperResolveHTMLHexCharacterReference(0x000F2))
|
||||
.Case("ugrave", helperResolveHTMLHexCharacterReference(0x000F9))
|
||||
.Case("ygrave", helperResolveHTMLHexCharacterReference(0x01EF3))
|
||||
.Case("Acirc", helperResolveHTMLHexCharacterReference(0x000C2))
|
||||
.Case("Ecirc", helperResolveHTMLHexCharacterReference(0x000CA))
|
||||
.Case("Icirc", helperResolveHTMLHexCharacterReference(0x000CE))
|
||||
.Case("Ocirc", helperResolveHTMLHexCharacterReference(0x000D4))
|
||||
.Case("Ucirc", helperResolveHTMLHexCharacterReference(0x000DB))
|
||||
.Case("acirc", helperResolveHTMLHexCharacterReference(0x000E2))
|
||||
.Case("ecirc", helperResolveHTMLHexCharacterReference(0x000EA))
|
||||
.Case("icirc", helperResolveHTMLHexCharacterReference(0x000EE))
|
||||
.Case("ocirc", helperResolveHTMLHexCharacterReference(0x000F4))
|
||||
.Case("ucirc", helperResolveHTMLHexCharacterReference(0x000FB))
|
||||
.Case("ycirc", helperResolveHTMLHexCharacterReference(0x00177))
|
||||
.Case("Atilde", helperResolveHTMLHexCharacterReference(0x000C3))
|
||||
.Case("Ntilde", helperResolveHTMLHexCharacterReference(0x000D1))
|
||||
.Case("Otilde", helperResolveHTMLHexCharacterReference(0x000D5))
|
||||
.Case("atilde", helperResolveHTMLHexCharacterReference(0x000E3))
|
||||
.Case("ntilde", helperResolveHTMLHexCharacterReference(0x000F1))
|
||||
.Case("otilde", helperResolveHTMLHexCharacterReference(0x000F5))
|
||||
.Case("szlig", helperResolveHTMLHexCharacterReference(0x000DF))
|
||||
.Case("ccedil", helperResolveHTMLHexCharacterReference(0x000E7))
|
||||
.Case("Ccedil", helperResolveHTMLHexCharacterReference(0x000C7))
|
||||
.Case("aring", helperResolveHTMLHexCharacterReference(0x000E5))
|
||||
.Case("Aring", helperResolveHTMLHexCharacterReference(0x000C5))
|
||||
.Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0))
|
||||
.Case("Gamma", helperResolveHTMLHexCharacterReference(0x00393))
|
||||
.Case("Delta", helperResolveHTMLHexCharacterReference(0x00394))
|
||||
.Case("Theta", helperResolveHTMLHexCharacterReference(0x00398))
|
||||
.Case("Lambda", helperResolveHTMLHexCharacterReference(0x0039B))
|
||||
.Case("Xi", helperResolveHTMLHexCharacterReference(0x0039E))
|
||||
.Case("Pi", helperResolveHTMLHexCharacterReference(0x003A0))
|
||||
.Case("Sigma", helperResolveHTMLHexCharacterReference(0x003A3))
|
||||
.Case("Upsilon", helperResolveHTMLHexCharacterReference(0x003A5))
|
||||
.Case("Phi", helperResolveHTMLHexCharacterReference(0x003A6))
|
||||
.Case("Psi", helperResolveHTMLHexCharacterReference(0x003A8))
|
||||
.Case("Omega", helperResolveHTMLHexCharacterReference(0x003A9))
|
||||
.Case("alpha", helperResolveHTMLHexCharacterReference(0x003B1))
|
||||
.Case("beta", helperResolveHTMLHexCharacterReference(0x003B2))
|
||||
.Case("gamma", helperResolveHTMLHexCharacterReference(0x003B3))
|
||||
.Case("delta", helperResolveHTMLHexCharacterReference(0x003B4))
|
||||
.Case("epsilon", helperResolveHTMLHexCharacterReference(0x003B5))
|
||||
.Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6))
|
||||
.Case("eta", helperResolveHTMLHexCharacterReference(0x003B7))
|
||||
.Case("theta", helperResolveHTMLHexCharacterReference(0x003B8))
|
||||
.Case("iota", helperResolveHTMLHexCharacterReference(0x003B9))
|
||||
.Case("kappa", helperResolveHTMLHexCharacterReference(0x003BA))
|
||||
.Case("lambda", helperResolveHTMLHexCharacterReference(0x003BB))
|
||||
.Case("mu", helperResolveHTMLHexCharacterReference(0x003BC))
|
||||
.Case("nu", helperResolveHTMLHexCharacterReference(0x003BD))
|
||||
.Case("xi", helperResolveHTMLHexCharacterReference(0x003BE))
|
||||
.Case("pi", helperResolveHTMLHexCharacterReference(0x003C0))
|
||||
.Case("rho", helperResolveHTMLHexCharacterReference(0x003C1))
|
||||
.Case("sigma", helperResolveHTMLHexCharacterReference(0x003C3))
|
||||
.Case("tau", helperResolveHTMLHexCharacterReference(0x003C4))
|
||||
.Case("upsilon", helperResolveHTMLHexCharacterReference(0x003C5))
|
||||
.Case("phi", helperResolveHTMLHexCharacterReference(0x003C6))
|
||||
.Case("chi", helperResolveHTMLHexCharacterReference(0x003C7))
|
||||
.Case("psi", helperResolveHTMLHexCharacterReference(0x003C8))
|
||||
.Case("omega", helperResolveHTMLHexCharacterReference(0x003C9))
|
||||
.Case("sigmaf", helperResolveHTMLHexCharacterReference(0x003C2))
|
||||
.Case("sect", helperResolveHTMLHexCharacterReference(0x000A7))
|
||||
.Case("deg", helperResolveHTMLHexCharacterReference(0x000B0))
|
||||
.Case("prime", helperResolveHTMLHexCharacterReference(0x02032))
|
||||
.Case("Prime", helperResolveHTMLHexCharacterReference(0x02033))
|
||||
.Case("infin", helperResolveHTMLHexCharacterReference(0x0221E))
|
||||
.Case("empty", helperResolveHTMLHexCharacterReference(0x02205))
|
||||
.Case("plusmn", helperResolveHTMLHexCharacterReference(0x000B1))
|
||||
.Case("times", helperResolveHTMLHexCharacterReference(0x000D7))
|
||||
.Case("minus", helperResolveHTMLHexCharacterReference(0x02212))
|
||||
.Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5))
|
||||
.Case("part", helperResolveHTMLHexCharacterReference(0x02202))
|
||||
.Case("nabla", helperResolveHTMLHexCharacterReference(0x02207))
|
||||
.Case("radic", helperResolveHTMLHexCharacterReference(0x0221A))
|
||||
.Case("perp", helperResolveHTMLHexCharacterReference(0x022A5))
|
||||
.Case("sum", helperResolveHTMLHexCharacterReference(0x02211))
|
||||
.Case("int", helperResolveHTMLHexCharacterReference(0x0222B))
|
||||
.Case("prod", helperResolveHTMLHexCharacterReference(0x0220F))
|
||||
.Case("sim", helperResolveHTMLHexCharacterReference(0x0223C))
|
||||
.Case("asymp", helperResolveHTMLHexCharacterReference(0x02248))
|
||||
.Case("ne", helperResolveHTMLHexCharacterReference(0x02260))
|
||||
.Case("equiv", helperResolveHTMLHexCharacterReference(0x02261))
|
||||
.Case("prop", helperResolveHTMLHexCharacterReference(0x0221D))
|
||||
.Case("le", helperResolveHTMLHexCharacterReference(0x02264))
|
||||
.Case("ge", helperResolveHTMLHexCharacterReference(0x02265))
|
||||
.Case("larr", helperResolveHTMLHexCharacterReference(0x02190))
|
||||
.Case("rarr", helperResolveHTMLHexCharacterReference(0x02192))
|
||||
.Case("isin", helperResolveHTMLHexCharacterReference(0x02208))
|
||||
.Case("notin", helperResolveHTMLHexCharacterReference(0x02209))
|
||||
.Case("lceil", helperResolveHTMLHexCharacterReference(0x02308))
|
||||
.Case("rceil", helperResolveHTMLHexCharacterReference(0x02309))
|
||||
.Case("lfloor", helperResolveHTMLHexCharacterReference(0x0230A))
|
||||
.Case("rfloor", helperResolveHTMLHexCharacterReference(0x0230B))
|
||||
.Default("");
|
||||
}
|
||||
|
||||
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
|
||||
unsigned CodePoint = 0;
|
||||
@@ -60,23 +231,6 @@ StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
|
||||
return StringRef();
|
||||
}
|
||||
|
||||
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
|
||||
unsigned CodePoint = 0;
|
||||
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
|
||||
CodePoint *= 16;
|
||||
const char C = Name[i];
|
||||
assert(isHTMLHexCharacterReferenceCharacter(C));
|
||||
CodePoint += llvm::hexDigitValue(C);
|
||||
}
|
||||
|
||||
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
|
||||
char *ResolvedPtr = Resolved;
|
||||
if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
|
||||
return StringRef(Resolved, ResolvedPtr - Resolved);
|
||||
else
|
||||
return StringRef();
|
||||
}
|
||||
|
||||
void Lexer::skipLineStartingDecorations() {
|
||||
// This function should be called only for C comments
|
||||
assert(CommentState == LCS_InsideCComment);
|
||||
@@ -573,8 +727,17 @@ void Lexer::lexHTMLCharacterReference(Token &T) {
|
||||
StringRef Name(NamePtr, TokenPtr - NamePtr);
|
||||
TokenPtr++; // Skip semicolon.
|
||||
StringRef Resolved;
|
||||
if (isNamed)
|
||||
if (isNamed) {
|
||||
Resolved = resolveHTMLNamedCharacterReference(Name);
|
||||
if (Resolved.empty()) {
|
||||
Resolved = HTMLDoxygenCharacterReference(Name);
|
||||
if (!Resolved.empty()) {
|
||||
formTokenWithChars(T, TokenPtr, tok::text);
|
||||
T.setText(Resolved);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (isDecimal)
|
||||
Resolved = resolveHTMLDecimalCharacterReference(Name);
|
||||
else
|
||||
|
||||
28
clang/test/Index/special-html-characters.m
Normal file
28
clang/test/Index/special-html-characters.m
Normal file
@@ -0,0 +1,28 @@
|
||||
// RUN: rm -rf %t
|
||||
// RUN: mkdir %t
|
||||
// RUN: c-index-test -test-load-source all -comments-xml-schema=%S/../../bindings/xml/comment-xml-schema.rng -triple x86_64-apple-darwin10 %s > %t/out
|
||||
// RUN: FileCheck %s < %t/out
|
||||
// rdar://13067629
|
||||
|
||||
// Ensure that XML we generate is not invalid.
|
||||
// RUN: FileCheck %s -check-prefix=WRONG < %t/out
|
||||
// WRONG-NOT: CommentXMLInvalid
|
||||
|
||||
// rdar://12392215
|
||||
@interface I
|
||||
@end
|
||||
|
||||
@implementation I
|
||||
/*!
|
||||
© the copyright symbol
|
||||
™ the trade mark symbol
|
||||
® the registered trade mark symbol
|
||||
a non breakable space.
|
||||
Δ Greek letter Delta Δ.
|
||||
Γ Greek letter Gamma Γ.
|
||||
*/
|
||||
- (void)phoneHome:(id)sender {
|
||||
|
||||
}
|
||||
@end
|
||||
// CHECK: FullCommentAsHTML=[<p class="para-brief">\t© the copyright symbol\t™ the trade mark symbol ® the registered trade mark symbol\t a non breakable space. Δ Greek letter Delta Δ. Γ Greek letter Gamma Γ. </p>] FullCommentAsXML=[<Function isInstanceMethod="1" file="{{[^"]+}}special-html-characters.m" line="[[@LINE-4]]" column="1"><Name>phoneHome:</Name><USR>c:objc(cs)I(im)phoneHome:</USR><Declaration>- (void)phoneHome:(id)sender;</Declaration><Abstract><Para>\t© the copyright symbol\t™ the trade mark symbol ® the registered trade mark symbol\t a non breakable space. Δ Greek letter Delta Δ. Γ Greek letter Gamma Γ. </Para></Abstract></Function>]
|
||||
Reference in New Issue
Block a user