[Doc parsing] Patch to parse Doxygen-supported HTML character

references to their UTIF-8 encoding. Reviewed offline by Doug.
// rdar://12392215

llvm-svn: 173850
This commit is contained in:
Fariborz Jahanian
2013-01-29 23:42:26 +00:00
parent 5e9d55eca0
commit 7b3ae19048
3 changed files with 216 additions and 18 deletions

View File

@@ -282,11 +282,18 @@ private:
/// it stands for (e.g., "<").
StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
/// Given a Doxygen-supported named character reference (e.g., "&trade;"),
/// it returns its UTF8 encoding.
StringRef HTMLDoxygenCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-10 integer, return the character.
StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-16 integer, return the character.
StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
/// Helper routine to do part of the work for resolveHTMLHexCharacterReference.
StringRef helperResolveHTMLHexCharacterReference(unsigned CodePoint) const;
void formTokenWithChars(Token &Result, const char *TokEnd,
tok::TokenKind Kind) {

View File

@@ -34,6 +34,31 @@ bool isHTMLHexCharacterReferenceCharacter(char C) {
} // unnamed namespace
static unsigned getCodePoint(StringRef Name) {
unsigned CodePoint = 0;
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
CodePoint *= 16;
const char C = Name[i];
assert(isHTMLHexCharacterReferenceCharacter(C));
CodePoint += llvm::hexDigitValue(C);
}
return CodePoint;
}
StringRef Lexer::helperResolveHTMLHexCharacterReference(unsigned CodePoint) const {
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
char *ResolvedPtr = Resolved;
if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
return StringRef(Resolved, ResolvedPtr - Resolved);
else
return StringRef();
}
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
unsigned CodePoint = getCodePoint(Name);
return helperResolveHTMLHexCharacterReference(CodePoint);
}
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
return llvm::StringSwitch<StringRef>(Name)
.Case("amp", "&")
@@ -41,8 +66,154 @@ StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
.Case("gt", ">")
.Case("quot", "\"")
.Case("apos", "\'")
.Case("minus", "-")
.Case("sim", "~")
.Default("");
}
StringRef Lexer::HTMLDoxygenCharacterReference(StringRef Name) const {
return llvm::StringSwitch<StringRef>(Name)
.Case("copy", helperResolveHTMLHexCharacterReference(0x000A9))
.Case("trade", helperResolveHTMLHexCharacterReference(0x02122))
.Case("reg", helperResolveHTMLHexCharacterReference(0x000AE))
.Case("lt", helperResolveHTMLHexCharacterReference(0x0003C))
.Case("gt", helperResolveHTMLHexCharacterReference(0x0003C))
.Case("amp", helperResolveHTMLHexCharacterReference(0x00026))
.Case("apos", helperResolveHTMLHexCharacterReference(0x00027))
.Case("quot", helperResolveHTMLHexCharacterReference(0x00022))
.Case("lsquo", helperResolveHTMLHexCharacterReference(0x02018))
.Case("rsquo", helperResolveHTMLHexCharacterReference(0x02019))
.Case("ldquo", helperResolveHTMLHexCharacterReference(0x0201C))
.Case("rdquo", helperResolveHTMLHexCharacterReference(0x0201D))
.Case("ndash", helperResolveHTMLHexCharacterReference(0x02013))
.Case("mdash", helperResolveHTMLHexCharacterReference(0x02014))
.Case("Auml", helperResolveHTMLHexCharacterReference(0x000C4))
.Case("Euml", helperResolveHTMLHexCharacterReference(0x000CB))
.Case("Iuml", helperResolveHTMLHexCharacterReference(0x000CF))
.Case("Ouml", helperResolveHTMLHexCharacterReference(0x000D6))
.Case("Uuml", helperResolveHTMLHexCharacterReference(0x000DC))
.Case("Yuml", helperResolveHTMLHexCharacterReference(0x00178))
.Case("auml", helperResolveHTMLHexCharacterReference(0x000E4))
.Case("euml", helperResolveHTMLHexCharacterReference(0x000EB))
.Case("iuml", helperResolveHTMLHexCharacterReference(0x000EF))
.Case("ouml", helperResolveHTMLHexCharacterReference(0x000F6))
.Case("uuml", helperResolveHTMLHexCharacterReference(0x000FC))
.Case("yuml", helperResolveHTMLHexCharacterReference(0x000FF))
.Case("Aacute", helperResolveHTMLHexCharacterReference(0x000C1))
.Case("Eacute", helperResolveHTMLHexCharacterReference(0x000C9))
.Case("Iacute", helperResolveHTMLHexCharacterReference(0x000CD))
.Case("Oacute", helperResolveHTMLHexCharacterReference(0x000D3))
.Case("Uacute", helperResolveHTMLHexCharacterReference(0x000DA))
.Case("Yacute", helperResolveHTMLHexCharacterReference(0x000DD))
.Case("aacute", helperResolveHTMLHexCharacterReference(0x000E1))
.Case("eacute", helperResolveHTMLHexCharacterReference(0x000E9))
.Case("iacute", helperResolveHTMLHexCharacterReference(0x000ED))
.Case("oacute", helperResolveHTMLHexCharacterReference(0x000F3))
.Case("uacute", helperResolveHTMLHexCharacterReference(0x000FA))
.Case("yacute", helperResolveHTMLHexCharacterReference(0x000FD))
.Case("Agrave", helperResolveHTMLHexCharacterReference(0x000C0))
.Case("Egrave", helperResolveHTMLHexCharacterReference(0x000C8))
.Case("Igrave", helperResolveHTMLHexCharacterReference(0x000CC))
.Case("Ograve", helperResolveHTMLHexCharacterReference(0x000D2))
.Case("Ugrave", helperResolveHTMLHexCharacterReference(0x000D9))
.Case("agrave", helperResolveHTMLHexCharacterReference(0x000E0))
.Case("egrave", helperResolveHTMLHexCharacterReference(0x000E8))
.Case("igrave", helperResolveHTMLHexCharacterReference(0x000EC))
.Case("ograve", helperResolveHTMLHexCharacterReference(0x000F2))
.Case("ugrave", helperResolveHTMLHexCharacterReference(0x000F9))
.Case("ygrave", helperResolveHTMLHexCharacterReference(0x01EF3))
.Case("Acirc", helperResolveHTMLHexCharacterReference(0x000C2))
.Case("Ecirc", helperResolveHTMLHexCharacterReference(0x000CA))
.Case("Icirc", helperResolveHTMLHexCharacterReference(0x000CE))
.Case("Ocirc", helperResolveHTMLHexCharacterReference(0x000D4))
.Case("Ucirc", helperResolveHTMLHexCharacterReference(0x000DB))
.Case("acirc", helperResolveHTMLHexCharacterReference(0x000E2))
.Case("ecirc", helperResolveHTMLHexCharacterReference(0x000EA))
.Case("icirc", helperResolveHTMLHexCharacterReference(0x000EE))
.Case("ocirc", helperResolveHTMLHexCharacterReference(0x000F4))
.Case("ucirc", helperResolveHTMLHexCharacterReference(0x000FB))
.Case("ycirc", helperResolveHTMLHexCharacterReference(0x00177))
.Case("Atilde", helperResolveHTMLHexCharacterReference(0x000C3))
.Case("Ntilde", helperResolveHTMLHexCharacterReference(0x000D1))
.Case("Otilde", helperResolveHTMLHexCharacterReference(0x000D5))
.Case("atilde", helperResolveHTMLHexCharacterReference(0x000E3))
.Case("ntilde", helperResolveHTMLHexCharacterReference(0x000F1))
.Case("otilde", helperResolveHTMLHexCharacterReference(0x000F5))
.Case("szlig", helperResolveHTMLHexCharacterReference(0x000DF))
.Case("ccedil", helperResolveHTMLHexCharacterReference(0x000E7))
.Case("Ccedil", helperResolveHTMLHexCharacterReference(0x000C7))
.Case("aring", helperResolveHTMLHexCharacterReference(0x000E5))
.Case("Aring", helperResolveHTMLHexCharacterReference(0x000C5))
.Case("nbsp", helperResolveHTMLHexCharacterReference(0x000A0))
.Case("Gamma", helperResolveHTMLHexCharacterReference(0x00393))
.Case("Delta", helperResolveHTMLHexCharacterReference(0x00394))
.Case("Theta", helperResolveHTMLHexCharacterReference(0x00398))
.Case("Lambda", helperResolveHTMLHexCharacterReference(0x0039B))
.Case("Xi", helperResolveHTMLHexCharacterReference(0x0039E))
.Case("Pi", helperResolveHTMLHexCharacterReference(0x003A0))
.Case("Sigma", helperResolveHTMLHexCharacterReference(0x003A3))
.Case("Upsilon", helperResolveHTMLHexCharacterReference(0x003A5))
.Case("Phi", helperResolveHTMLHexCharacterReference(0x003A6))
.Case("Psi", helperResolveHTMLHexCharacterReference(0x003A8))
.Case("Omega", helperResolveHTMLHexCharacterReference(0x003A9))
.Case("alpha", helperResolveHTMLHexCharacterReference(0x003B1))
.Case("beta", helperResolveHTMLHexCharacterReference(0x003B2))
.Case("gamma", helperResolveHTMLHexCharacterReference(0x003B3))
.Case("delta", helperResolveHTMLHexCharacterReference(0x003B4))
.Case("epsilon", helperResolveHTMLHexCharacterReference(0x003B5))
.Case("zeta", helperResolveHTMLHexCharacterReference(0x003B6))
.Case("eta", helperResolveHTMLHexCharacterReference(0x003B7))
.Case("theta", helperResolveHTMLHexCharacterReference(0x003B8))
.Case("iota", helperResolveHTMLHexCharacterReference(0x003B9))
.Case("kappa", helperResolveHTMLHexCharacterReference(0x003BA))
.Case("lambda", helperResolveHTMLHexCharacterReference(0x003BB))
.Case("mu", helperResolveHTMLHexCharacterReference(0x003BC))
.Case("nu", helperResolveHTMLHexCharacterReference(0x003BD))
.Case("xi", helperResolveHTMLHexCharacterReference(0x003BE))
.Case("pi", helperResolveHTMLHexCharacterReference(0x003C0))
.Case("rho", helperResolveHTMLHexCharacterReference(0x003C1))
.Case("sigma", helperResolveHTMLHexCharacterReference(0x003C3))
.Case("tau", helperResolveHTMLHexCharacterReference(0x003C4))
.Case("upsilon", helperResolveHTMLHexCharacterReference(0x003C5))
.Case("phi", helperResolveHTMLHexCharacterReference(0x003C6))
.Case("chi", helperResolveHTMLHexCharacterReference(0x003C7))
.Case("psi", helperResolveHTMLHexCharacterReference(0x003C8))
.Case("omega", helperResolveHTMLHexCharacterReference(0x003C9))
.Case("sigmaf", helperResolveHTMLHexCharacterReference(0x003C2))
.Case("sect", helperResolveHTMLHexCharacterReference(0x000A7))
.Case("deg", helperResolveHTMLHexCharacterReference(0x000B0))
.Case("prime", helperResolveHTMLHexCharacterReference(0x02032))
.Case("Prime", helperResolveHTMLHexCharacterReference(0x02033))
.Case("infin", helperResolveHTMLHexCharacterReference(0x0221E))
.Case("empty", helperResolveHTMLHexCharacterReference(0x02205))
.Case("plusmn", helperResolveHTMLHexCharacterReference(0x000B1))
.Case("times", helperResolveHTMLHexCharacterReference(0x000D7))
.Case("minus", helperResolveHTMLHexCharacterReference(0x02212))
.Case("sdot", helperResolveHTMLHexCharacterReference(0x022C5))
.Case("part", helperResolveHTMLHexCharacterReference(0x02202))
.Case("nabla", helperResolveHTMLHexCharacterReference(0x02207))
.Case("radic", helperResolveHTMLHexCharacterReference(0x0221A))
.Case("perp", helperResolveHTMLHexCharacterReference(0x022A5))
.Case("sum", helperResolveHTMLHexCharacterReference(0x02211))
.Case("int", helperResolveHTMLHexCharacterReference(0x0222B))
.Case("prod", helperResolveHTMLHexCharacterReference(0x0220F))
.Case("sim", helperResolveHTMLHexCharacterReference(0x0223C))
.Case("asymp", helperResolveHTMLHexCharacterReference(0x02248))
.Case("ne", helperResolveHTMLHexCharacterReference(0x02260))
.Case("equiv", helperResolveHTMLHexCharacterReference(0x02261))
.Case("prop", helperResolveHTMLHexCharacterReference(0x0221D))
.Case("le", helperResolveHTMLHexCharacterReference(0x02264))
.Case("ge", helperResolveHTMLHexCharacterReference(0x02265))
.Case("larr", helperResolveHTMLHexCharacterReference(0x02190))
.Case("rarr", helperResolveHTMLHexCharacterReference(0x02192))
.Case("isin", helperResolveHTMLHexCharacterReference(0x02208))
.Case("notin", helperResolveHTMLHexCharacterReference(0x02209))
.Case("lceil", helperResolveHTMLHexCharacterReference(0x02308))
.Case("rceil", helperResolveHTMLHexCharacterReference(0x02309))
.Case("lfloor", helperResolveHTMLHexCharacterReference(0x0230A))
.Case("rfloor", helperResolveHTMLHexCharacterReference(0x0230B))
.Default("");
}
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
unsigned CodePoint = 0;
@@ -60,23 +231,6 @@ StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
return StringRef();
}
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
unsigned CodePoint = 0;
for (unsigned i = 0, e = Name.size(); i != e; ++i) {
CodePoint *= 16;
const char C = Name[i];
assert(isHTMLHexCharacterReferenceCharacter(C));
CodePoint += llvm::hexDigitValue(C);
}
char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
char *ResolvedPtr = Resolved;
if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
return StringRef(Resolved, ResolvedPtr - Resolved);
else
return StringRef();
}
void Lexer::skipLineStartingDecorations() {
// This function should be called only for C comments
assert(CommentState == LCS_InsideCComment);
@@ -573,8 +727,17 @@ void Lexer::lexHTMLCharacterReference(Token &T) {
StringRef Name(NamePtr, TokenPtr - NamePtr);
TokenPtr++; // Skip semicolon.
StringRef Resolved;
if (isNamed)
if (isNamed) {
Resolved = resolveHTMLNamedCharacterReference(Name);
if (Resolved.empty()) {
Resolved = HTMLDoxygenCharacterReference(Name);
if (!Resolved.empty()) {
formTokenWithChars(T, TokenPtr, tok::text);
T.setText(Resolved);
return;
}
}
}
else if (isDecimal)
Resolved = resolveHTMLDecimalCharacterReference(Name);
else

View File

@@ -0,0 +1,28 @@
// RUN: rm -rf %t
// RUN: mkdir %t
// RUN: c-index-test -test-load-source all -comments-xml-schema=%S/../../bindings/xml/comment-xml-schema.rng -triple x86_64-apple-darwin10 %s > %t/out
// RUN: FileCheck %s < %t/out
// rdar://13067629
// Ensure that XML we generate is not invalid.
// RUN: FileCheck %s -check-prefix=WRONG < %t/out
// WRONG-NOT: CommentXMLInvalid
// rdar://12392215
@interface I
@end
@implementation I
/*!
&copy; the copyright symbol
&trade; the trade mark symbol
&reg; the registered trade mark symbol
&nbsp; a non breakable space.
&Delta; Greek letter Delta Δ.
&Gamma; Greek letter Gamma Γ.
*/
- (void)phoneHome:(id)sender {
}
@end
// CHECK: FullCommentAsHTML=[<p class="para-brief">\t© the copyright symbol\t the trade mark symbol ® the registered trade mark symbol\t  a non breakable space. Δ Greek letter Delta Δ. Γ Greek letter Gamma Γ. </p>] FullCommentAsXML=[<Function isInstanceMethod="1" file="{{[^"]+}}special-html-characters.m" line="[[@LINE-4]]" column="1"><Name>phoneHome:</Name><USR>c:objc(cs)I(im)phoneHome:</USR><Declaration>- (void)phoneHome:(id)sender;</Declaration><Abstract><Para>\t© the copyright symbol\t the trade mark symbol ® the registered trade mark symbol\t  a non breakable space. Δ Greek letter Delta Δ. Γ Greek letter Gamma Γ. </Para></Abstract></Function>]