improve highlighting of invalid string encodings

limit highlight to exactly the bad encoding, and highlight every
bad encoding in a string.

llvm-svn: 166900
This commit is contained in:
Seth Cantrell
2012-10-28 18:24:46 +00:00
parent 59a8e49f01
commit 4cfc817a9a
2 changed files with 70 additions and 15 deletions

View File

@@ -49,6 +49,20 @@ static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
}
}
static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
FullSourceLoc TokLoc,
const char *TokBegin,
const char *TokRangeBegin,
const char *TokRangeEnd) {
SourceLocation Begin =
Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
TokLoc.getManager(), Features);
SourceLocation End =
Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
TokLoc.getManager(), Features);
return CharSourceRange::getCharRange(Begin, End);
}
/// \brief Produce a diagnostic highlighting some portion of a literal.
///
/// Emits the diagnostic \p DiagID, highlighting the range of characters from
@@ -61,11 +75,8 @@ static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
SourceLocation Begin =
Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
TokLoc.getManager(), Features);
SourceLocation End =
Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
TokLoc.getManager(), Features);
return Diags->Report(Begin, DiagID)
<< CharSourceRange::getCharRange(Begin, End);
return Diags->Report(Begin, DiagID) <<
MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
}
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
@@ -1372,6 +1383,15 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
}
}
static const char *resync_utf8(const char *err, const char *end) {
if (err==end)
return end;
end = err + std::min<unsigned>(getNumBytesForUTF8(*err), end-err);
while (++err!=end && (*err&0xC0)==0x80)
;
return err;
}
/// \brief This function copies from Fragment, which is a sequence of bytes
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
/// Performs widening for multi-byte characters.
@@ -1381,7 +1401,6 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok,
const UTF8 *ErrorPtrTmp;
if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
return false;
const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
// If we see bad encoding for unprefixed string literals, warn and
// simply copy the byte values, for compatibility with gcc and older
@@ -1391,12 +1410,31 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok,
memcpy(ResultPtr, Fragment.data(), Fragment.size());
ResultPtr += Fragment.size();
}
if (Diags) {
Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin,
ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr),
Fragment.end() - ErrorPtr),
NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
: diag::err_bad_string_encoding);
const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
FullSourceLoc SourceLoc(Tok.getLocation(), SM);
const DiagnosticBuilder &Builder =
Diag(Diags, Features, SourceLoc, TokBegin,
ErrorPtr, resync_utf8(ErrorPtr, Fragment.end()),
NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
: diag::err_bad_string_encoding);
char *SavedResultPtr = ResultPtr;
const char *NextStart = resync_utf8(ErrorPtr, Fragment.end());
StringRef NextFragment(NextStart, Fragment.end()-NextStart);
while (!ConvertUTF8toWide(CharByteWidth, NextFragment, ResultPtr,
ErrorPtrTmp)) {
const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
NextStart = resync_utf8(ErrorPtr, Fragment.end());
Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
ErrorPtr, NextStart);
NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
}
ResultPtr = SavedResultPtr;
}
return !NoErrorOnBadEncoding;
}

View File

@@ -1,16 +1,33 @@
// RUN: %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck -strict-whitespace %s
// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value %s 2>&1 | FileCheck -strict-whitespace %s
void foo() {
"§Ã"; // ø
// CHECK: {{^ "<A7><C3>"; // <F8>}}
// CHECK: {{^ \^~~~}}
// CHECK: {{^ \^~~~~~~}}
/* þ« */ const char *d = "¥";
// CHECK: {{^ /\* <FE><AB> \*/ const char \*d = "<A5>";}}
// CHECK: {{^ \^~~~}}
// CHECK: {{^ "<A7><C3>"; // <F8>}}
// CHECK: {{^ \^~~~~~~~~~}}
"xxé¿¿¿d";
// CHECK: {{^ "xx<U\+9FFF><BF>d";}}
// CHECK: {{^ \^~~~}}
"xxé¿bcd";
// CHECK: {{^ "xx<E9><BF>bcd";}}
// CHECK: {{^ \^~~~~~~~}}
"xxéabcd";
// CHECK: {{^ "xx<E9>abcd";}}
// CHECK: {{^ \^~~~}}
"xxé¿é¿d";
// CHECK: {{^ "xx<E9><BF><E9><BF>d";}}
// CHECK: {{^ \^~~~~~~~~~~~~~~}}
"xxé¿xxxxxxxxxxxxxxxxxxxxxé¿xx";
// CHECK: {{^ "xx<E9><BF>xxxxxxxxxxxxxxxxxxxxx<E9><BF>xx";}}
// CHECK: {{^ \^~~~~~~~ ~~~~~~~~}}
}