Preprocessor: preserve whitespace in -traditional-cpp mode.

Note that unlike GNU cpp we currently do not preserve whitespace in macros (even in -traditional-cpp mode). <rdar://problem/12897179> llvm-svn: 175778
2026-02-03 10:39:35 +08:00 · 2013-02-21 18:53:19 +00:00
parent 50f0c80341
commit cb8a1aca35
5 changed files with 99 additions and 32 deletions
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -174,8 +174,8 @@ public:
  /// SetKeepWhitespaceMode - This method lets clients enable or disable
  /// whitespace retention mode.
  void SetKeepWhitespaceMode(bool Val) {
-    assert((!Val || LexingRawMode) &&
-           "Can only enable whitespace retention in raw mode");
+    assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
+           "Can only retain whitespace in raw mode or -traditional-cpp");
    ExtendedTokenMode = Val ? 2 : 0;
  }

@@ -194,6 +194,14 @@ public:
    ExtendedTokenMode = Mode ? 1 : 0;
  }

+  /// Sets the extended token mode back to its initial value, according to the
+  /// language options and preprocessor. This controls whether the lexer
+  /// produces comment and whitespace tokens.
+  ///
+  /// This requires the lexer to have an associated preprocessor. A standalone
+  /// lexer has nothing to reset to.
+  void resetExtendedTokenMode();
+
  const char *getBufferStart() const { return BufferStart; }

  /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -548,7 +548,7 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,

      // Tokens that can contain embedded newlines need to adjust our current
      // line number.
-      if (Tok.getKind() == tok::comment)
+      if (Tok.getKind() == tok::comment || Tok.getKind() == tok::unknown)
        Callbacks->HandleNewlinesInToken(TokPtr, Len);
    } else {
      std::string S = PP.getSpelling(Tok);
@@ -556,7 +556,7 @@ static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,

      // Tokens that can contain embedded newlines need to adjust our current
      // line number.
-      if (Tok.getKind() == tok::comment)
+      if (Tok.getKind() == tok::comment || Tok.getKind() == tok::unknown)
        Callbacks->HandleNewlinesInToken(&S[0], S.size());
    }
    Callbacks->setEmittedTokensOnThisLine();
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -122,8 +122,15 @@ Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
  InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
            InputFile->getBufferEnd());

-  // Default to keeping comments if the preprocessor wants them.
-  SetCommentRetentionState(PP.getCommentRetentionState());
+  resetExtendedTokenMode();
+}
+
+void Lexer::resetExtendedTokenMode() {
+  assert(PP && "Cannot reset token mode without a preprocessor");
+  if (LangOpts.TraditionalCPP)
+    SetKeepWhitespaceMode(true);
+  else
+    SetCommentRetentionState(PP->getCommentRetentionState());
 }

 /// Lexer constructor - Create a new raw lexer object.  This object is only
@@ -1844,6 +1851,8 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
 ///
 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
  // Whitespace - Skip it, then return the token after the whitespace.
+  bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
+
  unsigned char Char = *CurPtr;  // Skip consequtive spaces efficiently.
  while (1) {
    // Skip horizontal whitespace very aggressively.
@@ -1851,7 +1860,7 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
      Char = *++CurPtr;

    // Otherwise if we have something other than whitespace, we're done.
-    if (Char != '\n' && Char != '\r')
+    if (!isVerticalWhitespace(Char))
      break;

    if (ParsingPreprocessorDirective) {
@@ -1861,24 +1870,27 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
    }

    // ok, but handle newline.
-    // The returned token is at the start of the line.
-    Result.setFlag(Token::StartOfLine);
-    // No leading whitespace seen so far.
-    Result.clearFlag(Token::LeadingSpace);
+    SawNewline = true;
    Char = *++CurPtr;
  }

-  // If this isn't immediately after a newline, there is leading space.
-  char PrevChar = CurPtr[-1];
-  if (PrevChar != '\n' && PrevChar != '\r')
-    Result.setFlag(Token::LeadingSpace);
-
  // If the client wants us to return whitespace, return it now.
  if (isKeepWhitespaceMode()) {
    FormTokenWithChars(Result, CurPtr, tok::unknown);
+    if (SawNewline)
+      IsAtStartOfLine = true;
+    // FIXME: The next token will not have LeadingSpace set.
    return true;
  }

+  // If this isn't immediately after a newline, there is leading space.
+  char PrevChar = CurPtr[-1];
+  bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
+
+  Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
+  if (SawNewline)
+    Result.setFlag(Token::StartOfLine);
+
  BufferPtr = CurPtr;
  return false;
 }
@@ -2269,7 +2281,6 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
  // have already returned above with the comment as a token.
  if (isHorizontalWhitespace(*CurPtr)) {
-    Result.setFlag(Token::LeadingSpace);
    SkipWhitespace(Result, CurPtr+1);
    return false;
  }
@@ -2351,7 +2362,7 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
    FormTokenWithChars(Result, CurPtr, tok::eod);

    // Restore comment saving mode, in case it was disabled for directive.
-    SetCommentRetentionState(PP->getCommentRetentionState());
+    resetExtendedTokenMode();
    return true;  // Have a token.
  }
 
@@ -2718,6 +2729,7 @@ LexNextToken:
    // whitespace.
    if (isKeepWhitespaceMode()) {
      FormTokenWithChars(Result, CurPtr, tok::unknown);
+      // FIXME: The next token will not have LeadingSpace set.
      return;
    }

@@ -2785,7 +2797,7 @@ LexNextToken:

      // Restore comment saving mode, in case it was disabled for directive.
      if (PP)
-        SetCommentRetentionState(PP->getCommentRetentionState());
+        resetExtendedTokenMode();

      // Since we consumed a newline, we are back at the start of a line.
      IsAtStartOfLine = true;
@@ -2793,8 +2805,7 @@ LexNextToken:
      Kind = tok::eod;
      break;
    }
-    // The returned token is at the start of the line.
-    Result.setFlag(Token::StartOfLine);
+
    // No leading whitespace seen so far.
    Result.clearFlag(Token::LeadingSpace);

--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -269,7 +269,7 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc,
    if (Tok.isNot(tok::raw_identifier)) {
      CurPPLexer->ParsingPreprocessorDirective = false;
      // Restore comment saving mode.
-      if (CurLexer) CurLexer->SetCommentRetentionState(KeepComments);
+      if (CurLexer) CurLexer->resetExtendedTokenMode();
      continue;
    }

@@ -285,7 +285,7 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc,
        FirstChar != 'i' && FirstChar != 'e') {
      CurPPLexer->ParsingPreprocessorDirective = false;
      // Restore comment saving mode.
-      if (CurLexer) CurLexer->SetCommentRetentionState(KeepComments);
+      if (CurLexer) CurLexer->resetExtendedTokenMode();
      continue;
    }

@@ -302,7 +302,7 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc,
      if (IdLen >= 20) {
        CurPPLexer->ParsingPreprocessorDirective = false;
        // Restore comment saving mode.
-        if (CurLexer) CurLexer->SetCommentRetentionState(KeepComments);
+        if (CurLexer) CurLexer->resetExtendedTokenMode();
        continue;
      }
      memcpy(DirectiveBuf, &DirectiveStr[0], IdLen);
@@ -408,7 +408,7 @@ void Preprocessor::SkipExcludedConditionalBlock(SourceLocation IfTokenLoc,

    CurPPLexer->ParsingPreprocessorDirective = false;
    // Restore comment saving mode.
-    if (CurLexer) CurLexer->SetCommentRetentionState(KeepComments);
+    if (CurLexer) CurLexer->resetExtendedTokenMode();
  }

  // Finally, if we are out of the conditional (saw an #endif or ran off the end
@@ -594,6 +594,7 @@ void Preprocessor::HandleDirective(Token &Result) {
  // mode.  Tell the lexer this so any newlines we see will be converted into an
  // EOD token (which terminates the directive).
  CurPPLexer->ParsingPreprocessorDirective = true;
+  if (CurLexer) CurLexer->SetKeepWhitespaceMode(false);

  ++NumDirectives;

@@ -638,14 +639,9 @@ void Preprocessor::HandleDirective(Token &Result) {
  // and reset to previous state when returning from this function.
  ResetMacroExpansionHelper helper(this);

-TryAgain:
  switch (Result.getKind()) {
  case tok::eod:
    return;   // null directive.
-  case tok::comment:
-    // Handle stuff like "# /*foo*/ define X" in -E -C mode.
-    LexUnexpandedToken(Result);
-    goto TryAgain;
  case tok::code_completion:
    if (CodeComplete)
      CodeComplete->CodeCompleteDirective(
--- a/clang/test/Preprocessor/traditional-cpp.c
+++ b/clang/test/Preprocessor/traditional-cpp.c
@@ -4,9 +4,61 @@

 /*
 RUN: %clang_cc1 -traditional-cpp %s -E -o %t
- RUN: FileCheck < %t %s
+ RUN: FileCheck -strict-whitespace < %t %s
 */

-/* CHECK: foo // bar
+/* CHECK: {{^}}foo // bar{{$}}
 */
 foo // bar
+
+
+/* The lines in this file contain hard tab characters and trailing whitespace; 
+ * do not change them! */
+
+/* CHECK: {{^}}	indented!{{$}}
+ * CHECK: {{^}}tab	separated	values{{$}}
+ */
+	indented!
+tab	separated	values
+
+#define bracket(x) >>>x<<<
+bracket(|  spaces  |)
+/* CHECK: {{^}}>>>|  spaces  |<<<{{$}}
+ */
+
+/* This is still a preprocessing directive. */
+# define foo bar
+foo!
+-
+	foo!	foo!	
+/* CHECK: {{^}}bar!{{$}}
+ * CHECK: {{^}}	bar!	bar!	{{$}}
+ */
+
+/* Deliberately check a leading newline with spaces on that line. */
+   
+# define foo bar
+foo!
+-
+	foo!	foo!	
+/* CHECK: {{^}}bar!{{$}}
+ * CHECK: {{^}}	bar!	bar!	{{$}}
+ */
+
+/* FIXME: -traditional-cpp should not consider this a preprocessing directive
+ * because the # isn't in the first column.
+ */
+ #define foo2 bar
+foo2!
+/* If this were working, both of these checks would be on.
+ * CHECK-NOT: {{^}} #define foo2 bar{{$}}
+ * CHECK-NOT: {{^}}foo2!{{$}}
+ */
+
+/* FIXME: -traditional-cpp should not homogenize whitespace in macros.
+ */
+#define bracket2(x) >>>  x  <<<
+bracket2(spaces)
+/* If this were working, this check would be on.
+ * CHECK-NOT: {{^}}>>>  spaces  <<<{{$}}
+ */