doxygen/CommentLexer_8cpp_source.html

//===--- CommentLexer.cpp -------------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "clang/AST/CommentLexer.h"

#include "clang/AST/CommentCommandTraits.h"

#include "clang/Basic/CharInfo.h"

#include "clang/Basic/DiagnosticComment.h"

#include "llvm/ADT/StringExtras.h"

#include "llvm/ADT/StringSwitch.h"

#include "llvm/Support/ConvertUTF.h"

#include "llvm/Support/ErrorHandling.h"


namespace clang {

namespace comments {


void Token::dump(const Lexer &L, const SourceManager &SM) const {

  llvm::errs() << "comments::Token Kind=" << Kind << " ";

  Loc.print(llvm::errs(), SM);

  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";

}


static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {

  return isLetter(C);

}


static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {

  return isDigit(C);

}


static inline bool isHTMLHexCharacterReferenceCharacter(char C) {

  return isHexDigit(C);

}


static inline StringRef convertCodePointToUTF8(

                                      llvm::BumpPtrAllocator &Allocator,

                                      unsigned CodePoint) {

  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);

  char *ResolvedPtr = Resolved;

  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))

    return StringRef(Resolved, ResolvedPtr - Resolved);

  else

    return StringRef();

}


namespace {


#include "clang/AST/CommentHTMLTags.inc"

#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"


} // end anonymous namespace


StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {

  // Fast path, first check a few most widely used named character references.

  return llvm::StringSwitch<StringRef>(Name)

      .Case("amp", "&")

      .Case("lt", "<")

      .Case("gt", ">")

      .Case("quot", "\"")

      .Case("apos", "\'")

      // Slow path.

      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));

}


StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {

  unsigned CodePoint = 0;

  for (unsigned i = 0, e = Name.size(); i != e; ++i) {

    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));

    CodePoint *= 10;

    CodePoint += Name[i] - '0';

  }

  return convertCodePointToUTF8(Allocator, CodePoint);

}


StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {

  unsigned CodePoint = 0;

  for (unsigned i = 0, e = Name.size(); i != e; ++i) {

    CodePoint *= 16;

    const char C = Name[i];

    assert(isHTMLHexCharacterReferenceCharacter(C));

    CodePoint += llvm::hexDigitValue(C);

  }

  return convertCodePointToUTF8(Allocator, CodePoint);

}


void Lexer::skipLineStartingDecorations() {

  // This function should be called only for C comments

  assert(CommentState == LCS_InsideCComment);


  if (BufferPtr == CommentEnd)

    return;


  const char *NewBufferPtr = BufferPtr;

  while (isHorizontalWhitespace(*NewBufferPtr))

    if (++NewBufferPtr == CommentEnd)

      return;

  if (*NewBufferPtr == '*')

    BufferPtr = NewBufferPtr + 1;

}


namespace {

/// Returns pointer to the first newline character in the string.

const char *findNewline(const char *BufferPtr, const char *BufferEnd) {

  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {

    if (isVerticalWhitespace(*BufferPtr))

      return BufferPtr;

  }

  return BufferEnd;

}


const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {

  if (BufferPtr == BufferEnd)

    return BufferPtr;


  if (*BufferPtr == '\n')

    BufferPtr++;

  else {

    assert(*BufferPtr == '\r');

    BufferPtr++;

    if (BufferPtr != BufferEnd && *BufferPtr == '\n')

      BufferPtr++;

  }

  return BufferPtr;

}


const char *skipNamedCharacterReference(const char *BufferPtr,

                                        const char *BufferEnd) {

  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {

    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))

      return BufferPtr;

  }

  return BufferEnd;

}


const char *skipDecimalCharacterReference(const char *BufferPtr,

                                          const char *BufferEnd) {

  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {

    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))

      return BufferPtr;

  }

  return BufferEnd;

}


const char *skipHexCharacterReference(const char *BufferPtr,

                                      const char *BufferEnd) {

  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {

    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))

      return BufferPtr;

  }

  return BufferEnd;

}


bool isHTMLIdentifierStartingCharacter(char C) {

  return isLetter(C);

}


bool isHTMLIdentifierCharacter(char C) {

  return isAlphanumeric(C);

}


const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {

  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {

    if (!isHTMLIdentifierCharacter(*BufferPtr))

      return BufferPtr;

  }

  return BufferEnd;

}


/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside

/// string allowed.

///

/// Returns pointer to closing quote.

const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)

{

  const char Quote = *BufferPtr;

  assert(Quote == '\"' || Quote == '\'');


  BufferPtr++;

  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {

    const char C = *BufferPtr;

    if (C == Quote && BufferPtr[-1] != '\\')

      return BufferPtr;

  }

  return BufferEnd;

}


const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {

  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {

    if (!isWhitespace(*BufferPtr))

      return BufferPtr;

  }

  return BufferEnd;

}


bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {

  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;

}


bool isCommandNameStartCharacter(char C) {

  return isLetter(C);

}


bool isCommandNameCharacter(char C) {

  return isAlphanumeric(C);

}


const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {

  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {

    if (!isCommandNameCharacter(*BufferPtr))

      return BufferPtr;

  }

  return BufferEnd;

}


/// Return the one past end pointer for BCPL comments.

/// Handles newlines escaped with backslash or trigraph for backslahs.

const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {

  const char *CurPtr = BufferPtr;

  while (CurPtr != BufferEnd) {

    while (!isVerticalWhitespace(*CurPtr)) {

      CurPtr++;

      if (CurPtr == BufferEnd)

        return BufferEnd;

    }

    // We found a newline, check if it is escaped.

    const char *EscapePtr = CurPtr - 1;

    while(isHorizontalWhitespace(*EscapePtr))

      EscapePtr--;


    if (*EscapePtr == '\\' ||

        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&

         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {

      // We found an escaped newline.

      CurPtr = skipNewline(CurPtr, BufferEnd);

    } else

      return CurPtr; // Not an escaped newline.

  }

  return BufferEnd;

}


/// Return the one past end pointer for C comments.

/// Very dumb, does not handle escaped newlines or trigraphs.

const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {

  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {

    if (*BufferPtr == '*') {

      assert(BufferPtr + 1 != BufferEnd);

      if (*(BufferPtr + 1) == '/')

        return BufferPtr;

    }

  }

  llvm_unreachable("buffer end hit before '*/' was seen");

}


} // end anonymous namespace


void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,

                               tok::TokenKind Kind) {

  const unsigned TokLen = TokEnd - BufferPtr;

  Result.setLocation(getSourceLocation(BufferPtr));

  Result.setKind(Kind);

  Result.setLength(TokLen);

#ifndef NDEBUG

  Result.TextPtr = "<UNSET>";

  Result.IntVal = 7;

#endif

  BufferPtr = TokEnd;

}


const char *Lexer::skipTextToken() {

  const char *TokenPtr = BufferPtr;

  assert(TokenPtr < CommentEnd);

  StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";


again:

  size_t End =

      StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);

  if (End == StringRef::npos)

    return CommentEnd;


  // Doxygen doesn't recognize any commands in a one-line double quotation.

  // If we don't find an ending quotation mark, we pretend it never began.

  if (*(TokenPtr + End) == '\"') {

    TokenPtr += End + 1;

    End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");

    if (End != StringRef::npos && *(TokenPtr + End) == '\"')

      TokenPtr += End + 1;

    goto again;

  }

  return TokenPtr + End;

}


void Lexer::lexCommentText(Token &T) {

  assert(CommentState == LCS_InsideBCPLComment ||

         CommentState == LCS_InsideCComment);


  // Handles lexing non-command text, i.e. text and newline.

  auto HandleNonCommandToken = [&]() -> void {

    assert(State == LS_Normal);


    const char *TokenPtr = BufferPtr;

    assert(TokenPtr < CommentEnd);

    switch (*TokenPtr) {

      case '\n':

      case '\r':

          TokenPtr = skipNewline(TokenPtr, CommentEnd);

          formTokenWithChars(T, TokenPtr, tok::newline);


          if (CommentState == LCS_InsideCComment)

            skipLineStartingDecorations();

          return;


      default:

        return formTextToken(T, skipTextToken());

    }

  };


  if (!ParseCommands)

    return HandleNonCommandToken();


  switch (State) {

  case LS_Normal:

    break;

  case LS_VerbatimBlockFirstLine:

    lexVerbatimBlockFirstLine(T);

    return;

  case LS_VerbatimBlockBody:

    lexVerbatimBlockBody(T);

    return;

  case LS_VerbatimLineText:

    lexVerbatimLineText(T);

    return;

  case LS_HTMLStartTag:

    lexHTMLStartTag(T);

    return;

  case LS_HTMLEndTag:

    lexHTMLEndTag(T);

    return;

  }


  assert(State == LS_Normal);

  const char *TokenPtr = BufferPtr;

  assert(TokenPtr < CommentEnd);

  switch(*TokenPtr) {

    case '\\':

    case '@': {

      // Commands that start with a backslash and commands that start with

      // 'at' have equivalent semantics.  But we keep information about the

      // exact syntax in AST for comments.

      tok::TokenKind CommandKind =

          (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;

      TokenPtr++;

      if (TokenPtr == CommentEnd) {

        formTextToken(T, TokenPtr);

        return;

      }

      char C = *TokenPtr;

      switch (C) {

      default:

        break;


      case '\\': case '@': case '&': case '$':

      case '#':  case '<': case '>': case '%':

      case '\"': case '.': case ':':

        // This is one of \\ \@ \& \$ etc escape sequences.

        TokenPtr++;

        if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {

          // This is the \:: escape sequence.

          TokenPtr++;

        }

        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));

        formTokenWithChars(T, TokenPtr, tok::text);

        T.setText(UnescapedText);

        return;

      }


      // Don't make zero-length commands.

      if (!isCommandNameStartCharacter(*TokenPtr)) {

        formTextToken(T, TokenPtr);

        return;

      }


      TokenPtr = skipCommandName(TokenPtr, CommentEnd);

      unsigned Length = TokenPtr - (BufferPtr + 1);


      // Hardcoded support for lexing LaTeX formula commands

      // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.

      if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {

        C = *TokenPtr;

        if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||

            C == '{' || C == '}') {

          TokenPtr++;

          Length++;

        }

      }


      StringRef CommandName(BufferPtr + 1, Length);


      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);

      if (!Info) {

        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {

          StringRef CorrectedName = Info->Name;

          SourceLocation Loc = getSourceLocation(BufferPtr);

          SourceLocation EndLoc = getSourceLocation(TokenPtr);

          SourceRange FullRange = SourceRange(Loc, EndLoc);

          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);

          Diag(Loc, diag::warn_correct_comment_command_name)

            << FullRange << CommandName << CorrectedName

            << FixItHint::CreateReplacement(CommandRange, CorrectedName);

        } else {

          formTokenWithChars(T, TokenPtr, tok::unknown_command);

          T.setUnknownCommandName(CommandName);

          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)

              << SourceRange(T.getLocation(), T.getEndLocation());

          return;

        }

      }

      if (Info->IsVerbatimBlockCommand) {

        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);

        return;

      }

      if (Info->IsVerbatimLineCommand) {

        setupAndLexVerbatimLine(T, TokenPtr, Info);

        return;

      }

      formTokenWithChars(T, TokenPtr, CommandKind);

      T.setCommandID(Info->getID());

      return;

    }


    case '&':

      lexHTMLCharacterReference(T);

      return;


    case '<': {

      TokenPtr++;

      if (TokenPtr == CommentEnd) {

        formTextToken(T, TokenPtr);

        return;

      }

      const char C = *TokenPtr;

      if (isHTMLIdentifierStartingCharacter(C))

        setupAndLexHTMLStartTag(T);

      else if (C == '/')

        setupAndLexHTMLEndTag(T);

      else

        formTextToken(T, TokenPtr);

      return;

    }


    default:

      return HandleNonCommandToken();

  }

}


void Lexer::setupAndLexVerbatimBlock(Token &T,

                                     const char *TextBegin,

                                     char Marker, const CommandInfo *Info) {

  assert(Info->IsVerbatimBlockCommand);


  VerbatimBlockEndCommandName.clear();

  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");

  VerbatimBlockEndCommandName.append(Info->EndCommandName);


  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);

  T.setVerbatimBlockID(Info->getID());


  // If there is a newline following the verbatim opening command, skip the

  // newline so that we don't create an tok::verbatim_block_line with empty

  // text content.

  if (BufferPtr != CommentEnd &&

      isVerticalWhitespace(*BufferPtr)) {

    BufferPtr = skipNewline(BufferPtr, CommentEnd);

    State = LS_VerbatimBlockBody;

    return;

  }


  State = LS_VerbatimBlockFirstLine;

}


void Lexer::lexVerbatimBlockFirstLine(Token &T) {

again:

  assert(BufferPtr < CommentEnd);


  // FIXME: It would be better to scan the text once, finding either the block

  // end command or newline.

  //

  // Extract current line.

  const char *Newline = findNewline(BufferPtr, CommentEnd);

  StringRef Line(BufferPtr, Newline - BufferPtr);


  // Look for end command in current line.

  size_t Pos = Line.find(VerbatimBlockEndCommandName);

  const char *TextEnd;

  const char *NextLine;

  if (Pos == StringRef::npos) {

    // Current line is completely verbatim.

    TextEnd = Newline;

    NextLine = skipNewline(Newline, CommentEnd);

  } else if (Pos == 0) {

    // Current line contains just an end command.

    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();

    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));

    formTokenWithChars(T, End, tok::verbatim_block_end);

    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());

    State = LS_Normal;

    return;

  } else {

    // There is some text, followed by end command.  Extract text first.

    TextEnd = BufferPtr + Pos;

    NextLine = TextEnd;

    // If there is only whitespace before end command, skip whitespace.

    if (isWhitespace(BufferPtr, TextEnd)) {

      BufferPtr = TextEnd;

      goto again;

    }

  }


  StringRef Text(BufferPtr, TextEnd - BufferPtr);

  formTokenWithChars(T, NextLine, tok::verbatim_block_line);

  T.setVerbatimBlockText(Text);


  State = LS_VerbatimBlockBody;

}


void Lexer::lexVerbatimBlockBody(Token &T) {

  assert(State == LS_VerbatimBlockBody);


  if (CommentState == LCS_InsideCComment)

    skipLineStartingDecorations();


  if (BufferPtr == CommentEnd) {

    formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);

    T.setVerbatimBlockText("");

    return;

  }


  lexVerbatimBlockFirstLine(T);

}


void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,

                                    const CommandInfo *Info) {

  assert(Info->IsVerbatimLineCommand);

  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);

  T.setVerbatimLineID(Info->getID());


  State = LS_VerbatimLineText;

}


void Lexer::lexVerbatimLineText(Token &T) {

  assert(State == LS_VerbatimLineText);


  // Extract current line.

  const char *Newline = findNewline(BufferPtr, CommentEnd);

  StringRef Text(BufferPtr, Newline - BufferPtr);

  formTokenWithChars(T, Newline, tok::verbatim_line_text);

  T.setVerbatimLineText(Text);


  State = LS_Normal;

}


void Lexer::lexHTMLCharacterReference(Token &T) {

  const char *TokenPtr = BufferPtr;

  assert(*TokenPtr == '&');

  TokenPtr++;

  if (TokenPtr == CommentEnd) {

    formTextToken(T, TokenPtr);

    return;

  }

  const char *NamePtr;

  bool isNamed = false;

  bool isDecimal = false;

  char C = *TokenPtr;

  if (isHTMLNamedCharacterReferenceCharacter(C)) {

    NamePtr = TokenPtr;

    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);

    isNamed = true;

  } else if (C == '#') {

    TokenPtr++;

    if (TokenPtr == CommentEnd) {

      formTextToken(T, TokenPtr);

      return;

    }

    C = *TokenPtr;

    if (isHTMLDecimalCharacterReferenceCharacter(C)) {

      NamePtr = TokenPtr;

      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);

      isDecimal = true;

    } else if (C == 'x' || C == 'X') {

      TokenPtr++;

      NamePtr = TokenPtr;

      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);

    } else {

      formTextToken(T, TokenPtr);

      return;

    }

  } else {

    formTextToken(T, TokenPtr);

    return;

  }

  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||

      *TokenPtr != ';') {

    formTextToken(T, TokenPtr);

    return;

  }

  StringRef Name(NamePtr, TokenPtr - NamePtr);

  TokenPtr++; // Skip semicolon.

  StringRef Resolved;

  if (isNamed)

    Resolved = resolveHTMLNamedCharacterReference(Name);

  else if (isDecimal)

    Resolved = resolveHTMLDecimalCharacterReference(Name);

  else

    Resolved = resolveHTMLHexCharacterReference(Name);


  if (Resolved.empty()) {

    formTextToken(T, TokenPtr);

    return;

  }

  formTokenWithChars(T, TokenPtr, tok::text);

  T.setText(Resolved);

}


void Lexer::setupAndLexHTMLStartTag(Token &T) {

  assert(BufferPtr[0] == '<' &&

         isHTMLIdentifierStartingCharacter(BufferPtr[1]));

  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);

  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));

  if (!isHTMLTagName(Name)) {

    formTextToken(T, TagNameEnd);

    return;

  }


  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);

  T.setHTMLTagStartName(Name);


  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);


  const char C = *BufferPtr;

  if (BufferPtr != CommentEnd &&

      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))

    State = LS_HTMLStartTag;

}


void Lexer::lexHTMLStartTag(Token &T) {

  assert(State == LS_HTMLStartTag);


  const char *TokenPtr = BufferPtr;

  char C = *TokenPtr;

  if (isHTMLIdentifierCharacter(C)) {

    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);

    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);

    formTokenWithChars(T, TokenPtr, tok::html_ident);

    T.setHTMLIdent(Ident);

  } else {

    switch (C) {

    case '=':

      TokenPtr++;

      formTokenWithChars(T, TokenPtr, tok::html_equals);

      break;

    case '\"':

    case '\'': {

      const char *OpenQuote = TokenPtr;

      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);

      const char *ClosingQuote = TokenPtr;

      if (TokenPtr != CommentEnd) // Skip closing quote.

        TokenPtr++;

      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);

      T.setHTMLQuotedString(StringRef(OpenQuote + 1,

                                      ClosingQuote - (OpenQuote + 1)));

      break;

    }

    case '>':

      TokenPtr++;

      formTokenWithChars(T, TokenPtr, tok::html_greater);

      State = LS_Normal;

      return;

    case '/':

      TokenPtr++;

      if (TokenPtr != CommentEnd && *TokenPtr == '>') {

        TokenPtr++;

        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);

      } else

        formTextToken(T, TokenPtr);


      State = LS_Normal;

      return;

    }

  }


  // Now look ahead and return to normal state if we don't see any HTML tokens

  // ahead.

  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);

  if (BufferPtr == CommentEnd) {

    State = LS_Normal;

    return;

  }


  C = *BufferPtr;

  if (!isHTMLIdentifierStartingCharacter(C) &&

      C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {

    State = LS_Normal;

    return;

  }

}


void Lexer::setupAndLexHTMLEndTag(Token &T) {

  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');


  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);

  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);

  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);

  if (!isHTMLTagName(Name)) {

    formTextToken(T, TagNameEnd);

    return;

  }


  const char *End = skipWhitespace(TagNameEnd, CommentEnd);


  formTokenWithChars(T, End, tok::html_end_tag);

  T.setHTMLTagEndName(Name);


  if (BufferPtr != CommentEnd && *BufferPtr == '>')

    State = LS_HTMLEndTag;

}


void Lexer::lexHTMLEndTag(Token &T) {

  assert(BufferPtr != CommentEnd && *BufferPtr == '>');


  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);

  State = LS_Normal;

}


Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,

             const CommandTraits &Traits, SourceLocation FileLoc,

             const char *BufferStart, const char *BufferEnd, bool ParseCommands)

    : Allocator(Allocator), Diags(Diags), Traits(Traits),

      BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),

      FileLoc(FileLoc), ParseCommands(ParseCommands),

      CommentState(LCS_BeforeComment), State(LS_Normal) {}


void Lexer::lex(Token &T) {

again:

  switch (CommentState) {

  case LCS_BeforeComment:

    if (BufferPtr == BufferEnd) {

      formTokenWithChars(T, BufferPtr, tok::eof);

      return;

    }


    assert(*BufferPtr == '/');

    BufferPtr++; // Skip first slash.

    switch(*BufferPtr) {

    case '/': { // BCPL comment.

      BufferPtr++; // Skip second slash.


      if (BufferPtr != BufferEnd) {

        // Skip Doxygen magic marker, if it is present.

        // It might be missing because of a typo //< or /*<, or because we

        // merged this non-Doxygen comment into a bunch of Doxygen comments

        // around it: /** ... */ /* ... */ /** ... */

        const char C = *BufferPtr;

        if (C == '/' || C == '!')

          BufferPtr++;

      }


      // Skip less-than symbol that marks trailing comments.

      // Skip it even if the comment is not a Doxygen one, because //< and /*<

      // are frequent typos.

      if (BufferPtr != BufferEnd && *BufferPtr == '<')

        BufferPtr++;


      CommentState = LCS_InsideBCPLComment;

      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)

        State = LS_Normal;

      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);

      goto again;

    }

    case '*': { // C comment.

      BufferPtr++; // Skip star.


      // Skip Doxygen magic marker.

      const char C = *BufferPtr;

      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')

        BufferPtr++;


      // Skip less-than symbol that marks trailing comments.

      if (BufferPtr != BufferEnd && *BufferPtr == '<')

        BufferPtr++;


      CommentState = LCS_InsideCComment;

      State = LS_Normal;

      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);

      goto again;

    }

    default:

      llvm_unreachable("second character of comment should be '/' or '*'");

    }


  case LCS_BetweenComments: {

    // Consecutive comments are extracted only if there is only whitespace

    // between them.  So we can search for the start of the next comment.

    const char *EndWhitespace = BufferPtr;

    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')

      EndWhitespace++;


    // Turn any whitespace between comments (and there is only whitespace

    // between them -- guaranteed by comment extraction) into a newline.  We

    // have two newlines between C comments in total (first one was synthesized

    // after a comment).

    formTokenWithChars(T, EndWhitespace, tok::newline);


    CommentState = LCS_BeforeComment;

    break;

  }


  case LCS_InsideBCPLComment:

  case LCS_InsideCComment:

    if (BufferPtr != CommentEnd) {

      lexCommentText(T);

      break;

    } else {

      // Skip C comment closing sequence.

      if (CommentState == LCS_InsideCComment) {

        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');

        BufferPtr += 2;

        assert(BufferPtr <= BufferEnd);


        // Synthenize newline just after the C comment, regardless if there is

        // actually a newline.

        formTokenWithChars(T, BufferPtr, tok::newline);


        CommentState = LCS_BetweenComments;

        break;

      } else {

        // Don't synthesized a newline after BCPL comment.

        CommentState = LCS_BetweenComments;

        goto again;

      }

    }

  }

}


StringRef Lexer::getSpelling(const Token &Tok,

                             const SourceManager &SourceMgr) const {

  SourceLocation Loc = Tok.getLocation();

  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);


  bool InvalidTemp = false;

  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);

  if (InvalidTemp)

    return StringRef();


  const char *Begin = File.data() + LocInfo.second;

  return StringRef(Begin, Tok.getLength());

}


} // end namespace comments

} // end namespace clang

SM
#define SM(sm)
Definition: Cuda.cpp:84

CharInfo.h

Kind
enum clang::sema::@1724::IndirectLocalPathEntry::EntryKind Kind

CommentCommandTraits.h

CommentLexer.h

isNamed
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Definition: Decl.cpp:3270

skipNewline
static unsigned skipNewline(const char *&First, const char *End)
Definition: DependencyDirectivesScanner.cpp:316

DiagnosticComment.h

Text
StringRef Text
Definition: Format.cpp:3052

Loc
SourceLocation Loc
Definition: SemaObjC.cpp:759

skipWhitespace
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
Definition: TextDiagnostic.cpp:512

Begin
SourceLocation Begin
Definition: USRLocFinder.cpp:165

clang::DiagnosticsEngine
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:231

clang::FixItHint::CreateReplacement
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:138

clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:88

clang::SourceLocation::print
void print(raw_ostream &OS, const SourceManager &SM) const
Definition: SourceLocation.cpp:62

clang::SourceManager
This class handles loading and caching of source files into memory.
Definition: SourceManager.h:663

clang::SourceManager::getBufferData
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
Definition: SourceManager.cpp:767

clang::SourceManager::getDecomposedLoc
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
Definition: SourceManager.h:1272

clang::comments::CommandTraits
This class provides information about commands that can be used in comments.
Definition: CommentCommandTraits.h:149

clang::comments::CommandTraits::getCommandInfo
const CommandInfo * getCommandInfo(StringRef Name) const
Definition: CommentCommandTraits.h:167

clang::comments::CommandTraits::getCommandInfoOrNULL
const CommandInfo * getCommandInfoOrNULL(StringRef Name) const
Definition: CommentCommandTraits.cpp:33

clang::comments::CommandTraits::getTypoCorrectCommandInfo
const CommandInfo * getTypoCorrectCommandInfo(StringRef Typo) const
Definition: CommentCommandTraits.cpp:46

clang::comments::Lexer
Comment lexer.
Definition: CommentLexer.h:220

clang::comments::Lexer::getSpelling
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const
Definition: CommentLexer.cpp:847

clang::comments::Lexer::lex
void lex(Token &T)
Definition: CommentLexer.cpp:745

clang::comments::Token
Comment token.
Definition: CommentLexer.h:55

clang::comments::Token::dump
void dump(const Lexer &L, const SourceManager &SM) const
Definition: CommentLexer.cpp:21

clang::comments::Token::getLength
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:95

clang::comments::Token::getLocation
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:80

clang::comments::tok::TokenKind
TokenKind
Definition: CommentLexer.h:32

clang::comments::tok::verbatim_block_line
@ verbatim_block_line
Definition: CommentLexer.h:40

clang::comments::tok::at_command
@ at_command
Definition: CommentLexer.h:38

clang::comments::tok::html_slash_greater
@ html_slash_greater
Definition: CommentLexer.h:49

clang::comments::tok::html_quoted_string
@ html_quoted_string
Definition: CommentLexer.h:47

clang::comments::tok::verbatim_line_text
@ verbatim_line_text
Definition: CommentLexer.h:43

clang::comments::tok::verbatim_block_end
@ verbatim_block_end
Definition: CommentLexer.h:41

clang::comments::tok::text
@ text
Definition: CommentLexer.h:35

clang::comments::tok::verbatim_block_begin
@ verbatim_block_begin
Definition: CommentLexer.h:39

clang::comments::tok::html_end_tag
@ html_end_tag
Definition: CommentLexer.h:50

clang::comments::tok::unknown_command
@ unknown_command
Definition: CommentLexer.h:36

clang::comments::tok::backslash_command
@ backslash_command
Definition: CommentLexer.h:37

clang::comments::tok::html_ident
@ html_ident
Definition: CommentLexer.h:45

clang::comments::tok::html_equals
@ html_equals
Definition: CommentLexer.h:46

clang::comments::tok::eof
@ eof
Definition: CommentLexer.h:33

clang::comments::tok::html_greater
@ html_greater
Definition: CommentLexer.h:48

clang::comments::tok::newline
@ newline
Definition: CommentLexer.h:34

clang::comments::tok::html_start_tag
@ html_start_tag
Definition: CommentLexer.h:44

clang::comments::tok::verbatim_line_name
@ verbatim_line_name
Definition: CommentLexer.h:42

clang::comments::isHTMLHexCharacterReferenceCharacter
static bool isHTMLHexCharacterReferenceCharacter(char C)
Definition: CommentLexer.cpp:35

clang::comments::convertCodePointToUTF8
static StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator, unsigned CodePoint)
Definition: CommentLexer.cpp:39

clang::comments::isHTMLNamedCharacterReferenceCharacter
static bool isHTMLNamedCharacterReferenceCharacter(char C)
Definition: CommentLexer.cpp:27

clang::comments::isHTMLDecimalCharacterReferenceCharacter
static bool isHTMLDecimalCharacterReferenceCharacter(char C)
Definition: CommentLexer.cpp:31

clang
The JSON file list parser is used to communicate input to InstallAPI.
Definition: CalledOnceCheck.h:17

clang::isVerticalWhitespace
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:99

clang::LinkageSpecLanguageIDs::C
@ C

clang::isLetter
LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
Definition: CharInfo.h:132

clang::isAlphanumeric
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
Definition: CharInfo.h:138

clang::isHorizontalWhitespace
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:91

clang::ObjCSubstitutionContext::Result
@ Result
The result type of a method or function.

clang::isDigit
LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
Definition: CharInfo.h:114

clang::isWhitespace
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:108

clang::isHexDigit
LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].
Definition: CharInfo.h:144

clang::T
const FunctionProtoType * T
Definition: RecursiveASTVisitor.h:1364

clang::SourceLocIdentKind::File
@ File

clang::SourceLocIdentKind::Line
@ Line

clang::comments::CommandInfo::Name
const char * Name
Definition: CommentCommandTraits.h:37

clang::comments::CommandInfo::getID
unsigned getID() const
Definition: CommentCommandTraits.h:33