14#include "llvm/ADT/StringExtras.h"
15#include "llvm/ADT/StringSwitch.h"
16#include "llvm/Support/ConvertUTF.h"
17#include "llvm/Support/ErrorHandling.h"
23 llvm::errs() <<
"comments::Token Kind=" << Kind <<
" ";
24 Loc.print(llvm::errs(),
SM);
25 llvm::errs() <<
" " << Length <<
" \"" << L.
getSpelling(*
this,
SM) <<
"\"\n";
41 llvm::BumpPtrAllocator &Allocator,
43 char *Resolved = Allocator.Allocate<
char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
44 char *ResolvedPtr = Resolved;
45 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
46 return StringRef(Resolved, ResolvedPtr - Resolved);
53#include "clang/AST/CommentHTMLTags.inc"
54#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
58StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name)
const {
60 return llvm::StringSwitch<StringRef>(Name)
67 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
70StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name)
const {
71 unsigned CodePoint = 0;
72 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
75 CodePoint += Name[i] -
'0';
80StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name)
const {
81 unsigned CodePoint = 0;
82 for (
unsigned i = 0, e = Name.size(); i != e; ++i) {
84 const char C = Name[i];
86 CodePoint += llvm::hexDigitValue(
C);
91void Lexer::skipLineStartingDecorations() {
93 assert(CommentState == LCS_InsideCComment);
95 if (BufferPtr == CommentEnd)
98 const char *NewBufferPtr = BufferPtr;
100 if (++NewBufferPtr == CommentEnd)
102 if (*NewBufferPtr ==
'*')
103 BufferPtr = NewBufferPtr + 1;
108const char *findNewline(
const char *BufferPtr,
const char *BufferEnd) {
109 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
116const char *
skipNewline(
const char *BufferPtr,
const char *BufferEnd) {
117 if (BufferPtr == BufferEnd)
120 if (*BufferPtr ==
'\n')
123 assert(*BufferPtr ==
'\r');
125 if (BufferPtr != BufferEnd && *BufferPtr ==
'\n')
131const char *skipNamedCharacterReference(
const char *BufferPtr,
132 const char *BufferEnd) {
133 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
140const char *skipDecimalCharacterReference(
const char *BufferPtr,
141 const char *BufferEnd) {
142 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
149const char *skipHexCharacterReference(
const char *BufferPtr,
150 const char *BufferEnd) {
151 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
158bool isHTMLIdentifierStartingCharacter(
char C) {
162bool isHTMLIdentifierCharacter(
char C) {
166const char *skipHTMLIdentifier(
const char *BufferPtr,
const char *BufferEnd) {
167 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
168 if (!isHTMLIdentifierCharacter(*BufferPtr))
178const char *skipHTMLQuotedString(
const char *BufferPtr,
const char *BufferEnd)
180 const char Quote = *BufferPtr;
181 assert(Quote ==
'\"' || Quote ==
'\'');
184 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
185 const char C = *BufferPtr;
186 if (
C == Quote && BufferPtr[-1] !=
'\\')
192const char *
skipWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
193 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
200const char *skipHorizontalWhitespace(
const char *BufferPtr,
201 const char *BufferEnd) {
202 for (; BufferPtr != BufferEnd; ++BufferPtr) {
209bool isWhitespace(
const char *BufferPtr,
const char *BufferEnd) {
213bool isCommandNameStartCharacter(
char C) {
217bool isCommandNameCharacter(
char C) {
221const char *skipCommandName(
const char *BufferPtr,
const char *BufferEnd) {
222 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
223 if (!isCommandNameCharacter(*BufferPtr))
231const char *findBCPLCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
232 const char *CurPtr = BufferPtr;
233 while (CurPtr != BufferEnd) {
236 if (CurPtr == BufferEnd)
240 const char *EscapePtr = CurPtr - 1;
244 if (*EscapePtr ==
'\\' ||
245 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] ==
'/' &&
246 EscapePtr[-1] ==
'?' && EscapePtr[-2] ==
'?')) {
257const char *findCCommentEnd(
const char *BufferPtr,
const char *BufferEnd) {
258 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
259 if (*BufferPtr ==
'*') {
260 assert(BufferPtr + 1 != BufferEnd);
261 if (*(BufferPtr + 1) ==
'/')
265 llvm_unreachable(
"buffer end hit before '*/' was seen");
270void Lexer::formTokenWithChars(
Token &
Result,
const char *TokEnd,
272 const unsigned TokLen = TokEnd - BufferPtr;
273 Result.setLocation(getSourceLocation(BufferPtr));
277 Result.TextPtr =
"<UNSET>";
283const char *Lexer::skipTextToken() {
284 const char *TokenPtr = BufferPtr;
285 assert(TokenPtr < CommentEnd);
286 StringRef TokStartSymbols = ParseCommands ?
"\n\r\\@\"&<" :
"\n\r";
290 StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
291 if (End == StringRef::npos)
296 if (*(TokenPtr + End) ==
'\"') {
298 End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(
"\n\r\"");
299 if (End != StringRef::npos && *(TokenPtr + End) ==
'\"')
303 return TokenPtr + End;
306void Lexer::lexCommentText(
Token &T) {
307 assert(CommentState == LCS_InsideBCPLComment ||
308 CommentState == LCS_InsideCComment);
311 auto HandleNonCommandToken = [&]() ->
void {
312 assert(State == LS_Normal);
314 const char *TokenPtr = BufferPtr;
315 assert(TokenPtr < CommentEnd);
322 if (CommentState == LCS_InsideCComment)
323 skipLineStartingDecorations();
327 return formTextToken(T, skipTextToken());
332 return HandleNonCommandToken();
337 case LS_VerbatimBlockFirstLine:
338 lexVerbatimBlockFirstLine(T);
340 case LS_VerbatimBlockBody:
341 lexVerbatimBlockBody(T);
343 case LS_VerbatimLineText:
344 lexVerbatimLineText(T);
346 case LS_HTMLStartTag:
354 assert(State == LS_Normal);
355 const char *TokenPtr = BufferPtr;
356 assert(TokenPtr < CommentEnd);
366 if (TokenPtr == CommentEnd) {
367 formTextToken(T, TokenPtr);
375 case '\\':
case '@':
case '&':
case '$':
376 case '#':
case '<':
case '>':
case '%':
377 case '\"':
case '.':
case ':':
380 if (
C ==
':' && TokenPtr != CommentEnd && *TokenPtr ==
':') {
384 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
385 formTokenWithChars(T, TokenPtr,
tok::text);
386 T.setText(UnescapedText);
391 if (!isCommandNameStartCharacter(*TokenPtr)) {
392 formTextToken(T, TokenPtr);
396 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
397 unsigned Length = TokenPtr - (BufferPtr + 1);
401 if (Length == 1 && TokenPtr[-1] ==
'f' && TokenPtr != CommentEnd) {
403 if (
C ==
'$' ||
C ==
'(' ||
C ==
')' ||
C ==
'[' ||
C ==
']' ||
404 C ==
'{' ||
C ==
'}') {
410 StringRef CommandName(BufferPtr + 1, Length);
412 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
414 if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
415 StringRef CorrectedName = Info->Name;
416 SourceLocation Loc = getSourceLocation(BufferPtr);
417 SourceLocation EndLoc = getSourceLocation(TokenPtr);
418 SourceRange FullRange = SourceRange(Loc, EndLoc);
419 SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
420 Diag(Loc, diag::warn_correct_comment_command_name)
421 << FullRange << CommandName << CorrectedName
424 formTokenWithChars(T, TokenPtr,
428 T.setUnknownCommandName(CommandName);
429 Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
430 << SourceRange(T.getLocation(), T.getEndLocation());
434 if (Info->IsVerbatimBlockCommand) {
435 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
438 if (Info->IsVerbatimLineCommand) {
439 setupAndLexVerbatimLine(T, TokenPtr, Info);
442 formTokenWithChars(T, TokenPtr, CommandKind);
443 T.setCommandID(Info->getID());
448 lexHTMLCharacterReference(T);
453 if (TokenPtr == CommentEnd) {
454 formTextToken(T, TokenPtr);
457 const char C = *TokenPtr;
458 if (isHTMLIdentifierStartingCharacter(
C))
459 setupAndLexHTMLStartTag(T);
461 setupAndLexHTMLEndTag(T);
463 formTextToken(T, TokenPtr);
468 return HandleNonCommandToken();
472void Lexer::setupAndLexVerbatimBlock(
Token &T,
473 const char *TextBegin,
475 assert(Info->IsVerbatimBlockCommand);
477 VerbatimBlockEndCommandName.clear();
478 VerbatimBlockEndCommandName.append(Marker ==
'\\' ?
"\\" :
"@");
479 VerbatimBlockEndCommandName.append(Info->EndCommandName);
482 T.setVerbatimBlockID(Info->getID());
487 if (BufferPtr != CommentEnd &&
490 State = LS_VerbatimBlockBody;
494 State = LS_VerbatimBlockFirstLine;
497void Lexer::lexVerbatimBlockFirstLine(
Token &T) {
499 assert(BufferPtr < CommentEnd);
505 const char *Newline = findNewline(BufferPtr, CommentEnd);
506 StringRef
Line(BufferPtr, Newline - BufferPtr);
509 size_t Pos =
Line.find(VerbatimBlockEndCommandName);
511 const char *NextLine;
512 if (Pos == StringRef::npos) {
516 }
else if (Pos == 0) {
518 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
519 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
521 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
526 TextEnd = BufferPtr + Pos;
535 StringRef
Text(BufferPtr, TextEnd - BufferPtr);
537 T.setVerbatimBlockText(
Text);
539 State = LS_VerbatimBlockBody;
542void Lexer::lexVerbatimBlockBody(
Token &T) {
543 assert(State == LS_VerbatimBlockBody);
545 if (CommentState == LCS_InsideCComment)
546 skipLineStartingDecorations();
548 if (BufferPtr == CommentEnd) {
550 T.setVerbatimBlockText(
"");
554 lexVerbatimBlockFirstLine(T);
557void Lexer::setupAndLexVerbatimLine(
Token &T,
const char *TextBegin,
559 assert(Info->IsVerbatimLineCommand);
561 T.setVerbatimLineID(Info->getID());
563 State = LS_VerbatimLineText;
566void Lexer::lexVerbatimLineText(
Token &T) {
567 assert(State == LS_VerbatimLineText);
570 const char *Newline = findNewline(BufferPtr, CommentEnd);
571 StringRef
Text(BufferPtr, Newline - BufferPtr);
573 T.setVerbatimLineText(
Text);
578void Lexer::lexHTMLCharacterReference(
Token &T) {
579 const char *TokenPtr = BufferPtr;
580 assert(*TokenPtr ==
'&');
582 if (TokenPtr == CommentEnd) {
583 formTextToken(T, TokenPtr);
588 bool isDecimal =
false;
592 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
594 }
else if (
C ==
'#') {
596 if (TokenPtr == CommentEnd) {
597 formTextToken(T, TokenPtr);
603 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
605 }
else if (
C ==
'x' ||
C ==
'X') {
608 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
610 formTextToken(T, TokenPtr);
614 formTextToken(T, TokenPtr);
617 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
619 formTextToken(T, TokenPtr);
622 StringRef Name(NamePtr, TokenPtr - NamePtr);
626 Resolved = resolveHTMLNamedCharacterReference(Name);
628 Resolved = resolveHTMLDecimalCharacterReference(Name);
630 Resolved = resolveHTMLHexCharacterReference(Name);
632 if (Resolved.empty()) {
633 formTextToken(T, TokenPtr);
636 formTokenWithChars(T, TokenPtr,
tok::text);
640void Lexer::setupAndLexHTMLStartTag(
Token &T) {
641 assert(BufferPtr[0] ==
'<' &&
642 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
643 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
644 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
645 if (!isHTMLTagName(Name)) {
646 formTextToken(T, TagNameEnd);
651 T.setHTMLTagStartName(Name);
653 BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
654 if (BufferPtr == CommentEnd) {
655 State = LS_HTMLStartTag;
659 const char C = *BufferPtr;
660 if (BufferPtr != CommentEnd &&
662 isHTMLIdentifierStartingCharacter(
C)))
663 State = LS_HTMLStartTag;
666void Lexer::lexHTMLStartTag(
Token &T) {
667 assert(State == LS_HTMLStartTag);
673 if (CommentState == LCS_InsideCComment)
674 skipLineStartingDecorations();
676 BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
677 if (BufferPtr == CommentEnd) {
688 const char *TokenPtr = BufferPtr;
690 if (isHTMLIdentifierCharacter(
C)) {
691 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
692 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
694 T.setHTMLIdent(Ident);
703 const char *OpenQuote = TokenPtr;
704 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
705 const char *ClosingQuote = TokenPtr;
706 if (TokenPtr != CommentEnd)
709 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
710 ClosingQuote - (OpenQuote + 1)));
720 if (TokenPtr != CommentEnd && *TokenPtr ==
'>') {
724 formTextToken(T, TokenPtr);
733 BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
734 if (BufferPtr == CommentEnd) {
740 C !=
'=' &&
C !=
'\"' &&
C !=
'\'' &&
C !=
'>' &&
C !=
'/') {
746void Lexer::setupAndLexHTMLEndTag(
Token &T) {
747 assert(BufferPtr[0] ==
'<' && BufferPtr[1] ==
'/');
749 const char *TagNameBegin =
skipWhitespace(BufferPtr + 2, CommentEnd);
750 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
751 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
752 if (!isHTMLTagName(Name)) {
753 formTextToken(T, TagNameEnd);
760 T.setHTMLTagEndName(Name);
762 if (BufferPtr != CommentEnd && *BufferPtr ==
'>')
763 State = LS_HTMLEndTag;
766void Lexer::lexHTMLEndTag(
Token &T) {
767 assert(BufferPtr != CommentEnd && *BufferPtr ==
'>');
775 const char *BufferStart,
const char *BufferEnd,
bool ParseCommands)
776 : Allocator(Allocator), Diags(Diags), Traits(Traits),
777 BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
778 FileLoc(FileLoc), ParseCommands(ParseCommands),
779 CommentState(LCS_BeforeComment), State(LS_Normal) {}
783 switch (CommentState) {
784 case LCS_BeforeComment:
785 if (BufferPtr == BufferEnd) {
786 formTokenWithChars(T, BufferPtr,
tok::eof);
790 assert(*BufferPtr ==
'/');
796 if (BufferPtr != BufferEnd) {
801 const char C = *BufferPtr;
802 if (
C ==
'/' ||
C ==
'!')
809 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
812 CommentState = LCS_InsideBCPLComment;
814 case LS_VerbatimBlockFirstLine:
815 case LS_VerbatimBlockBody:
817 case LS_HTMLStartTag:
818 BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
824 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
831 const char C = *BufferPtr;
832 if ((
C ==
'*' && *(BufferPtr + 1) !=
'/') ||
C ==
'!')
836 if (BufferPtr != BufferEnd && *BufferPtr ==
'<')
839 CommentState = LCS_InsideCComment;
841 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
845 llvm_unreachable(
"second character of comment should be '/' or '*'");
848 case LCS_BetweenComments: {
851 const char *EndWhitespace = BufferPtr;
852 while(EndWhitespace != BufferEnd && *EndWhitespace !=
'/')
857 if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
858 CommentState = LCS_BeforeComment;
859 BufferPtr = EndWhitespace;
869 CommentState = LCS_BeforeComment;
873 case LCS_InsideBCPLComment:
874 case LCS_InsideCComment:
875 if (BufferPtr != CommentEnd) {
880 if (CommentState == LCS_InsideCComment) {
881 assert(BufferPtr[0] ==
'*' && BufferPtr[1] ==
'/');
883 assert(BufferPtr <= BufferEnd);
888 if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
889 CommentState = LCS_BetweenComments;
897 CommentState = LCS_BetweenComments;
901 CommentState = LCS_BetweenComments;
913 bool InvalidTemp =
false;
914 StringRef
File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
918 const char *Begin =
File.data() + LocInfo.second;
919 return StringRef(Begin,
Tok.getLength());
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
static unsigned skipNewline(const char *&First, const char *End)
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
Concrete class used by the front-end to report problems and issues.
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Encodes a location in the source.
This class handles loading and caching of source files into memory.
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
std::pair< FileID, unsigned > FileIDAndOffset
LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
@ Result
The result type of a method or function.
LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].