clang 23.0.0git
CommentLexer.cpp
Go to the documentation of this file.
1//===--- CommentLexer.cpp -------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "clang/AST/Comment.h"
14#include "llvm/ADT/StringExtras.h"
15#include "llvm/ADT/StringSwitch.h"
16#include "llvm/Support/ConvertUTF.h"
17#include "llvm/Support/ErrorHandling.h"
18
19namespace clang {
20namespace comments {
21
22void Token::dump(const Lexer &L, const SourceManager &SM) const {
23 llvm::errs() << "comments::Token Kind=" << Kind << " ";
24 Loc.print(llvm::errs(), SM);
25 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
26}
27
28static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
29 return isLetter(C);
30}
31
33 return isDigit(C);
34}
35
36static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
37 return isHexDigit(C);
38}
39
40static inline StringRef convertCodePointToUTF8(
41 llvm::BumpPtrAllocator &Allocator,
42 unsigned CodePoint) {
43 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
44 char *ResolvedPtr = Resolved;
45 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
46 return StringRef(Resolved, ResolvedPtr - Resolved);
47 else
48 return StringRef();
49}
50
51namespace {
52
53#include "clang/AST/CommentHTMLTags.inc"
54#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
55
56} // end anonymous namespace
57
58StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
59 // Fast path, first check a few most widely used named character references.
60 return llvm::StringSwitch<StringRef>(Name)
61 .Case("amp", "&")
62 .Case("lt", "<")
63 .Case("gt", ">")
64 .Case("quot", "\"")
65 .Case("apos", "\'")
66 // Slow path.
67 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
68}
69
70StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
71 unsigned CodePoint = 0;
72 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74 CodePoint *= 10;
75 CodePoint += Name[i] - '0';
76 }
77 return convertCodePointToUTF8(Allocator, CodePoint);
78}
79
80StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
81 unsigned CodePoint = 0;
82 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
83 CodePoint *= 16;
84 const char C = Name[i];
86 CodePoint += llvm::hexDigitValue(C);
87 }
88 return convertCodePointToUTF8(Allocator, CodePoint);
89}
90
91void Lexer::skipLineStartingDecorations() {
92 // This function should be called only for C comments
93 assert(CommentState == LCS_InsideCComment);
94
95 if (BufferPtr == CommentEnd)
96 return;
97
98 const char *NewBufferPtr = BufferPtr;
99 while (isHorizontalWhitespace(*NewBufferPtr))
100 if (++NewBufferPtr == CommentEnd)
101 return;
102 if (*NewBufferPtr == '*')
103 BufferPtr = NewBufferPtr + 1;
104}
105
106namespace {
107/// Returns pointer to the first newline character in the string.
108const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
109 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
110 if (isVerticalWhitespace(*BufferPtr))
111 return BufferPtr;
112 }
113 return BufferEnd;
114}
115
116const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
117 if (BufferPtr == BufferEnd)
118 return BufferPtr;
119
120 if (*BufferPtr == '\n')
121 BufferPtr++;
122 else {
123 assert(*BufferPtr == '\r');
124 BufferPtr++;
125 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
126 BufferPtr++;
127 }
128 return BufferPtr;
129}
130
131const char *skipNamedCharacterReference(const char *BufferPtr,
132 const char *BufferEnd) {
133 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
135 return BufferPtr;
136 }
137 return BufferEnd;
138}
139
140const char *skipDecimalCharacterReference(const char *BufferPtr,
141 const char *BufferEnd) {
142 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144 return BufferPtr;
145 }
146 return BufferEnd;
147}
148
149const char *skipHexCharacterReference(const char *BufferPtr,
150 const char *BufferEnd) {
151 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153 return BufferPtr;
154 }
155 return BufferEnd;
156}
157
158bool isHTMLIdentifierStartingCharacter(char C) {
159 return isLetter(C);
160}
161
162bool isHTMLIdentifierCharacter(char C) {
163 return isAlphanumeric(C);
164}
165
166const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
167 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
168 if (!isHTMLIdentifierCharacter(*BufferPtr))
169 return BufferPtr;
170 }
171 return BufferEnd;
172}
173
174/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
175/// string allowed.
176///
177/// Returns pointer to closing quote.
178const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
179{
180 const char Quote = *BufferPtr;
181 assert(Quote == '\"' || Quote == '\'');
182
183 BufferPtr++;
184 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
185 const char C = *BufferPtr;
186 if (C == Quote && BufferPtr[-1] != '\\')
187 return BufferPtr;
188 }
189 return BufferEnd;
190}
191
192const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
193 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
194 if (!isWhitespace(*BufferPtr))
195 return BufferPtr;
196 }
197 return BufferEnd;
198}
199
200const char *skipHorizontalWhitespace(const char *BufferPtr,
201 const char *BufferEnd) {
202 for (; BufferPtr != BufferEnd; ++BufferPtr) {
203 if (!isHorizontalWhitespace(*BufferPtr))
204 return BufferPtr;
205 }
206 return BufferEnd;
207}
208
209bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
210 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
211}
212
213bool isCommandNameStartCharacter(char C) {
214 return isLetter(C);
215}
216
217bool isCommandNameCharacter(char C) {
218 return isAsciiIdentifierContinue(C, false);
219}
220
221const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
222 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
223 if (!isCommandNameCharacter(*BufferPtr))
224 return BufferPtr;
225 }
226 return BufferEnd;
227}
228
229/// Return the one past end pointer for BCPL comments.
230/// Handles newlines escaped with backslash or trigraph for backslahs.
231const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
232 const char *CurPtr = BufferPtr;
233 while (CurPtr != BufferEnd) {
234 while (!isVerticalWhitespace(*CurPtr)) {
235 CurPtr++;
236 if (CurPtr == BufferEnd)
237 return BufferEnd;
238 }
239 // We found a newline, check if it is escaped.
240 const char *EscapePtr = CurPtr - 1;
241 while(isHorizontalWhitespace(*EscapePtr))
242 EscapePtr--;
243
244 if (*EscapePtr == '\\' ||
245 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
246 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
247 // We found an escaped newline.
248 CurPtr = skipNewline(CurPtr, BufferEnd);
249 } else
250 return CurPtr; // Not an escaped newline.
251 }
252 return BufferEnd;
253}
254
255/// Return the one past end pointer for C comments.
256/// Very dumb, does not handle escaped newlines or trigraphs.
257const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
258 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
259 if (*BufferPtr == '*') {
260 assert(BufferPtr + 1 != BufferEnd);
261 if (*(BufferPtr + 1) == '/')
262 return BufferPtr;
263 }
264 }
265 llvm_unreachable("buffer end hit before '*/' was seen");
266}
267
268} // end anonymous namespace
269
270void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
271 tok::TokenKind Kind) {
272 const unsigned TokLen = TokEnd - BufferPtr;
273 Result.setLocation(getSourceLocation(BufferPtr));
274 Result.setKind(Kind);
275 Result.setLength(TokLen);
276#ifndef NDEBUG
277 Result.TextPtr = "<UNSET>";
278 Result.IntVal = 7;
279#endif
280 BufferPtr = TokEnd;
281}
282
283const char *Lexer::skipTextToken() {
284 const char *TokenPtr = BufferPtr;
285 assert(TokenPtr < CommentEnd);
286 StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
287
288again:
289 size_t End =
290 StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
291 if (End == StringRef::npos)
292 return CommentEnd;
293
294 // Doxygen doesn't recognize any commands in a one-line double quotation.
295 // If we don't find an ending quotation mark, we pretend it never began.
296 if (*(TokenPtr + End) == '\"') {
297 TokenPtr += End + 1;
298 End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
299 if (End != StringRef::npos && *(TokenPtr + End) == '\"')
300 TokenPtr += End + 1;
301 goto again;
302 }
303 return TokenPtr + End;
304}
305
306void Lexer::lexCommentText(Token &T) {
307 assert(CommentState == LCS_InsideBCPLComment ||
308 CommentState == LCS_InsideCComment);
309
310 // Handles lexing non-command text, i.e. text and newline.
311 auto HandleNonCommandToken = [&]() -> void {
312 assert(State == LS_Normal);
313
314 const char *TokenPtr = BufferPtr;
315 assert(TokenPtr < CommentEnd);
316 switch (*TokenPtr) {
317 case '\n':
318 case '\r':
319 TokenPtr = skipNewline(TokenPtr, CommentEnd);
320 formTokenWithChars(T, TokenPtr, tok::newline);
321
322 if (CommentState == LCS_InsideCComment)
323 skipLineStartingDecorations();
324 return;
325
326 default:
327 return formTextToken(T, skipTextToken());
328 }
329 };
330
331 if (!ParseCommands)
332 return HandleNonCommandToken();
333
334 switch (State) {
335 case LS_Normal:
336 break;
337 case LS_VerbatimBlockFirstLine:
338 lexVerbatimBlockFirstLine(T);
339 return;
340 case LS_VerbatimBlockBody:
341 lexVerbatimBlockBody(T);
342 return;
343 case LS_VerbatimLineText:
344 lexVerbatimLineText(T);
345 return;
346 case LS_HTMLStartTag:
347 lexHTMLStartTag(T);
348 return;
349 case LS_HTMLEndTag:
350 lexHTMLEndTag(T);
351 return;
352 }
353
354 assert(State == LS_Normal);
355 const char *TokenPtr = BufferPtr;
356 assert(TokenPtr < CommentEnd);
357 switch(*TokenPtr) {
358 case '\\':
359 case '@': {
360 // Commands that start with a backslash and commands that start with
361 // 'at' have equivalent semantics. But we keep information about the
362 // exact syntax in AST for comments.
363 tok::TokenKind CommandKind =
364 (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
365 TokenPtr++;
366 if (TokenPtr == CommentEnd) {
367 formTextToken(T, TokenPtr);
368 return;
369 }
370 char C = *TokenPtr;
371 switch (C) {
372 default:
373 break;
374
375 case '\\': case '@': case '&': case '$':
376 case '#': case '<': case '>': case '%':
377 case '\"': case '.': case ':':
378 // This is one of \\ \@ \& \$ etc escape sequences.
379 TokenPtr++;
380 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
381 // This is the \:: escape sequence.
382 TokenPtr++;
383 }
384 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
385 formTokenWithChars(T, TokenPtr, tok::text);
386 T.setText(UnescapedText);
387 return;
388 }
389
390 // Don't make zero-length commands.
391 if (!isCommandNameStartCharacter(*TokenPtr)) {
392 formTextToken(T, TokenPtr);
393 return;
394 }
395
396 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
397 unsigned Length = TokenPtr - (BufferPtr + 1);
398
399 // Hardcoded support for lexing LaTeX formula commands
400 // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
401 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
402 C = *TokenPtr;
403 if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
404 C == '{' || C == '}') {
405 TokenPtr++;
406 Length++;
407 }
408 }
409
410 StringRef CommandName(BufferPtr + 1, Length);
411
412 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
413 if (!Info) {
414 if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
415 StringRef CorrectedName = Info->Name;
416 SourceLocation Loc = getSourceLocation(BufferPtr);
417 SourceLocation EndLoc = getSourceLocation(TokenPtr);
418 SourceRange FullRange = SourceRange(Loc, EndLoc);
419 SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
420 Diag(Loc, diag::warn_correct_comment_command_name)
421 << FullRange << CommandName << CorrectedName
422 << FixItHint::CreateReplacement(CommandRange, CorrectedName);
423 } else {
424 formTokenWithChars(T, TokenPtr,
425 CommandKind == tok::backslash_command
428 T.setUnknownCommandName(CommandName);
429 Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
430 << SourceRange(T.getLocation(), T.getEndLocation());
431 return;
432 }
433 }
434 if (Info->IsVerbatimBlockCommand) {
435 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
436 return;
437 }
438 if (Info->IsVerbatimLineCommand) {
439 setupAndLexVerbatimLine(T, TokenPtr, Info);
440 return;
441 }
442 formTokenWithChars(T, TokenPtr, CommandKind);
443 T.setCommandID(Info->getID());
444 return;
445 }
446
447 case '&':
448 lexHTMLCharacterReference(T);
449 return;
450
451 case '<': {
452 TokenPtr++;
453 if (TokenPtr == CommentEnd) {
454 formTextToken(T, TokenPtr);
455 return;
456 }
457 const char C = *TokenPtr;
458 if (isHTMLIdentifierStartingCharacter(C))
459 setupAndLexHTMLStartTag(T);
460 else if (C == '/')
461 setupAndLexHTMLEndTag(T);
462 else
463 formTextToken(T, TokenPtr);
464 return;
465 }
466
467 default:
468 return HandleNonCommandToken();
469 }
470}
471
472void Lexer::setupAndLexVerbatimBlock(Token &T,
473 const char *TextBegin,
474 char Marker, const CommandInfo *Info) {
475 assert(Info->IsVerbatimBlockCommand);
476
477 VerbatimBlockEndCommandName.clear();
478 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
479 VerbatimBlockEndCommandName.append(Info->EndCommandName);
480
481 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
482 T.setVerbatimBlockID(Info->getID());
483
484 // If there is a newline following the verbatim opening command, skip the
485 // newline so that we don't create an tok::verbatim_block_line with empty
486 // text content.
487 if (BufferPtr != CommentEnd &&
488 isVerticalWhitespace(*BufferPtr)) {
489 BufferPtr = skipNewline(BufferPtr, CommentEnd);
490 State = LS_VerbatimBlockBody;
491 return;
492 }
493
494 State = LS_VerbatimBlockFirstLine;
495}
496
497void Lexer::lexVerbatimBlockFirstLine(Token &T) {
498again:
499 assert(BufferPtr < CommentEnd);
500
501 // FIXME: It would be better to scan the text once, finding either the block
502 // end command or newline.
503 //
504 // Extract current line.
505 const char *Newline = findNewline(BufferPtr, CommentEnd);
506 StringRef Line(BufferPtr, Newline - BufferPtr);
507
508 // Look for end command in current line.
509 size_t Pos = Line.find(VerbatimBlockEndCommandName);
510 const char *TextEnd;
511 const char *NextLine;
512 if (Pos == StringRef::npos) {
513 // Current line is completely verbatim.
514 TextEnd = Newline;
515 NextLine = skipNewline(Newline, CommentEnd);
516 } else if (Pos == 0) {
517 // Current line contains just an end command.
518 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
519 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
520 formTokenWithChars(T, End, tok::verbatim_block_end);
521 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
522 State = LS_Normal;
523 return;
524 } else {
525 // There is some text, followed by end command. Extract text first.
526 TextEnd = BufferPtr + Pos;
527 NextLine = TextEnd;
528 // If there is only whitespace before end command, skip whitespace.
529 if (isWhitespace(BufferPtr, TextEnd)) {
530 BufferPtr = TextEnd;
531 goto again;
532 }
533 }
534
535 StringRef Text(BufferPtr, TextEnd - BufferPtr);
536 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
537 T.setVerbatimBlockText(Text);
538
539 State = LS_VerbatimBlockBody;
540}
541
542void Lexer::lexVerbatimBlockBody(Token &T) {
543 assert(State == LS_VerbatimBlockBody);
544
545 if (CommentState == LCS_InsideCComment)
546 skipLineStartingDecorations();
547
548 if (BufferPtr == CommentEnd) {
549 formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
550 T.setVerbatimBlockText("");
551 return;
552 }
553
554 lexVerbatimBlockFirstLine(T);
555}
556
557void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
558 const CommandInfo *Info) {
559 assert(Info->IsVerbatimLineCommand);
560 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
561 T.setVerbatimLineID(Info->getID());
562
563 State = LS_VerbatimLineText;
564}
565
566void Lexer::lexVerbatimLineText(Token &T) {
567 assert(State == LS_VerbatimLineText);
568
569 // Extract current line.
570 const char *Newline = findNewline(BufferPtr, CommentEnd);
571 StringRef Text(BufferPtr, Newline - BufferPtr);
572 formTokenWithChars(T, Newline, tok::verbatim_line_text);
573 T.setVerbatimLineText(Text);
574
575 State = LS_Normal;
576}
577
578void Lexer::lexHTMLCharacterReference(Token &T) {
579 const char *TokenPtr = BufferPtr;
580 assert(*TokenPtr == '&');
581 TokenPtr++;
582 if (TokenPtr == CommentEnd) {
583 formTextToken(T, TokenPtr);
584 return;
585 }
586 const char *NamePtr;
587 bool isNamed = false;
588 bool isDecimal = false;
589 char C = *TokenPtr;
591 NamePtr = TokenPtr;
592 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
593 isNamed = true;
594 } else if (C == '#') {
595 TokenPtr++;
596 if (TokenPtr == CommentEnd) {
597 formTextToken(T, TokenPtr);
598 return;
599 }
600 C = *TokenPtr;
602 NamePtr = TokenPtr;
603 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
604 isDecimal = true;
605 } else if (C == 'x' || C == 'X') {
606 TokenPtr++;
607 NamePtr = TokenPtr;
608 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
609 } else {
610 formTextToken(T, TokenPtr);
611 return;
612 }
613 } else {
614 formTextToken(T, TokenPtr);
615 return;
616 }
617 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
618 *TokenPtr != ';') {
619 formTextToken(T, TokenPtr);
620 return;
621 }
622 StringRef Name(NamePtr, TokenPtr - NamePtr);
623 TokenPtr++; // Skip semicolon.
624 StringRef Resolved;
625 if (isNamed)
626 Resolved = resolveHTMLNamedCharacterReference(Name);
627 else if (isDecimal)
628 Resolved = resolveHTMLDecimalCharacterReference(Name);
629 else
630 Resolved = resolveHTMLHexCharacterReference(Name);
631
632 if (Resolved.empty()) {
633 formTextToken(T, TokenPtr);
634 return;
635 }
636 formTokenWithChars(T, TokenPtr, tok::text);
637 T.setText(Resolved);
638}
639
640void Lexer::setupAndLexHTMLStartTag(Token &T) {
641 assert(BufferPtr[0] == '<' &&
642 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
643 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
644 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
645 if (!isHTMLTagName(Name)) {
646 formTextToken(T, TagNameEnd);
647 return;
648 }
649
650 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
651 T.setHTMLTagStartName(Name);
652
653 BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
654 if (BufferPtr == CommentEnd) { // in BCPL comments
655 State = LS_HTMLStartTag;
656 return;
657 }
658
659 const char C = *BufferPtr;
660 if (BufferPtr != CommentEnd &&
661 (C == '>' || C == '/' || isVerticalWhitespace(C) ||
662 isHTMLIdentifierStartingCharacter(C)))
663 State = LS_HTMLStartTag;
664}
665
666void Lexer::lexHTMLStartTag(Token &T) {
667 assert(State == LS_HTMLStartTag);
668
669 // Skip leading whitespace and comment decorations
670 while (isVerticalWhitespace(*BufferPtr)) {
671 BufferPtr = skipNewline(BufferPtr, CommentEnd);
672
673 if (CommentState == LCS_InsideCComment)
674 skipLineStartingDecorations();
675
676 BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
677 if (BufferPtr == CommentEnd) {
678 // HTML starting tags must be defined in a single comment block.
679 // It's likely a user-error where they forgot to terminate the comment.
680 State = LS_Normal;
681 // Since at least one newline was skipped and one token needs to be lexed,
682 // return a newline.
683 formTokenWithChars(T, BufferPtr, tok::newline);
684 return;
685 }
686 }
687
688 const char *TokenPtr = BufferPtr;
689 char C = *TokenPtr;
690 if (isHTMLIdentifierCharacter(C)) {
691 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
692 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
693 formTokenWithChars(T, TokenPtr, tok::html_ident);
694 T.setHTMLIdent(Ident);
695 } else {
696 switch (C) {
697 case '=':
698 TokenPtr++;
699 formTokenWithChars(T, TokenPtr, tok::html_equals);
700 break;
701 case '\"':
702 case '\'': {
703 const char *OpenQuote = TokenPtr;
704 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
705 const char *ClosingQuote = TokenPtr;
706 if (TokenPtr != CommentEnd) // Skip closing quote.
707 TokenPtr++;
708 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
709 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
710 ClosingQuote - (OpenQuote + 1)));
711 break;
712 }
713 case '>':
714 TokenPtr++;
715 formTokenWithChars(T, TokenPtr, tok::html_greater);
716 State = LS_Normal;
717 return;
718 case '/':
719 TokenPtr++;
720 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
721 TokenPtr++;
722 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
723 } else
724 formTextToken(T, TokenPtr);
725
726 State = LS_Normal;
727 return;
728 }
729 }
730
731 // Now look ahead and return to normal state if we don't see any HTML tokens
732 // ahead.
733 BufferPtr = skipHorizontalWhitespace(BufferPtr, CommentEnd);
734 if (BufferPtr == CommentEnd) {
735 return;
736 }
737
738 C = *BufferPtr;
739 if (!isHTMLIdentifierStartingCharacter(C) && !isVerticalWhitespace(C) &&
740 C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
741 State = LS_Normal;
742 return;
743 }
744}
745
746void Lexer::setupAndLexHTMLEndTag(Token &T) {
747 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
748
749 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
750 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
751 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
752 if (!isHTMLTagName(Name)) {
753 formTextToken(T, TagNameEnd);
754 return;
755 }
756
757 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
758
759 formTokenWithChars(T, End, tok::html_end_tag);
760 T.setHTMLTagEndName(Name);
761
762 if (BufferPtr != CommentEnd && *BufferPtr == '>')
763 State = LS_HTMLEndTag;
764}
765
766void Lexer::lexHTMLEndTag(Token &T) {
767 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
768
769 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
770 State = LS_Normal;
771}
772
773Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
774 const CommandTraits &Traits, SourceLocation FileLoc,
775 const char *BufferStart, const char *BufferEnd, bool ParseCommands)
776 : Allocator(Allocator), Diags(Diags), Traits(Traits),
777 BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
778 FileLoc(FileLoc), ParseCommands(ParseCommands),
779 CommentState(LCS_BeforeComment), State(LS_Normal) {}
780
782again:
783 switch (CommentState) {
784 case LCS_BeforeComment:
785 if (BufferPtr == BufferEnd) {
786 formTokenWithChars(T, BufferPtr, tok::eof);
787 return;
788 }
789
790 assert(*BufferPtr == '/');
791 BufferPtr++; // Skip first slash.
792 switch(*BufferPtr) {
793 case '/': { // BCPL comment.
794 BufferPtr++; // Skip second slash.
795
796 if (BufferPtr != BufferEnd) {
797 // Skip Doxygen magic marker, if it is present.
798 // It might be missing because of a typo //< or /*<, or because we
799 // merged this non-Doxygen comment into a bunch of Doxygen comments
800 // around it: /** ... */ /* ... */ /** ... */
801 const char C = *BufferPtr;
802 if (C == '/' || C == '!')
803 BufferPtr++;
804 }
805
806 // Skip less-than symbol that marks trailing comments.
807 // Skip it even if the comment is not a Doxygen one, because //< and /*<
808 // are frequent typos.
809 if (BufferPtr != BufferEnd && *BufferPtr == '<')
810 BufferPtr++;
811
812 CommentState = LCS_InsideBCPLComment;
813 switch (State) {
814 case LS_VerbatimBlockFirstLine:
815 case LS_VerbatimBlockBody:
816 break;
817 case LS_HTMLStartTag:
818 BufferPtr = skipHorizontalWhitespace(BufferPtr, BufferEnd);
819 break;
820 default:
821 State = LS_Normal;
822 break;
823 }
824 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
825 goto again;
826 }
827 case '*': { // C comment.
828 BufferPtr++; // Skip star.
829
830 // Skip Doxygen magic marker.
831 const char C = *BufferPtr;
832 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
833 BufferPtr++;
834
835 // Skip less-than symbol that marks trailing comments.
836 if (BufferPtr != BufferEnd && *BufferPtr == '<')
837 BufferPtr++;
838
839 CommentState = LCS_InsideCComment;
840 State = LS_Normal;
841 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
842 goto again;
843 }
844 default:
845 llvm_unreachable("second character of comment should be '/' or '*'");
846 }
847
848 case LCS_BetweenComments: {
849 // Consecutive comments are extracted only if there is only whitespace
850 // between them. So we can search for the start of the next comment.
851 const char *EndWhitespace = BufferPtr;
852 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
853 EndWhitespace++;
854
855 // When lexing the start of an HTML tag (i.e. going through the attributes)
856 // there won't be any newlines generated.
857 if (State == LS_HTMLStartTag && EndWhitespace != BufferEnd) {
858 CommentState = LCS_BeforeComment;
859 BufferPtr = EndWhitespace;
860 goto again;
861 }
862
863 // Turn any whitespace between comments (and there is only whitespace
864 // between them -- guaranteed by comment extraction) into a newline. We
865 // have two newlines between C comments in total (first one was synthesized
866 // after a comment).
867 formTokenWithChars(T, EndWhitespace, tok::newline);
868
869 CommentState = LCS_BeforeComment;
870 break;
871 }
872
873 case LCS_InsideBCPLComment:
874 case LCS_InsideCComment:
875 if (BufferPtr != CommentEnd) {
876 lexCommentText(T);
877 break;
878 } else {
879 // Skip C comment closing sequence.
880 if (CommentState == LCS_InsideCComment) {
881 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
882 BufferPtr += 2;
883 assert(BufferPtr <= BufferEnd);
884
885 // When lexing the start of an HTML tag (i.e. going through the
886 // attributes) there won't be any newlines generated - whitespace still
887 // needs to be skipped.
888 if (State == LS_HTMLStartTag && BufferPtr != BufferEnd) {
889 CommentState = LCS_BetweenComments;
890 goto again;
891 }
892
893 // Synthenize newline just after the C comment, regardless if there is
894 // actually a newline.
895 formTokenWithChars(T, BufferPtr, tok::newline);
896
897 CommentState = LCS_BetweenComments;
898 break;
899 } else {
900 // Don't synthesized a newline after BCPL comment.
901 CommentState = LCS_BetweenComments;
902 goto again;
903 }
904 }
905 }
906}
907
908StringRef Lexer::getSpelling(const Token &Tok,
909 const SourceManager &SourceMgr) const {
910 SourceLocation Loc = Tok.getLocation();
911 FileIDAndOffset LocInfo = SourceMgr.getDecomposedLoc(Loc);
912
913 bool InvalidTemp = false;
914 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
915 if (InvalidTemp)
916 return StringRef();
917
918 const char *Begin = File.data() + LocInfo.second;
919 return StringRef(Begin, Tok.getLength());
920}
921
922} // end namespace comments
923} // end namespace clang
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Definition Decl.cpp:3283
static unsigned skipNewline(const char *&First, const char *End)
Token Tok
The Token.
#define SM(sm)
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
Concrete class used by the front-end to report problems and issues.
Definition Diagnostic.h:233
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition Diagnostic.h:141
Encodes a location in the source.
This class handles loading and caching of source files into memory.
This class provides information about commands that can be used in comments.
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const
void dump(const Lexer &L, const SourceManager &SM) const
static bool isHTMLHexCharacterReferenceCharacter(char C)
static StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator, unsigned CodePoint)
static bool isHTMLNamedCharacterReferenceCharacter(char C)
static bool isHTMLDecimalCharacterReferenceCharacter(char C)
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition CharInfo.h:99
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition CharInfo.h:61
std::pair< FileID, unsigned > FileIDAndOffset
LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
Definition CharInfo.h:132
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
Definition CharInfo.h:138
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition CharInfo.h:91
@ Result
The result type of a method or function.
Definition TypeBase.h:905
LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
Definition CharInfo.h:114
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition CharInfo.h:108
LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].
Definition CharInfo.h:144
Information about a single command.