clang  14.0.0git
CommentLexer.cpp
Go to the documentation of this file.
1 //===--- CommentLexer.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
12 #include "clang/Basic/CharInfo.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringSwitch.h"
15 #include "llvm/Support/ConvertUTF.h"
16 #include "llvm/Support/ErrorHandling.h"
17 
18 namespace clang {
19 namespace comments {
20 
21 void Token::dump(const Lexer &L, const SourceManager &SM) const {
22  llvm::errs() << "comments::Token Kind=" << Kind << " ";
23  Loc.print(llvm::errs(), SM);
24  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25 }
26 
27 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28  return isLetter(C);
29 }
30 
31 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32  return isDigit(C);
33 }
34 
35 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36  return isHexDigit(C);
37 }
38 
39 static inline StringRef convertCodePointToUTF8(
40  llvm::BumpPtrAllocator &Allocator,
41  unsigned CodePoint) {
42  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43  char *ResolvedPtr = Resolved;
44  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45  return StringRef(Resolved, ResolvedPtr - Resolved);
46  else
47  return StringRef();
48 }
49 
50 namespace {
51 
52 #include "clang/AST/CommentHTMLTags.inc"
53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54 
55 } // end anonymous namespace
56 
57 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58  // Fast path, first check a few most widely used named character references.
59  return llvm::StringSwitch<StringRef>(Name)
60  .Case("amp", "&")
61  .Case("lt", "<")
62  .Case("gt", ">")
63  .Case("quot", "\"")
64  .Case("apos", "\'")
65  // Slow path.
66  .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67 }
68 
69 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70  unsigned CodePoint = 0;
71  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
73  CodePoint *= 10;
74  CodePoint += Name[i] - '0';
75  }
76  return convertCodePointToUTF8(Allocator, CodePoint);
77 }
78 
79 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80  unsigned CodePoint = 0;
81  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82  CodePoint *= 16;
83  const char C = Name[i];
85  CodePoint += llvm::hexDigitValue(C);
86  }
87  return convertCodePointToUTF8(Allocator, CodePoint);
88 }
89 
90 void Lexer::skipLineStartingDecorations() {
91  // This function should be called only for C comments
92  assert(CommentState == LCS_InsideCComment);
93 
94  if (BufferPtr == CommentEnd)
95  return;
96 
97  const char *NewBufferPtr = BufferPtr;
98  while (isHorizontalWhitespace(*NewBufferPtr))
99  if (++NewBufferPtr == CommentEnd)
100  return;
101  if (*NewBufferPtr == '*')
102  BufferPtr = NewBufferPtr + 1;
103 }
104 
105 namespace {
106 /// Returns pointer to the first newline character in the string.
107 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109  if (isVerticalWhitespace(*BufferPtr))
110  return BufferPtr;
111  }
112  return BufferEnd;
113 }
114 
115 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116  if (BufferPtr == BufferEnd)
117  return BufferPtr;
118 
119  if (*BufferPtr == '\n')
120  BufferPtr++;
121  else {
122  assert(*BufferPtr == '\r');
123  BufferPtr++;
124  if (BufferPtr != BufferEnd && *BufferPtr == '\n')
125  BufferPtr++;
126  }
127  return BufferPtr;
128 }
129 
130 const char *skipNamedCharacterReference(const char *BufferPtr,
131  const char *BufferEnd) {
132  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
134  return BufferPtr;
135  }
136  return BufferEnd;
137 }
138 
139 const char *skipDecimalCharacterReference(const char *BufferPtr,
140  const char *BufferEnd) {
141  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
143  return BufferPtr;
144  }
145  return BufferEnd;
146 }
147 
148 const char *skipHexCharacterReference(const char *BufferPtr,
149  const char *BufferEnd) {
150  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151  if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152  return BufferPtr;
153  }
154  return BufferEnd;
155 }
156 
157 bool isHTMLIdentifierStartingCharacter(char C) {
158  return isLetter(C);
159 }
160 
161 bool isHTMLIdentifierCharacter(char C) {
162  return isAlphanumeric(C);
163 }
164 
165 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167  if (!isHTMLIdentifierCharacter(*BufferPtr))
168  return BufferPtr;
169  }
170  return BufferEnd;
171 }
172 
173 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
174 /// string allowed.
175 ///
176 /// Returns pointer to closing quote.
177 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178 {
179  const char Quote = *BufferPtr;
180  assert(Quote == '\"' || Quote == '\'');
181 
182  BufferPtr++;
183  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184  const char C = *BufferPtr;
185  if (C == Quote && BufferPtr[-1] != '\\')
186  return BufferPtr;
187  }
188  return BufferEnd;
189 }
190 
191 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193  if (!isWhitespace(*BufferPtr))
194  return BufferPtr;
195  }
196  return BufferEnd;
197 }
198 
199 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201 }
202 
203 bool isCommandNameStartCharacter(char C) {
204  return isLetter(C);
205 }
206 
207 bool isCommandNameCharacter(char C) {
208  return isAlphanumeric(C);
209 }
210 
211 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
212  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213  if (!isCommandNameCharacter(*BufferPtr))
214  return BufferPtr;
215  }
216  return BufferEnd;
217 }
218 
219 /// Return the one past end pointer for BCPL comments.
220 /// Handles newlines escaped with backslash or trigraph for backslahs.
221 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
222  const char *CurPtr = BufferPtr;
223  while (CurPtr != BufferEnd) {
224  while (!isVerticalWhitespace(*CurPtr)) {
225  CurPtr++;
226  if (CurPtr == BufferEnd)
227  return BufferEnd;
228  }
229  // We found a newline, check if it is escaped.
230  const char *EscapePtr = CurPtr - 1;
231  while(isHorizontalWhitespace(*EscapePtr))
232  EscapePtr--;
233 
234  if (*EscapePtr == '\\' ||
235  (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
236  EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
237  // We found an escaped newline.
238  CurPtr = skipNewline(CurPtr, BufferEnd);
239  } else
240  return CurPtr; // Not an escaped newline.
241  }
242  return BufferEnd;
243 }
244 
245 /// Return the one past end pointer for C comments.
246 /// Very dumb, does not handle escaped newlines or trigraphs.
247 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
249  if (*BufferPtr == '*') {
250  assert(BufferPtr + 1 != BufferEnd);
251  if (*(BufferPtr + 1) == '/')
252  return BufferPtr;
253  }
254  }
255  llvm_unreachable("buffer end hit before '*/' was seen");
256 }
257 
258 } // end anonymous namespace
259 
260 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
262  const unsigned TokLen = TokEnd - BufferPtr;
263  Result.setLocation(getSourceLocation(BufferPtr));
264  Result.setKind(Kind);
265  Result.setLength(TokLen);
266 #ifndef NDEBUG
267  Result.TextPtr = "<UNSET>";
268  Result.IntVal = 7;
269 #endif
270  BufferPtr = TokEnd;
271 }
272 
273 const char *Lexer::skipTextToken() {
274  const char *TokenPtr = BufferPtr;
275  assert(TokenPtr < CommentEnd);
276  StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
277 
278 again:
279  size_t End =
280  StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
281  if (End == StringRef::npos)
282  return CommentEnd;
283 
284  // Doxygen doesn't recognize any commands in a one-line double quotation.
285  // If we don't find an ending quotation mark, we pretend it never began.
286  if (*(TokenPtr + End) == '\"') {
287  TokenPtr += End + 1;
288  End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
289  if (End != StringRef::npos && *(TokenPtr + End) == '\"')
290  TokenPtr += End + 1;
291  goto again;
292  }
293  return TokenPtr + End;
294 }
295 
296 void Lexer::lexCommentText(Token &T) {
297  assert(CommentState == LCS_InsideBCPLComment ||
298  CommentState == LCS_InsideCComment);
299 
300  // Handles lexing non-command text, i.e. text and newline.
301  auto HandleNonCommandToken = [&]() -> void {
302  assert(State == LS_Normal);
303 
304  const char *TokenPtr = BufferPtr;
305  assert(TokenPtr < CommentEnd);
306  switch (*TokenPtr) {
307  case '\n':
308  case '\r':
309  TokenPtr = skipNewline(TokenPtr, CommentEnd);
310  formTokenWithChars(T, TokenPtr, tok::newline);
311 
312  if (CommentState == LCS_InsideCComment)
313  skipLineStartingDecorations();
314  return;
315 
316  default:
317  return formTextToken(T, skipTextToken());
318  }
319  };
320 
321  if (!ParseCommands)
322  return HandleNonCommandToken();
323 
324  switch (State) {
325  case LS_Normal:
326  break;
327  case LS_VerbatimBlockFirstLine:
328  lexVerbatimBlockFirstLine(T);
329  return;
330  case LS_VerbatimBlockBody:
331  lexVerbatimBlockBody(T);
332  return;
333  case LS_VerbatimLineText:
334  lexVerbatimLineText(T);
335  return;
336  case LS_HTMLStartTag:
337  lexHTMLStartTag(T);
338  return;
339  case LS_HTMLEndTag:
340  lexHTMLEndTag(T);
341  return;
342  }
343 
344  assert(State == LS_Normal);
345  const char *TokenPtr = BufferPtr;
346  assert(TokenPtr < CommentEnd);
347  switch(*TokenPtr) {
348  case '\\':
349  case '@': {
350  // Commands that start with a backslash and commands that start with
351  // 'at' have equivalent semantics. But we keep information about the
352  // exact syntax in AST for comments.
353  tok::TokenKind CommandKind =
354  (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
355  TokenPtr++;
356  if (TokenPtr == CommentEnd) {
357  formTextToken(T, TokenPtr);
358  return;
359  }
360  char C = *TokenPtr;
361  switch (C) {
362  default:
363  break;
364 
365  case '\\': case '@': case '&': case '$':
366  case '#': case '<': case '>': case '%':
367  case '\"': case '.': case ':':
368  // This is one of \\ \@ \& \$ etc escape sequences.
369  TokenPtr++;
370  if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
371  // This is the \:: escape sequence.
372  TokenPtr++;
373  }
374  StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
375  formTokenWithChars(T, TokenPtr, tok::text);
376  T.setText(UnescapedText);
377  return;
378  }
379 
380  // Don't make zero-length commands.
381  if (!isCommandNameStartCharacter(*TokenPtr)) {
382  formTextToken(T, TokenPtr);
383  return;
384  }
385 
386  TokenPtr = skipCommandName(TokenPtr, CommentEnd);
387  unsigned Length = TokenPtr - (BufferPtr + 1);
388 
389  // Hardcoded support for lexing LaTeX formula commands
390  // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391  if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
392  C = *TokenPtr;
393  if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
394  C == '{' || C == '}') {
395  TokenPtr++;
396  Length++;
397  }
398  }
399 
400  StringRef CommandName(BufferPtr + 1, Length);
401 
402  const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
403  if (!Info) {
404  if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
405  StringRef CorrectedName = Info->Name;
406  SourceLocation Loc = getSourceLocation(BufferPtr);
407  SourceLocation EndLoc = getSourceLocation(TokenPtr);
408  SourceRange FullRange = SourceRange(Loc, EndLoc);
409  SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
410  Diag(Loc, diag::warn_correct_comment_command_name)
411  << FullRange << CommandName << CorrectedName
412  << FixItHint::CreateReplacement(CommandRange, CorrectedName);
413  } else {
414  formTokenWithChars(T, TokenPtr, tok::unknown_command);
415  T.setUnknownCommandName(CommandName);
416  Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417  << SourceRange(T.getLocation(), T.getEndLocation());
418  return;
419  }
420  }
421  if (Info->IsVerbatimBlockCommand) {
422  setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
423  return;
424  }
425  if (Info->IsVerbatimLineCommand) {
426  setupAndLexVerbatimLine(T, TokenPtr, Info);
427  return;
428  }
429  formTokenWithChars(T, TokenPtr, CommandKind);
430  T.setCommandID(Info->getID());
431  return;
432  }
433 
434  case '&':
435  lexHTMLCharacterReference(T);
436  return;
437 
438  case '<': {
439  TokenPtr++;
440  if (TokenPtr == CommentEnd) {
441  formTextToken(T, TokenPtr);
442  return;
443  }
444  const char C = *TokenPtr;
445  if (isHTMLIdentifierStartingCharacter(C))
446  setupAndLexHTMLStartTag(T);
447  else if (C == '/')
448  setupAndLexHTMLEndTag(T);
449  else
450  formTextToken(T, TokenPtr);
451  return;
452  }
453 
454  default:
455  return HandleNonCommandToken();
456  }
457 }
458 
459 void Lexer::setupAndLexVerbatimBlock(Token &T,
460  const char *TextBegin,
461  char Marker, const CommandInfo *Info) {
462  assert(Info->IsVerbatimBlockCommand);
463 
464  VerbatimBlockEndCommandName.clear();
465  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
466  VerbatimBlockEndCommandName.append(Info->EndCommandName);
467 
468  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
469  T.setVerbatimBlockID(Info->getID());
470 
471  // If there is a newline following the verbatim opening command, skip the
472  // newline so that we don't create an tok::verbatim_block_line with empty
473  // text content.
474  if (BufferPtr != CommentEnd &&
475  isVerticalWhitespace(*BufferPtr)) {
476  BufferPtr = skipNewline(BufferPtr, CommentEnd);
477  State = LS_VerbatimBlockBody;
478  return;
479  }
480 
481  State = LS_VerbatimBlockFirstLine;
482 }
483 
484 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485 again:
486  assert(BufferPtr < CommentEnd);
487 
488  // FIXME: It would be better to scan the text once, finding either the block
489  // end command or newline.
490  //
491  // Extract current line.
492  const char *Newline = findNewline(BufferPtr, CommentEnd);
493  StringRef Line(BufferPtr, Newline - BufferPtr);
494 
495  // Look for end command in current line.
496  size_t Pos = Line.find(VerbatimBlockEndCommandName);
497  const char *TextEnd;
498  const char *NextLine;
499  if (Pos == StringRef::npos) {
500  // Current line is completely verbatim.
501  TextEnd = Newline;
502  NextLine = skipNewline(Newline, CommentEnd);
503  } else if (Pos == 0) {
504  // Current line contains just an end command.
505  const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506  StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
507  formTokenWithChars(T, End, tok::verbatim_block_end);
508  T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509  State = LS_Normal;
510  return;
511  } else {
512  // There is some text, followed by end command. Extract text first.
513  TextEnd = BufferPtr + Pos;
514  NextLine = TextEnd;
515  // If there is only whitespace before end command, skip whitespace.
516  if (isWhitespace(BufferPtr, TextEnd)) {
517  BufferPtr = TextEnd;
518  goto again;
519  }
520  }
521 
522  StringRef Text(BufferPtr, TextEnd - BufferPtr);
523  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
524  T.setVerbatimBlockText(Text);
525 
526  State = LS_VerbatimBlockBody;
527 }
528 
529 void Lexer::lexVerbatimBlockBody(Token &T) {
530  assert(State == LS_VerbatimBlockBody);
531 
532  if (CommentState == LCS_InsideCComment)
533  skipLineStartingDecorations();
534 
535  if (BufferPtr == CommentEnd) {
536  formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
537  T.setVerbatimBlockText("");
538  return;
539  }
540 
541  lexVerbatimBlockFirstLine(T);
542 }
543 
544 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545  const CommandInfo *Info) {
546  assert(Info->IsVerbatimLineCommand);
547  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
548  T.setVerbatimLineID(Info->getID());
549 
550  State = LS_VerbatimLineText;
551 }
552 
553 void Lexer::lexVerbatimLineText(Token &T) {
554  assert(State == LS_VerbatimLineText);
555 
556  // Extract current line.
557  const char *Newline = findNewline(BufferPtr, CommentEnd);
558  StringRef Text(BufferPtr, Newline - BufferPtr);
559  formTokenWithChars(T, Newline, tok::verbatim_line_text);
560  T.setVerbatimLineText(Text);
561 
562  State = LS_Normal;
563 }
564 
565 void Lexer::lexHTMLCharacterReference(Token &T) {
566  const char *TokenPtr = BufferPtr;
567  assert(*TokenPtr == '&');
568  TokenPtr++;
569  if (TokenPtr == CommentEnd) {
570  formTextToken(T, TokenPtr);
571  return;
572  }
573  const char *NamePtr;
574  bool isNamed = false;
575  bool isDecimal = false;
576  char C = *TokenPtr;
578  NamePtr = TokenPtr;
579  TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
580  isNamed = true;
581  } else if (C == '#') {
582  TokenPtr++;
583  if (TokenPtr == CommentEnd) {
584  formTextToken(T, TokenPtr);
585  return;
586  }
587  C = *TokenPtr;
589  NamePtr = TokenPtr;
590  TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
591  isDecimal = true;
592  } else if (C == 'x' || C == 'X') {
593  TokenPtr++;
594  NamePtr = TokenPtr;
595  TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
596  } else {
597  formTextToken(T, TokenPtr);
598  return;
599  }
600  } else {
601  formTextToken(T, TokenPtr);
602  return;
603  }
604  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
605  *TokenPtr != ';') {
606  formTextToken(T, TokenPtr);
607  return;
608  }
609  StringRef Name(NamePtr, TokenPtr - NamePtr);
610  TokenPtr++; // Skip semicolon.
611  StringRef Resolved;
612  if (isNamed)
613  Resolved = resolveHTMLNamedCharacterReference(Name);
614  else if (isDecimal)
615  Resolved = resolveHTMLDecimalCharacterReference(Name);
616  else
617  Resolved = resolveHTMLHexCharacterReference(Name);
618 
619  if (Resolved.empty()) {
620  formTextToken(T, TokenPtr);
621  return;
622  }
623  formTokenWithChars(T, TokenPtr, tok::text);
624  T.setText(Resolved);
625 }
626 
627 void Lexer::setupAndLexHTMLStartTag(Token &T) {
628  assert(BufferPtr[0] == '<' &&
629  isHTMLIdentifierStartingCharacter(BufferPtr[1]));
630  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
631  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
632  if (!isHTMLTagName(Name)) {
633  formTextToken(T, TagNameEnd);
634  return;
635  }
636 
637  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638  T.setHTMLTagStartName(Name);
639 
640  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
641 
642  const char C = *BufferPtr;
643  if (BufferPtr != CommentEnd &&
644  (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
645  State = LS_HTMLStartTag;
646 }
647 
648 void Lexer::lexHTMLStartTag(Token &T) {
649  assert(State == LS_HTMLStartTag);
650 
651  const char *TokenPtr = BufferPtr;
652  char C = *TokenPtr;
653  if (isHTMLIdentifierCharacter(C)) {
654  TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
655  StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656  formTokenWithChars(T, TokenPtr, tok::html_ident);
657  T.setHTMLIdent(Ident);
658  } else {
659  switch (C) {
660  case '=':
661  TokenPtr++;
662  formTokenWithChars(T, TokenPtr, tok::html_equals);
663  break;
664  case '\"':
665  case '\'': {
666  const char *OpenQuote = TokenPtr;
667  TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
668  const char *ClosingQuote = TokenPtr;
669  if (TokenPtr != CommentEnd) // Skip closing quote.
670  TokenPtr++;
671  formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
672  T.setHTMLQuotedString(StringRef(OpenQuote + 1,
673  ClosingQuote - (OpenQuote + 1)));
674  break;
675  }
676  case '>':
677  TokenPtr++;
678  formTokenWithChars(T, TokenPtr, tok::html_greater);
679  State = LS_Normal;
680  return;
681  case '/':
682  TokenPtr++;
683  if (TokenPtr != CommentEnd && *TokenPtr == '>') {
684  TokenPtr++;
685  formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
686  } else
687  formTextToken(T, TokenPtr);
688 
689  State = LS_Normal;
690  return;
691  }
692  }
693 
694  // Now look ahead and return to normal state if we don't see any HTML tokens
695  // ahead.
696  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
697  if (BufferPtr == CommentEnd) {
698  State = LS_Normal;
699  return;
700  }
701 
702  C = *BufferPtr;
703  if (!isHTMLIdentifierStartingCharacter(C) &&
704  C != '=' && C != '\"' && C != '\'' && C != '>') {
705  State = LS_Normal;
706  return;
707  }
708 }
709 
710 void Lexer::setupAndLexHTMLEndTag(Token &T) {
711  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
712 
713  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
714  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
715  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716  if (!isHTMLTagName(Name)) {
717  formTextToken(T, TagNameEnd);
718  return;
719  }
720 
721  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
722 
723  formTokenWithChars(T, End, tok::html_end_tag);
724  T.setHTMLTagEndName(Name);
725 
726  if (BufferPtr != CommentEnd && *BufferPtr == '>')
727  State = LS_HTMLEndTag;
728 }
729 
730 void Lexer::lexHTMLEndTag(Token &T) {
731  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
732 
733  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
734  State = LS_Normal;
735 }
736 
737 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738  const CommandTraits &Traits, SourceLocation FileLoc,
739  const char *BufferStart, const char *BufferEnd, bool ParseCommands)
740  : Allocator(Allocator), Diags(Diags), Traits(Traits),
741  BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742  FileLoc(FileLoc), ParseCommands(ParseCommands),
743  CommentState(LCS_BeforeComment), State(LS_Normal) {}
744 
745 void Lexer::lex(Token &T) {
746 again:
747  switch (CommentState) {
748  case LCS_BeforeComment:
749  if (BufferPtr == BufferEnd) {
750  formTokenWithChars(T, BufferPtr, tok::eof);
751  return;
752  }
753 
754  assert(*BufferPtr == '/');
755  BufferPtr++; // Skip first slash.
756  switch(*BufferPtr) {
757  case '/': { // BCPL comment.
758  BufferPtr++; // Skip second slash.
759 
760  if (BufferPtr != BufferEnd) {
761  // Skip Doxygen magic marker, if it is present.
762  // It might be missing because of a typo //< or /*<, or because we
763  // merged this non-Doxygen comment into a bunch of Doxygen comments
764  // around it: /** ... */ /* ... */ /** ... */
765  const char C = *BufferPtr;
766  if (C == '/' || C == '!')
767  BufferPtr++;
768  }
769 
770  // Skip less-than symbol that marks trailing comments.
771  // Skip it even if the comment is not a Doxygen one, because //< and /*<
772  // are frequent typos.
773  if (BufferPtr != BufferEnd && *BufferPtr == '<')
774  BufferPtr++;
775 
776  CommentState = LCS_InsideBCPLComment;
777  if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
778  State = LS_Normal;
779  CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780  goto again;
781  }
782  case '*': { // C comment.
783  BufferPtr++; // Skip star.
784 
785  // Skip Doxygen magic marker.
786  const char C = *BufferPtr;
787  if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
788  BufferPtr++;
789 
790  // Skip less-than symbol that marks trailing comments.
791  if (BufferPtr != BufferEnd && *BufferPtr == '<')
792  BufferPtr++;
793 
794  CommentState = LCS_InsideCComment;
795  State = LS_Normal;
796  CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797  goto again;
798  }
799  default:
800  llvm_unreachable("second character of comment should be '/' or '*'");
801  }
802 
803  case LCS_BetweenComments: {
804  // Consecutive comments are extracted only if there is only whitespace
805  // between them. So we can search for the start of the next comment.
806  const char *EndWhitespace = BufferPtr;
807  while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
808  EndWhitespace++;
809 
810  // Turn any whitespace between comments (and there is only whitespace
811  // between them -- guaranteed by comment extraction) into a newline. We
812  // have two newlines between C comments in total (first one was synthesized
813  // after a comment).
814  formTokenWithChars(T, EndWhitespace, tok::newline);
815 
816  CommentState = LCS_BeforeComment;
817  break;
818  }
819 
820  case LCS_InsideBCPLComment:
821  case LCS_InsideCComment:
822  if (BufferPtr != CommentEnd) {
823  lexCommentText(T);
824  break;
825  } else {
826  // Skip C comment closing sequence.
827  if (CommentState == LCS_InsideCComment) {
828  assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
829  BufferPtr += 2;
830  assert(BufferPtr <= BufferEnd);
831 
832  // Synthenize newline just after the C comment, regardless if there is
833  // actually a newline.
834  formTokenWithChars(T, BufferPtr, tok::newline);
835 
836  CommentState = LCS_BetweenComments;
837  break;
838  } else {
839  // Don't synthesized a newline after BCPL comment.
840  CommentState = LCS_BetweenComments;
841  goto again;
842  }
843  }
844  }
845 }
846 
847 StringRef Lexer::getSpelling(const Token &Tok,
848  const SourceManager &SourceMgr) const {
849  SourceLocation Loc = Tok.getLocation();
850  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851 
852  bool InvalidTemp = false;
853  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854  if (InvalidTemp)
855  return StringRef();
856 
857  const char *Begin = File.data() + LocInfo.second;
858  return StringRef(Begin, Tok.getLength());
859 }
860 
861 } // end namespace comments
862 } // end namespace clang
clang::comments::tok::text
@ text
Definition: CommentLexer.h:35
clang::DeclaratorContext::File
@ File
clang::comments::tok::unknown_command
@ unknown_command
Definition: CommentLexer.h:36
clang::comments::tok::html_start_tag
@ html_start_tag
Definition: CommentLexer.h:44
isNamed
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Definition: Decl.cpp:3065
clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:86
clang::comments::tok::TokenKind
TokenKind
Definition: CommentLexer.h:32
AttributeLangSupport::C
@ C
Definition: SemaDeclAttr.cpp:54
clang::DiagnosticsEngine
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:191
clang::comments::Token::getLocation
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:80
skipWhitespace
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
Definition: TextDiagnostic.cpp:514
clang::comments::Token::dump
void dump(const Lexer &L, const SourceManager &SM) const
Definition: CommentLexer.cpp:21
clang::comments::CommandTraits::getCommandInfoOrNULL
const CommandInfo * getCommandInfoOrNULL(StringRef Name) const
Definition: CommentCommandTraits.cpp:34
End
SourceLocation End
Definition: USRLocFinder.cpp:167
clang::SourceManager
This class handles loading and caching of source files into memory.
Definition: SourceManager.h:626
clang::comments::CommandTraits::getCommandInfo
const CommandInfo * getCommandInfo(StringRef Name) const
Definition: CommentCommandTraits.h:145
clang::comments::CommandTraits::getTypoCorrectCommandInfo
const CommandInfo * getTypoCorrectCommandInfo(StringRef Typo) const
Definition: CommentCommandTraits.cpp:47
clang::SourceLocation::print
void print(raw_ostream &OS, const SourceManager &SM) const
Definition: SourceLocation.cpp:62
clang::comments::tok::html_ident
@ html_ident
Definition: CommentLexer.h:45
skipNewline
static unsigned skipNewline(const char *&First, const char *End)
Definition: DependencyDirectivesSourceMinimizer.cpp:229
clang::isLetter
LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
Definition: CharInfo.h:116
clang::comments::tok::verbatim_block_begin
@ verbatim_block_begin
Definition: CommentLexer.h:39
clang::comments::isHTMLHexCharacterReferenceCharacter
static bool isHTMLHexCharacterReferenceCharacter(char C)
Definition: CommentLexer.cpp:35
clang::comments::CommandInfo::Name
const char * Name
Definition: CommentCommandTraits.h:37
clang::comments::Token
Comment token.
Definition: CommentLexer.h:55
clang::comments::isHTMLDecimalCharacterReferenceCharacter
static bool isHTMLDecimalCharacterReferenceCharacter(char C)
Definition: CommentLexer.cpp:31
clang::comments::convertCodePointToUTF8
static StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator, unsigned CodePoint)
Definition: CommentLexer.cpp:39
clang::comments::tok::html_equals
@ html_equals
Definition: CommentLexer.h:46
Line
const AnnotatedLine * Line
Definition: UsingDeclarationsSorter.cpp:68
clang::comments::tok::at_command
@ at_command
Definition: CommentLexer.h:38
clang::comments::tok::html_slash_greater
@ html_slash_greater
Definition: CommentLexer.h:49
clang::SourceManager::getDecomposedLoc
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
Definition: SourceManager.h:1236
clang::comments::tok::verbatim_line_name
@ verbatim_line_name
Definition: CommentLexer.h:42
clang::isHexDigit
LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].
Definition: CharInfo.h:128
clang::comments::tok::verbatim_line_text
@ verbatim_line_text
Definition: CommentLexer.h:43
clang::isWhitespace
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:92
clang::comments::Lexer::getSpelling
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const
Definition: CommentLexer.cpp:847
clang::comments::tok::verbatim_block_line
@ verbatim_block_line
Definition: CommentLexer.h:40
CharInfo.h
clang::comments::tok::html_greater
@ html_greater
Definition: CommentLexer.h:48
Begin
SourceLocation Begin
Definition: USRLocFinder.cpp:165
clang::isAlphanumeric
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
Definition: CharInfo.h:122
State
LineState State
Definition: UnwrappedLineFormatter.cpp:1052
clang::comments::tok::html_end_tag
@ html_end_tag
Definition: CommentLexer.h:50
clang::SourceManager::getBufferData
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
Definition: SourceManager.cpp:735
clang::ObjCPropertyAttribute::Kind
Kind
Definition: DeclObjCCommon.h:22
clang::comments::tok::verbatim_block_end
@ verbatim_block_end
Definition: CommentLexer.h:41
clang::comments::isHTMLNamedCharacterReferenceCharacter
static bool isHTMLNamedCharacterReferenceCharacter(char C)
Definition: CommentLexer.cpp:27
clang::isVerticalWhitespace
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:83
clang
Definition: CalledOnceCheck.h:17
Text
StringRef Text
Definition: Format.cpp:2430
clang::isDigit
LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
Definition: CharInfo.h:98
clang::comments::Lexer::lex
void lex(Token &T)
Definition: CommentLexer.cpp:745
clang::isHorizontalWhitespace
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:75
clang::comments::tok::eof
@ eof
Definition: CommentLexer.h:33
clang::comments::Token::getLength
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:95
clang::comments::CommandTraits
This class provides information about commands that can be used in comments.
Definition: CommentCommandTraits.h:127
SM
#define SM(sm)
Definition: Cuda.cpp:81
clang::comments::tok::backslash_command
@ backslash_command
Definition: CommentLexer.h:37
CommentCommandTraits.h
CommentDiagnostic.h
clang::comments::tok::newline
@ newline
Definition: CommentLexer.h:34
clang::comments::CommandInfo::getID
unsigned getID() const
Definition: CommentCommandTraits.h:33
clang::FixItHint::CreateReplacement
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:133
CommentLexer.h
clang::comments::Lexer
Comment lexer.
Definition: CommentLexer.h:220
clang::comments::tok::html_quoted_string
@ html_quoted_string
Definition: CommentLexer.h:47