clang  8.0.0svn
CommentLexer.cpp
Go to the documentation of this file.
1 //===--- CommentLexer.cpp -------------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "clang/AST/CommentLexer.h"
13 #include "clang/Basic/CharInfo.h"
14 #include "llvm/ADT/StringExtras.h"
15 #include "llvm/ADT/StringSwitch.h"
16 #include "llvm/Support/ConvertUTF.h"
17 #include "llvm/Support/ErrorHandling.h"
18 
19 namespace clang {
20 namespace comments {
21 
22 void Token::dump(const Lexer &L, const SourceManager &SM) const {
23  llvm::errs() << "comments::Token Kind=" << Kind << " ";
24  Loc.print(llvm::errs(), SM);
25  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
26 }
27 
28 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
29  return isLetter(C);
30 }
31 
32 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
33  return isDigit(C);
34 }
35 
36 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
37  return isHexDigit(C);
38 }
39 
40 static inline StringRef convertCodePointToUTF8(
41  llvm::BumpPtrAllocator &Allocator,
42  unsigned CodePoint) {
43  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
44  char *ResolvedPtr = Resolved;
45  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
46  return StringRef(Resolved, ResolvedPtr - Resolved);
47  else
48  return StringRef();
49 }
50 
51 namespace {
52 
53 #include "clang/AST/CommentHTMLTags.inc"
54 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
55 
56 } // end anonymous namespace
57 
58 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
59  // Fast path, first check a few most widely used named character references.
60  return llvm::StringSwitch<StringRef>(Name)
61  .Case("amp", "&")
62  .Case("lt", "<")
63  .Case("gt", ">")
64  .Case("quot", "\"")
65  .Case("apos", "\'")
66  // Slow path.
67  .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
68 }
69 
70 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
71  unsigned CodePoint = 0;
72  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74  CodePoint *= 10;
75  CodePoint += Name[i] - '0';
76  }
77  return convertCodePointToUTF8(Allocator, CodePoint);
78 }
79 
80 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
81  unsigned CodePoint = 0;
82  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
83  CodePoint *= 16;
84  const char C = Name[i];
86  CodePoint += llvm::hexDigitValue(C);
87  }
88  return convertCodePointToUTF8(Allocator, CodePoint);
89 }
90 
91 void Lexer::skipLineStartingDecorations() {
92  // This function should be called only for C comments
93  assert(CommentState == LCS_InsideCComment);
94 
95  if (BufferPtr == CommentEnd)
96  return;
97 
98  switch (*BufferPtr) {
99  case ' ':
100  case '\t':
101  case '\f':
102  case '\v': {
103  const char *NewBufferPtr = BufferPtr;
104  NewBufferPtr++;
105  if (NewBufferPtr == CommentEnd)
106  return;
107 
108  char C = *NewBufferPtr;
109  while (isHorizontalWhitespace(C)) {
110  NewBufferPtr++;
111  if (NewBufferPtr == CommentEnd)
112  return;
113  C = *NewBufferPtr;
114  }
115  if (C == '*')
116  BufferPtr = NewBufferPtr + 1;
117  break;
118  }
119  case '*':
120  BufferPtr++;
121  break;
122  }
123 }
124 
125 namespace {
126 /// Returns pointer to the first newline character in the string.
127 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
128  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
129  if (isVerticalWhitespace(*BufferPtr))
130  return BufferPtr;
131  }
132  return BufferEnd;
133 }
134 
135 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
136  if (BufferPtr == BufferEnd)
137  return BufferPtr;
138 
139  if (*BufferPtr == '\n')
140  BufferPtr++;
141  else {
142  assert(*BufferPtr == '\r');
143  BufferPtr++;
144  if (BufferPtr != BufferEnd && *BufferPtr == '\n')
145  BufferPtr++;
146  }
147  return BufferPtr;
148 }
149 
150 const char *skipNamedCharacterReference(const char *BufferPtr,
151  const char *BufferEnd) {
152  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
154  return BufferPtr;
155  }
156  return BufferEnd;
157 }
158 
159 const char *skipDecimalCharacterReference(const char *BufferPtr,
160  const char *BufferEnd) {
161  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
163  return BufferPtr;
164  }
165  return BufferEnd;
166 }
167 
168 const char *skipHexCharacterReference(const char *BufferPtr,
169  const char *BufferEnd) {
170  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
171  if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
172  return BufferPtr;
173  }
174  return BufferEnd;
175 }
176 
177 bool isHTMLIdentifierStartingCharacter(char C) {
178  return isLetter(C);
179 }
180 
181 bool isHTMLIdentifierCharacter(char C) {
182  return isAlphanumeric(C);
183 }
184 
185 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
186  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
187  if (!isHTMLIdentifierCharacter(*BufferPtr))
188  return BufferPtr;
189  }
190  return BufferEnd;
191 }
192 
193 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
194 /// string allowed.
195 ///
196 /// Returns pointer to closing quote.
197 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
198 {
199  const char Quote = *BufferPtr;
200  assert(Quote == '\"' || Quote == '\'');
201 
202  BufferPtr++;
203  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204  const char C = *BufferPtr;
205  if (C == Quote && BufferPtr[-1] != '\\')
206  return BufferPtr;
207  }
208  return BufferEnd;
209 }
210 
211 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
212  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213  if (!isWhitespace(*BufferPtr))
214  return BufferPtr;
215  }
216  return BufferEnd;
217 }
218 
219 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
220  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
221 }
222 
223 bool isCommandNameStartCharacter(char C) {
224  return isLetter(C);
225 }
226 
227 bool isCommandNameCharacter(char C) {
228  return isAlphanumeric(C);
229 }
230 
231 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
232  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
233  if (!isCommandNameCharacter(*BufferPtr))
234  return BufferPtr;
235  }
236  return BufferEnd;
237 }
238 
239 /// Return the one past end pointer for BCPL comments.
240 /// Handles newlines escaped with backslash or trigraph for backslahs.
241 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
242  const char *CurPtr = BufferPtr;
243  while (CurPtr != BufferEnd) {
244  while (!isVerticalWhitespace(*CurPtr)) {
245  CurPtr++;
246  if (CurPtr == BufferEnd)
247  return BufferEnd;
248  }
249  // We found a newline, check if it is escaped.
250  const char *EscapePtr = CurPtr - 1;
251  while(isHorizontalWhitespace(*EscapePtr))
252  EscapePtr--;
253 
254  if (*EscapePtr == '\\' ||
255  (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
256  EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
257  // We found an escaped newline.
258  CurPtr = skipNewline(CurPtr, BufferEnd);
259  } else
260  return CurPtr; // Not an escaped newline.
261  }
262  return BufferEnd;
263 }
264 
265 /// Return the one past end pointer for C comments.
266 /// Very dumb, does not handle escaped newlines or trigraphs.
267 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
268  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
269  if (*BufferPtr == '*') {
270  assert(BufferPtr + 1 != BufferEnd);
271  if (*(BufferPtr + 1) == '/')
272  return BufferPtr;
273  }
274  }
275  llvm_unreachable("buffer end hit before '*/' was seen");
276 }
277 
278 } // end anonymous namespace
279 
280 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
282  const unsigned TokLen = TokEnd - BufferPtr;
283  Result.setLocation(getSourceLocation(BufferPtr));
284  Result.setKind(Kind);
285  Result.setLength(TokLen);
286 #ifndef NDEBUG
287  Result.TextPtr = "<UNSET>";
288  Result.IntVal = 7;
289 #endif
290  BufferPtr = TokEnd;
291 }
292 
293 void Lexer::lexCommentText(Token &T) {
294  assert(CommentState == LCS_InsideBCPLComment ||
295  CommentState == LCS_InsideCComment);
296 
297  // Handles lexing non-command text, i.e. text and newline.
298  auto HandleNonCommandToken = [&]() -> void {
299  assert(State == LS_Normal);
300 
301  const char *TokenPtr = BufferPtr;
302  assert(TokenPtr < CommentEnd);
303  switch (*TokenPtr) {
304  case '\n':
305  case '\r':
306  TokenPtr = skipNewline(TokenPtr, CommentEnd);
307  formTokenWithChars(T, TokenPtr, tok::newline);
308 
309  if (CommentState == LCS_InsideCComment)
310  skipLineStartingDecorations();
311  return;
312 
313  default: {
314  StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
315  size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
316  .find_first_of(TokStartSymbols);
317  if (End != StringRef::npos)
318  TokenPtr += End;
319  else
320  TokenPtr = CommentEnd;
321  formTextToken(T, TokenPtr);
322  return;
323  }
324  }
325  };
326 
327  if (!ParseCommands)
328  return HandleNonCommandToken();
329 
330  switch (State) {
331  case LS_Normal:
332  break;
333  case LS_VerbatimBlockFirstLine:
334  lexVerbatimBlockFirstLine(T);
335  return;
336  case LS_VerbatimBlockBody:
337  lexVerbatimBlockBody(T);
338  return;
339  case LS_VerbatimLineText:
340  lexVerbatimLineText(T);
341  return;
342  case LS_HTMLStartTag:
343  lexHTMLStartTag(T);
344  return;
345  case LS_HTMLEndTag:
346  lexHTMLEndTag(T);
347  return;
348  }
349 
350  assert(State == LS_Normal);
351  const char *TokenPtr = BufferPtr;
352  assert(TokenPtr < CommentEnd);
353  switch(*TokenPtr) {
354  case '\\':
355  case '@': {
356  // Commands that start with a backslash and commands that start with
357  // 'at' have equivalent semantics. But we keep information about the
358  // exact syntax in AST for comments.
359  tok::TokenKind CommandKind =
360  (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
361  TokenPtr++;
362  if (TokenPtr == CommentEnd) {
363  formTextToken(T, TokenPtr);
364  return;
365  }
366  char C = *TokenPtr;
367  switch (C) {
368  default:
369  break;
370 
371  case '\\': case '@': case '&': case '$':
372  case '#': case '<': case '>': case '%':
373  case '\"': case '.': case ':':
374  // This is one of \\ \@ \& \$ etc escape sequences.
375  TokenPtr++;
376  if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
377  // This is the \:: escape sequence.
378  TokenPtr++;
379  }
380  StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
381  formTokenWithChars(T, TokenPtr, tok::text);
382  T.setText(UnescapedText);
383  return;
384  }
385 
386  // Don't make zero-length commands.
387  if (!isCommandNameStartCharacter(*TokenPtr)) {
388  formTextToken(T, TokenPtr);
389  return;
390  }
391 
392  TokenPtr = skipCommandName(TokenPtr, CommentEnd);
393  unsigned Length = TokenPtr - (BufferPtr + 1);
394 
395  // Hardcoded support for lexing LaTeX formula commands
396  // \f$ \f[ \f] \f{ \f} as a single command.
397  if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
398  C = *TokenPtr;
399  if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
400  TokenPtr++;
401  Length++;
402  }
403  }
404 
405  StringRef CommandName(BufferPtr + 1, Length);
406 
407  const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
408  if (!Info) {
409  if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
410  StringRef CorrectedName = Info->Name;
411  SourceLocation Loc = getSourceLocation(BufferPtr);
412  SourceLocation EndLoc = getSourceLocation(TokenPtr);
413  SourceRange FullRange = SourceRange(Loc, EndLoc);
414  SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
415  Diag(Loc, diag::warn_correct_comment_command_name)
416  << FullRange << CommandName << CorrectedName
417  << FixItHint::CreateReplacement(CommandRange, CorrectedName);
418  } else {
419  formTokenWithChars(T, TokenPtr, tok::unknown_command);
420  T.setUnknownCommandName(CommandName);
421  Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
423  return;
424  }
425  }
426  if (Info->IsVerbatimBlockCommand) {
427  setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
428  return;
429  }
430  if (Info->IsVerbatimLineCommand) {
431  setupAndLexVerbatimLine(T, TokenPtr, Info);
432  return;
433  }
434  formTokenWithChars(T, TokenPtr, CommandKind);
435  T.setCommandID(Info->getID());
436  return;
437  }
438 
439  case '&':
440  lexHTMLCharacterReference(T);
441  return;
442 
443  case '<': {
444  TokenPtr++;
445  if (TokenPtr == CommentEnd) {
446  formTextToken(T, TokenPtr);
447  return;
448  }
449  const char C = *TokenPtr;
450  if (isHTMLIdentifierStartingCharacter(C))
451  setupAndLexHTMLStartTag(T);
452  else if (C == '/')
453  setupAndLexHTMLEndTag(T);
454  else
455  formTextToken(T, TokenPtr);
456  return;
457  }
458 
459  default:
460  return HandleNonCommandToken();
461  }
462 }
463 
464 void Lexer::setupAndLexVerbatimBlock(Token &T,
465  const char *TextBegin,
466  char Marker, const CommandInfo *Info) {
467  assert(Info->IsVerbatimBlockCommand);
468 
469  VerbatimBlockEndCommandName.clear();
470  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
471  VerbatimBlockEndCommandName.append(Info->EndCommandName);
472 
473  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
474  T.setVerbatimBlockID(Info->getID());
475 
476  // If there is a newline following the verbatim opening command, skip the
477  // newline so that we don't create an tok::verbatim_block_line with empty
478  // text content.
479  if (BufferPtr != CommentEnd &&
480  isVerticalWhitespace(*BufferPtr)) {
481  BufferPtr = skipNewline(BufferPtr, CommentEnd);
482  State = LS_VerbatimBlockBody;
483  return;
484  }
485 
486  State = LS_VerbatimBlockFirstLine;
487 }
488 
489 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
490 again:
491  assert(BufferPtr < CommentEnd);
492 
493  // FIXME: It would be better to scan the text once, finding either the block
494  // end command or newline.
495  //
496  // Extract current line.
497  const char *Newline = findNewline(BufferPtr, CommentEnd);
498  StringRef Line(BufferPtr, Newline - BufferPtr);
499 
500  // Look for end command in current line.
501  size_t Pos = Line.find(VerbatimBlockEndCommandName);
502  const char *TextEnd;
503  const char *NextLine;
504  if (Pos == StringRef::npos) {
505  // Current line is completely verbatim.
506  TextEnd = Newline;
507  NextLine = skipNewline(Newline, CommentEnd);
508  } else if (Pos == 0) {
509  // Current line contains just an end command.
510  const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
511  StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
512  formTokenWithChars(T, End, tok::verbatim_block_end);
513  T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
514  State = LS_Normal;
515  return;
516  } else {
517  // There is some text, followed by end command. Extract text first.
518  TextEnd = BufferPtr + Pos;
519  NextLine = TextEnd;
520  // If there is only whitespace before end command, skip whitespace.
521  if (isWhitespace(BufferPtr, TextEnd)) {
522  BufferPtr = TextEnd;
523  goto again;
524  }
525  }
526 
527  StringRef Text(BufferPtr, TextEnd - BufferPtr);
528  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
529  T.setVerbatimBlockText(Text);
530 
531  State = LS_VerbatimBlockBody;
532 }
533 
534 void Lexer::lexVerbatimBlockBody(Token &T) {
535  assert(State == LS_VerbatimBlockBody);
536 
537  if (CommentState == LCS_InsideCComment)
538  skipLineStartingDecorations();
539 
540  if (BufferPtr == CommentEnd) {
541  formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
542  T.setVerbatimBlockText("");
543  return;
544  }
545 
546  lexVerbatimBlockFirstLine(T);
547 }
548 
549 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
550  const CommandInfo *Info) {
551  assert(Info->IsVerbatimLineCommand);
552  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
553  T.setVerbatimLineID(Info->getID());
554 
555  State = LS_VerbatimLineText;
556 }
557 
558 void Lexer::lexVerbatimLineText(Token &T) {
559  assert(State == LS_VerbatimLineText);
560 
561  // Extract current line.
562  const char *Newline = findNewline(BufferPtr, CommentEnd);
563  StringRef Text(BufferPtr, Newline - BufferPtr);
564  formTokenWithChars(T, Newline, tok::verbatim_line_text);
565  T.setVerbatimLineText(Text);
566 
567  State = LS_Normal;
568 }
569 
570 void Lexer::lexHTMLCharacterReference(Token &T) {
571  const char *TokenPtr = BufferPtr;
572  assert(*TokenPtr == '&');
573  TokenPtr++;
574  if (TokenPtr == CommentEnd) {
575  formTextToken(T, TokenPtr);
576  return;
577  }
578  const char *NamePtr;
579  bool isNamed = false;
580  bool isDecimal = false;
581  char C = *TokenPtr;
583  NamePtr = TokenPtr;
584  TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
585  isNamed = true;
586  } else if (C == '#') {
587  TokenPtr++;
588  if (TokenPtr == CommentEnd) {
589  formTextToken(T, TokenPtr);
590  return;
591  }
592  C = *TokenPtr;
594  NamePtr = TokenPtr;
595  TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
596  isDecimal = true;
597  } else if (C == 'x' || C == 'X') {
598  TokenPtr++;
599  NamePtr = TokenPtr;
600  TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
601  } else {
602  formTextToken(T, TokenPtr);
603  return;
604  }
605  } else {
606  formTextToken(T, TokenPtr);
607  return;
608  }
609  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
610  *TokenPtr != ';') {
611  formTextToken(T, TokenPtr);
612  return;
613  }
614  StringRef Name(NamePtr, TokenPtr - NamePtr);
615  TokenPtr++; // Skip semicolon.
616  StringRef Resolved;
617  if (isNamed)
618  Resolved = resolveHTMLNamedCharacterReference(Name);
619  else if (isDecimal)
620  Resolved = resolveHTMLDecimalCharacterReference(Name);
621  else
622  Resolved = resolveHTMLHexCharacterReference(Name);
623 
624  if (Resolved.empty()) {
625  formTextToken(T, TokenPtr);
626  return;
627  }
628  formTokenWithChars(T, TokenPtr, tok::text);
629  T.setText(Resolved);
630 }
631 
632 void Lexer::setupAndLexHTMLStartTag(Token &T) {
633  assert(BufferPtr[0] == '<' &&
634  isHTMLIdentifierStartingCharacter(BufferPtr[1]));
635  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
636  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
637  if (!isHTMLTagName(Name)) {
638  formTextToken(T, TagNameEnd);
639  return;
640  }
641 
642  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
643  T.setHTMLTagStartName(Name);
644 
645  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
646 
647  const char C = *BufferPtr;
648  if (BufferPtr != CommentEnd &&
649  (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
650  State = LS_HTMLStartTag;
651 }
652 
653 void Lexer::lexHTMLStartTag(Token &T) {
654  assert(State == LS_HTMLStartTag);
655 
656  const char *TokenPtr = BufferPtr;
657  char C = *TokenPtr;
658  if (isHTMLIdentifierCharacter(C)) {
659  TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
660  StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
661  formTokenWithChars(T, TokenPtr, tok::html_ident);
662  T.setHTMLIdent(Ident);
663  } else {
664  switch (C) {
665  case '=':
666  TokenPtr++;
667  formTokenWithChars(T, TokenPtr, tok::html_equals);
668  break;
669  case '\"':
670  case '\'': {
671  const char *OpenQuote = TokenPtr;
672  TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
673  const char *ClosingQuote = TokenPtr;
674  if (TokenPtr != CommentEnd) // Skip closing quote.
675  TokenPtr++;
676  formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
677  T.setHTMLQuotedString(StringRef(OpenQuote + 1,
678  ClosingQuote - (OpenQuote + 1)));
679  break;
680  }
681  case '>':
682  TokenPtr++;
683  formTokenWithChars(T, TokenPtr, tok::html_greater);
684  State = LS_Normal;
685  return;
686  case '/':
687  TokenPtr++;
688  if (TokenPtr != CommentEnd && *TokenPtr == '>') {
689  TokenPtr++;
690  formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
691  } else
692  formTextToken(T, TokenPtr);
693 
694  State = LS_Normal;
695  return;
696  }
697  }
698 
699  // Now look ahead and return to normal state if we don't see any HTML tokens
700  // ahead.
701  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
702  if (BufferPtr == CommentEnd) {
703  State = LS_Normal;
704  return;
705  }
706 
707  C = *BufferPtr;
708  if (!isHTMLIdentifierStartingCharacter(C) &&
709  C != '=' && C != '\"' && C != '\'' && C != '>') {
710  State = LS_Normal;
711  return;
712  }
713 }
714 
715 void Lexer::setupAndLexHTMLEndTag(Token &T) {
716  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
717 
718  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
719  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
720  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
721  if (!isHTMLTagName(Name)) {
722  formTextToken(T, TagNameEnd);
723  return;
724  }
725 
726  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
727 
728  formTokenWithChars(T, End, tok::html_end_tag);
729  T.setHTMLTagEndName(Name);
730 
731  if (BufferPtr != CommentEnd && *BufferPtr == '>')
732  State = LS_HTMLEndTag;
733 }
734 
735 void Lexer::lexHTMLEndTag(Token &T) {
736  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
737 
738  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
739  State = LS_Normal;
740 }
741 
742 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
743  const CommandTraits &Traits, SourceLocation FileLoc,
744  const char *BufferStart, const char *BufferEnd,
745  bool ParseCommands)
746  : Allocator(Allocator), Diags(Diags), Traits(Traits),
747  BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
748  BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
749  ParseCommands(ParseCommands) {}
750 
751 void Lexer::lex(Token &T) {
752 again:
753  switch (CommentState) {
754  case LCS_BeforeComment:
755  if (BufferPtr == BufferEnd) {
756  formTokenWithChars(T, BufferPtr, tok::eof);
757  return;
758  }
759 
760  assert(*BufferPtr == '/');
761  BufferPtr++; // Skip first slash.
762  switch(*BufferPtr) {
763  case '/': { // BCPL comment.
764  BufferPtr++; // Skip second slash.
765 
766  if (BufferPtr != BufferEnd) {
767  // Skip Doxygen magic marker, if it is present.
768  // It might be missing because of a typo //< or /*<, or because we
769  // merged this non-Doxygen comment into a bunch of Doxygen comments
770  // around it: /** ... */ /* ... */ /** ... */
771  const char C = *BufferPtr;
772  if (C == '/' || C == '!')
773  BufferPtr++;
774  }
775 
776  // Skip less-than symbol that marks trailing comments.
777  // Skip it even if the comment is not a Doxygen one, because //< and /*<
778  // are frequent typos.
779  if (BufferPtr != BufferEnd && *BufferPtr == '<')
780  BufferPtr++;
781 
782  CommentState = LCS_InsideBCPLComment;
783  if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
784  State = LS_Normal;
785  CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
786  goto again;
787  }
788  case '*': { // C comment.
789  BufferPtr++; // Skip star.
790 
791  // Skip Doxygen magic marker.
792  const char C = *BufferPtr;
793  if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
794  BufferPtr++;
795 
796  // Skip less-than symbol that marks trailing comments.
797  if (BufferPtr != BufferEnd && *BufferPtr == '<')
798  BufferPtr++;
799 
800  CommentState = LCS_InsideCComment;
801  State = LS_Normal;
802  CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
803  goto again;
804  }
805  default:
806  llvm_unreachable("second character of comment should be '/' or '*'");
807  }
808 
809  case LCS_BetweenComments: {
810  // Consecutive comments are extracted only if there is only whitespace
811  // between them. So we can search for the start of the next comment.
812  const char *EndWhitespace = BufferPtr;
813  while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
814  EndWhitespace++;
815 
816  // Turn any whitespace between comments (and there is only whitespace
817  // between them -- guaranteed by comment extraction) into a newline. We
818  // have two newlines between C comments in total (first one was synthesized
819  // after a comment).
820  formTokenWithChars(T, EndWhitespace, tok::newline);
821 
822  CommentState = LCS_BeforeComment;
823  break;
824  }
825 
826  case LCS_InsideBCPLComment:
827  case LCS_InsideCComment:
828  if (BufferPtr != CommentEnd) {
829  lexCommentText(T);
830  break;
831  } else {
832  // Skip C comment closing sequence.
833  if (CommentState == LCS_InsideCComment) {
834  assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
835  BufferPtr += 2;
836  assert(BufferPtr <= BufferEnd);
837 
838  // Synthenize newline just after the C comment, regardless if there is
839  // actually a newline.
840  formTokenWithChars(T, BufferPtr, tok::newline);
841 
842  CommentState = LCS_BetweenComments;
843  break;
844  } else {
845  // Don't synthesized a newline after BCPL comment.
846  CommentState = LCS_BetweenComments;
847  goto again;
848  }
849  }
850  }
851 }
852 
853 StringRef Lexer::getSpelling(const Token &Tok,
854  const SourceManager &SourceMgr,
855  bool *Invalid) const {
856  SourceLocation Loc = Tok.getLocation();
857  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
858 
859  bool InvalidTemp = false;
860  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
861  if (InvalidTemp) {
862  *Invalid = true;
863  return StringRef();
864  }
865 
866  const char *Begin = File.data() + LocInfo.second;
867  return StringRef(Begin, Tok.getLength());
868 }
869 
870 } // end namespace comments
871 } // end namespace clang
void setHTMLQuotedString(StringRef Str)
Definition: CommentLexer.h:200
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, unsigned DiagID)
Produce a diagnostic highlighting some portion of a literal.
void setHTMLTagStartName(StringRef Name)
Definition: CommentLexer.h:178
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
static StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator, unsigned CodePoint)
SourceLocation getEndLocation() const LLVM_READONLY
Definition: CommentLexer.h:84
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:96
void setLength(unsigned L)
Definition: CommentLexer.h:97
void setText(StringRef Text)
Definition: CommentLexer.h:104
const char * EndCommandName
Name of the command that ends the verbatim block.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: &#39; &#39;, &#39;\t&#39;, &#39;\f&#39;, &#39;\v&#39;.
Definition: CharInfo.h:71
void setVerbatimLineText(StringRef Text)
Definition: CommentLexer.h:167
Information about a single command.
void print(raw_ostream &OS, const SourceManager &SM) const
LineState State
LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
Definition: CharInfo.h:112
void setCommandID(unsigned ID)
Definition: CommentLexer.h:126
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: &#39; &#39;, &#39;\t&#39;, &#39;\f&#39;, &#39;\v&#39;, &#39;\n&#39;, &#39;\r&#39;.
Definition: CharInfo.h:88
void setVerbatimBlockID(unsigned ID)
Definition: CommentLexer.h:136
const FormatToken & Tok
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:149
void setLocation(SourceLocation SL)
Definition: CommentLexer.h:82
void dump(const Lexer &L, const SourceManager &SM) const
unsigned IsVerbatimLineCommand
True if this command is a verbatim line command.
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
static bool isHTMLHexCharacterReferenceCharacter(char C)
SourceLocation End
void setUnknownCommandName(StringRef Name)
Definition: CommentLexer.h:115
const AnnotatedLine * Line
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr, bool *Invalid=nullptr) const
SourceLocation Begin
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
Definition: CharInfo.h:118
void setVerbatimLineID(unsigned ID)
Definition: CommentLexer.h:157
void setHTMLTagEndName(StringRef Name)
Definition: CommentLexer.h:211
static bool isHTMLNamedCharacterReferenceCharacter(char C)
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:81
const SourceManager & SM
Definition: Format.cpp:1472
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Definition: Decl.cpp:2734
void setVerbatimBlockText(StringRef Text)
Definition: CommentLexer.h:146
This class provides information about commands that can be used in comments.
Kind
Encodes a location in the source.
Comment lexer.
Definition: CommentLexer.h:221
Dataflow Directional Tag Classes.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: &#39;\n&#39;, &#39;\r&#39;.
Definition: CharInfo.h:79
unsigned IsVerbatimBlockCommand
True if this command is a verbatim-like block command.
void setKind(tok::TokenKind K)
Definition: CommentLexer.h:91
LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].
Definition: CharInfo.h:124
LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
Definition: CharInfo.h:94
static bool isHTMLDecimalCharacterReferenceCharacter(char C)
Comment token.
Definition: CommentLexer.h:56
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string...
Definition: Diagnostic.h:129
StringRef Text
Definition: Format.cpp:1603
void setHTMLIdent(StringRef Name)
Definition: CommentLexer.h:189
A trivial tuple used to represent a source range.
This class handles loading and caching of source files into memory.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.