clang  10.0.0svn
CommentLexer.cpp
Go to the documentation of this file.
1 //===--- CommentLexer.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
12 #include "clang/Basic/CharInfo.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringSwitch.h"
15 #include "llvm/Support/ConvertUTF.h"
16 #include "llvm/Support/ErrorHandling.h"
17 
18 namespace clang {
19 namespace comments {
20 
21 void Token::dump(const Lexer &L, const SourceManager &SM) const {
22  llvm::errs() << "comments::Token Kind=" << Kind << " ";
23  Loc.print(llvm::errs(), SM);
24  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25 }
26 
27 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28  return isLetter(C);
29 }
30 
31 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32  return isDigit(C);
33 }
34 
35 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36  return isHexDigit(C);
37 }
38 
39 static inline StringRef convertCodePointToUTF8(
40  llvm::BumpPtrAllocator &Allocator,
41  unsigned CodePoint) {
42  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43  char *ResolvedPtr = Resolved;
44  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45  return StringRef(Resolved, ResolvedPtr - Resolved);
46  else
47  return StringRef();
48 }
49 
50 namespace {
51 
52 #include "clang/AST/CommentHTMLTags.inc"
53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54 
55 } // end anonymous namespace
56 
57 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58  // Fast path, first check a few most widely used named character references.
59  return llvm::StringSwitch<StringRef>(Name)
60  .Case("amp", "&")
61  .Case("lt", "<")
62  .Case("gt", ">")
63  .Case("quot", "\"")
64  .Case("apos", "\'")
65  // Slow path.
66  .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67 }
68 
69 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70  unsigned CodePoint = 0;
71  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
73  CodePoint *= 10;
74  CodePoint += Name[i] - '0';
75  }
76  return convertCodePointToUTF8(Allocator, CodePoint);
77 }
78 
79 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80  unsigned CodePoint = 0;
81  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82  CodePoint *= 16;
83  const char C = Name[i];
85  CodePoint += llvm::hexDigitValue(C);
86  }
87  return convertCodePointToUTF8(Allocator, CodePoint);
88 }
89 
90 void Lexer::skipLineStartingDecorations() {
91  // This function should be called only for C comments
92  assert(CommentState == LCS_InsideCComment);
93 
94  if (BufferPtr == CommentEnd)
95  return;
96 
97  switch (*BufferPtr) {
98  case ' ':
99  case '\t':
100  case '\f':
101  case '\v': {
102  const char *NewBufferPtr = BufferPtr;
103  NewBufferPtr++;
104  if (NewBufferPtr == CommentEnd)
105  return;
106 
107  char C = *NewBufferPtr;
108  while (isHorizontalWhitespace(C)) {
109  NewBufferPtr++;
110  if (NewBufferPtr == CommentEnd)
111  return;
112  C = *NewBufferPtr;
113  }
114  if (C == '*')
115  BufferPtr = NewBufferPtr + 1;
116  break;
117  }
118  case '*':
119  BufferPtr++;
120  break;
121  }
122 }
123 
124 namespace {
125 /// Returns pointer to the first newline character in the string.
126 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
127  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
128  if (isVerticalWhitespace(*BufferPtr))
129  return BufferPtr;
130  }
131  return BufferEnd;
132 }
133 
134 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
135  if (BufferPtr == BufferEnd)
136  return BufferPtr;
137 
138  if (*BufferPtr == '\n')
139  BufferPtr++;
140  else {
141  assert(*BufferPtr == '\r');
142  BufferPtr++;
143  if (BufferPtr != BufferEnd && *BufferPtr == '\n')
144  BufferPtr++;
145  }
146  return BufferPtr;
147 }
148 
149 const char *skipNamedCharacterReference(const char *BufferPtr,
150  const char *BufferEnd) {
151  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153  return BufferPtr;
154  }
155  return BufferEnd;
156 }
157 
158 const char *skipDecimalCharacterReference(const char *BufferPtr,
159  const char *BufferEnd) {
160  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162  return BufferPtr;
163  }
164  return BufferEnd;
165 }
166 
167 const char *skipHexCharacterReference(const char *BufferPtr,
168  const char *BufferEnd) {
169  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
170  if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
171  return BufferPtr;
172  }
173  return BufferEnd;
174 }
175 
176 bool isHTMLIdentifierStartingCharacter(char C) {
177  return isLetter(C);
178 }
179 
180 bool isHTMLIdentifierCharacter(char C) {
181  return isAlphanumeric(C);
182 }
183 
184 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
185  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
186  if (!isHTMLIdentifierCharacter(*BufferPtr))
187  return BufferPtr;
188  }
189  return BufferEnd;
190 }
191 
192 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
193 /// string allowed.
194 ///
195 /// Returns pointer to closing quote.
196 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
197 {
198  const char Quote = *BufferPtr;
199  assert(Quote == '\"' || Quote == '\'');
200 
201  BufferPtr++;
202  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
203  const char C = *BufferPtr;
204  if (C == Quote && BufferPtr[-1] != '\\')
205  return BufferPtr;
206  }
207  return BufferEnd;
208 }
209 
210 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
211  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
212  if (!isWhitespace(*BufferPtr))
213  return BufferPtr;
214  }
215  return BufferEnd;
216 }
217 
218 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
219  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
220 }
221 
222 bool isCommandNameStartCharacter(char C) {
223  return isLetter(C);
224 }
225 
226 bool isCommandNameCharacter(char C) {
227  return isAlphanumeric(C);
228 }
229 
230 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
231  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
232  if (!isCommandNameCharacter(*BufferPtr))
233  return BufferPtr;
234  }
235  return BufferEnd;
236 }
237 
238 /// Return the one past end pointer for BCPL comments.
239 /// Handles newlines escaped with backslash or trigraph for backslahs.
240 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
241  const char *CurPtr = BufferPtr;
242  while (CurPtr != BufferEnd) {
243  while (!isVerticalWhitespace(*CurPtr)) {
244  CurPtr++;
245  if (CurPtr == BufferEnd)
246  return BufferEnd;
247  }
248  // We found a newline, check if it is escaped.
249  const char *EscapePtr = CurPtr - 1;
250  while(isHorizontalWhitespace(*EscapePtr))
251  EscapePtr--;
252 
253  if (*EscapePtr == '\\' ||
254  (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
255  EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
256  // We found an escaped newline.
257  CurPtr = skipNewline(CurPtr, BufferEnd);
258  } else
259  return CurPtr; // Not an escaped newline.
260  }
261  return BufferEnd;
262 }
263 
264 /// Return the one past end pointer for C comments.
265 /// Very dumb, does not handle escaped newlines or trigraphs.
266 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
267  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
268  if (*BufferPtr == '*') {
269  assert(BufferPtr + 1 != BufferEnd);
270  if (*(BufferPtr + 1) == '/')
271  return BufferPtr;
272  }
273  }
274  llvm_unreachable("buffer end hit before '*/' was seen");
275 }
276 
277 } // end anonymous namespace
278 
279 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
281  const unsigned TokLen = TokEnd - BufferPtr;
282  Result.setLocation(getSourceLocation(BufferPtr));
283  Result.setKind(Kind);
284  Result.setLength(TokLen);
285 #ifndef NDEBUG
286  Result.TextPtr = "<UNSET>";
287  Result.IntVal = 7;
288 #endif
289  BufferPtr = TokEnd;
290 }
291 
292 void Lexer::lexCommentText(Token &T) {
293  assert(CommentState == LCS_InsideBCPLComment ||
294  CommentState == LCS_InsideCComment);
295 
296  // Handles lexing non-command text, i.e. text and newline.
297  auto HandleNonCommandToken = [&]() -> void {
298  assert(State == LS_Normal);
299 
300  const char *TokenPtr = BufferPtr;
301  assert(TokenPtr < CommentEnd);
302  switch (*TokenPtr) {
303  case '\n':
304  case '\r':
305  TokenPtr = skipNewline(TokenPtr, CommentEnd);
306  formTokenWithChars(T, TokenPtr, tok::newline);
307 
308  if (CommentState == LCS_InsideCComment)
309  skipLineStartingDecorations();
310  return;
311 
312  default: {
313  StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
314  size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
315  .find_first_of(TokStartSymbols);
316  if (End != StringRef::npos)
317  TokenPtr += End;
318  else
319  TokenPtr = CommentEnd;
320  formTextToken(T, TokenPtr);
321  return;
322  }
323  }
324  };
325 
326  if (!ParseCommands)
327  return HandleNonCommandToken();
328 
329  switch (State) {
330  case LS_Normal:
331  break;
332  case LS_VerbatimBlockFirstLine:
333  lexVerbatimBlockFirstLine(T);
334  return;
335  case LS_VerbatimBlockBody:
336  lexVerbatimBlockBody(T);
337  return;
338  case LS_VerbatimLineText:
339  lexVerbatimLineText(T);
340  return;
341  case LS_HTMLStartTag:
342  lexHTMLStartTag(T);
343  return;
344  case LS_HTMLEndTag:
345  lexHTMLEndTag(T);
346  return;
347  }
348 
349  assert(State == LS_Normal);
350  const char *TokenPtr = BufferPtr;
351  assert(TokenPtr < CommentEnd);
352  switch(*TokenPtr) {
353  case '\\':
354  case '@': {
355  // Commands that start with a backslash and commands that start with
356  // 'at' have equivalent semantics. But we keep information about the
357  // exact syntax in AST for comments.
358  tok::TokenKind CommandKind =
359  (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
360  TokenPtr++;
361  if (TokenPtr == CommentEnd) {
362  formTextToken(T, TokenPtr);
363  return;
364  }
365  char C = *TokenPtr;
366  switch (C) {
367  default:
368  break;
369 
370  case '\\': case '@': case '&': case '$':
371  case '#': case '<': case '>': case '%':
372  case '\"': case '.': case ':':
373  // This is one of \\ \@ \& \$ etc escape sequences.
374  TokenPtr++;
375  if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
376  // This is the \:: escape sequence.
377  TokenPtr++;
378  }
379  StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
380  formTokenWithChars(T, TokenPtr, tok::text);
381  T.setText(UnescapedText);
382  return;
383  }
384 
385  // Don't make zero-length commands.
386  if (!isCommandNameStartCharacter(*TokenPtr)) {
387  formTextToken(T, TokenPtr);
388  return;
389  }
390 
391  TokenPtr = skipCommandName(TokenPtr, CommentEnd);
392  unsigned Length = TokenPtr - (BufferPtr + 1);
393 
394  // Hardcoded support for lexing LaTeX formula commands
395  // \f$ \f[ \f] \f{ \f} as a single command.
396  if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
397  C = *TokenPtr;
398  if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
399  TokenPtr++;
400  Length++;
401  }
402  }
403 
404  StringRef CommandName(BufferPtr + 1, Length);
405 
406  const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
407  if (!Info) {
408  if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
409  StringRef CorrectedName = Info->Name;
410  SourceLocation Loc = getSourceLocation(BufferPtr);
411  SourceLocation EndLoc = getSourceLocation(TokenPtr);
412  SourceRange FullRange = SourceRange(Loc, EndLoc);
413  SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
414  Diag(Loc, diag::warn_correct_comment_command_name)
415  << FullRange << CommandName << CorrectedName
416  << FixItHint::CreateReplacement(CommandRange, CorrectedName);
417  } else {
418  formTokenWithChars(T, TokenPtr, tok::unknown_command);
419  T.setUnknownCommandName(CommandName);
420  Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
422  return;
423  }
424  }
425  if (Info->IsVerbatimBlockCommand) {
426  setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
427  return;
428  }
429  if (Info->IsVerbatimLineCommand) {
430  setupAndLexVerbatimLine(T, TokenPtr, Info);
431  return;
432  }
433  formTokenWithChars(T, TokenPtr, CommandKind);
434  T.setCommandID(Info->getID());
435  return;
436  }
437 
438  case '&':
439  lexHTMLCharacterReference(T);
440  return;
441 
442  case '<': {
443  TokenPtr++;
444  if (TokenPtr == CommentEnd) {
445  formTextToken(T, TokenPtr);
446  return;
447  }
448  const char C = *TokenPtr;
449  if (isHTMLIdentifierStartingCharacter(C))
450  setupAndLexHTMLStartTag(T);
451  else if (C == '/')
452  setupAndLexHTMLEndTag(T);
453  else
454  formTextToken(T, TokenPtr);
455  return;
456  }
457 
458  default:
459  return HandleNonCommandToken();
460  }
461 }
462 
463 void Lexer::setupAndLexVerbatimBlock(Token &T,
464  const char *TextBegin,
465  char Marker, const CommandInfo *Info) {
466  assert(Info->IsVerbatimBlockCommand);
467 
468  VerbatimBlockEndCommandName.clear();
469  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
470  VerbatimBlockEndCommandName.append(Info->EndCommandName);
471 
472  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
473  T.setVerbatimBlockID(Info->getID());
474 
475  // If there is a newline following the verbatim opening command, skip the
476  // newline so that we don't create an tok::verbatim_block_line with empty
477  // text content.
478  if (BufferPtr != CommentEnd &&
479  isVerticalWhitespace(*BufferPtr)) {
480  BufferPtr = skipNewline(BufferPtr, CommentEnd);
481  State = LS_VerbatimBlockBody;
482  return;
483  }
484 
485  State = LS_VerbatimBlockFirstLine;
486 }
487 
488 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
489 again:
490  assert(BufferPtr < CommentEnd);
491 
492  // FIXME: It would be better to scan the text once, finding either the block
493  // end command or newline.
494  //
495  // Extract current line.
496  const char *Newline = findNewline(BufferPtr, CommentEnd);
497  StringRef Line(BufferPtr, Newline - BufferPtr);
498 
499  // Look for end command in current line.
500  size_t Pos = Line.find(VerbatimBlockEndCommandName);
501  const char *TextEnd;
502  const char *NextLine;
503  if (Pos == StringRef::npos) {
504  // Current line is completely verbatim.
505  TextEnd = Newline;
506  NextLine = skipNewline(Newline, CommentEnd);
507  } else if (Pos == 0) {
508  // Current line contains just an end command.
509  const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
510  StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
511  formTokenWithChars(T, End, tok::verbatim_block_end);
512  T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
513  State = LS_Normal;
514  return;
515  } else {
516  // There is some text, followed by end command. Extract text first.
517  TextEnd = BufferPtr + Pos;
518  NextLine = TextEnd;
519  // If there is only whitespace before end command, skip whitespace.
520  if (isWhitespace(BufferPtr, TextEnd)) {
521  BufferPtr = TextEnd;
522  goto again;
523  }
524  }
525 
526  StringRef Text(BufferPtr, TextEnd - BufferPtr);
527  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
528  T.setVerbatimBlockText(Text);
529 
530  State = LS_VerbatimBlockBody;
531 }
532 
533 void Lexer::lexVerbatimBlockBody(Token &T) {
534  assert(State == LS_VerbatimBlockBody);
535 
536  if (CommentState == LCS_InsideCComment)
537  skipLineStartingDecorations();
538 
539  if (BufferPtr == CommentEnd) {
540  formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
541  T.setVerbatimBlockText("");
542  return;
543  }
544 
545  lexVerbatimBlockFirstLine(T);
546 }
547 
548 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
549  const CommandInfo *Info) {
550  assert(Info->IsVerbatimLineCommand);
551  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
552  T.setVerbatimLineID(Info->getID());
553 
554  State = LS_VerbatimLineText;
555 }
556 
557 void Lexer::lexVerbatimLineText(Token &T) {
558  assert(State == LS_VerbatimLineText);
559 
560  // Extract current line.
561  const char *Newline = findNewline(BufferPtr, CommentEnd);
562  StringRef Text(BufferPtr, Newline - BufferPtr);
563  formTokenWithChars(T, Newline, tok::verbatim_line_text);
564  T.setVerbatimLineText(Text);
565 
566  State = LS_Normal;
567 }
568 
569 void Lexer::lexHTMLCharacterReference(Token &T) {
570  const char *TokenPtr = BufferPtr;
571  assert(*TokenPtr == '&');
572  TokenPtr++;
573  if (TokenPtr == CommentEnd) {
574  formTextToken(T, TokenPtr);
575  return;
576  }
577  const char *NamePtr;
578  bool isNamed = false;
579  bool isDecimal = false;
580  char C = *TokenPtr;
582  NamePtr = TokenPtr;
583  TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
584  isNamed = true;
585  } else if (C == '#') {
586  TokenPtr++;
587  if (TokenPtr == CommentEnd) {
588  formTextToken(T, TokenPtr);
589  return;
590  }
591  C = *TokenPtr;
593  NamePtr = TokenPtr;
594  TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
595  isDecimal = true;
596  } else if (C == 'x' || C == 'X') {
597  TokenPtr++;
598  NamePtr = TokenPtr;
599  TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
600  } else {
601  formTextToken(T, TokenPtr);
602  return;
603  }
604  } else {
605  formTextToken(T, TokenPtr);
606  return;
607  }
608  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
609  *TokenPtr != ';') {
610  formTextToken(T, TokenPtr);
611  return;
612  }
613  StringRef Name(NamePtr, TokenPtr - NamePtr);
614  TokenPtr++; // Skip semicolon.
615  StringRef Resolved;
616  if (isNamed)
617  Resolved = resolveHTMLNamedCharacterReference(Name);
618  else if (isDecimal)
619  Resolved = resolveHTMLDecimalCharacterReference(Name);
620  else
621  Resolved = resolveHTMLHexCharacterReference(Name);
622 
623  if (Resolved.empty()) {
624  formTextToken(T, TokenPtr);
625  return;
626  }
627  formTokenWithChars(T, TokenPtr, tok::text);
628  T.setText(Resolved);
629 }
630 
631 void Lexer::setupAndLexHTMLStartTag(Token &T) {
632  assert(BufferPtr[0] == '<' &&
633  isHTMLIdentifierStartingCharacter(BufferPtr[1]));
634  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
635  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
636  if (!isHTMLTagName(Name)) {
637  formTextToken(T, TagNameEnd);
638  return;
639  }
640 
641  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
642  T.setHTMLTagStartName(Name);
643 
644  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
645 
646  const char C = *BufferPtr;
647  if (BufferPtr != CommentEnd &&
648  (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
649  State = LS_HTMLStartTag;
650 }
651 
652 void Lexer::lexHTMLStartTag(Token &T) {
653  assert(State == LS_HTMLStartTag);
654 
655  const char *TokenPtr = BufferPtr;
656  char C = *TokenPtr;
657  if (isHTMLIdentifierCharacter(C)) {
658  TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
659  StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
660  formTokenWithChars(T, TokenPtr, tok::html_ident);
661  T.setHTMLIdent(Ident);
662  } else {
663  switch (C) {
664  case '=':
665  TokenPtr++;
666  formTokenWithChars(T, TokenPtr, tok::html_equals);
667  break;
668  case '\"':
669  case '\'': {
670  const char *OpenQuote = TokenPtr;
671  TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
672  const char *ClosingQuote = TokenPtr;
673  if (TokenPtr != CommentEnd) // Skip closing quote.
674  TokenPtr++;
675  formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
676  T.setHTMLQuotedString(StringRef(OpenQuote + 1,
677  ClosingQuote - (OpenQuote + 1)));
678  break;
679  }
680  case '>':
681  TokenPtr++;
682  formTokenWithChars(T, TokenPtr, tok::html_greater);
683  State = LS_Normal;
684  return;
685  case '/':
686  TokenPtr++;
687  if (TokenPtr != CommentEnd && *TokenPtr == '>') {
688  TokenPtr++;
689  formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
690  } else
691  formTextToken(T, TokenPtr);
692 
693  State = LS_Normal;
694  return;
695  }
696  }
697 
698  // Now look ahead and return to normal state if we don't see any HTML tokens
699  // ahead.
700  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
701  if (BufferPtr == CommentEnd) {
702  State = LS_Normal;
703  return;
704  }
705 
706  C = *BufferPtr;
707  if (!isHTMLIdentifierStartingCharacter(C) &&
708  C != '=' && C != '\"' && C != '\'' && C != '>') {
709  State = LS_Normal;
710  return;
711  }
712 }
713 
714 void Lexer::setupAndLexHTMLEndTag(Token &T) {
715  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
716 
717  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
718  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
719  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
720  if (!isHTMLTagName(Name)) {
721  formTextToken(T, TagNameEnd);
722  return;
723  }
724 
725  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
726 
727  formTokenWithChars(T, End, tok::html_end_tag);
728  T.setHTMLTagEndName(Name);
729 
730  if (BufferPtr != CommentEnd && *BufferPtr == '>')
731  State = LS_HTMLEndTag;
732 }
733 
734 void Lexer::lexHTMLEndTag(Token &T) {
735  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
736 
737  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
738  State = LS_Normal;
739 }
740 
741 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
742  const CommandTraits &Traits, SourceLocation FileLoc,
743  const char *BufferStart, const char *BufferEnd,
744  bool ParseCommands)
745  : Allocator(Allocator), Diags(Diags), Traits(Traits),
746  BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
747  BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
748  ParseCommands(ParseCommands) {}
749 
750 void Lexer::lex(Token &T) {
751 again:
752  switch (CommentState) {
753  case LCS_BeforeComment:
754  if (BufferPtr == BufferEnd) {
755  formTokenWithChars(T, BufferPtr, tok::eof);
756  return;
757  }
758 
759  assert(*BufferPtr == '/');
760  BufferPtr++; // Skip first slash.
761  switch(*BufferPtr) {
762  case '/': { // BCPL comment.
763  BufferPtr++; // Skip second slash.
764 
765  if (BufferPtr != BufferEnd) {
766  // Skip Doxygen magic marker, if it is present.
767  // It might be missing because of a typo //< or /*<, or because we
768  // merged this non-Doxygen comment into a bunch of Doxygen comments
769  // around it: /** ... */ /* ... */ /** ... */
770  const char C = *BufferPtr;
771  if (C == '/' || C == '!')
772  BufferPtr++;
773  }
774 
775  // Skip less-than symbol that marks trailing comments.
776  // Skip it even if the comment is not a Doxygen one, because //< and /*<
777  // are frequent typos.
778  if (BufferPtr != BufferEnd && *BufferPtr == '<')
779  BufferPtr++;
780 
781  CommentState = LCS_InsideBCPLComment;
782  if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
783  State = LS_Normal;
784  CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
785  goto again;
786  }
787  case '*': { // C comment.
788  BufferPtr++; // Skip star.
789 
790  // Skip Doxygen magic marker.
791  const char C = *BufferPtr;
792  if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
793  BufferPtr++;
794 
795  // Skip less-than symbol that marks trailing comments.
796  if (BufferPtr != BufferEnd && *BufferPtr == '<')
797  BufferPtr++;
798 
799  CommentState = LCS_InsideCComment;
800  State = LS_Normal;
801  CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
802  goto again;
803  }
804  default:
805  llvm_unreachable("second character of comment should be '/' or '*'");
806  }
807 
808  case LCS_BetweenComments: {
809  // Consecutive comments are extracted only if there is only whitespace
810  // between them. So we can search for the start of the next comment.
811  const char *EndWhitespace = BufferPtr;
812  while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
813  EndWhitespace++;
814 
815  // Turn any whitespace between comments (and there is only whitespace
816  // between them -- guaranteed by comment extraction) into a newline. We
817  // have two newlines between C comments in total (first one was synthesized
818  // after a comment).
819  formTokenWithChars(T, EndWhitespace, tok::newline);
820 
821  CommentState = LCS_BeforeComment;
822  break;
823  }
824 
825  case LCS_InsideBCPLComment:
826  case LCS_InsideCComment:
827  if (BufferPtr != CommentEnd) {
828  lexCommentText(T);
829  break;
830  } else {
831  // Skip C comment closing sequence.
832  if (CommentState == LCS_InsideCComment) {
833  assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
834  BufferPtr += 2;
835  assert(BufferPtr <= BufferEnd);
836 
837  // Synthenize newline just after the C comment, regardless if there is
838  // actually a newline.
839  formTokenWithChars(T, BufferPtr, tok::newline);
840 
841  CommentState = LCS_BetweenComments;
842  break;
843  } else {
844  // Don't synthesized a newline after BCPL comment.
845  CommentState = LCS_BetweenComments;
846  goto again;
847  }
848  }
849  }
850 }
851 
852 StringRef Lexer::getSpelling(const Token &Tok,
853  const SourceManager &SourceMgr) const {
854  SourceLocation Loc = Tok.getLocation();
855  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
856 
857  bool InvalidTemp = false;
858  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
859  if (InvalidTemp)
860  return StringRef();
861 
862  const char *Begin = File.data() + LocInfo.second;
863  return StringRef(Begin, Tok.getLength());
864 }
865 
866 } // end namespace comments
867 } // end namespace clang
void setHTMLQuotedString(StringRef Str)
Definition: CommentLexer.h:199
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, unsigned DiagID)
Produce a diagnostic highlighting some portion of a literal.
void setHTMLTagStartName(StringRef Name)
Definition: CommentLexer.h:177
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
static StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator, unsigned CodePoint)
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const
SourceLocation getEndLocation() const LLVM_READONLY
Definition: CommentLexer.h:83
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:95
void setLength(unsigned L)
Definition: CommentLexer.h:96
void setText(StringRef Text)
Definition: CommentLexer.h:103
const char * EndCommandName
Name of the command that ends the verbatim block.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: &#39; &#39;, &#39;\t&#39;, &#39;\f&#39;, &#39;\v&#39;.
Definition: CharInfo.h:70
void setVerbatimLineText(StringRef Text)
Definition: CommentLexer.h:166
Information about a single command.
void print(raw_ostream &OS, const SourceManager &SM) const
LineState State
LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
Definition: CharInfo.h:111
void setCommandID(unsigned ID)
Definition: CommentLexer.h:125
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: &#39; &#39;, &#39;\t&#39;, &#39;\f&#39;, &#39;\v&#39;, &#39;\n&#39;, &#39;\r&#39;.
Definition: CharInfo.h:87
void setVerbatimBlockID(unsigned ID)
Definition: CommentLexer.h:135
const FormatToken & Tok
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:149
void setLocation(SourceLocation SL)
Definition: CommentLexer.h:81
void dump(const Lexer &L, const SourceManager &SM) const
unsigned IsVerbatimLineCommand
True if this command is a verbatim line command.
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
static unsigned skipNewline(const char *&First, const char *End)
static bool isHTMLHexCharacterReferenceCharacter(char C)
SourceLocation End
void setUnknownCommandName(StringRef Name)
Definition: CommentLexer.h:114
const AnnotatedLine * Line
SourceLocation Begin
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
Definition: CharInfo.h:117
void setVerbatimLineID(unsigned ID)
Definition: CommentLexer.h:156
void setHTMLTagEndName(StringRef Name)
Definition: CommentLexer.h:210
static bool isHTMLNamedCharacterReferenceCharacter(char C)
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:80
const SourceManager & SM
Definition: Format.cpp:1667
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Definition: Decl.cpp:2871
void setVerbatimBlockText(StringRef Text)
Definition: CommentLexer.h:145
This class provides information about commands that can be used in comments.
Kind
Encodes a location in the source.
Comment lexer.
Definition: CommentLexer.h:220
Dataflow Directional Tag Classes.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: &#39;\n&#39;, &#39;\r&#39;.
Definition: CharInfo.h:78
unsigned IsVerbatimBlockCommand
True if this command is a verbatim-like block command.
void setKind(tok::TokenKind K)
Definition: CommentLexer.h:90
LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].
Definition: CharInfo.h:123
LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
Definition: CharInfo.h:93
static bool isHTMLDecimalCharacterReferenceCharacter(char C)
Comment token.
Definition: CommentLexer.h:55
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string...
Definition: Diagnostic.h:129
StringRef Text
Definition: Format.cpp:1808
void setHTMLIdent(StringRef Name)
Definition: CommentLexer.h:188
A trivial tuple used to represent a source range.
This class handles loading and caching of source files into memory.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.