clang  9.0.0svn
CommentLexer.cpp
Go to the documentation of this file.
1 //===--- CommentLexer.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
12 #include "clang/Basic/CharInfo.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringSwitch.h"
15 #include "llvm/Support/ConvertUTF.h"
16 #include "llvm/Support/ErrorHandling.h"
17 
18 namespace clang {
19 namespace comments {
20 
21 void Token::dump(const Lexer &L, const SourceManager &SM) const {
22  llvm::errs() << "comments::Token Kind=" << Kind << " ";
23  Loc.print(llvm::errs(), SM);
24  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25 }
26 
27 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28  return isLetter(C);
29 }
30 
31 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32  return isDigit(C);
33 }
34 
35 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36  return isHexDigit(C);
37 }
38 
39 static inline StringRef convertCodePointToUTF8(
40  llvm::BumpPtrAllocator &Allocator,
41  unsigned CodePoint) {
42  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43  char *ResolvedPtr = Resolved;
44  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45  return StringRef(Resolved, ResolvedPtr - Resolved);
46  else
47  return StringRef();
48 }
49 
50 namespace {
51 
52 #include "clang/AST/CommentHTMLTags.inc"
53 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54 
55 } // end anonymous namespace
56 
57 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58  // Fast path, first check a few most widely used named character references.
59  return llvm::StringSwitch<StringRef>(Name)
60  .Case("amp", "&")
61  .Case("lt", "<")
62  .Case("gt", ">")
63  .Case("quot", "\"")
64  .Case("apos", "\'")
65  // Slow path.
66  .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67 }
68 
69 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70  unsigned CodePoint = 0;
71  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
73  CodePoint *= 10;
74  CodePoint += Name[i] - '0';
75  }
76  return convertCodePointToUTF8(Allocator, CodePoint);
77 }
78 
79 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80  unsigned CodePoint = 0;
81  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82  CodePoint *= 16;
83  const char C = Name[i];
85  CodePoint += llvm::hexDigitValue(C);
86  }
87  return convertCodePointToUTF8(Allocator, CodePoint);
88 }
89 
90 void Lexer::skipLineStartingDecorations() {
91  // This function should be called only for C comments
92  assert(CommentState == LCS_InsideCComment);
93 
94  if (BufferPtr == CommentEnd)
95  return;
96 
97  switch (*BufferPtr) {
98  case ' ':
99  case '\t':
100  case '\f':
101  case '\v': {
102  const char *NewBufferPtr = BufferPtr;
103  NewBufferPtr++;
104  if (NewBufferPtr == CommentEnd)
105  return;
106 
107  char C = *NewBufferPtr;
108  while (isHorizontalWhitespace(C)) {
109  NewBufferPtr++;
110  if (NewBufferPtr == CommentEnd)
111  return;
112  C = *NewBufferPtr;
113  }
114  if (C == '*')
115  BufferPtr = NewBufferPtr + 1;
116  break;
117  }
118  case '*':
119  BufferPtr++;
120  break;
121  }
122 }
123 
124 namespace {
125 /// Returns pointer to the first newline character in the string.
126 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
127  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
128  if (isVerticalWhitespace(*BufferPtr))
129  return BufferPtr;
130  }
131  return BufferEnd;
132 }
133 
134 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
135  if (BufferPtr == BufferEnd)
136  return BufferPtr;
137 
138  if (*BufferPtr == '\n')
139  BufferPtr++;
140  else {
141  assert(*BufferPtr == '\r');
142  BufferPtr++;
143  if (BufferPtr != BufferEnd && *BufferPtr == '\n')
144  BufferPtr++;
145  }
146  return BufferPtr;
147 }
148 
149 const char *skipNamedCharacterReference(const char *BufferPtr,
150  const char *BufferEnd) {
151  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153  return BufferPtr;
154  }
155  return BufferEnd;
156 }
157 
158 const char *skipDecimalCharacterReference(const char *BufferPtr,
159  const char *BufferEnd) {
160  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162  return BufferPtr;
163  }
164  return BufferEnd;
165 }
166 
167 const char *skipHexCharacterReference(const char *BufferPtr,
168  const char *BufferEnd) {
169  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
170  if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
171  return BufferPtr;
172  }
173  return BufferEnd;
174 }
175 
176 bool isHTMLIdentifierStartingCharacter(char C) {
177  return isLetter(C);
178 }
179 
180 bool isHTMLIdentifierCharacter(char C) {
181  return isAlphanumeric(C);
182 }
183 
184 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
185  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
186  if (!isHTMLIdentifierCharacter(*BufferPtr))
187  return BufferPtr;
188  }
189  return BufferEnd;
190 }
191 
192 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
193 /// string allowed.
194 ///
195 /// Returns pointer to closing quote.
196 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
197 {
198  const char Quote = *BufferPtr;
199  assert(Quote == '\"' || Quote == '\'');
200 
201  BufferPtr++;
202  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
203  const char C = *BufferPtr;
204  if (C == Quote && BufferPtr[-1] != '\\')
205  return BufferPtr;
206  }
207  return BufferEnd;
208 }
209 
210 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
211  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
212  if (!isWhitespace(*BufferPtr))
213  return BufferPtr;
214  }
215  return BufferEnd;
216 }
217 
218 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
219  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
220 }
221 
222 bool isCommandNameStartCharacter(char C) {
223  return isLetter(C);
224 }
225 
226 bool isCommandNameCharacter(char C) {
227  return isAlphanumeric(C);
228 }
229 
230 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
231  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
232  if (!isCommandNameCharacter(*BufferPtr))
233  return BufferPtr;
234  }
235  return BufferEnd;
236 }
237 
238 /// Return the one past end pointer for BCPL comments.
239 /// Handles newlines escaped with backslash or trigraph for backslahs.
240 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
241  const char *CurPtr = BufferPtr;
242  while (CurPtr != BufferEnd) {
243  while (!isVerticalWhitespace(*CurPtr)) {
244  CurPtr++;
245  if (CurPtr == BufferEnd)
246  return BufferEnd;
247  }
248  // We found a newline, check if it is escaped.
249  const char *EscapePtr = CurPtr - 1;
250  while(isHorizontalWhitespace(*EscapePtr))
251  EscapePtr--;
252 
253  if (*EscapePtr == '\\' ||
254  (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
255  EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
256  // We found an escaped newline.
257  CurPtr = skipNewline(CurPtr, BufferEnd);
258  } else
259  return CurPtr; // Not an escaped newline.
260  }
261  return BufferEnd;
262 }
263 
264 /// Return the one past end pointer for C comments.
265 /// Very dumb, does not handle escaped newlines or trigraphs.
266 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
267  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
268  if (*BufferPtr == '*') {
269  assert(BufferPtr + 1 != BufferEnd);
270  if (*(BufferPtr + 1) == '/')
271  return BufferPtr;
272  }
273  }
274  llvm_unreachable("buffer end hit before '*/' was seen");
275 }
276 
277 } // end anonymous namespace
278 
279 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
281  const unsigned TokLen = TokEnd - BufferPtr;
282  Result.setLocation(getSourceLocation(BufferPtr));
283  Result.setKind(Kind);
284  Result.setLength(TokLen);
285 #ifndef NDEBUG
286  Result.TextPtr = "<UNSET>";
287  Result.IntVal = 7;
288 #endif
289  BufferPtr = TokEnd;
290 }
291 
292 void Lexer::lexCommentText(Token &T) {
293  assert(CommentState == LCS_InsideBCPLComment ||
294  CommentState == LCS_InsideCComment);
295 
296  // Handles lexing non-command text, i.e. text and newline.
297  auto HandleNonCommandToken = [&]() -> void {
298  assert(State == LS_Normal);
299 
300  const char *TokenPtr = BufferPtr;
301  assert(TokenPtr < CommentEnd);
302  switch (*TokenPtr) {
303  case '\n':
304  case '\r':
305  TokenPtr = skipNewline(TokenPtr, CommentEnd);
306  formTokenWithChars(T, TokenPtr, tok::newline);
307 
308  if (CommentState == LCS_InsideCComment)
309  skipLineStartingDecorations();
310  return;
311 
312  default: {
313  StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
314  size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
315  .find_first_of(TokStartSymbols);
316  if (End != StringRef::npos)
317  TokenPtr += End;
318  else
319  TokenPtr = CommentEnd;
320  formTextToken(T, TokenPtr);
321  return;
322  }
323  }
324  };
325 
326  if (!ParseCommands)
327  return HandleNonCommandToken();
328 
329  switch (State) {
330  case LS_Normal:
331  break;
332  case LS_VerbatimBlockFirstLine:
333  lexVerbatimBlockFirstLine(T);
334  return;
335  case LS_VerbatimBlockBody:
336  lexVerbatimBlockBody(T);
337  return;
338  case LS_VerbatimLineText:
339  lexVerbatimLineText(T);
340  return;
341  case LS_HTMLStartTag:
342  lexHTMLStartTag(T);
343  return;
344  case LS_HTMLEndTag:
345  lexHTMLEndTag(T);
346  return;
347  }
348 
349  assert(State == LS_Normal);
350  const char *TokenPtr = BufferPtr;
351  assert(TokenPtr < CommentEnd);
352  switch(*TokenPtr) {
353  case '\\':
354  case '@': {
355  // Commands that start with a backslash and commands that start with
356  // 'at' have equivalent semantics. But we keep information about the
357  // exact syntax in AST for comments.
358  tok::TokenKind CommandKind =
359  (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
360  TokenPtr++;
361  if (TokenPtr == CommentEnd) {
362  formTextToken(T, TokenPtr);
363  return;
364  }
365  char C = *TokenPtr;
366  switch (C) {
367  default:
368  break;
369 
370  case '\\': case '@': case '&': case '$':
371  case '#': case '<': case '>': case '%':
372  case '\"': case '.': case ':':
373  // This is one of \\ \@ \& \$ etc escape sequences.
374  TokenPtr++;
375  if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
376  // This is the \:: escape sequence.
377  TokenPtr++;
378  }
379  StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
380  formTokenWithChars(T, TokenPtr, tok::text);
381  T.setText(UnescapedText);
382  return;
383  }
384 
385  // Don't make zero-length commands.
386  if (!isCommandNameStartCharacter(*TokenPtr)) {
387  formTextToken(T, TokenPtr);
388  return;
389  }
390 
391  TokenPtr = skipCommandName(TokenPtr, CommentEnd);
392  unsigned Length = TokenPtr - (BufferPtr + 1);
393 
394  // Hardcoded support for lexing LaTeX formula commands
395  // \f$ \f[ \f] \f{ \f} as a single command.
396  if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
397  C = *TokenPtr;
398  if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
399  TokenPtr++;
400  Length++;
401  }
402  }
403 
404  StringRef CommandName(BufferPtr + 1, Length);
405 
406  const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
407  if (!Info) {
408  if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
409  StringRef CorrectedName = Info->Name;
410  SourceLocation Loc = getSourceLocation(BufferPtr);
411  SourceLocation EndLoc = getSourceLocation(TokenPtr);
412  SourceRange FullRange = SourceRange(Loc, EndLoc);
413  SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
414  Diag(Loc, diag::warn_correct_comment_command_name)
415  << FullRange << CommandName << CorrectedName
416  << FixItHint::CreateReplacement(CommandRange, CorrectedName);
417  } else {
418  formTokenWithChars(T, TokenPtr, tok::unknown_command);
419  T.setUnknownCommandName(CommandName);
420  Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
422  return;
423  }
424  }
425  if (Info->IsVerbatimBlockCommand) {
426  setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
427  return;
428  }
429  if (Info->IsVerbatimLineCommand) {
430  setupAndLexVerbatimLine(T, TokenPtr, Info);
431  return;
432  }
433  formTokenWithChars(T, TokenPtr, CommandKind);
434  T.setCommandID(Info->getID());
435  return;
436  }
437 
438  case '&':
439  lexHTMLCharacterReference(T);
440  return;
441 
442  case '<': {
443  TokenPtr++;
444  if (TokenPtr == CommentEnd) {
445  formTextToken(T, TokenPtr);
446  return;
447  }
448  const char C = *TokenPtr;
449  if (isHTMLIdentifierStartingCharacter(C))
450  setupAndLexHTMLStartTag(T);
451  else if (C == '/')
452  setupAndLexHTMLEndTag(T);
453  else
454  formTextToken(T, TokenPtr);
455  return;
456  }
457 
458  default:
459  return HandleNonCommandToken();
460  }
461 }
462 
463 void Lexer::setupAndLexVerbatimBlock(Token &T,
464  const char *TextBegin,
465  char Marker, const CommandInfo *Info) {
466  assert(Info->IsVerbatimBlockCommand);
467 
468  VerbatimBlockEndCommandName.clear();
469  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
470  VerbatimBlockEndCommandName.append(Info->EndCommandName);
471 
472  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
473  T.setVerbatimBlockID(Info->getID());
474 
475  // If there is a newline following the verbatim opening command, skip the
476  // newline so that we don't create an tok::verbatim_block_line with empty
477  // text content.
478  if (BufferPtr != CommentEnd &&
479  isVerticalWhitespace(*BufferPtr)) {
480  BufferPtr = skipNewline(BufferPtr, CommentEnd);
481  State = LS_VerbatimBlockBody;
482  return;
483  }
484 
485  State = LS_VerbatimBlockFirstLine;
486 }
487 
488 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
489 again:
490  assert(BufferPtr < CommentEnd);
491 
492  // FIXME: It would be better to scan the text once, finding either the block
493  // end command or newline.
494  //
495  // Extract current line.
496  const char *Newline = findNewline(BufferPtr, CommentEnd);
497  StringRef Line(BufferPtr, Newline - BufferPtr);
498 
499  // Look for end command in current line.
500  size_t Pos = Line.find(VerbatimBlockEndCommandName);
501  const char *TextEnd;
502  const char *NextLine;
503  if (Pos == StringRef::npos) {
504  // Current line is completely verbatim.
505  TextEnd = Newline;
506  NextLine = skipNewline(Newline, CommentEnd);
507  } else if (Pos == 0) {
508  // Current line contains just an end command.
509  const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
510  StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
511  formTokenWithChars(T, End, tok::verbatim_block_end);
512  T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
513  State = LS_Normal;
514  return;
515  } else {
516  // There is some text, followed by end command. Extract text first.
517  TextEnd = BufferPtr + Pos;
518  NextLine = TextEnd;
519  // If there is only whitespace before end command, skip whitespace.
520  if (isWhitespace(BufferPtr, TextEnd)) {
521  BufferPtr = TextEnd;
522  goto again;
523  }
524  }
525 
526  StringRef Text(BufferPtr, TextEnd - BufferPtr);
527  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
528  T.setVerbatimBlockText(Text);
529 
530  State = LS_VerbatimBlockBody;
531 }
532 
533 void Lexer::lexVerbatimBlockBody(Token &T) {
534  assert(State == LS_VerbatimBlockBody);
535 
536  if (CommentState == LCS_InsideCComment)
537  skipLineStartingDecorations();
538 
539  if (BufferPtr == CommentEnd) {
540  formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
541  T.setVerbatimBlockText("");
542  return;
543  }
544 
545  lexVerbatimBlockFirstLine(T);
546 }
547 
548 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
549  const CommandInfo *Info) {
550  assert(Info->IsVerbatimLineCommand);
551  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
552  T.setVerbatimLineID(Info->getID());
553 
554  State = LS_VerbatimLineText;
555 }
556 
557 void Lexer::lexVerbatimLineText(Token &T) {
558  assert(State == LS_VerbatimLineText);
559 
560  // Extract current line.
561  const char *Newline = findNewline(BufferPtr, CommentEnd);
562  StringRef Text(BufferPtr, Newline - BufferPtr);
563  formTokenWithChars(T, Newline, tok::verbatim_line_text);
564  T.setVerbatimLineText(Text);
565 
566  State = LS_Normal;
567 }
568 
569 void Lexer::lexHTMLCharacterReference(Token &T) {
570  const char *TokenPtr = BufferPtr;
571  assert(*TokenPtr == '&');
572  TokenPtr++;
573  if (TokenPtr == CommentEnd) {
574  formTextToken(T, TokenPtr);
575  return;
576  }
577  const char *NamePtr;
578  bool isNamed = false;
579  bool isDecimal = false;
580  char C = *TokenPtr;
582  NamePtr = TokenPtr;
583  TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
584  isNamed = true;
585  } else if (C == '#') {
586  TokenPtr++;
587  if (TokenPtr == CommentEnd) {
588  formTextToken(T, TokenPtr);
589  return;
590  }
591  C = *TokenPtr;
593  NamePtr = TokenPtr;
594  TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
595  isDecimal = true;
596  } else if (C == 'x' || C == 'X') {
597  TokenPtr++;
598  NamePtr = TokenPtr;
599  TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
600  } else {
601  formTextToken(T, TokenPtr);
602  return;
603  }
604  } else {
605  formTextToken(T, TokenPtr);
606  return;
607  }
608  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
609  *TokenPtr != ';') {
610  formTextToken(T, TokenPtr);
611  return;
612  }
613  StringRef Name(NamePtr, TokenPtr - NamePtr);
614  TokenPtr++; // Skip semicolon.
615  StringRef Resolved;
616  if (isNamed)
617  Resolved = resolveHTMLNamedCharacterReference(Name);
618  else if (isDecimal)
619  Resolved = resolveHTMLDecimalCharacterReference(Name);
620  else
621  Resolved = resolveHTMLHexCharacterReference(Name);
622 
623  if (Resolved.empty()) {
624  formTextToken(T, TokenPtr);
625  return;
626  }
627  formTokenWithChars(T, TokenPtr, tok::text);
628  T.setText(Resolved);
629 }
630 
631 void Lexer::setupAndLexHTMLStartTag(Token &T) {
632  assert(BufferPtr[0] == '<' &&
633  isHTMLIdentifierStartingCharacter(BufferPtr[1]));
634  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
635  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
636  if (!isHTMLTagName(Name)) {
637  formTextToken(T, TagNameEnd);
638  return;
639  }
640 
641  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
642  T.setHTMLTagStartName(Name);
643 
644  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
645 
646  const char C = *BufferPtr;
647  if (BufferPtr != CommentEnd &&
648  (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
649  State = LS_HTMLStartTag;
650 }
651 
652 void Lexer::lexHTMLStartTag(Token &T) {
653  assert(State == LS_HTMLStartTag);
654 
655  const char *TokenPtr = BufferPtr;
656  char C = *TokenPtr;
657  if (isHTMLIdentifierCharacter(C)) {
658  TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
659  StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
660  formTokenWithChars(T, TokenPtr, tok::html_ident);
661  T.setHTMLIdent(Ident);
662  } else {
663  switch (C) {
664  case '=':
665  TokenPtr++;
666  formTokenWithChars(T, TokenPtr, tok::html_equals);
667  break;
668  case '\"':
669  case '\'': {
670  const char *OpenQuote = TokenPtr;
671  TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
672  const char *ClosingQuote = TokenPtr;
673  if (TokenPtr != CommentEnd) // Skip closing quote.
674  TokenPtr++;
675  formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
676  T.setHTMLQuotedString(StringRef(OpenQuote + 1,
677  ClosingQuote - (OpenQuote + 1)));
678  break;
679  }
680  case '>':
681  TokenPtr++;
682  formTokenWithChars(T, TokenPtr, tok::html_greater);
683  State = LS_Normal;
684  return;
685  case '/':
686  TokenPtr++;
687  if (TokenPtr != CommentEnd && *TokenPtr == '>') {
688  TokenPtr++;
689  formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
690  } else
691  formTextToken(T, TokenPtr);
692 
693  State = LS_Normal;
694  return;
695  }
696  }
697 
698  // Now look ahead and return to normal state if we don't see any HTML tokens
699  // ahead.
700  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
701  if (BufferPtr == CommentEnd) {
702  State = LS_Normal;
703  return;
704  }
705 
706  C = *BufferPtr;
707  if (!isHTMLIdentifierStartingCharacter(C) &&
708  C != '=' && C != '\"' && C != '\'' && C != '>') {
709  State = LS_Normal;
710  return;
711  }
712 }
713 
714 void Lexer::setupAndLexHTMLEndTag(Token &T) {
715  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
716 
717  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
718  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
719  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
720  if (!isHTMLTagName(Name)) {
721  formTextToken(T, TagNameEnd);
722  return;
723  }
724 
725  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
726 
727  formTokenWithChars(T, End, tok::html_end_tag);
728  T.setHTMLTagEndName(Name);
729 
730  if (BufferPtr != CommentEnd && *BufferPtr == '>')
731  State = LS_HTMLEndTag;
732 }
733 
734 void Lexer::lexHTMLEndTag(Token &T) {
735  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
736 
737  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
738  State = LS_Normal;
739 }
740 
741 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
742  const CommandTraits &Traits, SourceLocation FileLoc,
743  const char *BufferStart, const char *BufferEnd,
744  bool ParseCommands)
745  : Allocator(Allocator), Diags(Diags), Traits(Traits),
746  BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
747  BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
748  ParseCommands(ParseCommands) {}
749 
750 void Lexer::lex(Token &T) {
751 again:
752  switch (CommentState) {
753  case LCS_BeforeComment:
754  if (BufferPtr == BufferEnd) {
755  formTokenWithChars(T, BufferPtr, tok::eof);
756  return;
757  }
758 
759  assert(*BufferPtr == '/');
760  BufferPtr++; // Skip first slash.
761  switch(*BufferPtr) {
762  case '/': { // BCPL comment.
763  BufferPtr++; // Skip second slash.
764 
765  if (BufferPtr != BufferEnd) {
766  // Skip Doxygen magic marker, if it is present.
767  // It might be missing because of a typo //< or /*<, or because we
768  // merged this non-Doxygen comment into a bunch of Doxygen comments
769  // around it: /** ... */ /* ... */ /** ... */
770  const char C = *BufferPtr;
771  if (C == '/' || C == '!')
772  BufferPtr++;
773  }
774 
775  // Skip less-than symbol that marks trailing comments.
776  // Skip it even if the comment is not a Doxygen one, because //< and /*<
777  // are frequent typos.
778  if (BufferPtr != BufferEnd && *BufferPtr == '<')
779  BufferPtr++;
780 
781  CommentState = LCS_InsideBCPLComment;
782  if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
783  State = LS_Normal;
784  CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
785  goto again;
786  }
787  case '*': { // C comment.
788  BufferPtr++; // Skip star.
789 
790  // Skip Doxygen magic marker.
791  const char C = *BufferPtr;
792  if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
793  BufferPtr++;
794 
795  // Skip less-than symbol that marks trailing comments.
796  if (BufferPtr != BufferEnd && *BufferPtr == '<')
797  BufferPtr++;
798 
799  CommentState = LCS_InsideCComment;
800  State = LS_Normal;
801  CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
802  goto again;
803  }
804  default:
805  llvm_unreachable("second character of comment should be '/' or '*'");
806  }
807 
808  case LCS_BetweenComments: {
809  // Consecutive comments are extracted only if there is only whitespace
810  // between them. So we can search for the start of the next comment.
811  const char *EndWhitespace = BufferPtr;
812  while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
813  EndWhitespace++;
814 
815  // Turn any whitespace between comments (and there is only whitespace
816  // between them -- guaranteed by comment extraction) into a newline. We
817  // have two newlines between C comments in total (first one was synthesized
818  // after a comment).
819  formTokenWithChars(T, EndWhitespace, tok::newline);
820 
821  CommentState = LCS_BeforeComment;
822  break;
823  }
824 
825  case LCS_InsideBCPLComment:
826  case LCS_InsideCComment:
827  if (BufferPtr != CommentEnd) {
828  lexCommentText(T);
829  break;
830  } else {
831  // Skip C comment closing sequence.
832  if (CommentState == LCS_InsideCComment) {
833  assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
834  BufferPtr += 2;
835  assert(BufferPtr <= BufferEnd);
836 
837  // Synthenize newline just after the C comment, regardless if there is
838  // actually a newline.
839  formTokenWithChars(T, BufferPtr, tok::newline);
840 
841  CommentState = LCS_BetweenComments;
842  break;
843  } else {
844  // Don't synthesized a newline after BCPL comment.
845  CommentState = LCS_BetweenComments;
846  goto again;
847  }
848  }
849  }
850 }
851 
852 StringRef Lexer::getSpelling(const Token &Tok,
853  const SourceManager &SourceMgr,
854  bool *Invalid) const {
855  SourceLocation Loc = Tok.getLocation();
856  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
857 
858  bool InvalidTemp = false;
859  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
860  if (InvalidTemp) {
861  *Invalid = true;
862  return StringRef();
863  }
864 
865  const char *Begin = File.data() + LocInfo.second;
866  return StringRef(Begin, Tok.getLength());
867 }
868 
869 } // end namespace comments
870 } // end namespace clang
void setHTMLQuotedString(StringRef Str)
Definition: CommentLexer.h:199
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, unsigned DiagID)
Produce a diagnostic highlighting some portion of a literal.
void setHTMLTagStartName(StringRef Name)
Definition: CommentLexer.h:177
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
static StringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator, unsigned CodePoint)
SourceLocation getEndLocation() const LLVM_READONLY
Definition: CommentLexer.h:83
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:95
void setLength(unsigned L)
Definition: CommentLexer.h:96
void setText(StringRef Text)
Definition: CommentLexer.h:103
const char * EndCommandName
Name of the command that ends the verbatim block.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: &#39; &#39;, &#39;\t&#39;, &#39;\f&#39;, &#39;\v&#39;.
Definition: CharInfo.h:70
void setVerbatimLineText(StringRef Text)
Definition: CommentLexer.h:166
Information about a single command.
void print(raw_ostream &OS, const SourceManager &SM) const
LineState State
LLVM_READONLY bool isLetter(unsigned char c)
Return true if this character is an ASCII letter: [a-zA-Z].
Definition: CharInfo.h:111
void setCommandID(unsigned ID)
Definition: CommentLexer.h:125
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: &#39; &#39;, &#39;\t&#39;, &#39;\f&#39;, &#39;\v&#39;, &#39;\n&#39;, &#39;\r&#39;.
Definition: CharInfo.h:87
void setVerbatimBlockID(unsigned ID)
Definition: CommentLexer.h:135
const FormatToken & Tok
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:148
void setLocation(SourceLocation SL)
Definition: CommentLexer.h:81
void dump(const Lexer &L, const SourceManager &SM) const
unsigned IsVerbatimLineCommand
True if this command is a verbatim line command.
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
static bool isHTMLHexCharacterReferenceCharacter(char C)
SourceLocation End
void setUnknownCommandName(StringRef Name)
Definition: CommentLexer.h:114
const AnnotatedLine * Line
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr, bool *Invalid=nullptr) const
SourceLocation Begin
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
Definition: CharInfo.h:117
void setVerbatimLineID(unsigned ID)
Definition: CommentLexer.h:156
void setHTMLTagEndName(StringRef Name)
Definition: CommentLexer.h:210
static bool isHTMLNamedCharacterReferenceCharacter(char C)
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:80
const SourceManager & SM
Definition: Format.cpp:1489
static bool isNamed(const NamedDecl *ND, const char(&Str)[Len])
Definition: Decl.cpp:2754
void setVerbatimBlockText(StringRef Text)
Definition: CommentLexer.h:145
This class provides information about commands that can be used in comments.
Kind
Encodes a location in the source.
Comment lexer.
Definition: CommentLexer.h:220
Dataflow Directional Tag Classes.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: &#39;\n&#39;, &#39;\r&#39;.
Definition: CharInfo.h:78
unsigned IsVerbatimBlockCommand
True if this command is a verbatim-like block command.
void setKind(tok::TokenKind K)
Definition: CommentLexer.h:90
LLVM_READONLY bool isHexDigit(unsigned char c)
Return true if this character is an ASCII hex digit: [0-9a-fA-F].
Definition: CharInfo.h:123
LLVM_READONLY bool isDigit(unsigned char c)
Return true if this character is an ASCII digit: [0-9].
Definition: CharInfo.h:93
static bool isHTMLDecimalCharacterReferenceCharacter(char C)
Comment token.
Definition: CommentLexer.h:55
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string...
Definition: Diagnostic.h:128
StringRef Text
Definition: Format.cpp:1629
void setHTMLIdent(StringRef Name)
Definition: CommentLexer.h:188
A trivial tuple used to represent a source range.
This class handles loading and caching of source files into memory.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.