clang  16.0.0git
Lexer.cpp
Go to the documentation of this file.
1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Lexer and Token interfaces.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "clang/Lex/Lexer.h"
14 #include "UnicodeCharSets.h"
15 #include "clang/Basic/CharInfo.h"
16 #include "clang/Basic/Diagnostic.h"
18 #include "clang/Basic/LLVM.h"
22 #include "clang/Basic/TokenKinds.h"
26 #include "clang/Lex/Preprocessor.h"
28 #include "clang/Lex/Token.h"
29 #include "llvm/ADT/None.h"
30 #include "llvm/ADT/Optional.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/ADT/StringExtras.h"
33 #include "llvm/ADT/StringRef.h"
34 #include "llvm/ADT/StringSwitch.h"
35 #include "llvm/Support/Compiler.h"
36 #include "llvm/Support/ConvertUTF.h"
37 #include "llvm/Support/MathExtras.h"
38 #include "llvm/Support/MemoryBufferRef.h"
39 #include "llvm/Support/NativeFormatting.h"
40 #include "llvm/Support/Unicode.h"
41 #include "llvm/Support/UnicodeCharRanges.h"
42 #include <algorithm>
43 #include <cassert>
44 #include <cstddef>
45 #include <cstdint>
46 #include <cstring>
47 #include <string>
48 #include <tuple>
49 #include <utility>
50 
51 using namespace clang;
52 
53 //===----------------------------------------------------------------------===//
54 // Token Class Implementation
55 //===----------------------------------------------------------------------===//
56 
57 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
59  if (isAnnotation())
60  return false;
62  return II->getObjCKeywordID() == objcKey;
63  return false;
64 }
65 
66 /// getObjCKeywordID - Return the ObjC keyword kind.
68  if (isAnnotation())
69  return tok::objc_not_keyword;
71  return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
72 }
73 
74 //===----------------------------------------------------------------------===//
75 // Lexer Class Implementation
76 //===----------------------------------------------------------------------===//
77 
78 void Lexer::anchor() {}
79 
80 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
81  const char *BufEnd) {
82  BufferStart = BufStart;
83  BufferPtr = BufPtr;
84  BufferEnd = BufEnd;
85 
86  assert(BufEnd[0] == 0 &&
87  "We assume that the input buffer has a null character at the end"
88  " to simplify lexing!");
89 
90  // Check whether we have a BOM in the beginning of the buffer. If yes - act
91  // accordingly. Right now we support only UTF-8 with and without BOM, so, just
92  // skip the UTF-8 BOM if it's present.
93  if (BufferStart == BufferPtr) {
94  // Determine the size of the BOM.
95  StringRef Buf(BufferStart, BufferEnd - BufferStart);
96  size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
97  .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
98  .Default(0);
99 
100  // Skip the BOM.
101  BufferPtr += BOMLength;
102  }
103 
104  Is_PragmaLexer = false;
105  CurrentConflictMarkerState = CMK_None;
106 
107  // Start of the file is a start of line.
108  IsAtStartOfLine = true;
109  IsAtPhysicalStartOfLine = true;
110 
111  HasLeadingSpace = false;
112  HasLeadingEmptyMacro = false;
113 
114  // We are not after parsing a #.
116 
117  // We are not after parsing #include.
118  ParsingFilename = false;
119 
120  // We are not in raw mode. Raw mode disables diagnostics and interpretation
121  // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
122  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
123  // or otherwise skipping over tokens.
124  LexingRawMode = false;
125 
126  // Default to not keeping comments.
127  ExtendedTokenMode = 0;
128 
129  NewLinePtr = nullptr;
130 }
131 
132 /// Lexer constructor - Create a new lexer object for the specified buffer
133 /// with the specified preprocessor managing the lexing process. This lexer
134 /// assumes that the associated file buffer and Preprocessor objects will
135 /// outlive it, so it doesn't take ownership of either of them.
136 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
137  Preprocessor &PP, bool IsFirstIncludeOfFile)
138  : PreprocessorLexer(&PP, FID),
139  FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
140  LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
141  IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
142  InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
143  InputFile.getBufferEnd());
144 
146 }
147 
148 /// Lexer constructor - Create a new raw lexer object. This object is only
149 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
150 /// range will outlive it, so it doesn't take ownership of it.
151 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
152  const char *BufStart, const char *BufPtr, const char *BufEnd,
153  bool IsFirstIncludeOfFile)
154  : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
155  IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
156  InitLexer(BufStart, BufPtr, BufEnd);
157 
158  // We *are* in raw mode.
159  LexingRawMode = true;
160 }
161 
162 /// Lexer constructor - Create a new raw lexer object. This object is only
163 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
164 /// range will outlive it, so it doesn't take ownership of it.
165 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
166  const SourceManager &SM, const LangOptions &langOpts,
167  bool IsFirstIncludeOfFile)
168  : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
169  FromFile.getBufferStart(), FromFile.getBufferEnd(),
170  IsFirstIncludeOfFile) {}
171 
173  assert(PP && "Cannot reset token mode without a preprocessor");
174  if (LangOpts.TraditionalCPP)
175  SetKeepWhitespaceMode(true);
176  else
178 }
179 
180 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
181 /// _Pragma expansion. This has a variety of magic semantics that this method
182 /// sets up. It returns a new'd Lexer that must be delete'd when done.
183 ///
184 /// On entrance to this routine, TokStartLoc is a macro location which has a
185 /// spelling loc that indicates the bytes to be lexed for the token and an
186 /// expansion location that indicates where all lexed tokens should be
187 /// "expanded from".
188 ///
189 /// TODO: It would really be nice to make _Pragma just be a wrapper around a
190 /// normal lexer that remaps tokens as they fly by. This would require making
191 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
192 /// interface that could handle this stuff. This would pull GetMappedTokenLoc
193 /// out of the critical path of the lexer!
194 ///
196  SourceLocation ExpansionLocStart,
197  SourceLocation ExpansionLocEnd,
198  unsigned TokLen, Preprocessor &PP) {
200 
201  // Create the lexer as if we were going to lex the file normally.
202  FileID SpellingFID = SM.getFileID(SpellingLoc);
203  llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
204  Lexer *L = new Lexer(SpellingFID, InputFile, PP);
205 
206  // Now that the lexer is created, change the start/end locations so that we
207  // just lex the subsection of the file that we want. This is lexing from a
208  // scratch buffer.
209  const char *StrData = SM.getCharacterData(SpellingLoc);
210 
211  L->BufferPtr = StrData;
212  L->BufferEnd = StrData+TokLen;
213  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
214 
215  // Set the SourceLocation with the remapping information. This ensures that
216  // GetMappedTokenLoc will remap the tokens as they are lexed.
217  L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
218  ExpansionLocStart,
219  ExpansionLocEnd, TokLen);
220 
221  // Ensure that the lexer thinks it is inside a directive, so that end \n will
222  // return an EOD token.
224 
225  // This lexer really is for _Pragma.
226  L->Is_PragmaLexer = true;
227  return L;
228 }
229 
230 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
231  this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
232  this->IsAtStartOfLine = IsAtStartOfLine;
233  assert((BufferStart + Offset) <= BufferEnd);
234  BufferPtr = BufferStart + Offset;
235 }
236 
237 template <typename T> static void StringifyImpl(T &Str, char Quote) {
238  typename T::size_type i = 0, e = Str.size();
239  while (i < e) {
240  if (Str[i] == '\\' || Str[i] == Quote) {
241  Str.insert(Str.begin() + i, '\\');
242  i += 2;
243  ++e;
244  } else if (Str[i] == '\n' || Str[i] == '\r') {
245  // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
246  if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
247  Str[i] != Str[i + 1]) {
248  Str[i] = '\\';
249  Str[i + 1] = 'n';
250  } else {
251  // Replace '\n' and '\r' to '\\' followed by 'n'.
252  Str[i] = '\\';
253  Str.insert(Str.begin() + i + 1, 'n');
254  ++e;
255  }
256  i += 2;
257  } else
258  ++i;
259  }
260 }
261 
262 std::string Lexer::Stringify(StringRef Str, bool Charify) {
263  std::string Result = std::string(Str);
264  char Quote = Charify ? '\'' : '"';
265  StringifyImpl(Result, Quote);
266  return Result;
267 }
268 
270 
271 //===----------------------------------------------------------------------===//
272 // Token Spelling
273 //===----------------------------------------------------------------------===//
274 
275 /// Slow case of getSpelling. Extract the characters comprising the
276 /// spelling of this token from the provided input buffer.
277 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
278  const LangOptions &LangOpts, char *Spelling) {
279  assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
280 
281  size_t Length = 0;
282  const char *BufEnd = BufPtr + Tok.getLength();
283 
284  if (tok::isStringLiteral(Tok.getKind())) {
285  // Munch the encoding-prefix and opening double-quote.
286  while (BufPtr < BufEnd) {
287  unsigned Size;
288  Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
289  BufPtr += Size;
290 
291  if (Spelling[Length - 1] == '"')
292  break;
293  }
294 
295  // Raw string literals need special handling; trigraph expansion and line
296  // splicing do not occur within their d-char-sequence nor within their
297  // r-char-sequence.
298  if (Length >= 2 &&
299  Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
300  // Search backwards from the end of the token to find the matching closing
301  // quote.
302  const char *RawEnd = BufEnd;
303  do --RawEnd; while (*RawEnd != '"');
304  size_t RawLength = RawEnd - BufPtr + 1;
305 
306  // Everything between the quotes is included verbatim in the spelling.
307  memcpy(Spelling + Length, BufPtr, RawLength);
308  Length += RawLength;
309  BufPtr += RawLength;
310 
311  // The rest of the token is lexed normally.
312  }
313  }
314 
315  while (BufPtr < BufEnd) {
316  unsigned Size;
317  Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
318  BufPtr += Size;
319  }
320 
321  assert(Length < Tok.getLength() &&
322  "NeedsCleaning flag set on token that didn't need cleaning!");
323  return Length;
324 }
325 
326 /// getSpelling() - Return the 'spelling' of this token. The spelling of a
327 /// token are the characters used to represent the token in the source file
328 /// after trigraph expansion and escaped-newline folding. In particular, this
329 /// wants to get the true, uncanonicalized, spelling of things like digraphs
330 /// UCNs, etc.
332  SmallVectorImpl<char> &buffer,
333  const SourceManager &SM,
334  const LangOptions &options,
335  bool *invalid) {
336  // Break down the source location.
337  std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
338 
339  // Try to the load the file buffer.
340  bool invalidTemp = false;
341  StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
342  if (invalidTemp) {
343  if (invalid) *invalid = true;
344  return {};
345  }
346 
347  const char *tokenBegin = file.data() + locInfo.second;
348 
349  // Lex from the start of the given location.
350  Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
351  file.begin(), tokenBegin, file.end());
352  Token token;
353  lexer.LexFromRawLexer(token);
354 
355  unsigned length = token.getLength();
356 
357  // Common case: no need for cleaning.
358  if (!token.needsCleaning())
359  return StringRef(tokenBegin, length);
360 
361  // Hard case, we need to relex the characters into the string.
362  buffer.resize(length);
363  buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
364  return StringRef(buffer.data(), buffer.size());
365 }
366 
367 /// getSpelling() - Return the 'spelling' of this token. The spelling of a
368 /// token are the characters used to represent the token in the source file
369 /// after trigraph expansion and escaped-newline folding. In particular, this
370 /// wants to get the true, uncanonicalized, spelling of things like digraphs
371 /// UCNs, etc.
372 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
373  const LangOptions &LangOpts, bool *Invalid) {
374  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
375 
376  bool CharDataInvalid = false;
377  const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
378  &CharDataInvalid);
379  if (Invalid)
380  *Invalid = CharDataInvalid;
381  if (CharDataInvalid)
382  return {};
383 
384  // If this token contains nothing interesting, return it directly.
385  if (!Tok.needsCleaning())
386  return std::string(TokStart, TokStart + Tok.getLength());
387 
388  std::string Result;
389  Result.resize(Tok.getLength());
390  Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
391  return Result;
392 }
393 
394 /// getSpelling - This method is used to get the spelling of a token into a
395 /// preallocated buffer, instead of as an std::string. The caller is required
396 /// to allocate enough space for the token, which is guaranteed to be at least
397 /// Tok.getLength() bytes long. The actual length of the token is returned.
398 ///
399 /// Note that this method may do two possible things: it may either fill in
400 /// the buffer specified with characters, or it may *change the input pointer*
401 /// to point to a constant buffer with the data already in it (avoiding a
402 /// copy). The caller is not allowed to modify the returned buffer pointer
403 /// if an internal buffer is returned.
404 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
405  const SourceManager &SourceMgr,
406  const LangOptions &LangOpts, bool *Invalid) {
407  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
408 
409  const char *TokStart = nullptr;
410  // NOTE: this has to be checked *before* testing for an IdentifierInfo.
411  if (Tok.is(tok::raw_identifier))
412  TokStart = Tok.getRawIdentifier().data();
413  else if (!Tok.hasUCN()) {
414  if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
415  // Just return the string from the identifier table, which is very quick.
416  Buffer = II->getNameStart();
417  return II->getLength();
418  }
419  }
420 
421  // NOTE: this can be checked even after testing for an IdentifierInfo.
422  if (Tok.isLiteral())
423  TokStart = Tok.getLiteralData();
424 
425  if (!TokStart) {
426  // Compute the start of the token in the input lexer buffer.
427  bool CharDataInvalid = false;
428  TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
429  if (Invalid)
430  *Invalid = CharDataInvalid;
431  if (CharDataInvalid) {
432  Buffer = "";
433  return 0;
434  }
435  }
436 
437  // If this token contains nothing interesting, return it directly.
438  if (!Tok.needsCleaning()) {
439  Buffer = TokStart;
440  return Tok.getLength();
441  }
442 
443  // Otherwise, hard case, relex the characters into the string.
444  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
445 }
446 
447 /// MeasureTokenLength - Relex the token at the specified location and return
448 /// its length in bytes in the input file. If the token needs cleaning (e.g.
449 /// includes a trigraph or an escaped newline) then this count includes bytes
450 /// that are part of that.
452  const SourceManager &SM,
453  const LangOptions &LangOpts) {
454  Token TheTok;
455  if (getRawToken(Loc, TheTok, SM, LangOpts))
456  return 0;
457  return TheTok.getLength();
458 }
459 
460 /// Relex the token at the specified location.
461 /// \returns true if there was a failure, false on success.
463  const SourceManager &SM,
464  const LangOptions &LangOpts,
465  bool IgnoreWhiteSpace) {
466  // TODO: this could be special cased for common tokens like identifiers, ')',
467  // etc to make this faster, if it mattered. Just look at StrData[0] to handle
468  // all obviously single-char tokens. This could use
469  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
470  // something.
471 
472  // If this comes from a macro expansion, we really do want the macro name, not
473  // the token this macro expanded to.
474  Loc = SM.getExpansionLoc(Loc);
475  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
476  bool Invalid = false;
477  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
478  if (Invalid)
479  return true;
480 
481  const char *StrData = Buffer.data()+LocInfo.second;
482 
483  if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
484  return true;
485 
486  // Create a lexer starting at the beginning of this token.
487  Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
488  Buffer.begin(), StrData, Buffer.end());
489  TheLexer.SetCommentRetentionState(true);
490  TheLexer.LexFromRawLexer(Result);
491  return false;
492 }
493 
494 /// Returns the pointer that points to the beginning of line that contains
495 /// the given offset, or null if the offset if invalid.
496 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
497  const char *BufStart = Buffer.data();
498  if (Offset >= Buffer.size())
499  return nullptr;
500 
501  const char *LexStart = BufStart + Offset;
502  for (; LexStart != BufStart; --LexStart) {
503  if (isVerticalWhitespace(LexStart[0]) &&
504  !Lexer::isNewLineEscaped(BufStart, LexStart)) {
505  // LexStart should point at first character of logical line.
506  ++LexStart;
507  break;
508  }
509  }
510  return LexStart;
511 }
512 
514  const SourceManager &SM,
515  const LangOptions &LangOpts) {
516  assert(Loc.isFileID());
517  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
518  if (LocInfo.first.isInvalid())
519  return Loc;
520 
521  bool Invalid = false;
522  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
523  if (Invalid)
524  return Loc;
525 
526  // Back up from the current location until we hit the beginning of a line
527  // (or the buffer). We'll relex from that point.
528  const char *StrData = Buffer.data() + LocInfo.second;
529  const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
530  if (!LexStart || LexStart == StrData)
531  return Loc;
532 
533  // Create a lexer starting at the beginning of this token.
534  SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
535  Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
536  Buffer.end());
537  TheLexer.SetCommentRetentionState(true);
538 
539  // Lex tokens until we find the token that contains the source location.
540  Token TheTok;
541  do {
542  TheLexer.LexFromRawLexer(TheTok);
543 
544  if (TheLexer.getBufferLocation() > StrData) {
545  // Lexing this token has taken the lexer past the source location we're
546  // looking for. If the current token encompasses our source location,
547  // return the beginning of that token.
548  if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
549  return TheTok.getLocation();
550 
551  // We ended up skipping over the source location entirely, which means
552  // that it points into whitespace. We're done here.
553  break;
554  }
555  } while (TheTok.getKind() != tok::eof);
556 
557  // We've passed our source location; just return the original source location.
558  return Loc;
559 }
560 
562  const SourceManager &SM,
563  const LangOptions &LangOpts) {
564  if (Loc.isFileID())
565  return getBeginningOfFileToken(Loc, SM, LangOpts);
566 
567  if (!SM.isMacroArgExpansion(Loc))
568  return Loc;
569 
570  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
571  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
572  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
573  std::pair<FileID, unsigned> BeginFileLocInfo =
574  SM.getDecomposedLoc(BeginFileLoc);
575  assert(FileLocInfo.first == BeginFileLocInfo.first &&
576  FileLocInfo.second >= BeginFileLocInfo.second);
577  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
578 }
579 
580 namespace {
581 
582 enum PreambleDirectiveKind {
583  PDK_Skipped,
584  PDK_Unknown
585 };
586 
587 } // namespace
588 
590  const LangOptions &LangOpts,
591  unsigned MaxLines) {
592  // Create a lexer starting at the beginning of the file. Note that we use a
593  // "fake" file source location at offset 1 so that the lexer will track our
594  // position within the file.
595  const SourceLocation::UIntTy StartOffset = 1;
596  SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
597  Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
598  Buffer.end());
599  TheLexer.SetCommentRetentionState(true);
600 
601  bool InPreprocessorDirective = false;
602  Token TheTok;
603  SourceLocation ActiveCommentLoc;
604 
605  unsigned MaxLineOffset = 0;
606  if (MaxLines) {
607  const char *CurPtr = Buffer.begin();
608  unsigned CurLine = 0;
609  while (CurPtr != Buffer.end()) {
610  char ch = *CurPtr++;
611  if (ch == '\n') {
612  ++CurLine;
613  if (CurLine == MaxLines)
614  break;
615  }
616  }
617  if (CurPtr != Buffer.end())
618  MaxLineOffset = CurPtr - Buffer.begin();
619  }
620 
621  do {
622  TheLexer.LexFromRawLexer(TheTok);
623 
624  if (InPreprocessorDirective) {
625  // If we've hit the end of the file, we're done.
626  if (TheTok.getKind() == tok::eof) {
627  break;
628  }
629 
630  // If we haven't hit the end of the preprocessor directive, skip this
631  // token.
632  if (!TheTok.isAtStartOfLine())
633  continue;
634 
635  // We've passed the end of the preprocessor directive, and will look
636  // at this token again below.
637  InPreprocessorDirective = false;
638  }
639 
640  // Keep track of the # of lines in the preamble.
641  if (TheTok.isAtStartOfLine()) {
642  unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
643 
644  // If we were asked to limit the number of lines in the preamble,
645  // and we're about to exceed that limit, we're done.
646  if (MaxLineOffset && TokOffset >= MaxLineOffset)
647  break;
648  }
649 
650  // Comments are okay; skip over them.
651  if (TheTok.getKind() == tok::comment) {
652  if (ActiveCommentLoc.isInvalid())
653  ActiveCommentLoc = TheTok.getLocation();
654  continue;
655  }
656 
657  if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
658  // This is the start of a preprocessor directive.
659  Token HashTok = TheTok;
660  InPreprocessorDirective = true;
661  ActiveCommentLoc = SourceLocation();
662 
663  // Figure out which directive this is. Since we're lexing raw tokens,
664  // we don't have an identifier table available. Instead, just look at
665  // the raw identifier to recognize and categorize preprocessor directives.
666  TheLexer.LexFromRawLexer(TheTok);
667  if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
668  StringRef Keyword = TheTok.getRawIdentifier();
669  PreambleDirectiveKind PDK
670  = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
671  .Case("include", PDK_Skipped)
672  .Case("__include_macros", PDK_Skipped)
673  .Case("define", PDK_Skipped)
674  .Case("undef", PDK_Skipped)
675  .Case("line", PDK_Skipped)
676  .Case("error", PDK_Skipped)
677  .Case("pragma", PDK_Skipped)
678  .Case("import", PDK_Skipped)
679  .Case("include_next", PDK_Skipped)
680  .Case("warning", PDK_Skipped)
681  .Case("ident", PDK_Skipped)
682  .Case("sccs", PDK_Skipped)
683  .Case("assert", PDK_Skipped)
684  .Case("unassert", PDK_Skipped)
685  .Case("if", PDK_Skipped)
686  .Case("ifdef", PDK_Skipped)
687  .Case("ifndef", PDK_Skipped)
688  .Case("elif", PDK_Skipped)
689  .Case("elifdef", PDK_Skipped)
690  .Case("elifndef", PDK_Skipped)
691  .Case("else", PDK_Skipped)
692  .Case("endif", PDK_Skipped)
693  .Default(PDK_Unknown);
694 
695  switch (PDK) {
696  case PDK_Skipped:
697  continue;
698 
699  case PDK_Unknown:
700  // We don't know what this directive is; stop at the '#'.
701  break;
702  }
703  }
704 
705  // We only end up here if we didn't recognize the preprocessor
706  // directive or it was one that can't occur in the preamble at this
707  // point. Roll back the current token to the location of the '#'.
708  TheTok = HashTok;
709  }
710 
711  // We hit a token that we don't recognize as being in the
712  // "preprocessing only" part of the file, so we're no longer in
713  // the preamble.
714  break;
715  } while (true);
716 
718  if (ActiveCommentLoc.isValid())
719  End = ActiveCommentLoc; // don't truncate a decl comment.
720  else
721  End = TheTok.getLocation();
722 
723  return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
724  TheTok.isAtStartOfLine());
725 }
726 
727 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
728  const SourceManager &SM,
729  const LangOptions &LangOpts) {
730  // Figure out how many physical characters away the specified expansion
731  // character is. This needs to take into consideration newlines and
732  // trigraphs.
733  bool Invalid = false;
734  const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
735 
736  // If they request the first char of the token, we're trivially done.
737  if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
738  return 0;
739 
740  unsigned PhysOffset = 0;
741 
742  // The usual case is that tokens don't contain anything interesting. Skip
743  // over the uninteresting characters. If a token only consists of simple
744  // chars, this method is extremely fast.
745  while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
746  if (CharNo == 0)
747  return PhysOffset;
748  ++TokPtr;
749  --CharNo;
750  ++PhysOffset;
751  }
752 
753  // If we have a character that may be a trigraph or escaped newline, use a
754  // lexer to parse it correctly.
755  for (; CharNo; --CharNo) {
756  unsigned Size;
757  Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
758  TokPtr += Size;
759  PhysOffset += Size;
760  }
761 
762  // Final detail: if we end up on an escaped newline, we want to return the
763  // location of the actual byte of the token. For example foo<newline>bar
764  // advanced by 3 should return the location of b, not of \\. One compounding
765  // detail of this is that the escape may be made by a trigraph.
766  if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
767  PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
768 
769  return PhysOffset;
770 }
771 
772 /// Computes the source location just past the end of the
773 /// token at this source location.
774 ///
775 /// This routine can be used to produce a source location that
776 /// points just past the end of the token referenced by \p Loc, and
777 /// is generally used when a diagnostic needs to point just after a
778 /// token where it expected something different that it received. If
779 /// the returned source location would not be meaningful (e.g., if
780 /// it points into a macro), this routine returns an invalid
781 /// source location.
782 ///
783 /// \param Offset an offset from the end of the token, where the source
784 /// location should refer to. The default offset (0) produces a source
785 /// location pointing just past the end of the token; an offset of 1 produces
786 /// a source location pointing to the last character in the token, etc.
788  const SourceManager &SM,
789  const LangOptions &LangOpts) {
790  if (Loc.isInvalid())
791  return {};
792 
793  if (Loc.isMacroID()) {
794  if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
795  return {}; // Points inside the macro expansion.
796  }
797 
798  unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
799  if (Len > Offset)
800  Len = Len - Offset;
801  else
802  return Loc;
803 
804  return Loc.getLocWithOffset(Len);
805 }
806 
807 /// Returns true if the given MacroID location points at the first
808 /// token of the macro expansion.
810  const SourceManager &SM,
811  const LangOptions &LangOpts,
812  SourceLocation *MacroBegin) {
813  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
814 
815  SourceLocation expansionLoc;
816  if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
817  return false;
818 
819  if (expansionLoc.isFileID()) {
820  // No other macro expansions, this is the first.
821  if (MacroBegin)
822  *MacroBegin = expansionLoc;
823  return true;
824  }
825 
826  return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
827 }
828 
829 /// Returns true if the given MacroID location points at the last
830 /// token of the macro expansion.
832  const SourceManager &SM,
833  const LangOptions &LangOpts,
834  SourceLocation *MacroEnd) {
835  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
836 
837  SourceLocation spellLoc = SM.getSpellingLoc(loc);
838  unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
839  if (tokLen == 0)
840  return false;
841 
842  SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
843  SourceLocation expansionLoc;
844  if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
845  return false;
846 
847  if (expansionLoc.isFileID()) {
848  // No other macro expansions.
849  if (MacroEnd)
850  *MacroEnd = expansionLoc;
851  return true;
852  }
853 
854  return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
855 }
856 
858  const SourceManager &SM,
859  const LangOptions &LangOpts) {
860  SourceLocation Begin = Range.getBegin();
861  SourceLocation End = Range.getEnd();
862  assert(Begin.isFileID() && End.isFileID());
863  if (Range.isTokenRange()) {
864  End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
865  if (End.isInvalid())
866  return {};
867  }
868 
869  // Break down the source locations.
870  FileID FID;
871  unsigned BeginOffs;
872  std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
873  if (FID.isInvalid())
874  return {};
875 
876  unsigned EndOffs;
877  if (!SM.isInFileID(End, FID, &EndOffs) ||
878  BeginOffs > EndOffs)
879  return {};
880 
882 }
883 
884 // Assumes that `Loc` is in an expansion.
886  const SourceManager &SM) {
887  return SM.getSLocEntry(SM.getFileID(Loc))
888  .getExpansion()
889  .isExpansionTokenRange();
890 }
891 
893  const SourceManager &SM,
894  const LangOptions &LangOpts) {
895  SourceLocation Begin = Range.getBegin();
896  SourceLocation End = Range.getEnd();
897  if (Begin.isInvalid() || End.isInvalid())
898  return {};
899 
900  if (Begin.isFileID() && End.isFileID())
901  return makeRangeFromFileLocs(Range, SM, LangOpts);
902 
903  if (Begin.isMacroID() && End.isFileID()) {
904  if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
905  return {};
906  Range.setBegin(Begin);
907  return makeRangeFromFileLocs(Range, SM, LangOpts);
908  }
909 
910  if (Begin.isFileID() && End.isMacroID()) {
911  if (Range.isTokenRange()) {
912  if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
913  return {};
914  // Use the *original* end, not the expanded one in `End`.
915  Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
916  } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
917  return {};
918  Range.setEnd(End);
919  return makeRangeFromFileLocs(Range, SM, LangOpts);
920  }
921 
922  assert(Begin.isMacroID() && End.isMacroID());
923  SourceLocation MacroBegin, MacroEnd;
924  if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
925  ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
926  &MacroEnd)) ||
927  (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
928  &MacroEnd)))) {
929  Range.setBegin(MacroBegin);
930  Range.setEnd(MacroEnd);
931  // Use the *original* `End`, not the expanded one in `MacroEnd`.
932  if (Range.isTokenRange())
933  Range.setTokenRange(isInExpansionTokenRange(End, SM));
934  return makeRangeFromFileLocs(Range, SM, LangOpts);
935  }
936 
937  bool Invalid = false;
938  const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
939  &Invalid);
940  if (Invalid)
941  return {};
942 
943  if (BeginEntry.getExpansion().isMacroArgExpansion()) {
944  const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
945  &Invalid);
946  if (Invalid)
947  return {};
948 
949  if (EndEntry.getExpansion().isMacroArgExpansion() &&
950  BeginEntry.getExpansion().getExpansionLocStart() ==
951  EndEntry.getExpansion().getExpansionLocStart()) {
952  Range.setBegin(SM.getImmediateSpellingLoc(Begin));
953  Range.setEnd(SM.getImmediateSpellingLoc(End));
954  return makeFileCharRange(Range, SM, LangOpts);
955  }
956  }
957 
958  return {};
959 }
960 
962  const SourceManager &SM,
963  const LangOptions &LangOpts,
964  bool *Invalid) {
965  Range = makeFileCharRange(Range, SM, LangOpts);
966  if (Range.isInvalid()) {
967  if (Invalid) *Invalid = true;
968  return {};
969  }
970 
971  // Break down the source location.
972  std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
973  if (beginInfo.first.isInvalid()) {
974  if (Invalid) *Invalid = true;
975  return {};
976  }
977 
978  unsigned EndOffs;
979  if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
980  beginInfo.second > EndOffs) {
981  if (Invalid) *Invalid = true;
982  return {};
983  }
984 
985  // Try to the load the file buffer.
986  bool invalidTemp = false;
987  StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
988  if (invalidTemp) {
989  if (Invalid) *Invalid = true;
990  return {};
991  }
992 
993  if (Invalid) *Invalid = false;
994  return file.substr(beginInfo.second, EndOffs - beginInfo.second);
995 }
996 
998  const SourceManager &SM,
999  const LangOptions &LangOpts) {
1000  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1001 
1002  // Find the location of the immediate macro expansion.
1003  while (true) {
1004  FileID FID = SM.getFileID(Loc);
1005  const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1006  const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1007  Loc = Expansion.getExpansionLocStart();
1008  if (!Expansion.isMacroArgExpansion())
1009  break;
1010 
1011  // For macro arguments we need to check that the argument did not come
1012  // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1013 
1014  // Loc points to the argument id of the macro definition, move to the
1015  // macro expansion.
1016  Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1017  SourceLocation SpellLoc = Expansion.getSpellingLoc();
1018  if (SpellLoc.isFileID())
1019  break; // No inner macro.
1020 
1021  // If spelling location resides in the same FileID as macro expansion
1022  // location, it means there is no inner macro.
1023  FileID MacroFID = SM.getFileID(Loc);
1024  if (SM.isInFileID(SpellLoc, MacroFID))
1025  break;
1026 
1027  // Argument came from inner macro.
1028  Loc = SpellLoc;
1029  }
1030 
1031  // Find the spelling location of the start of the non-argument expansion
1032  // range. This is where the macro name was spelled in order to begin
1033  // expanding this macro.
1034  Loc = SM.getSpellingLoc(Loc);
1035 
1036  // Dig out the buffer where the macro name was spelled and the extents of the
1037  // name so that we can render it into the expansion note.
1038  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1039  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1040  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1041  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1042 }
1043 
1045  SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1046  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1047  // Walk past macro argument expansions.
1048  while (SM.isMacroArgExpansion(Loc))
1049  Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1050 
1051  // If the macro's spelling has no FileID, then it's actually a token paste
1052  // or stringization (or similar) and not a macro at all.
1053  if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1054  return {};
1055 
1056  // Find the spelling location of the start of the non-argument expansion
1057  // range. This is where the macro name was spelled in order to begin
1058  // expanding this macro.
1059  Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1060 
1061  // Dig out the buffer where the macro name was spelled and the extents of the
1062  // name so that we can render it into the expansion note.
1063  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1064  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1065  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1066  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1067 }
1068 
1070  return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1071 }
1072 
1073 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1074  assert(isVerticalWhitespace(Str[0]));
1075  if (Str - 1 < BufferStart)
1076  return false;
1077 
1078  if ((Str[0] == '\n' && Str[-1] == '\r') ||
1079  (Str[0] == '\r' && Str[-1] == '\n')) {
1080  if (Str - 2 < BufferStart)
1081  return false;
1082  --Str;
1083  }
1084  --Str;
1085 
1086  // Rewind to first non-space character:
1087  while (Str > BufferStart && isHorizontalWhitespace(*Str))
1088  --Str;
1089 
1090  return *Str == '\\';
1091 }
1092 
1094  const SourceManager &SM) {
1095  if (Loc.isInvalid() || Loc.isMacroID())
1096  return {};
1097  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1098  if (LocInfo.first.isInvalid())
1099  return {};
1100  bool Invalid = false;
1101  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1102  if (Invalid)
1103  return {};
1104  const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1105  if (!Line)
1106  return {};
1107  StringRef Rest = Buffer.substr(Line - Buffer.data());
1108  size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1109  return NumWhitespaceChars == StringRef::npos
1110  ? ""
1111  : Rest.take_front(NumWhitespaceChars);
1112 }
1113 
1114 //===----------------------------------------------------------------------===//
1115 // Diagnostics forwarding code.
1116 //===----------------------------------------------------------------------===//
1117 
1118 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1119 /// lexer buffer was all expanded at a single point, perform the mapping.
1120 /// This is currently only used for _Pragma implementation, so it is the slow
1121 /// path of the hot getSourceLocation method. Do not allow it to be inlined.
1122 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1123  Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1125  SourceLocation FileLoc,
1126  unsigned CharNo, unsigned TokLen) {
1127  assert(FileLoc.isMacroID() && "Must be a macro expansion");
1128 
1129  // Otherwise, we're lexing "mapped tokens". This is used for things like
1130  // _Pragma handling. Combine the expansion location of FileLoc with the
1131  // spelling location.
1133 
1134  // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1135  // characters come from spelling(FileLoc)+Offset.
1136  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1137  SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1138 
1139  // Figure out the expansion loc range, which is the range covered by the
1140  // original _Pragma(...) sequence.
1141  CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1142 
1143  return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1144 }
1145 
1146 /// getSourceLocation - Return a source location identifier for the specified
1147 /// offset in the current file.
1149  unsigned TokLen) const {
1150  assert(Loc >= BufferStart && Loc <= BufferEnd &&
1151  "Location out of range for this buffer!");
1152 
1153  // In the normal case, we're just lexing from a simple file buffer, return
1154  // the file id from FileLoc with the offset specified.
1155  unsigned CharNo = Loc-BufferStart;
1156  if (FileLoc.isFileID())
1157  return FileLoc.getLocWithOffset(CharNo);
1158 
1159  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1160  // tokens are lexed from where the _Pragma was defined.
1161  assert(PP && "This doesn't work on raw lexers");
1162  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1163 }
1164 
1165 /// Diag - Forwarding function for diagnostics. This translate a source
1166 /// position in the current buffer into a SourceLocation object for rendering.
1167 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1168  return PP->Diag(getSourceLocation(Loc), DiagID);
1169 }
1170 
1171 //===----------------------------------------------------------------------===//
1172 // Trigraph and Escaped Newline Handling Code.
1173 //===----------------------------------------------------------------------===//
1174 
1175 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1176 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1177 static char GetTrigraphCharForLetter(char Letter) {
1178  switch (Letter) {
1179  default: return 0;
1180  case '=': return '#';
1181  case ')': return ']';
1182  case '(': return '[';
1183  case '!': return '|';
1184  case '\'': return '^';
1185  case '>': return '}';
1186  case '/': return '\\';
1187  case '<': return '{';
1188  case '-': return '~';
1189  }
1190 }
1191 
1192 /// DecodeTrigraphChar - If the specified character is a legal trigraph when
1193 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1194 /// return the result character. Finally, emit a warning about trigraph use
1195 /// whether trigraphs are enabled or not.
1196 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1197  char Res = GetTrigraphCharForLetter(*CP);
1198  if (!Res || !L) return Res;
1199 
1200  if (!Trigraphs) {
1201  if (!L->isLexingRawMode())
1202  L->Diag(CP-2, diag::trigraph_ignored);
1203  return 0;
1204  }
1205 
1206  if (!L->isLexingRawMode())
1207  L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1208  return Res;
1209 }
1210 
1211 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
1212 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1213 /// trigraph equivalent on entry to this function.
1214 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1215  unsigned Size = 0;
1216  while (isWhitespace(Ptr[Size])) {
1217  ++Size;
1218 
1219  if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1220  continue;
1221 
1222  // If this is a \r\n or \n\r, skip the other half.
1223  if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1224  Ptr[Size-1] != Ptr[Size])
1225  ++Size;
1226 
1227  return Size;
1228  }
1229 
1230  // Not an escaped newline, must be a \t or something else.
1231  return 0;
1232 }
1233 
1234 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1235 /// them), skip over them and return the first non-escaped-newline found,
1236 /// otherwise return P.
1237 const char *Lexer::SkipEscapedNewLines(const char *P) {
1238  while (true) {
1239  const char *AfterEscape;
1240  if (*P == '\\') {
1241  AfterEscape = P+1;
1242  } else if (*P == '?') {
1243  // If not a trigraph for escape, bail out.
1244  if (P[1] != '?' || P[2] != '/')
1245  return P;
1246  // FIXME: Take LangOpts into account; the language might not
1247  // support trigraphs.
1248  AfterEscape = P+3;
1249  } else {
1250  return P;
1251  }
1252 
1253  unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1254  if (NewLineSize == 0) return P;
1255  P = AfterEscape+NewLineSize;
1256  }
1257 }
1258 
1260  const SourceManager &SM,
1261  const LangOptions &LangOpts) {
1262  if (Loc.isMacroID()) {
1263  if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1264  return std::nullopt;
1265  }
1266  Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1267 
1268  // Break down the source location.
1269  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1270 
1271  // Try to load the file buffer.
1272  bool InvalidTemp = false;
1273  StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1274  if (InvalidTemp)
1275  return std::nullopt;
1276 
1277  const char *TokenBegin = File.data() + LocInfo.second;
1278 
1279  // Lex from the start of the given location.
1280  Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1281  TokenBegin, File.end());
1282  // Find the token.
1283  Token Tok;
1284  lexer.LexFromRawLexer(Tok);
1285  return Tok;
1286 }
1287 
1288 /// Checks that the given token is the first token that occurs after the
1289 /// given location (this excludes comments and whitespace). Returns the location
1290 /// immediately after the specified token. If the token is not found or the
1291 /// location is inside a macro, the returned source location will be invalid.
1293  SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1294  const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1295  Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1296  if (!Tok || Tok->isNot(TKind))
1297  return {};
1298  SourceLocation TokenLoc = Tok->getLocation();
1299 
1300  // Calculate how much whitespace needs to be skipped if any.
1301  unsigned NumWhitespaceChars = 0;
1302  if (SkipTrailingWhitespaceAndNewLine) {
1303  const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1304  unsigned char C = *TokenEnd;
1305  while (isHorizontalWhitespace(C)) {
1306  C = *(++TokenEnd);
1307  NumWhitespaceChars++;
1308  }
1309 
1310  // Skip \r, \n, \r\n, or \n\r
1311  if (C == '\n' || C == '\r') {
1312  char PrevC = C;
1313  C = *(++TokenEnd);
1314  NumWhitespaceChars++;
1315  if ((C == '\n' || C == '\r') && C != PrevC)
1316  NumWhitespaceChars++;
1317  }
1318  }
1319 
1320  return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1321 }
1322 
1323 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1324 /// get its size, and return it. This is tricky in several cases:
1325 /// 1. If currently at the start of a trigraph, we warn about the trigraph,
1326 /// then either return the trigraph (skipping 3 chars) or the '?',
1327 /// depending on whether trigraphs are enabled or not.
1328 /// 2. If this is an escaped newline (potentially with whitespace between
1329 /// the backslash and newline), implicitly skip the newline and return
1330 /// the char after it.
1331 ///
1332 /// This handles the slow/uncommon case of the getCharAndSize method. Here we
1333 /// know that we can accumulate into Size, and that we have already incremented
1334 /// Ptr by Size bytes.
1335 ///
1336 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1337 /// be updated to match.
1338 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1339  Token *Tok) {
1340  // If we have a slash, look for an escaped newline.
1341  if (Ptr[0] == '\\') {
1342  ++Size;
1343  ++Ptr;
1344 Slash:
1345  // Common case, backslash-char where the char is not whitespace.
1346  if (!isWhitespace(Ptr[0])) return '\\';
1347 
1348  // See if we have optional whitespace characters between the slash and
1349  // newline.
1350  if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1351  // Remember that this token needs to be cleaned.
1352  if (Tok) Tok->setFlag(Token::NeedsCleaning);
1353 
1354  // Warn if there was whitespace between the backslash and newline.
1355  if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1356  Diag(Ptr, diag::backslash_newline_space);
1357 
1358  // Found backslash<whitespace><newline>. Parse the char after it.
1359  Size += EscapedNewLineSize;
1360  Ptr += EscapedNewLineSize;
1361 
1362  // Use slow version to accumulate a correct size field.
1363  return getCharAndSizeSlow(Ptr, Size, Tok);
1364  }
1365 
1366  // Otherwise, this is not an escaped newline, just return the slash.
1367  return '\\';
1368  }
1369 
1370  // If this is a trigraph, process it.
1371  if (Ptr[0] == '?' && Ptr[1] == '?') {
1372  // If this is actually a legal trigraph (not something like "??x"), emit
1373  // a trigraph warning. If so, and if trigraphs are enabled, return it.
1374  if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1375  LangOpts.Trigraphs)) {
1376  // Remember that this token needs to be cleaned.
1377  if (Tok) Tok->setFlag(Token::NeedsCleaning);
1378 
1379  Ptr += 3;
1380  Size += 3;
1381  if (C == '\\') goto Slash;
1382  return C;
1383  }
1384  }
1385 
1386  // If this is neither, return a single character.
1387  ++Size;
1388  return *Ptr;
1389 }
1390 
1391 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1392 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1393 /// and that we have already incremented Ptr by Size bytes.
1394 ///
1395 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1396 /// be updated to match.
1397 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1398  const LangOptions &LangOpts) {
1399  // If we have a slash, look for an escaped newline.
1400  if (Ptr[0] == '\\') {
1401  ++Size;
1402  ++Ptr;
1403 Slash:
1404  // Common case, backslash-char where the char is not whitespace.
1405  if (!isWhitespace(Ptr[0])) return '\\';
1406 
1407  // See if we have optional whitespace characters followed by a newline.
1408  if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1409  // Found backslash<whitespace><newline>. Parse the char after it.
1410  Size += EscapedNewLineSize;
1411  Ptr += EscapedNewLineSize;
1412 
1413  // Use slow version to accumulate a correct size field.
1414  return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1415  }
1416 
1417  // Otherwise, this is not an escaped newline, just return the slash.
1418  return '\\';
1419  }
1420 
1421  // If this is a trigraph, process it.
1422  if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1423  // If this is actually a legal trigraph (not something like "??x"), return
1424  // it.
1425  if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1426  Ptr += 3;
1427  Size += 3;
1428  if (C == '\\') goto Slash;
1429  return C;
1430  }
1431  }
1432 
1433  // If this is neither, return a single character.
1434  ++Size;
1435  return *Ptr;
1436 }
1437 
1438 //===----------------------------------------------------------------------===//
1439 // Helper methods for lexing.
1440 //===----------------------------------------------------------------------===//
1441 
1442 /// Routine that indiscriminately sets the offset into the source file.
1443 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1444  BufferPtr = BufferStart + Offset;
1445  if (BufferPtr > BufferEnd)
1446  BufferPtr = BufferEnd;
1447  // FIXME: What exactly does the StartOfLine bit mean? There are two
1448  // possible meanings for the "start" of the line: the first token on the
1449  // unexpanded line, or the first token on the expanded line.
1450  IsAtStartOfLine = StartOfLine;
1451  IsAtPhysicalStartOfLine = StartOfLine;
1452 }
1453 
1454 static bool isUnicodeWhitespace(uint32_t Codepoint) {
1455  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1457  return UnicodeWhitespaceChars.contains(Codepoint);
1458 }
1459 
1460 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1461  if (LangOpts.AsmPreprocessor) {
1462  return false;
1463  } else if (LangOpts.DollarIdents && '$' == C) {
1464  return true;
1465  } else if (LangOpts.CPlusPlus || LangOpts.C2x) {
1466  // A non-leading codepoint must have the XID_Continue property.
1467  // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1468  // so we need to check both tables.
1469  // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1470  static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1471  static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1472  return C == '_' || XIDStartChars.contains(C) ||
1473  XIDContinueChars.contains(C);
1474  } else if (LangOpts.C11) {
1475  static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1477  return C11AllowedIDChars.contains(C);
1478  } else {
1479  static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1481  return C99AllowedIDChars.contains(C);
1482  }
1483 }
1484 
1485 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1486  assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1487  if (LangOpts.AsmPreprocessor) {
1488  return false;
1489  }
1490  if (LangOpts.CPlusPlus || LangOpts.C2x) {
1491  static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1492  return XIDStartChars.contains(C);
1493  }
1494  if (!isAllowedIDChar(C, LangOpts))
1495  return false;
1496  if (LangOpts.C11) {
1497  static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1499  return !C11DisallowedInitialIDChars.contains(C);
1500  }
1501  static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1503  return !C99DisallowedInitialIDChars.contains(C);
1504 }
1505 
1506 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1507  const char *End) {
1509  L.getSourceLocation(End));
1510 }
1511 
1512 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1513  CharSourceRange Range, bool IsFirst) {
1514  // Check C99 compatibility.
1515  if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1516  enum {
1517  CannotAppearInIdentifier = 0,
1518  CannotStartIdentifier
1519  };
1520 
1521  static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1523  static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1525  if (!C99AllowedIDChars.contains(C)) {
1526  Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1527  << Range
1528  << CannotAppearInIdentifier;
1529  } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1530  Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1531  << Range
1532  << CannotStartIdentifier;
1533  }
1534  }
1535 }
1536 
1537 /// After encountering UTF-8 character C and interpreting it as an identifier
1538 /// character, check whether it's a homoglyph for a common non-identifier
1539 /// source character that is unlikely to be an intentional identifier
1540 /// character and warn if so.
1541 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1542  CharSourceRange Range) {
1543  // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1544  struct HomoglyphPair {
1545  uint32_t Character;
1546  char LooksLike;
1547  bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1548  };
1549  static constexpr HomoglyphPair SortedHomoglyphs[] = {
1550  {U'\u00ad', 0}, // SOFT HYPHEN
1551  {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1552  {U'\u037e', ';'}, // GREEK QUESTION MARK
1553  {U'\u200b', 0}, // ZERO WIDTH SPACE
1554  {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1555  {U'\u200d', 0}, // ZERO WIDTH JOINER
1556  {U'\u2060', 0}, // WORD JOINER
1557  {U'\u2061', 0}, // FUNCTION APPLICATION
1558  {U'\u2062', 0}, // INVISIBLE TIMES
1559  {U'\u2063', 0}, // INVISIBLE SEPARATOR
1560  {U'\u2064', 0}, // INVISIBLE PLUS
1561  {U'\u2212', '-'}, // MINUS SIGN
1562  {U'\u2215', '/'}, // DIVISION SLASH
1563  {U'\u2216', '\\'}, // SET MINUS
1564  {U'\u2217', '*'}, // ASTERISK OPERATOR
1565  {U'\u2223', '|'}, // DIVIDES
1566  {U'\u2227', '^'}, // LOGICAL AND
1567  {U'\u2236', ':'}, // RATIO
1568  {U'\u223c', '~'}, // TILDE OPERATOR
1569  {U'\ua789', ':'}, // MODIFIER LETTER COLON
1570  {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1571  {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1572  {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1573  {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1574  {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1575  {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1576  {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1577  {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1578  {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1579  {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1580  {U'\uff0c', ','}, // FULLWIDTH COMMA
1581  {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1582  {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1583  {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1584  {U'\uff1a', ':'}, // FULLWIDTH COLON
1585  {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1586  {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1587  {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1588  {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1589  {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1590  {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1591  {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1592  {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1593  {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1594  {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1595  {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1596  {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1597  {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1598  {U'\uff5e', '~'}, // FULLWIDTH TILDE
1599  {0, 0}
1600  };
1601  auto Homoglyph =
1602  std::lower_bound(std::begin(SortedHomoglyphs),
1603  std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1604  if (Homoglyph->Character == C) {
1605  llvm::SmallString<5> CharBuf;
1606  {
1607  llvm::raw_svector_ostream CharOS(CharBuf);
1608  llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1609  }
1610  if (Homoglyph->LooksLike) {
1611  const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1612  Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1613  << Range << CharBuf << LooksLikeStr;
1614  } else {
1615  Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1616  << Range << CharBuf;
1617  }
1618  }
1619 }
1620 
1622  DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1623  CharSourceRange Range, bool IsFirst) {
1624  if (isASCII(CodePoint))
1625  return;
1626 
1627  bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
1628  bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);
1629 
1630  if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1631  return;
1632 
1633  bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1634 
1635  llvm::SmallString<5> CharBuf;
1636  llvm::raw_svector_ostream CharOS(CharBuf);
1637  llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);
1638 
1639  if (!IsFirst || InvalidOnlyAtStart) {
1640  Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1641  << Range << CharBuf << int(InvalidOnlyAtStart)
1642  << FixItHint::CreateRemoval(Range);
1643  } else {
1644  Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1645  << Range << CharBuf << FixItHint::CreateRemoval(Range);
1646  }
1647 }
1648 
1649 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1650  Token &Result) {
1651  const char *UCNPtr = CurPtr + Size;
1652  uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1653  if (CodePoint == 0) {
1654  return false;
1655  }
1656 
1657  if (!isAllowedIDChar(CodePoint, LangOpts)) {
1658  if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1659  return false;
1663  PP->getDiagnostics(), LangOpts, CodePoint,
1664  makeCharRange(*this, CurPtr, UCNPtr),
1665  /*IsFirst=*/false);
1666 
1667  // We got a unicode codepoint that is neither a space nor a
1668  // a valid identifier part.
1669  // Carry on as if the codepoint was valid for recovery purposes.
1670  } else if (!isLexingRawMode())
1672  makeCharRange(*this, CurPtr, UCNPtr),
1673  /*IsFirst=*/false);
1674 
1675  Result.setFlag(Token::HasUCN);
1676  if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1677  (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1678  CurPtr = UCNPtr;
1679  else
1680  while (CurPtr != UCNPtr)
1681  (void)getAndAdvanceChar(CurPtr, Result);
1682  return true;
1683 }
1684 
1685 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1686  const char *UnicodePtr = CurPtr;
1687  llvm::UTF32 CodePoint;
1688  llvm::ConversionResult Result =
1689  llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1690  (const llvm::UTF8 *)BufferEnd,
1691  &CodePoint,
1692  llvm::strictConversion);
1693  if (Result != llvm::conversionOK)
1694  return false;
1695 
1696  if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
1697  if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1698  return false;
1699 
1703  PP->getDiagnostics(), LangOpts, CodePoint,
1704  makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
1705  // We got a unicode codepoint that is neither a space nor a
1706  // a valid identifier part. Carry on as if the codepoint was
1707  // valid for recovery purposes.
1708  } else if (!isLexingRawMode()) {
1710  makeCharRange(*this, CurPtr, UnicodePtr),
1711  /*IsFirst=*/false);
1713  makeCharRange(*this, CurPtr, UnicodePtr));
1714  }
1715 
1716  CurPtr = UnicodePtr;
1717  return true;
1718 }
1719 
1720 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1721  const char *CurPtr) {
1722  if (isAllowedInitiallyIDChar(C, LangOpts)) {
1724  !PP->isPreprocessedOutput()) {
1726  makeCharRange(*this, BufferPtr, CurPtr),
1727  /*IsFirst=*/true);
1729  makeCharRange(*this, BufferPtr, CurPtr));
1730  }
1731 
1732  MIOpt.ReadToken();
1733  return LexIdentifierContinue(Result, CurPtr);
1734  }
1735 
1737  !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1738  !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
1739  // Non-ASCII characters tend to creep into source code unintentionally.
1740  // Instead of letting the parser complain about the unknown token,
1741  // just drop the character.
1742  // Note that we can /only/ do this when the non-ASCII character is actually
1743  // spelled as Unicode, not written as a UCN. The standard requires that
1744  // we not throw away any possible preprocessor tokens, but there's a
1745  // loophole in the mapping of Unicode characters to basic character set
1746  // characters that allows us to map these particular characters to, say,
1747  // whitespace.
1749  PP->getDiagnostics(), LangOpts, C,
1750  makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1751  BufferPtr = CurPtr;
1752  return false;
1753  }
1754 
1755  // Otherwise, we have an explicit UCN or a character that's unlikely to show
1756  // up by accident.
1757  MIOpt.ReadToken();
1758  FormTokenWithChars(Result, CurPtr, tok::unknown);
1759  return true;
1760 }
1761 
1762 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1763  // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1764  while (true) {
1765  unsigned char C = *CurPtr;
1766  // Fast path.
1767  if (isAsciiIdentifierContinue(C)) {
1768  ++CurPtr;
1769  continue;
1770  }
1771 
1772  unsigned Size;
1773  // Slow path: handle trigraph, unicode codepoints, UCNs.
1774  C = getCharAndSize(CurPtr, Size);
1775  if (isAsciiIdentifierContinue(C)) {
1776  CurPtr = ConsumeChar(CurPtr, Size, Result);
1777  continue;
1778  }
1779  if (C == '$') {
1780  // If we hit a $ and they are not supported in identifiers, we are done.
1781  if (!LangOpts.DollarIdents)
1782  break;
1783  // Otherwise, emit a diagnostic and continue.
1784  if (!isLexingRawMode())
1785  Diag(CurPtr, diag::ext_dollar_in_identifier);
1786  CurPtr = ConsumeChar(CurPtr, Size, Result);
1787  continue;
1788  }
1789  if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1790  continue;
1791  if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1792  continue;
1793  // Neither an expected Unicode codepoint nor a UCN.
1794  break;
1795  }
1796 
1797  const char *IdStart = BufferPtr;
1798  FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1799  Result.setRawIdentifierData(IdStart);
1800 
1801  // If we are in raw mode, return this identifier raw. There is no need to
1802  // look up identifier information or attempt to macro expand it.
1803  if (LexingRawMode)
1804  return true;
1805 
1806  // Fill in Result.IdentifierInfo and update the token kind,
1807  // looking up the identifier in the identifier table.
1808  IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1809  // Note that we have to call PP->LookUpIdentifierInfo() even for code
1810  // completion, it writes IdentifierInfo into Result, and callers rely on it.
1811 
1812  // If the completion point is at the end of an identifier, we want to treat
1813  // the identifier as incomplete even if it resolves to a macro or a keyword.
1814  // This allows e.g. 'class^' to complete to 'classifier'.
1815  if (isCodeCompletionPoint(CurPtr)) {
1816  // Return the code-completion token.
1817  Result.setKind(tok::code_completion);
1818  // Skip the code-completion char and all immediate identifier characters.
1819  // This ensures we get consistent behavior when completing at any point in
1820  // an identifier (i.e. at the start, in the middle, at the end). Note that
1821  // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1822  // simpler.
1823  assert(*CurPtr == 0 && "Completion character must be 0");
1824  ++CurPtr;
1825  // Note that code completion token is not added as a separate character
1826  // when the completion point is at the end of the buffer. Therefore, we need
1827  // to check if the buffer has ended.
1828  if (CurPtr < BufferEnd) {
1829  while (isAsciiIdentifierContinue(*CurPtr))
1830  ++CurPtr;
1831  }
1832  BufferPtr = CurPtr;
1833  return true;
1834  }
1835 
1836  // Finally, now that we know we have an identifier, pass this off to the
1837  // preprocessor, which may macro expand it or something.
1838  if (II->isHandleIdentifierCase())
1839  return PP->HandleIdentifier(Result);
1840 
1841  return true;
1842 }
1843 
1844 /// isHexaLiteral - Return true if Start points to a hex constant.
1845 /// in microsoft mode (where this is supposed to be several different tokens).
1846 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1847  unsigned Size;
1848  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1849  if (C1 != '0')
1850  return false;
1851  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1852  return (C2 == 'x' || C2 == 'X');
1853 }
1854 
1855 /// LexNumericConstant - Lex the remainder of a integer or floating point
1856 /// constant. From[-1] is the first character lexed. Return the end of the
1857 /// constant.
1858 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1859  unsigned Size;
1860  char C = getCharAndSize(CurPtr, Size);
1861  char PrevCh = 0;
1862  while (isPreprocessingNumberBody(C)) {
1863  CurPtr = ConsumeChar(CurPtr, Size, Result);
1864  PrevCh = C;
1865  C = getCharAndSize(CurPtr, Size);
1866  }
1867 
1868  // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1869  if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1870  // If we are in Microsoft mode, don't continue if the constant is hex.
1871  // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1872  if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1873  return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1874  }
1875 
1876  // If we have a hex FP constant, continue.
1877  if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1878  // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1879  // not-quite-conforming extension. Only do so if this looks like it's
1880  // actually meant to be a hexfloat, and not if it has a ud-suffix.
1881  bool IsHexFloat = true;
1882  if (!LangOpts.C99) {
1883  if (!isHexaLiteral(BufferPtr, LangOpts))
1884  IsHexFloat = false;
1885  else if (!LangOpts.CPlusPlus17 &&
1886  std::find(BufferPtr, CurPtr, '_') != CurPtr)
1887  IsHexFloat = false;
1888  }
1889  if (IsHexFloat)
1890  return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1891  }
1892 
1893  // If we have a digit separator, continue.
1894  if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C2x)) {
1895  unsigned NextSize;
1896  char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
1897  if (isAsciiIdentifierContinue(Next)) {
1898  if (!isLexingRawMode())
1899  Diag(CurPtr, LangOpts.CPlusPlus
1900  ? diag::warn_cxx11_compat_digit_separator
1901  : diag::warn_c2x_compat_digit_separator);
1902  CurPtr = ConsumeChar(CurPtr, Size, Result);
1903  CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1904  return LexNumericConstant(Result, CurPtr);
1905  }
1906  }
1907 
1908  // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1909  if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1910  return LexNumericConstant(Result, CurPtr);
1911  if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1912  return LexNumericConstant(Result, CurPtr);
1913 
1914  // Update the location of token as well as BufferPtr.
1915  const char *TokStart = BufferPtr;
1916  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1917  Result.setLiteralData(TokStart);
1918  return true;
1919 }
1920 
1921 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1922 /// in C++11, or warn on a ud-suffix in C++98.
1923 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1924  bool IsStringLiteral) {
1925  assert(LangOpts.CPlusPlus);
1926 
1927  // Maximally munch an identifier.
1928  unsigned Size;
1929  char C = getCharAndSize(CurPtr, Size);
1930  bool Consumed = false;
1931 
1932  if (!isAsciiIdentifierStart(C)) {
1933  if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1934  Consumed = true;
1935  else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1936  Consumed = true;
1937  else
1938  return CurPtr;
1939  }
1940 
1941  if (!LangOpts.CPlusPlus11) {
1942  if (!isLexingRawMode())
1943  Diag(CurPtr,
1944  C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1945  : diag::warn_cxx11_compat_reserved_user_defined_literal)
1947  return CurPtr;
1948  }
1949 
1950  // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1951  // that does not start with an underscore is ill-formed. As a conforming
1952  // extension, we treat all such suffixes as if they had whitespace before
1953  // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1954  // likely to be a ud-suffix than a macro, however, and accept that.
1955  if (!Consumed) {
1956  bool IsUDSuffix = false;
1957  if (C == '_')
1958  IsUDSuffix = true;
1959  else if (IsStringLiteral && LangOpts.CPlusPlus14) {
1960  // In C++1y, we need to look ahead a few characters to see if this is a
1961  // valid suffix for a string literal or a numeric literal (this could be
1962  // the 'operator""if' defining a numeric literal operator).
1963  const unsigned MaxStandardSuffixLength = 3;
1964  char Buffer[MaxStandardSuffixLength] = { C };
1965  unsigned Consumed = Size;
1966  unsigned Chars = 1;
1967  while (true) {
1968  unsigned NextSize;
1969  char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
1970  if (!isAsciiIdentifierContinue(Next)) {
1971  // End of suffix. Check whether this is on the allowed list.
1972  const StringRef CompleteSuffix(Buffer, Chars);
1973  IsUDSuffix =
1974  StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
1975  break;
1976  }
1977 
1978  if (Chars == MaxStandardSuffixLength)
1979  // Too long: can't be a standard suffix.
1980  break;
1981 
1982  Buffer[Chars++] = Next;
1983  Consumed += NextSize;
1984  }
1985  }
1986 
1987  if (!IsUDSuffix) {
1988  if (!isLexingRawMode())
1989  Diag(CurPtr, LangOpts.MSVCCompat
1990  ? diag::ext_ms_reserved_user_defined_literal
1991  : diag::ext_reserved_user_defined_literal)
1993  return CurPtr;
1994  }
1995 
1996  CurPtr = ConsumeChar(CurPtr, Size, Result);
1997  }
1998 
1999  Result.setFlag(Token::HasUDSuffix);
2000  while (true) {
2001  C = getCharAndSize(CurPtr, Size);
2002  if (isAsciiIdentifierContinue(C)) {
2003  CurPtr = ConsumeChar(CurPtr, Size, Result);
2004  } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2005  } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
2006  } else
2007  break;
2008  }
2009 
2010  return CurPtr;
2011 }
2012 
2013 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2014 /// either " or L" or u8" or u" or U".
2015 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2016  tok::TokenKind Kind) {
2017  const char *AfterQuote = CurPtr;
2018  // Does this string contain the \0 character?
2019  const char *NulCharacter = nullptr;
2020 
2021  if (!isLexingRawMode() &&
2022  (Kind == tok::utf8_string_literal ||
2023  Kind == tok::utf16_string_literal ||
2024  Kind == tok::utf32_string_literal))
2025  Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2026  : diag::warn_c99_compat_unicode_literal);
2027 
2028  char C = getAndAdvanceChar(CurPtr, Result);
2029  while (C != '"') {
2030  // Skip escaped characters. Escaped newlines will already be processed by
2031  // getAndAdvanceChar.
2032  if (C == '\\')
2033  C = getAndAdvanceChar(CurPtr, Result);
2034 
2035  if (C == '\n' || C == '\r' || // Newline.
2036  (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2037  if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2038  Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2039  FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2040  return true;
2041  }
2042 
2043  if (C == 0) {
2044  if (isCodeCompletionPoint(CurPtr-1)) {
2045  if (ParsingFilename)
2046  codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2047  else
2049  FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2050  cutOffLexing();
2051  return true;
2052  }
2053 
2054  NulCharacter = CurPtr-1;
2055  }
2056  C = getAndAdvanceChar(CurPtr, Result);
2057  }
2058 
2059  // If we are in C++11, lex the optional ud-suffix.
2060  if (LangOpts.CPlusPlus)
2061  CurPtr = LexUDSuffix(Result, CurPtr, true);
2062 
2063  // If a nul character existed in the string, warn about it.
2064  if (NulCharacter && !isLexingRawMode())
2065  Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2066 
2067  // Update the location of the token as well as the BufferPtr instance var.
2068  const char *TokStart = BufferPtr;
2069  FormTokenWithChars(Result, CurPtr, Kind);
2070  Result.setLiteralData(TokStart);
2071  return true;
2072 }
2073 
2074 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2075 /// having lexed R", LR", u8R", uR", or UR".
2076 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2077  tok::TokenKind Kind) {
2078  // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2079  // Between the initial and final double quote characters of the raw string,
2080  // any transformations performed in phases 1 and 2 (trigraphs,
2081  // universal-character-names, and line splicing) are reverted.
2082 
2083  if (!isLexingRawMode())
2084  Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2085 
2086  unsigned PrefixLen = 0;
2087 
2088  while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
2089  ++PrefixLen;
2090 
2091  // If the last character was not a '(', then we didn't lex a valid delimiter.
2092  if (CurPtr[PrefixLen] != '(') {
2093  if (!isLexingRawMode()) {
2094  const char *PrefixEnd = &CurPtr[PrefixLen];
2095  if (PrefixLen == 16) {
2096  Diag(PrefixEnd, diag::err_raw_delim_too_long);
2097  } else {
2098  Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2099  << StringRef(PrefixEnd, 1);
2100  }
2101  }
2102 
2103  // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2104  // it's possible the '"' was intended to be part of the raw string, but
2105  // there's not much we can do about that.
2106  while (true) {
2107  char C = *CurPtr++;
2108 
2109  if (C == '"')
2110  break;
2111  if (C == 0 && CurPtr-1 == BufferEnd) {
2112  --CurPtr;
2113  break;
2114  }
2115  }
2116 
2117  FormTokenWithChars(Result, CurPtr, tok::unknown);
2118  return true;
2119  }
2120 
2121  // Save prefix and move CurPtr past it
2122  const char *Prefix = CurPtr;
2123  CurPtr += PrefixLen + 1; // skip over prefix and '('
2124 
2125  while (true) {
2126  char C = *CurPtr++;
2127 
2128  if (C == ')') {
2129  // Check for prefix match and closing quote.
2130  if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2131  CurPtr += PrefixLen + 1; // skip over prefix and '"'
2132  break;
2133  }
2134  } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2135  if (!isLexingRawMode())
2136  Diag(BufferPtr, diag::err_unterminated_raw_string)
2137  << StringRef(Prefix, PrefixLen);
2138  FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2139  return true;
2140  }
2141  }
2142 
2143  // If we are in C++11, lex the optional ud-suffix.
2144  if (LangOpts.CPlusPlus)
2145  CurPtr = LexUDSuffix(Result, CurPtr, true);
2146 
2147  // Update the location of token as well as BufferPtr.
2148  const char *TokStart = BufferPtr;
2149  FormTokenWithChars(Result, CurPtr, Kind);
2150  Result.setLiteralData(TokStart);
2151  return true;
2152 }
2153 
2154 /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2155 /// after having lexed the '<' character. This is used for #include filenames.
2156 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2157  // Does this string contain the \0 character?
2158  const char *NulCharacter = nullptr;
2159  const char *AfterLessPos = CurPtr;
2160  char C = getAndAdvanceChar(CurPtr, Result);
2161  while (C != '>') {
2162  // Skip escaped characters. Escaped newlines will already be processed by
2163  // getAndAdvanceChar.
2164  if (C == '\\')
2165  C = getAndAdvanceChar(CurPtr, Result);
2166 
2167  if (isVerticalWhitespace(C) || // Newline.
2168  (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2169  // If the filename is unterminated, then it must just be a lone <
2170  // character. Return this as such.
2171  FormTokenWithChars(Result, AfterLessPos, tok::less);
2172  return true;
2173  }
2174 
2175  if (C == 0) {
2176  if (isCodeCompletionPoint(CurPtr - 1)) {
2177  codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2178  cutOffLexing();
2179  FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2180  return true;
2181  }
2182  NulCharacter = CurPtr-1;
2183  }
2184  C = getAndAdvanceChar(CurPtr, Result);
2185  }
2186 
2187  // If a nul character existed in the string, warn about it.
2188  if (NulCharacter && !isLexingRawMode())
2189  Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2190 
2191  // Update the location of token as well as BufferPtr.
2192  const char *TokStart = BufferPtr;
2193  FormTokenWithChars(Result, CurPtr, tok::header_name);
2194  Result.setLiteralData(TokStart);
2195  return true;
2196 }
2197 
2198 void Lexer::codeCompleteIncludedFile(const char *PathStart,
2199  const char *CompletionPoint,
2200  bool IsAngled) {
2201  // Completion only applies to the filename, after the last slash.
2202  StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2203  llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2204  auto Slash = PartialPath.find_last_of(SlashChars);
2205  StringRef Dir =
2206  (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2207  const char *StartOfFilename =
2208  (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2209  // Code completion filter range is the filename only, up to completion point.
2211  StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2212  // We should replace the characters up to the closing quote or closest slash,
2213  // if any.
2214  while (CompletionPoint < BufferEnd) {
2215  char Next = *(CompletionPoint + 1);
2216  if (Next == 0 || Next == '\r' || Next == '\n')
2217  break;
2218  ++CompletionPoint;
2219  if (Next == (IsAngled ? '>' : '"'))
2220  break;
2221  if (llvm::is_contained(SlashChars, Next))
2222  break;
2223  }
2224 
2226  FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2227  FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2228  PP->CodeCompleteIncludedFile(Dir, IsAngled);
2229 }
2230 
2231 /// LexCharConstant - Lex the remainder of a character constant, after having
2232 /// lexed either ' or L' or u8' or u' or U'.
2233 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2234  tok::TokenKind Kind) {
2235  // Does this character contain the \0 character?
2236  const char *NulCharacter = nullptr;
2237 
2238  if (!isLexingRawMode()) {
2239  if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2240  Diag(BufferPtr, LangOpts.CPlusPlus
2241  ? diag::warn_cxx98_compat_unicode_literal
2242  : diag::warn_c99_compat_unicode_literal);
2243  else if (Kind == tok::utf8_char_constant)
2244  Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2245  }
2246 
2247  char C = getAndAdvanceChar(CurPtr, Result);
2248  if (C == '\'') {
2249  if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2250  Diag(BufferPtr, diag::ext_empty_character);
2251  FormTokenWithChars(Result, CurPtr, tok::unknown);
2252  return true;
2253  }
2254 
2255  while (C != '\'') {
2256  // Skip escaped characters.
2257  if (C == '\\')
2258  C = getAndAdvanceChar(CurPtr, Result);
2259 
2260  if (C == '\n' || C == '\r' || // Newline.
2261  (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2262  if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2263  Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2264  FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2265  return true;
2266  }
2267 
2268  if (C == 0) {
2269  if (isCodeCompletionPoint(CurPtr-1)) {
2271  FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2272  cutOffLexing();
2273  return true;
2274  }
2275 
2276  NulCharacter = CurPtr-1;
2277  }
2278  C = getAndAdvanceChar(CurPtr, Result);
2279  }
2280 
2281  // If we are in C++11, lex the optional ud-suffix.
2282  if (LangOpts.CPlusPlus)
2283  CurPtr = LexUDSuffix(Result, CurPtr, false);
2284 
2285  // If a nul character existed in the character, warn about it.
2286  if (NulCharacter && !isLexingRawMode())
2287  Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2288 
2289  // Update the location of token as well as BufferPtr.
2290  const char *TokStart = BufferPtr;
2291  FormTokenWithChars(Result, CurPtr, Kind);
2292  Result.setLiteralData(TokStart);
2293  return true;
2294 }
2295 
2296 /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2297 /// Update BufferPtr to point to the next non-whitespace character and return.
2298 ///
2299 /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2300 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2301  bool &TokAtPhysicalStartOfLine) {
2302  // Whitespace - Skip it, then return the token after the whitespace.
2303  bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2304 
2305  unsigned char Char = *CurPtr;
2306 
2307  const char *lastNewLine = nullptr;
2308  auto setLastNewLine = [&](const char *Ptr) {
2309  lastNewLine = Ptr;
2310  if (!NewLinePtr)
2311  NewLinePtr = Ptr;
2312  };
2313  if (SawNewline)
2314  setLastNewLine(CurPtr - 1);
2315 
2316  // Skip consecutive spaces efficiently.
2317  while (true) {
2318  // Skip horizontal whitespace very aggressively.
2319  while (isHorizontalWhitespace(Char))
2320  Char = *++CurPtr;
2321 
2322  // Otherwise if we have something other than whitespace, we're done.
2323  if (!isVerticalWhitespace(Char))
2324  break;
2325 
2327  // End of preprocessor directive line, let LexTokenInternal handle this.
2328  BufferPtr = CurPtr;
2329  return false;
2330  }
2331 
2332  // OK, but handle newline.
2333  if (*CurPtr == '\n')
2334  setLastNewLine(CurPtr);
2335  SawNewline = true;
2336  Char = *++CurPtr;
2337  }
2338 
2339  // If the client wants us to return whitespace, return it now.
2340  if (isKeepWhitespaceMode()) {
2341  FormTokenWithChars(Result, CurPtr, tok::unknown);
2342  if (SawNewline) {
2343  IsAtStartOfLine = true;
2344  IsAtPhysicalStartOfLine = true;
2345  }
2346  // FIXME: The next token will not have LeadingSpace set.
2347  return true;
2348  }
2349 
2350  // If this isn't immediately after a newline, there is leading space.
2351  char PrevChar = CurPtr[-1];
2352  bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2353 
2354  Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2355  if (SawNewline) {
2356  Result.setFlag(Token::StartOfLine);
2357  TokAtPhysicalStartOfLine = true;
2358 
2359  if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2360  if (auto *Handler = PP->getEmptylineHandler())
2361  Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2362  getSourceLocation(lastNewLine)));
2363  }
2364  }
2365 
2366  BufferPtr = CurPtr;
2367  return false;
2368 }
2369 
2370 /// We have just read the // characters from input. Skip until we find the
2371 /// newline character that terminates the comment. Then update BufferPtr and
2372 /// return.
2373 ///
2374 /// If we're in KeepCommentMode or any CommentHandler has inserted
2375 /// some tokens, this will store the first token and return true.
2376 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2377  bool &TokAtPhysicalStartOfLine) {
2378  // If Line comments aren't explicitly enabled for this language, emit an
2379  // extension warning.
2380  if (!LineComment) {
2381  if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2382  Diag(BufferPtr, diag::ext_line_comment);
2383 
2384  // Mark them enabled so we only emit one warning for this translation
2385  // unit.
2386  LineComment = true;
2387  }
2388 
2389  // Scan over the body of the comment. The common case, when scanning, is that
2390  // the comment contains normal ascii characters with nothing interesting in
2391  // them. As such, optimize for this case with the inner loop.
2392  //
2393  // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2394  // character that ends the line comment.
2395 
2396  // C++23 [lex.phases] p1
2397  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2398  // diagnostic only once per entire ill-formed subsequence to avoid
2399  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2400  bool UnicodeDecodingAlreadyDiagnosed = false;
2401 
2402  char C;
2403  while (true) {
2404  C = *CurPtr;
2405  // Skip over characters in the fast loop.
2406  while (isASCII(C) && C != 0 && // Potentially EOF.
2407  C != '\n' && C != '\r') { // Newline or DOS-style newline.
2408  C = *++CurPtr;
2409  UnicodeDecodingAlreadyDiagnosed = false;
2410  }
2411 
2412  if (!isASCII(C)) {
2413  unsigned Length = llvm::getUTF8SequenceSize(
2414  (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2415  if (Length == 0) {
2416  if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2417  Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2418  UnicodeDecodingAlreadyDiagnosed = true;
2419  ++CurPtr;
2420  } else {
2421  UnicodeDecodingAlreadyDiagnosed = false;
2422  CurPtr += Length;
2423  }
2424  continue;
2425  }
2426 
2427  const char *NextLine = CurPtr;
2428  if (C != 0) {
2429  // We found a newline, see if it's escaped.
2430  const char *EscapePtr = CurPtr-1;
2431  bool HasSpace = false;
2432  while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2433  --EscapePtr;
2434  HasSpace = true;
2435  }
2436 
2437  if (*EscapePtr == '\\')
2438  // Escaped newline.
2439  CurPtr = EscapePtr;
2440  else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2441  EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2442  // Trigraph-escaped newline.
2443  CurPtr = EscapePtr-2;
2444  else
2445  break; // This is a newline, we're done.
2446 
2447  // If there was space between the backslash and newline, warn about it.
2448  if (HasSpace && !isLexingRawMode())
2449  Diag(EscapePtr, diag::backslash_newline_space);
2450  }
2451 
2452  // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2453  // properly decode the character. Read it in raw mode to avoid emitting
2454  // diagnostics about things like trigraphs. If we see an escaped newline,
2455  // we'll handle it below.
2456  const char *OldPtr = CurPtr;
2457  bool OldRawMode = isLexingRawMode();
2458  LexingRawMode = true;
2459  C = getAndAdvanceChar(CurPtr, Result);
2460  LexingRawMode = OldRawMode;
2461 
2462  // If we only read only one character, then no special handling is needed.
2463  // We're done and can skip forward to the newline.
2464  if (C != 0 && CurPtr == OldPtr+1) {
2465  CurPtr = NextLine;
2466  break;
2467  }
2468 
2469  // If we read multiple characters, and one of those characters was a \r or
2470  // \n, then we had an escaped newline within the comment. Emit diagnostic
2471  // unless the next line is also a // comment.
2472  if (CurPtr != OldPtr + 1 && C != '/' &&
2473  (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2474  for (; OldPtr != CurPtr; ++OldPtr)
2475  if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2476  // Okay, we found a // comment that ends in a newline, if the next
2477  // line is also a // comment, but has spaces, don't emit a diagnostic.
2478  if (isWhitespace(C)) {
2479  const char *ForwardPtr = CurPtr;
2480  while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2481  ++ForwardPtr;
2482  if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2483  break;
2484  }
2485 
2486  if (!isLexingRawMode())
2487  Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2488  break;
2489  }
2490  }
2491 
2492  if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2493  --CurPtr;
2494  break;
2495  }
2496 
2497  if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2499  cutOffLexing();
2500  return false;
2501  }
2502  }
2503 
2504  // Found but did not consume the newline. Notify comment handlers about the
2505  // comment unless we're in a #if 0 block.
2506  if (PP && !isLexingRawMode() &&
2507  PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2508  getSourceLocation(CurPtr)))) {
2509  BufferPtr = CurPtr;
2510  return true; // A token has to be returned.
2511  }
2512 
2513  // If we are returning comments as tokens, return this comment as a token.
2514  if (inKeepCommentMode())
2515  return SaveLineComment(Result, CurPtr);
2516 
2517  // If we are inside a preprocessor directive and we see the end of line,
2518  // return immediately, so that the lexer can return this as an EOD token.
2519  if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2520  BufferPtr = CurPtr;
2521  return false;
2522  }
2523 
2524  // Otherwise, eat the \n character. We don't care if this is a \n\r or
2525  // \r\n sequence. This is an efficiency hack (because we know the \n can't
2526  // contribute to another token), it isn't needed for correctness. Note that
2527  // this is ok even in KeepWhitespaceMode, because we would have returned the
2528  /// comment above in that mode.
2529  NewLinePtr = CurPtr++;
2530 
2531  // The next returned token is at the start of the line.
2532  Result.setFlag(Token::StartOfLine);
2533  TokAtPhysicalStartOfLine = true;
2534  // No leading whitespace seen so far.
2535  Result.clearFlag(Token::LeadingSpace);
2536  BufferPtr = CurPtr;
2537  return false;
2538 }
2539 
2540 /// If in save-comment mode, package up this Line comment in an appropriate
2541 /// way and return it.
2542 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2543  // If we're not in a preprocessor directive, just return the // comment
2544  // directly.
2545  FormTokenWithChars(Result, CurPtr, tok::comment);
2546 
2548  return true;
2549 
2550  // If this Line-style comment is in a macro definition, transmogrify it into
2551  // a C-style block comment.
2552  bool Invalid = false;
2553  std::string Spelling = PP->getSpelling(Result, &Invalid);
2554  if (Invalid)
2555  return true;
2556 
2557  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2558  Spelling[1] = '*'; // Change prefix to "/*".
2559  Spelling += "*/"; // add suffix.
2560 
2561  Result.setKind(tok::comment);
2562  PP->CreateString(Spelling, Result,
2563  Result.getLocation(), Result.getLocation());
2564  return true;
2565 }
2566 
2567 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2568 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2569 /// a diagnostic if so. We know that the newline is inside of a block comment.
2570 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2571  bool Trigraphs) {
2572  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2573 
2574  // Position of the first trigraph in the ending sequence.
2575  const char *TrigraphPos = nullptr;
2576  // Position of the first whitespace after a '\' in the ending sequence.
2577  const char *SpacePos = nullptr;
2578 
2579  while (true) {
2580  // Back up off the newline.
2581  --CurPtr;
2582 
2583  // If this is a two-character newline sequence, skip the other character.
2584  if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2585  // \n\n or \r\r -> not escaped newline.
2586  if (CurPtr[0] == CurPtr[1])
2587  return false;
2588  // \n\r or \r\n -> skip the newline.
2589  --CurPtr;
2590  }
2591 
2592  // If we have horizontal whitespace, skip over it. We allow whitespace
2593  // between the slash and newline.
2594  while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2595  SpacePos = CurPtr;
2596  --CurPtr;
2597  }
2598 
2599  // If we have a slash, this is an escaped newline.
2600  if (*CurPtr == '\\') {
2601  --CurPtr;
2602  } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2603  // This is a trigraph encoding of a slash.
2604  TrigraphPos = CurPtr - 2;
2605  CurPtr -= 3;
2606  } else {
2607  return false;
2608  }
2609 
2610  // If the character preceding the escaped newline is a '*', then after line
2611  // splicing we have a '*/' ending the comment.
2612  if (*CurPtr == '*')
2613  break;
2614 
2615  if (*CurPtr != '\n' && *CurPtr != '\r')
2616  return false;
2617  }
2618 
2619  if (TrigraphPos) {
2620  // If no trigraphs are enabled, warn that we ignored this trigraph and
2621  // ignore this * character.
2622  if (!Trigraphs) {
2623  if (!L->isLexingRawMode())
2624  L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2625  return false;
2626  }
2627  if (!L->isLexingRawMode())
2628  L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2629  }
2630 
2631  // Warn about having an escaped newline between the */ characters.
2632  if (!L->isLexingRawMode())
2633  L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2634 
2635  // If there was space between the backslash and newline, warn about it.
2636  if (SpacePos && !L->isLexingRawMode())
2637  L->Diag(SpacePos, diag::backslash_newline_space);
2638 
2639  return true;
2640 }
2641 
2642 #ifdef __SSE2__
2643 #include <emmintrin.h>
2644 #elif __ALTIVEC__
2645 #include <altivec.h>
2646 #undef bool
2647 #endif
2648 
2649 /// We have just read from input the / and * characters that started a comment.
2650 /// Read until we find the * and / characters that terminate the comment.
2651 /// Note that we don't bother decoding trigraphs or escaped newlines in block
2652 /// comments, because they cannot cause the comment to end. The only thing
2653 /// that can happen is the comment could end with an escaped newline between
2654 /// the terminating * and /.
2655 ///
2656 /// If we're in KeepCommentMode or any CommentHandler has inserted
2657 /// some tokens, this will store the first token and return true.
2658 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2659  bool &TokAtPhysicalStartOfLine) {
2660  // Scan one character past where we should, looking for a '/' character. Once
2661  // we find it, check to see if it was preceded by a *. This common
2662  // optimization helps people who like to put a lot of * characters in their
2663  // comments.
2664 
2665  // The first character we get with newlines and trigraphs skipped to handle
2666  // the degenerate /*/ case below correctly if the * has an escaped newline
2667  // after it.
2668  unsigned CharSize;
2669  unsigned char C = getCharAndSize(CurPtr, CharSize);
2670  CurPtr += CharSize;
2671  if (C == 0 && CurPtr == BufferEnd+1) {
2672  if (!isLexingRawMode())
2673  Diag(BufferPtr, diag::err_unterminated_block_comment);
2674  --CurPtr;
2675 
2676  // KeepWhitespaceMode should return this broken comment as a token. Since
2677  // it isn't a well formed comment, just return it as an 'unknown' token.
2678  if (isKeepWhitespaceMode()) {
2679  FormTokenWithChars(Result, CurPtr, tok::unknown);
2680  return true;
2681  }
2682 
2683  BufferPtr = CurPtr;
2684  return false;
2685  }
2686 
2687  // Check to see if the first character after the '/*' is another /. If so,
2688  // then this slash does not end the block comment, it is part of it.
2689  if (C == '/')
2690  C = *CurPtr++;
2691 
2692  // C++23 [lex.phases] p1
2693  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2694  // diagnostic only once per entire ill-formed subsequence to avoid
2695  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2696  bool UnicodeDecodingAlreadyDiagnosed = false;
2697 
2698  while (true) {
2699  // Skip over all non-interesting characters until we find end of buffer or a
2700  // (probably ending) '/' character.
2701  if (CurPtr + 24 < BufferEnd &&
2702  // If there is a code-completion point avoid the fast scan because it
2703  // doesn't check for '\0'.
2704  !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2705  // While not aligned to a 16-byte boundary.
2706  while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2707  if (!isASCII(C))
2708  goto MultiByteUTF8;
2709  C = *CurPtr++;
2710  }
2711  if (C == '/') goto FoundSlash;
2712 
2713 #ifdef __SSE2__
2714  __m128i Slashes = _mm_set1_epi8('/');
2715  while (CurPtr + 16 < BufferEnd) {
2716  int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2717  if (LLVM_UNLIKELY(Mask != 0)) {
2718  goto MultiByteUTF8;
2719  }
2720  // look for slashes
2721  int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2722  Slashes));
2723  if (cmp != 0) {
2724  // Adjust the pointer to point directly after the first slash. It's
2725  // not necessary to set C here, it will be overwritten at the end of
2726  // the outer loop.
2727  CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2728  goto FoundSlash;
2729  }
2730  CurPtr += 16;
2731  }
2732 #elif __ALTIVEC__
2733  __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2734  0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2735  0x80, 0x80, 0x80, 0x80};
2736  __vector unsigned char Slashes = {
2737  '/', '/', '/', '/', '/', '/', '/', '/',
2738  '/', '/', '/', '/', '/', '/', '/', '/'
2739  };
2740  while (CurPtr + 16 < BufferEnd) {
2741  if (LLVM_UNLIKELY(
2742  vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2743  goto MultiByteUTF8;
2744  if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2745  break;
2746  }
2747  CurPtr += 16;
2748  }
2749 
2750 #else
2751  while (CurPtr + 16 < BufferEnd) {
2752  bool HasNonASCII = false;
2753  for (unsigned I = 0; I < 16; ++I)
2754  HasNonASCII |= !isASCII(CurPtr[I]);
2755 
2756  if (LLVM_UNLIKELY(HasNonASCII))
2757  goto MultiByteUTF8;
2758 
2759  bool HasSlash = false;
2760  for (unsigned I = 0; I < 16; ++I)
2761  HasSlash |= CurPtr[I] == '/';
2762  if (HasSlash)
2763  break;
2764  CurPtr += 16;
2765  }
2766 #endif
2767 
2768  // It has to be one of the bytes scanned, increment to it and read one.
2769  C = *CurPtr++;
2770  }
2771 
2772  // Loop to scan the remainder, warning on invalid UTF-8
2773  // if the corresponding warning is enabled, emitting a diagnostic only once
2774  // per sequence that cannot be decoded.
2775  while (C != '/' && C != '\0') {
2776  if (isASCII(C)) {
2777  UnicodeDecodingAlreadyDiagnosed = false;
2778  C = *CurPtr++;
2779  continue;
2780  }
2781  MultiByteUTF8:
2782  // CurPtr is 1 code unit past C, so to decode
2783  // the codepoint, we need to read from the previous position.
2784  unsigned Length = llvm::getUTF8SequenceSize(
2785  (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2786  if (Length == 0) {
2787  if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2788  Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2789  UnicodeDecodingAlreadyDiagnosed = true;
2790  } else {
2791  UnicodeDecodingAlreadyDiagnosed = false;
2792  CurPtr += Length - 1;
2793  }
2794  C = *CurPtr++;
2795  }
2796 
2797  if (C == '/') {
2798  FoundSlash:
2799  if (CurPtr[-2] == '*') // We found the final */. We're done!
2800  break;
2801 
2802  if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2803  if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2804  LangOpts.Trigraphs)) {
2805  // We found the final */, though it had an escaped newline between the
2806  // * and /. We're done!
2807  break;
2808  }
2809  }
2810  if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2811  // If this is a /* inside of the comment, emit a warning. Don't do this
2812  // if this is a /*/, which will end the comment. This misses cases with
2813  // embedded escaped newlines, but oh well.
2814  if (!isLexingRawMode())
2815  Diag(CurPtr-1, diag::warn_nested_block_comment);
2816  }
2817  } else if (C == 0 && CurPtr == BufferEnd+1) {
2818  if (!isLexingRawMode())
2819  Diag(BufferPtr, diag::err_unterminated_block_comment);
2820  // Note: the user probably forgot a */. We could continue immediately
2821  // after the /*, but this would involve lexing a lot of what really is the
2822  // comment, which surely would confuse the parser.
2823  --CurPtr;
2824 
2825  // KeepWhitespaceMode should return this broken comment as a token. Since
2826  // it isn't a well formed comment, just return it as an 'unknown' token.
2827  if (isKeepWhitespaceMode()) {
2828  FormTokenWithChars(Result, CurPtr, tok::unknown);
2829  return true;
2830  }
2831 
2832  BufferPtr = CurPtr;
2833  return false;
2834  } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2836  cutOffLexing();
2837  return false;
2838  }
2839 
2840  C = *CurPtr++;
2841  }
2842 
2843  // Notify comment handlers about the comment unless we're in a #if 0 block.
2844  if (PP && !isLexingRawMode() &&
2845  PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2846  getSourceLocation(CurPtr)))) {
2847  BufferPtr = CurPtr;
2848  return true; // A token has to be returned.
2849  }
2850 
2851  // If we are returning comments as tokens, return this comment as a token.
2852  if (inKeepCommentMode()) {
2853  FormTokenWithChars(Result, CurPtr, tok::comment);
2854  return true;
2855  }
2856 
2857  // It is common for the tokens immediately after a /**/ comment to be
2858  // whitespace. Instead of going through the big switch, handle it
2859  // efficiently now. This is safe even in KeepWhitespaceMode because we would
2860  // have already returned above with the comment as a token.
2861  if (isHorizontalWhitespace(*CurPtr)) {
2862  SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2863  return false;
2864  }
2865 
2866  // Otherwise, just return so that the next character will be lexed as a token.
2867  BufferPtr = CurPtr;
2868  Result.setFlag(Token::LeadingSpace);
2869  return false;
2870 }
2871 
2872 //===----------------------------------------------------------------------===//
2873 // Primary Lexing Entry Points
2874 //===----------------------------------------------------------------------===//
2875 
2876 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2877 /// uninterpreted string. This switches the lexer out of directive mode.
2879  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
2880  "Must be in a preprocessing directive!");
2881  Token Tmp;
2882  Tmp.startToken();
2883 
2884  // CurPtr - Cache BufferPtr in an automatic variable.
2885  const char *CurPtr = BufferPtr;
2886  while (true) {
2887  char Char = getAndAdvanceChar(CurPtr, Tmp);
2888  switch (Char) {
2889  default:
2890  if (Result)
2891  Result->push_back(Char);
2892  break;
2893  case 0: // Null.
2894  // Found end of file?
2895  if (CurPtr-1 != BufferEnd) {
2896  if (isCodeCompletionPoint(CurPtr-1)) {
2898  cutOffLexing();
2899  return;
2900  }
2901 
2902  // Nope, normal character, continue.
2903  if (Result)
2904  Result->push_back(Char);
2905  break;
2906  }
2907  // FALL THROUGH.
2908  [[fallthrough]];
2909  case '\r':
2910  case '\n':
2911  // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2912  assert(CurPtr[-1] == Char && "Trigraphs for newline?");
2913  BufferPtr = CurPtr-1;
2914 
2915  // Next, lex the character, which should handle the EOD transition.
2916  Lex(Tmp);
2917  if (Tmp.is(tok::code_completion)) {
2918  if (PP)
2920  Lex(Tmp);
2921  }
2922  assert(Tmp.is(tok::eod) && "Unexpected token!");
2923 
2924  // Finally, we're done;
2925  return;
2926  }
2927  }
2928 }
2929 
2930 /// LexEndOfFile - CurPtr points to the end of this file. Handle this
2931 /// condition, reporting diagnostics and handling other edge cases as required.
2932 /// This returns true if Result contains a token, false if PP.Lex should be
2933 /// called again.
2934 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2935  // If we hit the end of the file while parsing a preprocessor directive,
2936  // end the preprocessor directive first. The next token returned will
2937  // then be the end of file.
2939  // Done parsing the "line".
2941  // Update the location of token as well as BufferPtr.
2942  FormTokenWithChars(Result, CurPtr, tok::eod);
2943 
2944  // Restore comment saving mode, in case it was disabled for directive.
2945  if (PP)
2947  return true; // Have a token.
2948  }
2949 
2950  // If we are in raw mode, return this event as an EOF token. Let the caller
2951  // that put us in raw mode handle the event.
2952  if (isLexingRawMode()) {
2953  Result.startToken();
2954  BufferPtr = BufferEnd;
2955  FormTokenWithChars(Result, BufferEnd, tok::eof);
2956  return true;
2957  }
2958 
2959  if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
2961  // If the preamble cuts off the end of a header guard, consider it guarded.
2962  // The guard is valid for the preamble content itself, and for tools the
2963  // most useful answer is "yes, this file has a header guard".
2964  if (!ConditionalStack.empty())
2966  ConditionalStack.clear();
2967  }
2968 
2969  // Issue diagnostics for unterminated #if and missing newline.
2970 
2971  // If we are in a #if directive, emit an error.
2972  while (!ConditionalStack.empty()) {
2973  if (PP->getCodeCompletionFileLoc() != FileLoc)
2974  PP->Diag(ConditionalStack.back().IfLoc,
2975  diag::err_pp_unterminated_conditional);
2976  ConditionalStack.pop_back();
2977  }
2978 
2979  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2980  // a pedwarn.
2981  if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
2982  DiagnosticsEngine &Diags = PP->getDiagnostics();
2983  SourceLocation EndLoc = getSourceLocation(BufferEnd);
2984  unsigned DiagID;
2985 
2986  if (LangOpts.CPlusPlus11) {
2987  // C++11 [lex.phases] 2.2 p2
2988  // Prefer the C++98 pedantic compatibility warning over the generic,
2989  // non-extension, user-requested "missing newline at EOF" warning.
2990  if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2991  DiagID = diag::warn_cxx98_compat_no_newline_eof;
2992  } else {
2993  DiagID = diag::warn_no_newline_eof;
2994  }
2995  } else {
2996  DiagID = diag::ext_no_newline_eof;
2997  }
2998 
2999  Diag(BufferEnd, DiagID)
3000  << FixItHint::CreateInsertion(EndLoc, "\n");
3001  }
3002 
3003  BufferPtr = CurPtr;
3004 
3005  // Finally, let the preprocessor handle this.
3006  return PP->HandleEndOfFile(Result, isPragmaLexer());
3007 }
3008 
3009 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3010 /// the specified lexer will return a tok::l_paren token, 0 if it is something
3011 /// else and 2 if there are no more tokens in the buffer controlled by the
3012 /// lexer.
3013 unsigned Lexer::isNextPPTokenLParen() {
3014  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3015 
3016  if (isDependencyDirectivesLexer()) {
3017  if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3018  return 2;
3019  return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3020  tok::l_paren);
3021  }
3022 
3023  // Switch to 'skipping' mode. This will ensure that we can lex a token
3024  // without emitting diagnostics, disables macro expansion, and will cause EOF
3025  // to return an EOF token instead of popping the include stack.
3026  LexingRawMode = true;
3027 
3028  // Save state that can be changed while lexing so that we can restore it.
3029  const char *TmpBufferPtr = BufferPtr;
3030  bool inPPDirectiveMode = ParsingPreprocessorDirective;
3031  bool atStartOfLine = IsAtStartOfLine;
3032  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3033  bool leadingSpace = HasLeadingSpace;
3034 
3035  Token Tok;
3036  Lex(Tok);
3037 
3038  // Restore state that may have changed.
3039  BufferPtr = TmpBufferPtr;
3040  ParsingPreprocessorDirective = inPPDirectiveMode;
3041  HasLeadingSpace = leadingSpace;
3042  IsAtStartOfLine = atStartOfLine;
3043  IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3044 
3045  // Restore the lexer back to non-skipping mode.
3046  LexingRawMode = false;
3047 
3048  if (Tok.is(tok::eof))
3049  return 2;
3050  return Tok.is(tok::l_paren);
3051 }
3052 
3053 /// Find the end of a version control conflict marker.
3054 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3055  ConflictMarkerKind CMK) {
3056  const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3057  size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3058  auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3059  size_t Pos = RestOfBuffer.find(Terminator);
3060  while (Pos != StringRef::npos) {
3061  // Must occur at start of line.
3062  if (Pos == 0 ||
3063  (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3064  RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3065  Pos = RestOfBuffer.find(Terminator);
3066  continue;
3067  }
3068  return RestOfBuffer.data()+Pos;
3069  }
3070  return nullptr;
3071 }
3072 
3073 /// IsStartOfConflictMarker - If the specified pointer is the start of a version
3074 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3075 /// and recover nicely. This returns true if it is a conflict marker and false
3076 /// if not.
3077 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3078  // Only a conflict marker if it starts at the beginning of a line.
3079  if (CurPtr != BufferStart &&
3080  CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3081  return false;
3082 
3083  // Check to see if we have <<<<<<< or >>>>.
3084  if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
3085  !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
3086  return false;
3087 
3088  // If we have a situation where we don't care about conflict markers, ignore
3089  // it.
3090  if (CurrentConflictMarkerState || isLexingRawMode())
3091  return false;
3092 
3093  ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3094 
3095  // Check to see if there is an ending marker somewhere in the buffer at the
3096  // start of a line to terminate this conflict marker.
3097  if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3098  // We found a match. We are really in a conflict marker.
3099  // Diagnose this, and ignore to the end of line.
3100  Diag(CurPtr, diag::err_conflict_marker);
3101  CurrentConflictMarkerState = Kind;
3102 
3103  // Skip ahead to the end of line. We know this exists because the
3104  // end-of-conflict marker starts with \r or \n.
3105  while (*CurPtr != '\r' && *CurPtr != '\n') {
3106  assert(CurPtr != BufferEnd && "Didn't find end of line");
3107  ++CurPtr;
3108  }
3109  BufferPtr = CurPtr;
3110  return true;
3111  }
3112 
3113  // No end of conflict marker found.
3114  return false;
3115 }
3116 
3117 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3118 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3119 /// is the end of a conflict marker. Handle it by ignoring up until the end of
3120 /// the line. This returns true if it is a conflict marker and false if not.
3121 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3122  // Only a conflict marker if it starts at the beginning of a line.
3123  if (CurPtr != BufferStart &&
3124  CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3125  return false;
3126 
3127  // If we have a situation where we don't care about conflict markers, ignore
3128  // it.
3129  if (!CurrentConflictMarkerState || isLexingRawMode())
3130  return false;
3131 
3132  // Check to see if we have the marker (4 characters in a row).
3133  for (unsigned i = 1; i != 4; ++i)
3134  if (CurPtr[i] != CurPtr[0])
3135  return false;
3136 
3137  // If we do have it, search for the end of the conflict marker. This could
3138  // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3139  // be the end of conflict marker.
3140  if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3141  CurrentConflictMarkerState)) {
3142  CurPtr = End;
3143 
3144  // Skip ahead to the end of line.
3145  while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3146  ++CurPtr;
3147 
3148  BufferPtr = CurPtr;
3149 
3150  // No longer in the conflict marker.
3151  CurrentConflictMarkerState = CMK_None;
3152  return true;
3153  }
3154 
3155  return false;
3156 }
3157 
3158 static const char *findPlaceholderEnd(const char *CurPtr,
3159  const char *BufferEnd) {
3160  if (CurPtr == BufferEnd)
3161  return nullptr;
3162  BufferEnd -= 1; // Scan until the second last character.
3163  for (; CurPtr != BufferEnd; ++CurPtr) {
3164  if (CurPtr[0] == '#' && CurPtr[1] == '>')
3165  return CurPtr + 2;
3166  }
3167  return nullptr;
3168 }
3169 
3170 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3171  assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3173  return false;
3174  const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3175  if (!End)
3176  return false;
3177  const char *Start = CurPtr - 1;
3178  if (!LangOpts.AllowEditorPlaceholders)
3179  Diag(Start, diag::err_placeholder_in_source);
3180  Result.startToken();
3181  FormTokenWithChars(Result, End, tok::raw_identifier);
3182  Result.setRawIdentifierData(Start);
3183  PP->LookUpIdentifierInfo(Result);
3184  Result.setFlag(Token::IsEditorPlaceholder);
3185  BufferPtr = End;
3186  return true;
3187 }
3188 
3189 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3190  if (PP && PP->isCodeCompletionEnabled()) {
3191  SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3192  return Loc == PP->getCodeCompletionLoc();
3193  }
3194 
3195  return false;
3196 }
3197 
3198 llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3199  const char *SlashLoc,
3200  Token *Result) {
3201  unsigned CharSize;
3202  char Kind = getCharAndSize(StartPtr, CharSize);
3203  assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3204 
3205  unsigned NumHexDigits;
3206  if (Kind == 'u')
3207  NumHexDigits = 4;
3208  else if (Kind == 'U')
3209  NumHexDigits = 8;
3210 
3211  bool Delimited = false;
3212  bool FoundEndDelimiter = false;
3213  unsigned Count = 0;
3214  bool Diagnose = Result && !isLexingRawMode();
3215 
3216  if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3217  if (Diagnose)
3218  Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3219  return std::nullopt;
3220  }
3221 
3222  const char *CurPtr = StartPtr + CharSize;
3223  const char *KindLoc = &CurPtr[-1];
3224 
3225  uint32_t CodePoint = 0;
3226  while (Count != NumHexDigits || Delimited) {
3227  char C = getCharAndSize(CurPtr, CharSize);
3228  if (!Delimited && C == '{') {
3229  Delimited = true;
3230  CurPtr += CharSize;
3231  continue;
3232  }
3233 
3234  if (Delimited && C == '}') {
3235  CurPtr += CharSize;
3236  FoundEndDelimiter = true;
3237  break;
3238  }
3239 
3240  unsigned Value = llvm::hexDigitValue(C);
3241  if (Value == -1U) {
3242  if (!Delimited)
3243  break;
3244  if (Diagnose)
3245  Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
3246  << StringRef(KindLoc, 1);
3247  return std::nullopt;
3248  }
3249 
3250  if (CodePoint & 0xF000'0000) {
3251  if (Diagnose)
3252  Diag(KindLoc, diag::err_escape_too_large) << 0;
3253  return std::nullopt;
3254  }
3255 
3256  CodePoint <<= 4;
3257  CodePoint |= Value;
3258  CurPtr += CharSize;
3259  Count++;
3260  }
3261 
3262  if (Count == 0) {
3263  if (Diagnose)
3264  Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3265  : diag::warn_ucn_escape_no_digits)
3266  << StringRef(KindLoc, 1);
3267  return std::nullopt;
3268  }
3269 
3270  if (Delimited && Kind == 'U') {
3271  if (Diagnose)
3272  Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3273  return std::nullopt;
3274  }
3275 
3276  if (!Delimited && Count != NumHexDigits) {
3277  if (Diagnose) {
3278  Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
3279  // If the user wrote \U1234, suggest a fixit to \u.
3280  if (Count == 4 && NumHexDigits == 8) {
3281  CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3282  Diag(KindLoc, diag::note_ucn_four_not_eight)
3283  << FixItHint::CreateReplacement(URange, "u");
3284  }
3285  }
3286  return std::nullopt;
3287  }
3288 
3289  if (Delimited && PP) {
3290  Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b
3291  ? diag::warn_cxx2b_delimited_escape_sequence
3292  : diag::ext_delimited_escape_sequence)
3293  << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3294  }
3295 
3296  if (Result) {
3297  Result->setFlag(Token::HasUCN);
3298  if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0)))
3299  StartPtr = CurPtr;
3300  else
3301  while (StartPtr != CurPtr)
3302  (void)getAndAdvanceChar(StartPtr, *Result);
3303  } else {
3304  StartPtr = CurPtr;
3305  }
3306  return CodePoint;
3307 }
3308 
3309 llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3310  Token *Result) {
3311  unsigned CharSize;
3312  bool Diagnose = Result && !isLexingRawMode();
3313 
3314  char C = getCharAndSize(StartPtr, CharSize);
3315  assert(C == 'N' && "expected \\N{...}");
3316 
3317  const char *CurPtr = StartPtr + CharSize;
3318  const char *KindLoc = &CurPtr[-1];
3319 
3320  C = getCharAndSize(CurPtr, CharSize);
3321  if (C != '{') {
3322  if (Diagnose)
3323  Diag(StartPtr, diag::warn_ucn_escape_incomplete);
3324  return std::nullopt;
3325  }
3326  CurPtr += CharSize;
3327  const char *StartName = CurPtr;
3328  bool FoundEndDelimiter = false;
3329  llvm::SmallVector<char, 30> Buffer;
3330  while (C) {
3331  C = getCharAndSize(CurPtr, CharSize);
3332  CurPtr += CharSize;
3333  if (C == '}') {
3334  FoundEndDelimiter = true;
3335  break;
3336  }
3337 
3338  if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
3339  break;
3340  Buffer.push_back(C);
3341  }
3342 
3343  if (!FoundEndDelimiter || Buffer.empty()) {
3344  if (Diagnose)
3345  Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3346  : diag::warn_delimited_ucn_incomplete)
3347  << StringRef(KindLoc, 1);
3348  return std::nullopt;
3349  }
3350 
3351  StringRef Name(Buffer.data(), Buffer.size());
3352  llvm::Optional<char32_t> Res =
3353  llvm::sys::unicode::nameToCodepointStrict(Name);
3354  llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3355  if (!Res) {
3356  if (!isLexingRawMode()) {
3357  Diag(StartPtr, diag::err_invalid_ucn_name)
3358  << StringRef(Buffer.data(), Buffer.size());
3359  LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3360  if (LooseMatch) {
3361  Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3362  << FixItHint::CreateReplacement(
3363  makeCharRange(*this, StartName, CurPtr - CharSize),
3364  LooseMatch->Name);
3365  }
3366  }
3367  // When finding a match using Unicode loose matching rules
3368  // recover after having emitted a diagnostic.
3369  if (!LooseMatch)
3370  return std::nullopt;
3371  // We do not offer misspelled character names suggestions here
3372  // as the set of what would be a valid suggestion depends on context,
3373  // and we should not make invalid suggestions.
3374  }
3375 
3376  if (Diagnose && PP && !LooseMatch)
3377  Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b
3378  ? diag::warn_cxx2b_delimited_escape_sequence
3379  : diag::ext_delimited_escape_sequence)
3380  << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3381 
3382  if (LooseMatch)
3383  Res = LooseMatch->CodePoint;
3384 
3385  if (Result) {
3386  Result->setFlag(Token::HasUCN);
3387  if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4))
3388  StartPtr = CurPtr;
3389  else
3390  while (StartPtr != CurPtr)
3391  (void)getAndAdvanceChar(StartPtr, *Result);
3392  } else {
3393  StartPtr = CurPtr;
3394  }
3395  return *Res;
3396 }
3397 
3398 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3399  Token *Result) {
3400 
3401  unsigned CharSize;
3402  llvm::Optional<uint32_t> CodePointOpt;
3403  char Kind = getCharAndSize(StartPtr, CharSize);
3404  if (Kind == 'u' || Kind == 'U')
3405  CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3406  else if (Kind == 'N')
3407  CodePointOpt = tryReadNamedUCN(StartPtr, Result);
3408 
3409  if (!CodePointOpt)
3410  return 0;
3411 
3412  uint32_t CodePoint = *CodePointOpt;
3413 
3414  // Don't apply C family restrictions to UCNs in assembly mode
3415  if (LangOpts.AsmPreprocessor)
3416  return CodePoint;
3417 
3418  // C99 6.4.3p2: A universal character name shall not specify a character whose
3419  // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3420  // 0060 (`), nor one in the range D800 through DFFF inclusive.)
3421  // C++11 [lex.charset]p2: If the hexadecimal value for a
3422  // universal-character-name corresponds to a surrogate code point (in the
3423  // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3424  // if the hexadecimal value for a universal-character-name outside the
3425  // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3426  // string literal corresponds to a control character (in either of the
3427  // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3428  // basic source character set, the program is ill-formed.
3429  if (CodePoint < 0xA0) {
3430  if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
3431  return CodePoint;
3432 
3433  // We don't use isLexingRawMode() here because we need to warn about bad
3434  // UCNs even when skipping preprocessing tokens in a #if block.
3435  if (Result && PP) {
3436  if (CodePoint < 0x20 || CodePoint >= 0x7F)
3437  Diag(BufferPtr, diag::err_ucn_control_character);
3438  else {
3439  char C = static_cast<char>(CodePoint);
3440  Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3441  }
3442  }
3443 
3444  return 0;
3445  } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3446  // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3447  // We don't use isLexingRawMode() here because we need to diagnose bad
3448  // UCNs even when skipping preprocessing tokens in a #if block.
3449  if (Result && PP) {
3450  if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3451  Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3452  else
3453  Diag(BufferPtr, diag::err_ucn_escape_invalid);
3454  }
3455  return 0;
3456  }
3457 
3458  return CodePoint;
3459 }
3460 
3461 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3462  const char *CurPtr) {
3463  if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3464  isUnicodeWhitespace(C)) {
3465  Diag(BufferPtr, diag::ext_unicode_whitespace)
3466  << makeCharRange(*this, BufferPtr, CurPtr);
3467 
3468  Result.setFlag(Token::LeadingSpace);
3469  return true;
3470  }
3471  return false;
3472 }
3473 
3474 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3475  IsAtStartOfLine = Result.isAtStartOfLine();
3476  HasLeadingSpace = Result.hasLeadingSpace();
3477  HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3478  // Note that this doesn't affect IsAtPhysicalStartOfLine.
3479 }
3480 
3481 bool Lexer::Lex(Token &Result) {
3482  assert(!isDependencyDirectivesLexer());
3483 
3484  // Start a new token.
3485  Result.startToken();
3486 
3487  // Set up misc whitespace flags for LexTokenInternal.
3488  if (IsAtStartOfLine) {
3489  Result.setFlag(Token::StartOfLine);
3490  IsAtStartOfLine = false;
3491  }
3492 
3493  if (HasLeadingSpace) {
3494  Result.setFlag(Token::LeadingSpace);
3495  HasLeadingSpace = false;
3496  }
3497 
3498  if (HasLeadingEmptyMacro) {
3499  Result.setFlag(Token::LeadingEmptyMacro);
3500  HasLeadingEmptyMacro = false;
3501  }
3502 
3503  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3504  IsAtPhysicalStartOfLine = false;
3505  bool isRawLex = isLexingRawMode();
3506  (void) isRawLex;
3507  bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3508  // (After the LexTokenInternal call, the lexer might be destroyed.)
3509  assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3510  return returnedToken;
3511 }
3512 
3513 /// LexTokenInternal - This implements a simple C family lexer. It is an
3514 /// extremely performance critical piece of code. This assumes that the buffer
3515 /// has a null character at the end of the file. This returns a preprocessing
3516 /// token, not a normal token, as such, it is an internal interface. It assumes
3517 /// that the Flags of result have been cleared before calling this.
3518 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3519 LexStart:
3520  assert(!Result.needsCleaning() && "Result needs cleaning");
3521  assert(!Result.hasPtrData() && "Result has not been reset");
3522 
3523  // CurPtr - Cache BufferPtr in an automatic variable.
3524  const char *CurPtr = BufferPtr;
3525 
3526  // Small amounts of horizontal whitespace is very common between tokens.
3527  if (isHorizontalWhitespace(*CurPtr)) {
3528  do {
3529  ++CurPtr;
3530  } while (isHorizontalWhitespace(*CurPtr));
3531 
3532  // If we are keeping whitespace and other tokens, just return what we just
3533  // skipped. The next lexer invocation will return the token after the
3534  // whitespace.
3535  if (isKeepWhitespaceMode()) {
3536  FormTokenWithChars(Result, CurPtr, tok::unknown);
3537  // FIXME: The next token will not have LeadingSpace set.
3538  return true;
3539  }
3540 
3541  BufferPtr = CurPtr;
3542  Result.setFlag(Token::LeadingSpace);
3543  }
3544 
3545  unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3546 
3547  // Read a character, advancing over it.
3548  char Char = getAndAdvanceChar(CurPtr, Result);
3550 
3551  if (!isVerticalWhitespace(Char))
3552  NewLinePtr = nullptr;
3553 
3554  switch (Char) {
3555  case 0: // Null.
3556  // Found end of file?
3557  if (CurPtr-1 == BufferEnd)
3558  return LexEndOfFile(Result, CurPtr-1);
3559 
3560  // Check if we are performing code completion.
3561  if (isCodeCompletionPoint(CurPtr-1)) {
3562  // Return the code-completion token.
3563  Result.startToken();
3564  FormTokenWithChars(Result, CurPtr, tok::code_completion);
3565  return true;
3566  }
3567 
3568  if (!isLexingRawMode())
3569  Diag(CurPtr-1, diag::null_in_file);
3570  Result.setFlag(Token::LeadingSpace);
3571  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3572  return true; // KeepWhitespaceMode
3573 
3574  // We know the lexer hasn't changed, so just try again with this lexer.
3575  // (We manually eliminate the tail call to avoid recursion.)
3576  goto LexNextToken;
3577 
3578  case 26: // DOS & CP/M EOF: "^Z".
3579  // If we're in Microsoft extensions mode, treat this as end of file.
3580  if (LangOpts.MicrosoftExt) {
3581  if (!isLexingRawMode())
3582  Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3583  return LexEndOfFile(Result, CurPtr-1);
3584  }
3585 
3586  // If Microsoft extensions are disabled, this is just random garbage.
3587  Kind = tok::unknown;
3588  break;
3589 
3590  case '\r':
3591  if (CurPtr[0] == '\n')
3592  (void)getAndAdvanceChar(CurPtr, Result);
3593  [[fallthrough]];
3594  case '\n':
3595  // If we are inside a preprocessor directive and we see the end of line,
3596  // we know we are done with the directive, so return an EOD token.
3598  // Done parsing the "line".
3600 
3601  // Restore comment saving mode, in case it was disabled for directive.
3602  if (PP)
3604 
3605  // Since we consumed a newline, we are back at the start of a line.
3606  IsAtStartOfLine = true;
3607  IsAtPhysicalStartOfLine = true;
3608  NewLinePtr = CurPtr - 1;
3609 
3610  Kind = tok::eod;
3611  break;
3612  }
3613 
3614  // No leading whitespace seen so far.
3615  Result.clearFlag(Token::LeadingSpace);
3616 
3617  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3618  return true; // KeepWhitespaceMode
3619 
3620  // We only saw whitespace, so just try again with this lexer.
3621  // (We manually eliminate the tail call to avoid recursion.)
3622  goto LexNextToken;
3623  case ' ':
3624  case '\t':
3625  case '\f':
3626  case '\v':
3627  SkipHorizontalWhitespace:
3628  Result.setFlag(Token::LeadingSpace);
3629  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3630  return true; // KeepWhitespaceMode
3631 
3632  SkipIgnoredUnits:
3633  CurPtr = BufferPtr;
3634 
3635  // If the next token is obviously a // or /* */ comment, skip it efficiently
3636  // too (without going through the big switch stmt).
3637  if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3638  LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3639  if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3640  return true; // There is a token to return.
3641  goto SkipIgnoredUnits;
3642  } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3643  if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3644  return true; // There is a token to return.
3645  goto SkipIgnoredUnits;
3646  } else if (isHorizontalWhitespace(*CurPtr)) {
3647  goto SkipHorizontalWhitespace;
3648  }
3649  // We only saw whitespace, so just try again with this lexer.
3650  // (We manually eliminate the tail call to avoid recursion.)
3651  goto LexNextToken;
3652 
3653  // C99 6.4.4.1: Integer Constants.
3654  // C99 6.4.4.2: Floating Constants.
3655  case '0': case '1': case '2': case '3': case '4':
3656  case '5': case '6': case '7': case '8': case '9':
3657  // Notify MIOpt that we read a non-whitespace/non-comment token.
3658  MIOpt.ReadToken();
3659  return LexNumericConstant(Result, CurPtr);
3660 
3661  // Identifier (e.g., uber), or
3662  // UTF-8 (C2x/C++17) or UTF-16 (C11/C++11) character literal, or
3663  // UTF-8 or UTF-16 string literal (C11/C++11).
3664  case 'u':
3665  // Notify MIOpt that we read a non-whitespace/non-comment token.
3666  MIOpt.ReadToken();
3667 
3668  if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3669  Char = getCharAndSize(CurPtr, SizeTmp);
3670 
3671  // UTF-16 string literal
3672  if (Char == '"')
3673  return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3674  tok::utf16_string_literal);
3675 
3676  // UTF-16 character constant
3677  if (Char == '\'')
3678  return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3679  tok::utf16_char_constant);
3680 
3681  // UTF-16 raw string literal
3682  if (Char == 'R' && LangOpts.CPlusPlus11 &&
3683  getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3684  return LexRawStringLiteral(Result,
3685  ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3686  SizeTmp2, Result),
3687  tok::utf16_string_literal);
3688 
3689  if (Char == '8') {
3690  char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3691 
3692  // UTF-8 string literal
3693  if (Char2 == '"')
3694  return LexStringLiteral(Result,
3695  ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3696  SizeTmp2, Result),
3697  tok::utf8_string_literal);
3698  if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C2x))
3699  return LexCharConstant(
3700  Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3701  SizeTmp2, Result),
3702  tok::utf8_char_constant);
3703 
3704  if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3705  unsigned SizeTmp3;
3706  char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3707  // UTF-8 raw string literal
3708  if (Char3 == '"') {
3709  return LexRawStringLiteral(Result,
3710  ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3711  SizeTmp2, Result),
3712  SizeTmp3, Result),
3713  tok::utf8_string_literal);
3714  }
3715  }
3716  }
3717  }
3718 
3719  // treat u like the start of an identifier.
3720  return LexIdentifierContinue(Result, CurPtr);
3721 
3722  case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3723  // Notify MIOpt that we read a non-whitespace/non-comment token.
3724  MIOpt.ReadToken();
3725 
3726  if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3727  Char = getCharAndSize(CurPtr, SizeTmp);
3728 
3729  // UTF-32 string literal
3730  if (Char == '"')
3731  return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3732  tok::utf32_string_literal);
3733 
3734  // UTF-32 character constant
3735  if (Char == '\'')
3736  return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3737  tok::utf32_char_constant);
3738 
3739  // UTF-32 raw string literal
3740  if (Char == 'R' && LangOpts.CPlusPlus11 &&
3741  getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3742  return LexRawStringLiteral(Result,
3743  ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3744  SizeTmp2, Result),
3745  tok::utf32_string_literal);
3746  }
3747 
3748  // treat U like the start of an identifier.
3749  return LexIdentifierContinue(Result, CurPtr);
3750 
3751  case 'R': // Identifier or C++0x raw string literal
3752  // Notify MIOpt that we read a non-whitespace/non-comment token.
3753  MIOpt.ReadToken();
3754 
3755  if (LangOpts.CPlusPlus11) {
3756  Char = getCharAndSize(CurPtr, SizeTmp);
3757 
3758  if (Char == '"')
3759  return LexRawStringLiteral(Result,
3760  ConsumeChar(CurPtr, SizeTmp, Result),
3761  tok::string_literal);
3762  }
3763 
3764  // treat R like the start of an identifier.
3765  return LexIdentifierContinue(Result, CurPtr);
3766 
3767  case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3768  // Notify MIOpt that we read a non-whitespace/non-comment token.
3769  MIOpt.ReadToken();
3770  Char = getCharAndSize(CurPtr, SizeTmp);
3771 
3772  // Wide string literal.
3773  if (Char == '"')
3774  return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3775  tok::wide_string_literal);
3776 
3777  // Wide raw string literal.
3778  if (LangOpts.CPlusPlus11 && Char == 'R' &&
3779  getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3780  return LexRawStringLiteral(Result,
3781  ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3782  SizeTmp2, Result),
3783  tok::wide_string_literal);
3784 
3785  // Wide character constant.
3786  if (Char == '\'')
3787  return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3788  tok::wide_char_constant);
3789  // FALL THROUGH, treating L like the start of an identifier.
3790  [[fallthrough]];
3791 
3792  // C99 6.4.2: Identifiers.
3793  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3794  case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3795  case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3796  case 'V': case 'W': case 'X': case 'Y': case 'Z':
3797  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3798  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3799  case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3800  case 'v': case 'w': case 'x': case 'y': case 'z':
3801  case '_':
3802  // Notify MIOpt that we read a non-whitespace/non-comment token.
3803  MIOpt.ReadToken();
3804  return LexIdentifierContinue(Result, CurPtr);
3805 
3806  case '$': // $ in identifiers.
3807  if (LangOpts.DollarIdents) {
3808  if (!isLexingRawMode())
3809  Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3810  // Notify MIOpt that we read a non-whitespace/non-comment token.
3811  MIOpt.ReadToken();
3812  return LexIdentifierContinue(Result, CurPtr);
3813  }
3814 
3815  Kind = tok::unknown;
3816  break;
3817 
3818  // C99 6.4.4: Character Constants.
3819  case '\'':
3820  // Notify MIOpt that we read a non-whitespace/non-comment token.
3821  MIOpt.ReadToken();
3822  return LexCharConstant(Result, CurPtr, tok::char_constant);
3823 
3824  // C99 6.4.5: String Literals.
3825  case '"':
3826  // Notify MIOpt that we read a non-whitespace/non-comment token.
3827  MIOpt.ReadToken();
3828  return LexStringLiteral(Result, CurPtr,
3829  ParsingFilename ? tok::header_name
3830  : tok::string_literal);
3831 
3832  // C99 6.4.6: Punctuators.
3833  case '?':
3834  Kind = tok::question;
3835  break;
3836  case '[':
3837  Kind = tok::l_square;
3838  break;
3839  case ']':
3840  Kind = tok::r_square;
3841  break;
3842  case '(':
3843  Kind = tok::l_paren;
3844  break;
3845  case ')':
3846  Kind = tok::r_paren;
3847  break;
3848  case '{':
3849  Kind = tok::l_brace;
3850  break;
3851  case '}':
3852  Kind = tok::r_brace;
3853  break;
3854  case '.':
3855  Char = getCharAndSize(CurPtr, SizeTmp);
3856  if (Char >= '0' && Char <= '9') {
3857  // Notify MIOpt that we read a non-whitespace/non-comment token.
3858  MIOpt.ReadToken();
3859 
3860  return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3861  } else if (LangOpts.CPlusPlus && Char == '*') {
3862  Kind = tok::periodstar;
3863  CurPtr += SizeTmp;
3864  } else if (Char == '.' &&
3865  getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3866  Kind = tok::ellipsis;
3867  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3868  SizeTmp2, Result);
3869  } else {
3870  Kind = tok::period;
3871  }
3872  break;
3873  case '&':
3874  Char = getCharAndSize(CurPtr, SizeTmp);
3875  if (Char == '&') {
3876  Kind = tok::ampamp;
3877  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3878  } else if (Char == '=') {
3879  Kind = tok::ampequal;
3880  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3881  } else {
3882  Kind = tok::amp;
3883  }
3884  break;
3885  case '*':
3886  if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3887  Kind = tok::starequal;
3888  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3889  } else {
3890  Kind = tok::star;
3891  }
3892  break;
3893  case '+':
3894  Char = getCharAndSize(CurPtr, SizeTmp);
3895  if (Char == '+') {
3896  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3897  Kind = tok::plusplus;
3898  } else if (Char == '=') {
3899  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3900  Kind = tok::plusequal;
3901  } else {
3902  Kind = tok::plus;
3903  }
3904  break;
3905  case '-':
3906  Char = getCharAndSize(CurPtr, SizeTmp);
3907  if (Char == '-') { // --
3908  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3909  Kind = tok::minusminus;
3910  } else if (Char == '>' && LangOpts.CPlusPlus &&
3911  getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
3912  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3913  SizeTmp2, Result);
3914  Kind = tok::arrowstar;
3915  } else if (Char == '>') { // ->
3916  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3917  Kind = tok::arrow;
3918  } else if (Char == '=') { // -=
3919  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3920  Kind = tok::minusequal;
3921  } else {
3922  Kind = tok::minus;
3923  }
3924  break;
3925  case '~':
3926  Kind = tok::tilde;
3927  break;
3928  case '!':
3929  if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3930  Kind = tok::exclaimequal;
3931  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3932  } else {
3933  Kind = tok::exclaim;
3934  }
3935  break;
3936  case '/':
3937  // 6.4.9: Comments
3938  Char = getCharAndSize(CurPtr, SizeTmp);
3939  if (Char == '/') { // Line comment.
3940  // Even if Line comments are disabled (e.g. in C89 mode), we generally
3941  // want to lex this as a comment. There is one problem with this though,
3942  // that in one particular corner case, this can change the behavior of the
3943  // resultant program. For example, In "foo //**/ bar", C89 would lex
3944  // this as "foo / bar" and languages with Line comments would lex it as
3945  // "foo". Check to see if the character after the second slash is a '*'.
3946  // If so, we will lex that as a "/" instead of the start of a comment.
3947  // However, we never do this if we are just preprocessing.
3948  bool TreatAsComment =
3949  LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
3950  if (!TreatAsComment)
3951  if (!(PP && PP->isPreprocessedOutput()))
3952  TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3953 
3954  if (TreatAsComment) {
3955  if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3956  TokAtPhysicalStartOfLine))
3957  return true; // There is a token to return.
3958 
3959  // It is common for the tokens immediately after a // comment to be
3960  // whitespace (indentation for the next line). Instead of going through
3961  // the big switch, handle it efficiently now.
3962  goto SkipIgnoredUnits;
3963  }
3964  }
3965 
3966  if (Char == '*') { // /**/ comment.
3967  if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3968  TokAtPhysicalStartOfLine))
3969  return true; // There is a token to return.
3970 
3971  // We only saw whitespace, so just try again with this lexer.
3972  // (We manually eliminate the tail call to avoid recursion.)
3973  goto LexNextToken;
3974  }
3975 
3976  if (Char == '=') {
3977  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3978  Kind = tok::slashequal;
3979  } else {
3980  Kind = tok::slash;
3981  }
3982  break;
3983  case '%':
3984  Char = getCharAndSize(CurPtr, SizeTmp);
3985  if (Char == '=') {
3986  Kind = tok::percentequal;
3987  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3988  } else if (LangOpts.Digraphs && Char == '>') {
3989  Kind = tok::r_brace; // '%>' -> '}'
3990  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3991  } else if (LangOpts.Digraphs && Char == ':') {
3992  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3993  Char = getCharAndSize(CurPtr, SizeTmp);
3994  if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
3995  Kind = tok::hashhash; // '%:%:' -> '##'
3996  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3997  SizeTmp2, Result);
3998  } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
3999  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4000  if (!isLexingRawMode())
4001  Diag(BufferPtr, diag::ext_charize_microsoft);
4002  Kind = tok::hashat;
4003  } else { // '%:' -> '#'
4004  // We parsed a # character. If this occurs at the start of the line,
4005  // it's actually the start of a preprocessing directive. Callback to
4006  // the preprocessor to handle it.
4007  // TODO: -fpreprocessed mode??
4008  if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4009  goto HandleDirective;
4010 
4011  Kind = tok::hash;
4012  }
4013  } else {
4014  Kind = tok::percent;
4015  }
4016  break;
4017  case '<':
4018  Char = getCharAndSize(CurPtr, SizeTmp);
4019  if (ParsingFilename) {
4020  return LexAngledStringLiteral(Result, CurPtr);
4021  } else if (Char == '<') {
4022  char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4023  if (After == '=') {
4024  Kind = tok::lesslessequal;
4025  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4026  SizeTmp2, Result);
4027  } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4028  // If this is actually a '<<<<<<<' version control conflict marker,
4029  // recognize it as such and recover nicely.
4030  goto LexNextToken;
4031  } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4032  // If this is '<<<<' and we're in a Perforce-style conflict marker,
4033  // ignore it.
4034  goto LexNextToken;
4035  } else if (LangOpts.CUDA && After == '<') {
4036  Kind = tok::lesslessless;
4037  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4038  SizeTmp2, Result);
4039  } else {
4040  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4041  Kind = tok::lessless;
4042  }
4043  } else if (Char == '=') {
4044  char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4045  if (After == '>') {
4046  if (LangOpts.CPlusPlus20) {
4047  if (!isLexingRawMode())
4048  Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4049  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4050  SizeTmp2, Result);
4051  Kind = tok::spaceship;
4052  break;
4053  }
4054  // Suggest adding a space between the '<=' and the '>' to avoid a
4055  // change in semantics if this turns up in C++ <=17 mode.
4056  if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4057  Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4059  getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4060  }
4061  }
4062  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4063  Kind = tok::lessequal;
4064  } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4065  if (LangOpts.CPlusPlus11 &&
4066  getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4067  // C++0x [lex.pptoken]p3:
4068  // Otherwise, if the next three characters are <:: and the subsequent
4069  // character is neither : nor >, the < is treated as a preprocessor
4070  // token by itself and not as the first character of the alternative
4071  // token <:.
4072  unsigned SizeTmp3;
4073  char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4074  if (After != ':' && After != '>') {
4075  Kind = tok::less;
4076  if (!isLexingRawMode())
4077  Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4078  break;
4079  }
4080  }
4081 
4082  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4083  Kind = tok::l_square;
4084  } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4085  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4086  Kind = tok::l_brace;
4087  } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4088  lexEditorPlaceholder(Result, CurPtr)) {
4089  return true;
4090  } else {
4091  Kind = tok::less;
4092  }
4093  break;
4094  case '>':
4095  Char = getCharAndSize(CurPtr, SizeTmp);
4096  if (Char == '=') {
4097  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4098  Kind = tok::greaterequal;
4099  } else if (Char == '>') {
4100  char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4101  if (After == '=') {
4102  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4103  SizeTmp2, Result);
4104  Kind = tok::greatergreaterequal;
4105  } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4106  // If this is actually a '>>>>' conflict marker, recognize it as such
4107  // and recover nicely.
4108  goto LexNextToken;
4109  } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4110  // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4111  goto LexNextToken;
4112  } else if (LangOpts.CUDA && After == '>') {
4113  Kind = tok::greatergreatergreater;
4114  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4115  SizeTmp2, Result);
4116  } else {
4117  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4118  Kind = tok::greatergreater;
4119  }
4120  } else {
4121  Kind = tok::greater;
4122  }
4123  break;
4124  case '^':
4125  Char = getCharAndSize(CurPtr, SizeTmp);
4126  if (Char == '=') {
4127  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4128  Kind = tok::caretequal;
4129  } else if (LangOpts.OpenCL && Char == '^') {
4130  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4131  Kind = tok::caretcaret;
4132  } else {
4133  Kind = tok::caret;
4134  }
4135  break;
4136  case '|':
4137  Char = getCharAndSize(CurPtr, SizeTmp);
4138  if (Char == '=') {
4139  Kind = tok::pipeequal;
4140  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4141  } else if (Char == '|') {
4142  // If this is '|||||||' and we're in a conflict marker, ignore it.
4143  if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4144  goto LexNextToken;
4145  Kind = tok::pipepipe;
4146  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4147  } else {
4148  Kind = tok::pipe;
4149  }
4150  break;
4151  case ':':
4152  Char = getCharAndSize(CurPtr, SizeTmp);
4153  if (LangOpts.Digraphs && Char == '>') {
4154  Kind = tok::r_square; // ':>' -> ']'
4155  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4156  } else if ((LangOpts.CPlusPlus ||
4157  LangOpts.DoubleSquareBracketAttributes) &&
4158  Char == ':') {
4159  Kind = tok::coloncolon;
4160  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4161  } else {
4162  Kind = tok::colon;
4163  }
4164  break;
4165  case ';':
4166  Kind = tok::semi;
4167  break;
4168  case '=':
4169  Char = getCharAndSize(CurPtr, SizeTmp);
4170  if (Char == '=') {
4171  // If this is '====' and we're in a conflict marker, ignore it.
4172  if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4173  goto LexNextToken;
4174 
4175  Kind = tok::equalequal;
4176  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4177  } else {
4178  Kind = tok::equal;
4179  }
4180  break;
4181  case ',':
4182  Kind = tok::comma;
4183  break;
4184  case '#':
4185  Char = getCharAndSize(CurPtr, SizeTmp);
4186  if (Char == '#') {
4187  Kind = tok::hashhash;
4188  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4189  } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4190  Kind = tok::hashat;
4191  if (!isLexingRawMode())
4192  Diag(BufferPtr, diag::ext_charize_microsoft);
4193  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4194  } else {
4195  // We parsed a # character. If this occurs at the start of the line,
4196  // it's actually the start of a preprocessing directive. Callback to
4197  // the preprocessor to handle it.
4198  // TODO: -fpreprocessed mode??
4199  if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4200  goto HandleDirective;
4201 
4202  Kind = tok::hash;
4203  }
4204  break;
4205 
4206  case '@':
4207  // Objective C support.
4208  if (CurPtr[-1] == '@' && LangOpts.ObjC)
4209  Kind = tok::at;
4210  else
4211  Kind = tok::unknown;
4212  break;
4213 
4214  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4215  case '\\':
4216  if (!LangOpts.AsmPreprocessor) {
4217  if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4218  if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4219  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4220  return true; // KeepWhitespaceMode
4221 
4222  // We only saw whitespace, so just try again with this lexer.
4223  // (We manually eliminate the tail call to avoid recursion.)
4224  goto LexNextToken;
4225  }
4226 
4227  return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4228  }
4229  }
4230 
4231  Kind = tok::unknown;
4232  break;
4233 
4234  default: {
4235  if (isASCII(Char)) {
4236  Kind = tok::unknown;
4237  break;
4238  }
4239 
4240  llvm::UTF32 CodePoint;
4241 
4242  // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4243  // an escaped newline.
4244  --CurPtr;
4245  llvm::ConversionResult Status =
4246  llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4247  (const llvm::UTF8 *)BufferEnd,
4248  &CodePoint,
4249  llvm::strictConversion);
4250  if (Status == llvm::conversionOK) {
4251  if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4252  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4253  return true; // KeepWhitespaceMode
4254 
4255  // We only saw whitespace, so just try again with this lexer.
4256  // (We manually eliminate the tail call to avoid recursion.)
4257  goto LexNextToken;
4258  }
4259  return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4260  }
4261 
4263  PP->isPreprocessedOutput()) {
4264  ++CurPtr;
4265  Kind = tok::unknown;
4266  break;
4267  }
4268 
4269  // Non-ASCII characters tend to creep into source code unintentionally.
4270  // Instead of letting the parser complain about the unknown token,
4271  // just diagnose the invalid UTF-8, then drop the character.
4272  Diag(CurPtr, diag::err_invalid_utf8);
4273 
4274  BufferPtr = CurPtr+1;
4275  // We're pretending the character didn't exist, so just try again with
4276  // this lexer.
4277  // (We manually eliminate the tail call to avoid recursion.)
4278  goto LexNextToken;
4279  }
4280  }
4281 
4282  // Notify MIOpt that we read a non-whitespace/non-comment token.
4283  MIOpt.ReadToken();
4284 
4285  // Update the location of token as well as BufferPtr.
4286  FormTokenWithChars(Result, CurPtr, Kind);
4287  return true;
4288 
4289 HandleDirective:
4290  // We parsed a # character and it's the start of a preprocessing directive.
4291 
4292  FormTokenWithChars(Result, CurPtr, tok::hash);
4293  PP->HandleDirective(Result);
4294 
4296  // With a fatal failure in the module loader, we abort parsing.
4297  assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
4298  return true;
4299  }
4300 
4301  // We parsed the directive; lex a token with the new state.
4302  return false;
4303 
4304 LexNextToken:
4305  Result.clearFlag(Token::NeedsCleaning);
4306  goto LexStart;
4307 }
4308 
4309 const char *Lexer::convertDependencyDirectiveToken(
4310  const dependency_directives_scan::Token &DDTok, Token &Result) {
4311  const char *TokPtr = BufferStart + DDTok.Offset;
4312  Result.startToken();
4313  Result.setLocation(getSourceLocation(TokPtr));
4314  Result.setKind(DDTok.Kind);
4315  Result.setFlag((Token::TokenFlags)DDTok.Flags);
4316  Result.setLength(DDTok.Length);
4317  BufferPtr = TokPtr + DDTok.Length;
4318  return TokPtr;
4319 }
4320 
4321 bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4322  assert(isDependencyDirectivesLexer());
4323 
4324  using namespace dependency_directives_scan;
4325 
4326  while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4327  if (DepDirectives.front().Kind == pp_eof)
4328  return LexEndOfFile(Result, BufferEnd);
4329  if (DepDirectives.front().Kind == tokens_present_before_eof)
4330  MIOpt.ReadToken();
4331  NextDepDirectiveTokenIndex = 0;
4332  DepDirectives = DepDirectives.drop_front();
4333  }
4334 
4335  const dependency_directives_scan::Token &DDTok =
4336  DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4337  if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4338  // Read something other than a preprocessor directive hash.
4339  MIOpt.ReadToken();
4340  }
4341 
4342  const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4343 
4344  if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4345  PP->HandleDirective(Result);
4346  return false;
4347  }
4348  if (Result.is(tok::raw_identifier)) {
4349  Result.setRawIdentifierData(TokPtr);
4350  if (!isLexingRawMode()) {
4351  IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
4352  if (II->isHandleIdentifierCase())
4353  return PP->HandleIdentifier(Result);
4354  }
4355  return true;
4356  }
4357  if (Result.isLiteral()) {
4358  Result.setLiteralData(TokPtr);
4359  return true;
4360  }
4361  if (Result.is(tok::colon) &&
4362  (LangOpts.CPlusPlus || LangOpts.DoubleSquareBracketAttributes)) {
4363  // Convert consecutive colons to 'tok::coloncolon'.
4364  if (*BufferPtr == ':') {
4365  assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4366  tok::colon));
4367  ++NextDepDirectiveTokenIndex;
4368  Result.setKind(tok::coloncolon);
4369  }
4370  return true;
4371  }
4372  if (Result.is(tok::eod))
4374 
4375  return true;
4376 }
4377 
4378 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4379  assert(isDependencyDirectivesLexer());
4380 
4381  using namespace dependency_directives_scan;
4382 
4383  bool Stop = false;
4384  unsigned NestedIfs = 0;
4385  do {
4386  DepDirectives = DepDirectives.drop_front();
4387  switch (DepDirectives.front().Kind) {
4388  case pp_none:
4389  llvm_unreachable("unexpected 'pp_none'");
4390  case pp_include:
4391  case pp___include_macros:
4392  case pp_define:
4393  case pp_undef:
4394  case pp_import:
4395  case pp_pragma_import:
4396  case pp_pragma_once:
4397  case pp_pragma_push_macro:
4398  case pp_pragma_pop_macro:
4400  case pp_include_next:
4401  case decl_at_import:
4402  case cxx_module_decl:
4403  case cxx_import_decl:
4407  break;
4408  case pp_if:
4409  case pp_ifdef:
4410  case pp_ifndef:
4411  ++NestedIfs;
4412  break;
4413  case pp_elif:
4414  case pp_elifdef:
4415  case pp_elifndef:
4416  case pp_else:
4417  if (!NestedIfs) {
4418  Stop = true;
4419  }
4420  break;
4421  case pp_endif:
4422  if (!NestedIfs) {
4423  Stop = true;
4424  } else {
4425  --NestedIfs;
4426  }
4427  break;
4428  case pp_eof:
4429  NextDepDirectiveTokenIndex = 0;
4430  return LexEndOfFile(Result, BufferEnd);
4431  }
4432  } while (!Stop);
4433 
4434  const dependency_directives_scan::Token &DDTok =
4435  DepDirectives.front().Tokens.front();
4436  assert(DDTok.is(tok::hash));
4437  NextDepDirectiveTokenIndex = 1;
4438 
4439  convertDependencyDirectiveToken(DDTok, Result);
4440  return false;
4441 }
clang::Token::startToken
void startToken()
Reset all flags to cleared.
Definition: Token.h:171
clang::Lexer::isAtEndOfMacroExpansion
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:831
clang::Lexer::makeFileCharRange
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:892
clang::dependency_directives_scan::tokens_present_before_eof
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
Definition: DependencyDirectivesScanner.h:87
clang::Preprocessor::CreateString
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
Definition: Preprocessor.cpp:488
clang::dependency_directives_scan::pp_undef
@ pp_undef
Definition: DependencyDirectivesScanner.h:64
getBeginningOfFileToken
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:513
clang::dependency_directives_scan::cxx_import_decl
@ cxx_import_decl
Definition: DependencyDirectivesScanner.h:82
_mm_cmpeq_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3023
clang::Token::hasUCN
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:298
isEndOfBlockCommentWithEscapedNewLine
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2570
clang::dependency_directives_scan::cxx_export_module_decl
@ cxx_export_module_decl
Definition: DependencyDirectivesScanner.h:83
clang::Lexer::SetCommentRetentionState
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
clang::DiagnosticBuilder
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1266
isInExpansionTokenRange
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:885
clang::CMK_Perforce
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
clang::CharSourceRange::getBegin
SourceLocation getBegin() const
Definition: SourceLocation.h:283
clang::Preprocessor::setRecordedPreambleConditionalStack
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
Definition: Preprocessor.h:2580
clang::Token::isObjCAtKeyword
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:58
clang::DeclaratorContext::File
@ File
clang::SourceRange
A trivial tuple used to represent a source range.
Definition: SourceLocation.h:210
string
string(SUBSTRING ${CMAKE_CURRENT_BINARY_DIR} 0 ${PATH_LIB_START} PATH_HEAD) string(SUBSTRING $
Definition: CMakeLists.txt:22
clang::Lexer::getRawToken
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:462
clang::Lexer::seek
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:230
clang::Lexer::isAsciiIdentifierContinueChar
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1069
clang::Preprocessor::setCodeCompletionTokenRange
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
Definition: Preprocessor.h:1448
clang::FixItHint::CreateInsertion
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:97
clang::Preprocessor::HandleIdentifier
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
Definition: Preprocessor.cpp:789
clang::ConflictMarkerKind
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
clang::SrcMgr::ExpansionInfo::isMacroArgExpansion
bool isMacroArgExpansion() const
Definition: SourceManager.h:393
GetTrigraphCharForLetter
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1177
clang::Token::getLiteralData
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:219
clang::Lexer::getSpelling
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:404
clang::IdentifierTable::get
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Definition: IdentifierTable.h:597
clang::MultipleIncludeOpt::ReadToken
void ReadToken()
Definition: MultipleIncludeOpt.h:106
clang::if
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
Definition: RecursiveASTVisitor.h:1081
clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:86
clang::DiagnosticsEngine::isIgnored
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:911
clang::SourceLocation::getLocWithOffset
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
Definition: SourceLocation.h:134
clang::Preprocessor::getCommentRetentionState
bool getCommentRetentionState() const
Definition: Preprocessor.h:1106
clang::dependency_directives_scan::pp_ifndef
@ pp_ifndef
Definition: DependencyDirectivesScanner.h:74
clang::Lexer::inKeepCommentMode
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
clang::PreprocessorLexer
Definition: PreprocessorLexer.h:30
clang::Lexer::Create_PragmaLexer
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:195
clang::PreprocessorLexer::ConditionalStack
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
Definition: PreprocessorLexer.h:76
FindConflictEnd
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3054
AttributeLangSupport::C
@ C
Definition: SemaDeclAttr.cpp:56
clang::Token::getIdentifierInfo
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:181
clang::PreprocessorLexer::MIOpt
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
Definition: PreprocessorLexer.h:72
clang::DiagnosticsEngine
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:192
clang::dependency_directives_scan::Token::is
bool is(tok::TokenKind K) const
Definition: DependencyDirectivesScanner.h:47
memcpy
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
Definition: __clang_cuda_device_functions.h:1549
int
__device__ int
Definition: __clang_hip_libdevice_declares.h:63
clang::PreprocessorLexer::ParsingFilename
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
Definition: PreprocessorLexer.h:53
clang::Token::isAtStartOfLine
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:270
clang::Preprocessor::CodeCompleteIncludedFile
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
Definition: Preprocessor.cpp:450
llvm::Optional
Definition: LLVM.h:40
clang::Lexer::getSourceText
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:961
SourceManager.h
XIDContinueRanges
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
Definition: UnicodeCharSets.h:242
clang::PreprocessorLexer::LexingRawMode
bool LexingRawMode
True if in raw mode.
Definition: PreprocessorLexer.h:68
clang::Token::IsEditorPlaceholder
@ IsEditorPlaceholder
Definition: Token.h:87
vec_any_eq
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16021
clang::Lexer::isKeepWhitespaceMode
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
clang::Lexer::MeasureTokenLength
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:451
clang::Lexer::LexFromRawLexer
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
clang::Token
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
clang::tok::isStringLiteral
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:81
LiteralSupport.h
End
SourceLocation End
Definition: USRLocFinder.cpp:167
clang::StringLiteralParser::isValidUDSuffix
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Definition: LiteralSupport.cpp:2290
clang::Preprocessor::getIdentifierTable
IdentifierTable & getIdentifierTable()
Definition: Preprocessor.h:1068
clang::isPreprocessingNumberBody
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:153
clang::SourceManager
This class handles loading and caching of source files into memory.
Definition: SourceManager.h:637
clang::Lexer::Stringify
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:262
Preprocessor.h
clang::dependency_directives_scan::Token::Offset
unsigned Offset
Offset into the original source input.
Definition: DependencyDirectivesScanner.h:36
clang::Lexer
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
Offset
unsigned Offset
Definition: Format.cpp:2717
TokenKinds.h
U
clang::CharSourceRange::getCharRange
static CharSourceRange getCharRange(SourceRange R)
Definition: SourceLocation.h:265
diagnoseInvalidUnicodeCodepointInIdentifier
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1621
clang::dependency_directives_scan::pp_endif
@ pp_endif
Definition: DependencyDirectivesScanner.h:79
length
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
clang::CMK_Normal
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
clang::dependency_directives_scan::pp_if
@ pp_if
Definition: DependencyDirectivesScanner.h:72
clang::dependency_directives_scan::Token::Kind
tok::TokenKind Kind
Definition: DependencyDirectivesScanner.h:38
clang::dependency_directives_scan::pp_eof
@ pp_eof
Definition: DependencyDirectivesScanner.h:88
clang::Lexer::Diag
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1167
isAllowedIDChar
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts)
Definition: Lexer.cpp:1460
clang::SrcMgr::ExpansionInfo
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
Definition: SourceManager.h:354
MultipleIncludeOpt.h
clang::dependency_directives_scan::Token::Length
unsigned Length
Definition: DependencyDirectivesScanner.h:37
clang::Token::getKind
tok::TokenKind getKind() const
Definition: Token.h:93
LangOptions.h
clang::Lexer::isNewLineEscaped
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1073
makeRangeFromFileLocs
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:857
Diagnostic.h
LexDiagnostic.h
clang::dependency_directives_scan::pp_define
@ pp_define
Definition: DependencyDirectivesScanner.h:63
_mm_set1_epi8
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3652
intptr_t
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
Definition: opencl-c-base.h:133
clang::dependency_directives_scan::pp_elif
@ pp_elif
Definition: DependencyDirectivesScanner.h:75
clang::Preprocessor::Diag
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Definition: Preprocessor.h:1898
GetMappedTokenLoc
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1124
clang::Token::is
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:98
clang::Lexer::Lexer
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:136
IdentifierTable.h
clang::Token::TokenFlags
TokenFlags
Definition: Token.h:73
clang::SrcMgr::ExpansionInfo::getSpellingLoc
SourceLocation getSpellingLoc() const
Definition: SourceManager.h:372
clang::Lexer::getTokenPrefixLength
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:727
llvm::SmallString
Definition: LLVM.h:37
clang::Token::needsCleaning
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:287
clang::Lexer::getIndentationForLine
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1093
clang::Token::LeadingEmptyMacro
@ LeadingEmptyMacro
Definition: Token.h:80
clang::PreprocessorLexer::ParsingPreprocessorDirective
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
Definition: PreprocessorLexer.h:50
clang::isRawStringDelimBody
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:160
clang::PreambleBounds
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
clang::PreprocessorLexer::FID
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
Definition: PreprocessorLexer.h:40
C11DisallowedInitialIDCharRanges
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
Definition: UnicodeCharSets.h:619
clang::dependency_directives_scan::pp_elifdef
@ pp_elifdef
Definition: DependencyDirectivesScanner.h:76
clang::SourceLocation::isFileID
bool isFileID() const
Definition: SourceLocation.h:102
clang::tok::ObjCKeywordKind
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
clang::Lexer::ReadToEndOfLine
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:2878
clang::PreprocessorLexer::PP
Preprocessor * PP
Definition: PreprocessorLexer.h:37
getSpellingSlow
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:277
vec_any_ge
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16229
clang::Lexer::findNextToken
static Optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1259
altivec.h
clang::Lexer::getImmediateMacroName
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:997
clang::dependency_directives_scan::pp___include_macros
@ pp___include_macros
Definition: DependencyDirectivesScanner.h:62
clang::dependency_directives_scan::cxx_module_decl
@ cxx_module_decl
Definition: DependencyDirectivesScanner.h:81
clang::IdentifierInfo::getObjCKeywordID
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
Definition: IdentifierTable.h:292
Line
const AnnotatedLine * Line
Definition: UsingDeclarationsSorter.cpp:68
clang::PreprocessorLexer::isLexingRawMode
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
Definition: PreprocessorLexer.h:142
clang::Lexer::getSourceLocation
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1148
findPlaceholderEnd
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3158
clang::Token::getRawIdentifier
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:207
clang::dependency_directives_scan::pp_pragma_once
@ pp_pragma_once
Definition: DependencyDirectivesScanner.h:67
clang::Token::isLiteral
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:115
clang::isAsciiIdentifierContinue
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c, bool AllowDollar=false)
Returns true if this is a body character of a C identifier, which is [a-zA-Z0-9_].
Definition: CharInfo.h:64
clang::Token::getLength
unsigned getLength() const
Definition: Token.h:129
clang::Token::LeadingSpace
@ LeadingSpace
Definition: Token.h:76
makeCharRange
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1506
clang::Preprocessor::getSourceManager
SourceManager & getSourceManager() const
Definition: Preprocessor.h:1065
clang::Preprocessor::HandleComment
bool HandleComment(Token &result, SourceRange Comment)
Definition: Preprocessor.cpp:1408
clang::Preprocessor::getSpelling
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
Definition: Preprocessor.h:1913
clang::Decl::setLocation
void setLocation(SourceLocation L)
Definition: DeclBase.h:433
clang::isWhitespace
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:93
UnicodeWhitespaceCharRanges
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
Definition: UnicodeCharSets.h:636
clang::SrcMgr::SLocEntry::getExpansion
const ExpansionInfo & getExpansion() const
Definition: SourceManager.h:496
SourceLocation.h
clang::tok::TokenKind
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
clang::Token::HasUDSuffix
@ HasUDSuffix
Definition: Token.h:81
clang::Preprocessor::CodeCompleteNaturalLanguage
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
Definition: Preprocessor.cpp:457
P
StringRef P
Definition: ASTMatchersInternal.cpp:564
clang::dependency_directives_scan::Token::Flags
unsigned short Flags
Definition: DependencyDirectivesScanner.h:39
clang::Preprocessor::HandleDirective
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
Definition: PPDirectives.cpp:1135
CharInfo.h
clang::Lexer::getBufferLocation
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
clang::syntax::NodeRole::Size
@ Size
clang::dependency_directives_scan::pp_pragma_import
@ pp_pragma_import
Definition: DependencyDirectivesScanner.h:66
clang::Preprocessor::hadModuleLoaderFatalFailure
bool hadModuleLoaderFatalFailure() const
Definition: Preprocessor.h:1085
clang::Preprocessor::getCodeCompletionFileLoc
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
Definition: Preprocessor.h:1817
clang::dependency_directives_scan::pp_elifndef
@ pp_elifndef
Definition: DependencyDirectivesScanner.h:77
clang::dependency_directives_scan::pp_pragma_push_macro
@ pp_pragma_push_macro
Definition: DependencyDirectivesScanner.h:68
StringifyImpl
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:237
clang::Preprocessor::isCodeCompletionEnabled
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
Definition: Preprocessor.h:1805
clang::dependency_directives_scan::pp_else
@ pp_else
Definition: DependencyDirectivesScanner.h:78
clang::SrcMgr::ExpansionInfo::getExpansionLocStart
SourceLocation getExpansionLocStart() const
Definition: SourceManager.h:376
Begin
SourceLocation Begin
Definition: USRLocFinder.cpp:165
clang::dependency_directives_scan::pp_import
@ pp_import
Definition: DependencyDirectivesScanner.h:65
Lexer.h
Value
Value
Definition: UninitializedValues.cpp:103
clang::Preprocessor::setCodeCompletionIdentifierInfo
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
Definition: Preprocessor.h:1442
clang::SourceManager::getCharacterData
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
Definition: SourceManager.cpp:1160
isAllowedInitiallyIDChar
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts)
Definition: Lexer.cpp:1485
clang::frontend::After
@ After
Like System, but searched after the system directories.
Definition: HeaderSearchOptions.h:61
clang::CharSourceRange
Represents a character-granular source range.
Definition: SourceLocation.h:253
findBeginningOfLine
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:496
LLVM.h
clang::Token::StartOfLine
@ StartOfLine
Definition: Token.h:74
C99DisallowedInitialIDCharRanges
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
Definition: UnicodeCharSets.h:627
clang::dependency_directives_scan::pp_include
@ pp_include
Definition: DependencyDirectivesScanner.h:61
clang::CharSourceRange::getEnd
SourceLocation getEnd() const
Definition: SourceLocation.h:284
clang::Preprocessor::isPreprocessedOutput
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
Definition: Preprocessor.h:1127
clang::IdentifierInfo
One of these records is kept for each identifier that is lexed.
Definition: IdentifierTable.h:85
clang::Token::getLocation
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:126
clang::SourceLocation::getFromRawEncoding
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
Definition: SourceLocation.h:152
clang::SourceLocation::isMacroID
bool isMacroID() const
Definition: SourceLocation.h:103
clang::IdentifierInfo::isHandleIdentifierCase
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
Definition: IdentifierTable.h:381
clang::LangOptions
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:81
clang::Lexer::SetKeepWhitespaceMode
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
clang::ObjCPropertyAttribute::Kind
Kind
Definition: DeclObjCCommon.h:22
clang::Lexer::getImmediateMacroNameForDiagnostics
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1044
clang::Token::isAnnotation
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:120
clang::Preprocessor::getPreprocessorOpts
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
Definition: Preprocessor.h:1056
clang::dependency_directives_scan::Token
Token lexed as part of dependency directive scanning.
Definition: DependencyDirectivesScanner.h:34
clang::dependency_directives_scan::decl_at_import
@ decl_at_import
Definition: DependencyDirectivesScanner.h:80
clang::dependency_directives_scan::pp_include_next
@ pp_include_next
Definition: DependencyDirectivesScanner.h:71
maybeDiagnoseUTF8Homoglyph
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1541
clang::SourceLocation::isInvalid
bool isInvalid() const
Definition: SourceLocation.h:111
clang::Preprocessor::getCodeCompletionLoc
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
Definition: Preprocessor.h:1811
clang::isVerticalWhitespace
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:84
clang
Definition: CalledOnceCheck.h:17
clang::Lexer::isAtStartOfMacroExpansion
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:809
clang::PreprocessorOptions::LexEditorPlaceholders
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Definition: PreprocessorOptions.h:149
clang::Preprocessor::LookUpIdentifierInfo
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
Definition: Preprocessor.cpp:712
_mm_movemask_epi8
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4204
maybeDiagnoseIDCharCompat
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1512
clang::dependency_directives_scan::pp_ifdef
@ pp_ifdef
Definition: DependencyDirectivesScanner.h:73
isUnicodeWhitespace
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1454
clang::Preprocessor::getDiagnostics
DiagnosticsEngine & getDiagnostics() const
Definition: Preprocessor.h:1058
clang::Lexer::GetBeginningOfToken
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:561
clang::Lexer::isPragmaLexer
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
clang::operator<
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
Definition: DeclarationName.h:549
clang::FileID
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Definition: SourceLocation.h:38
C99AllowedIDCharRanges
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
Definition: UnicodeCharSets.h:400
clang::SourceLocation::isValid
bool isValid() const
Return true if this is a valid SourceLocation object.
Definition: SourceLocation.h:110
clang::FileID::isInvalid
bool isInvalid() const
Definition: SourceLocation.h:45
clang::dependency_directives_scan::pp_pragma_include_alias
@ pp_pragma_include_alias
Definition: DependencyDirectivesScanner.h:70
clang::FixItHint::CreateRemoval
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:123
clang::Preprocessor::isInPrimaryFile
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
Definition: PPLexerChange.cpp:34
clang::Token::NeedsCleaning
@ NeedsCleaning
Definition: Token.h:79
UnicodeCharSets.h
clang::MultipleIncludeOpt::ExitTopLevelConditional
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
Definition: MultipleIncludeOpt.h:149
clang::Lexer::resetExtendedTokenMode
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:172
clang::dependency_directives_scan::cxx_export_import_decl
@ cxx_export_import_decl
Definition: DependencyDirectivesScanner.h:84
c
__device__ __2f16 float c
Definition: __clang_hip_libdevice_declares.h:320
clang::isHorizontalWhitespace
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:76
DecodeTrigraphChar
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1196
clang::Token::setFlag
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:238
clang::Lexer::getLocForEndOfToken
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:787
clang::comments::tok::eof
@ eof
Definition: CommentLexer.h:33
clang::Lexer::ComputePreamble
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:589
llvm::SmallVectorImpl< char >
clang::Token::HasUCN
@ HasUCN
Definition: Token.h:82
clang::Lexer::findLocationAfterToken
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1292
clang::Lexer::getCharAndSizeNoWarn
static char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:580
PreprocessorOptions.h
clang::Preprocessor
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:129
clang::Preprocessor::isRecordingPreamble
bool isRecordingPreamble() const
Definition: Preprocessor.h:2568
clang::LineComment
@ LineComment
Definition: LangStandard.h:48
clang::Token::getObjCKeywordID
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:67
SM
#define SM(sm)
Definition: Cuda.cpp:79
C11AllowedIDCharRanges
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
Definition: UnicodeCharSets.h:370
clang::Lexer::getSourceLocation
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
Token.h
clang::Preprocessor::getEmptylineHandler
EmptylineHandler * getEmptylineHandler() const
Definition: Preprocessor.h:1408
XIDStartRanges
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
Definition: UnicodeCharSets.h:14
clang::isAsciiIdentifierStart
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:54
clang::Decl::getLocation
SourceLocation getLocation() const
Definition: DeclBase.h:432
clang::RISCV::Invalid
@ Invalid
Definition: RISCVVIntrinsicUtils.h:171
clang::dependency_directives_scan::pp_pragma_pop_macro
@ pp_pragma_pop_macro
Definition: DependencyDirectivesScanner.h:69
clang::SourceLocation::UIntTy
uint32_t UIntTy
Definition: SourceLocation.h:93
clang::isASCII
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:42
clang::SourceLocation::getRawEncoding
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
Definition: SourceLocation.h:146
clang::DiagnosticsEngine::Report
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1537
clang::CMK_None
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
clang::Preprocessor::HandleEndOfFile
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
Definition: PPLexerChange.cpp:331
clang::dependency_directives_scan::pp_none
@ pp_none
Definition: DependencyDirectivesScanner.h:60
clang::SrcMgr::SLocEntry
This is a discriminated union of FileInfo and ExpansionInfo.
Definition: SourceManager.h:474