clang 17.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
18#include "clang/Basic/LLVM.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MathExtras.h"
36#include "llvm/Support/MemoryBufferRef.h"
37#include "llvm/Support/NativeFormatting.h"
38#include "llvm/Support/Unicode.h"
39#include "llvm/Support/UnicodeCharRanges.h"
40#include <algorithm>
41#include <cassert>
42#include <cstddef>
43#include <cstdint>
44#include <cstring>
45#include <optional>
46#include <string>
47#include <tuple>
48#include <utility>
49
50using namespace clang;
51
52//===----------------------------------------------------------------------===//
53// Token Class Implementation
54//===----------------------------------------------------------------------===//
55
56/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
58 if (isAnnotation())
59 return false;
61 return II->getObjCKeywordID() == objcKey;
62 return false;
63}
64
65/// getObjCKeywordID - Return the ObjC keyword kind.
67 if (isAnnotation())
68 return tok::objc_not_keyword;
70 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
71}
72
73//===----------------------------------------------------------------------===//
74// Lexer Class Implementation
75//===----------------------------------------------------------------------===//
76
77void Lexer::anchor() {}
78
79void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
80 const char *BufEnd) {
81 BufferStart = BufStart;
82 BufferPtr = BufPtr;
83 BufferEnd = BufEnd;
84
85 assert(BufEnd[0] == 0 &&
86 "We assume that the input buffer has a null character at the end"
87 " to simplify lexing!");
88
89 // Check whether we have a BOM in the beginning of the buffer. If yes - act
90 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
91 // skip the UTF-8 BOM if it's present.
92 if (BufferStart == BufferPtr) {
93 // Determine the size of the BOM.
94 StringRef Buf(BufferStart, BufferEnd - BufferStart);
95 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
96 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
97 .Default(0);
98
99 // Skip the BOM.
100 BufferPtr += BOMLength;
101 }
102
103 Is_PragmaLexer = false;
104 CurrentConflictMarkerState = CMK_None;
105
106 // Start of the file is a start of line.
107 IsAtStartOfLine = true;
108 IsAtPhysicalStartOfLine = true;
109
110 HasLeadingSpace = false;
111 HasLeadingEmptyMacro = false;
112
113 // We are not after parsing a #.
115
116 // We are not after parsing #include.
117 ParsingFilename = false;
118
119 // We are not in raw mode. Raw mode disables diagnostics and interpretation
120 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
121 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
122 // or otherwise skipping over tokens.
123 LexingRawMode = false;
124
125 // Default to not keeping comments.
126 ExtendedTokenMode = 0;
127
128 NewLinePtr = nullptr;
129}
130
131/// Lexer constructor - Create a new lexer object for the specified buffer
132/// with the specified preprocessor managing the lexing process. This lexer
133/// assumes that the associated file buffer and Preprocessor objects will
134/// outlive it, so it doesn't take ownership of either of them.
135Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
136 Preprocessor &PP, bool IsFirstIncludeOfFile)
137 : PreprocessorLexer(&PP, FID),
138 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
139 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
140 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
141 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
142 InputFile.getBufferEnd());
143
145}
146
147/// Lexer constructor - Create a new raw lexer object. This object is only
148/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
149/// range will outlive it, so it doesn't take ownership of it.
150Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
151 const char *BufStart, const char *BufPtr, const char *BufEnd,
152 bool IsFirstIncludeOfFile)
153 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
154 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
155 InitLexer(BufStart, BufPtr, BufEnd);
156
157 // We *are* in raw mode.
158 LexingRawMode = true;
159}
160
161/// Lexer constructor - Create a new raw lexer object. This object is only
162/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
163/// range will outlive it, so it doesn't take ownership of it.
164Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
165 const SourceManager &SM, const LangOptions &langOpts,
166 bool IsFirstIncludeOfFile)
167 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
168 FromFile.getBufferStart(), FromFile.getBufferEnd(),
169 IsFirstIncludeOfFile) {}
170
172 assert(PP && "Cannot reset token mode without a preprocessor");
173 if (LangOpts.TraditionalCPP)
175 else
177}
178
179/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
180/// _Pragma expansion. This has a variety of magic semantics that this method
181/// sets up. It returns a new'd Lexer that must be delete'd when done.
182///
183/// On entrance to this routine, TokStartLoc is a macro location which has a
184/// spelling loc that indicates the bytes to be lexed for the token and an
185/// expansion location that indicates where all lexed tokens should be
186/// "expanded from".
187///
188/// TODO: It would really be nice to make _Pragma just be a wrapper around a
189/// normal lexer that remaps tokens as they fly by. This would require making
190/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
191/// interface that could handle this stuff. This would pull GetMappedTokenLoc
192/// out of the critical path of the lexer!
193///
195 SourceLocation ExpansionLocStart,
196 SourceLocation ExpansionLocEnd,
197 unsigned TokLen, Preprocessor &PP) {
199
200 // Create the lexer as if we were going to lex the file normally.
201 FileID SpellingFID = SM.getFileID(SpellingLoc);
202 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
203 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
204
205 // Now that the lexer is created, change the start/end locations so that we
206 // just lex the subsection of the file that we want. This is lexing from a
207 // scratch buffer.
208 const char *StrData = SM.getCharacterData(SpellingLoc);
209
210 L->BufferPtr = StrData;
211 L->BufferEnd = StrData+TokLen;
212 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
213
214 // Set the SourceLocation with the remapping information. This ensures that
215 // GetMappedTokenLoc will remap the tokens as they are lexed.
216 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
217 ExpansionLocStart,
218 ExpansionLocEnd, TokLen);
219
220 // Ensure that the lexer thinks it is inside a directive, so that end \n will
221 // return an EOD token.
223
224 // This lexer really is for _Pragma.
225 L->Is_PragmaLexer = true;
226 return L;
227}
228
229void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
230 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
231 this->IsAtStartOfLine = IsAtStartOfLine;
232 assert((BufferStart + Offset) <= BufferEnd);
233 BufferPtr = BufferStart + Offset;
234}
235
236template <typename T> static void StringifyImpl(T &Str, char Quote) {
237 typename T::size_type i = 0, e = Str.size();
238 while (i < e) {
239 if (Str[i] == '\\' || Str[i] == Quote) {
240 Str.insert(Str.begin() + i, '\\');
241 i += 2;
242 ++e;
243 } else if (Str[i] == '\n' || Str[i] == '\r') {
244 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
245 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
246 Str[i] != Str[i + 1]) {
247 Str[i] = '\\';
248 Str[i + 1] = 'n';
249 } else {
250 // Replace '\n' and '\r' to '\\' followed by 'n'.
251 Str[i] = '\\';
252 Str.insert(Str.begin() + i + 1, 'n');
253 ++e;
254 }
255 i += 2;
256 } else
257 ++i;
258 }
259}
260
261std::string Lexer::Stringify(StringRef Str, bool Charify) {
262 std::string Result = std::string(Str);
263 char Quote = Charify ? '\'' : '"';
264 StringifyImpl(Result, Quote);
265 return Result;
266}
267
269
270//===----------------------------------------------------------------------===//
271// Token Spelling
272//===----------------------------------------------------------------------===//
273
274/// Slow case of getSpelling. Extract the characters comprising the
275/// spelling of this token from the provided input buffer.
276static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
277 const LangOptions &LangOpts, char *Spelling) {
278 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
279
280 size_t Length = 0;
281 const char *BufEnd = BufPtr + Tok.getLength();
282
283 if (tok::isStringLiteral(Tok.getKind())) {
284 // Munch the encoding-prefix and opening double-quote.
285 while (BufPtr < BufEnd) {
286 unsigned Size;
287 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
288 BufPtr += Size;
289
290 if (Spelling[Length - 1] == '"')
291 break;
292 }
293
294 // Raw string literals need special handling; trigraph expansion and line
295 // splicing do not occur within their d-char-sequence nor within their
296 // r-char-sequence.
297 if (Length >= 2 &&
298 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
299 // Search backwards from the end of the token to find the matching closing
300 // quote.
301 const char *RawEnd = BufEnd;
302 do --RawEnd; while (*RawEnd != '"');
303 size_t RawLength = RawEnd - BufPtr + 1;
304
305 // Everything between the quotes is included verbatim in the spelling.
306 memcpy(Spelling + Length, BufPtr, RawLength);
307 Length += RawLength;
308 BufPtr += RawLength;
309
310 // The rest of the token is lexed normally.
311 }
312 }
313
314 while (BufPtr < BufEnd) {
315 unsigned Size;
316 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
317 BufPtr += Size;
318 }
319
320 assert(Length < Tok.getLength() &&
321 "NeedsCleaning flag set on token that didn't need cleaning!");
322 return Length;
323}
324
325/// getSpelling() - Return the 'spelling' of this token. The spelling of a
326/// token are the characters used to represent the token in the source file
327/// after trigraph expansion and escaped-newline folding. In particular, this
328/// wants to get the true, uncanonicalized, spelling of things like digraphs
329/// UCNs, etc.
331 SmallVectorImpl<char> &buffer,
332 const SourceManager &SM,
333 const LangOptions &options,
334 bool *invalid) {
335 // Break down the source location.
336 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
337
338 // Try to the load the file buffer.
339 bool invalidTemp = false;
340 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
341 if (invalidTemp) {
342 if (invalid) *invalid = true;
343 return {};
344 }
345
346 const char *tokenBegin = file.data() + locInfo.second;
347
348 // Lex from the start of the given location.
349 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
350 file.begin(), tokenBegin, file.end());
351 Token token;
352 lexer.LexFromRawLexer(token);
353
354 unsigned length = token.getLength();
355
356 // Common case: no need for cleaning.
357 if (!token.needsCleaning())
358 return StringRef(tokenBegin, length);
359
360 // Hard case, we need to relex the characters into the string.
361 buffer.resize(length);
362 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
363 return StringRef(buffer.data(), buffer.size());
364}
365
366/// getSpelling() - Return the 'spelling' of this token. The spelling of a
367/// token are the characters used to represent the token in the source file
368/// after trigraph expansion and escaped-newline folding. In particular, this
369/// wants to get the true, uncanonicalized, spelling of things like digraphs
370/// UCNs, etc.
371std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
372 const LangOptions &LangOpts, bool *Invalid) {
373 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
374
375 bool CharDataInvalid = false;
376 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
377 &CharDataInvalid);
378 if (Invalid)
379 *Invalid = CharDataInvalid;
380 if (CharDataInvalid)
381 return {};
382
383 // If this token contains nothing interesting, return it directly.
384 if (!Tok.needsCleaning())
385 return std::string(TokStart, TokStart + Tok.getLength());
386
387 std::string Result;
388 Result.resize(Tok.getLength());
389 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
390 return Result;
391}
392
393/// getSpelling - This method is used to get the spelling of a token into a
394/// preallocated buffer, instead of as an std::string. The caller is required
395/// to allocate enough space for the token, which is guaranteed to be at least
396/// Tok.getLength() bytes long. The actual length of the token is returned.
397///
398/// Note that this method may do two possible things: it may either fill in
399/// the buffer specified with characters, or it may *change the input pointer*
400/// to point to a constant buffer with the data already in it (avoiding a
401/// copy). The caller is not allowed to modify the returned buffer pointer
402/// if an internal buffer is returned.
403unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
404 const SourceManager &SourceMgr,
405 const LangOptions &LangOpts, bool *Invalid) {
406 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
407
408 const char *TokStart = nullptr;
409 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
410 if (Tok.is(tok::raw_identifier))
411 TokStart = Tok.getRawIdentifier().data();
412 else if (!Tok.hasUCN()) {
413 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
414 // Just return the string from the identifier table, which is very quick.
415 Buffer = II->getNameStart();
416 return II->getLength();
417 }
418 }
419
420 // NOTE: this can be checked even after testing for an IdentifierInfo.
421 if (Tok.isLiteral())
422 TokStart = Tok.getLiteralData();
423
424 if (!TokStart) {
425 // Compute the start of the token in the input lexer buffer.
426 bool CharDataInvalid = false;
427 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
428 if (Invalid)
429 *Invalid = CharDataInvalid;
430 if (CharDataInvalid) {
431 Buffer = "";
432 return 0;
433 }
434 }
435
436 // If this token contains nothing interesting, return it directly.
437 if (!Tok.needsCleaning()) {
438 Buffer = TokStart;
439 return Tok.getLength();
440 }
441
442 // Otherwise, hard case, relex the characters into the string.
443 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
444}
445
446/// MeasureTokenLength - Relex the token at the specified location and return
447/// its length in bytes in the input file. If the token needs cleaning (e.g.
448/// includes a trigraph or an escaped newline) then this count includes bytes
449/// that are part of that.
451 const SourceManager &SM,
452 const LangOptions &LangOpts) {
453 Token TheTok;
454 if (getRawToken(Loc, TheTok, SM, LangOpts))
455 return 0;
456 return TheTok.getLength();
457}
458
459/// Relex the token at the specified location.
460/// \returns true if there was a failure, false on success.
462 const SourceManager &SM,
463 const LangOptions &LangOpts,
464 bool IgnoreWhiteSpace) {
465 // TODO: this could be special cased for common tokens like identifiers, ')',
466 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
467 // all obviously single-char tokens. This could use
468 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
469 // something.
470
471 // If this comes from a macro expansion, we really do want the macro name, not
472 // the token this macro expanded to.
473 Loc = SM.getExpansionLoc(Loc);
474 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
475 bool Invalid = false;
476 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
477 if (Invalid)
478 return true;
479
480 const char *StrData = Buffer.data()+LocInfo.second;
481
482 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
483 return true;
484
485 // Create a lexer starting at the beginning of this token.
486 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
487 Buffer.begin(), StrData, Buffer.end());
488 TheLexer.SetCommentRetentionState(true);
489 TheLexer.LexFromRawLexer(Result);
490 return false;
491}
492
493/// Returns the pointer that points to the beginning of line that contains
494/// the given offset, or null if the offset if invalid.
495static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
496 const char *BufStart = Buffer.data();
497 if (Offset >= Buffer.size())
498 return nullptr;
499
500 const char *LexStart = BufStart + Offset;
501 for (; LexStart != BufStart; --LexStart) {
502 if (isVerticalWhitespace(LexStart[0]) &&
503 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
504 // LexStart should point at first character of logical line.
505 ++LexStart;
506 break;
507 }
508 }
509 return LexStart;
510}
511
513 const SourceManager &SM,
514 const LangOptions &LangOpts) {
515 assert(Loc.isFileID());
516 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
517 if (LocInfo.first.isInvalid())
518 return Loc;
519
520 bool Invalid = false;
521 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
522 if (Invalid)
523 return Loc;
524
525 // Back up from the current location until we hit the beginning of a line
526 // (or the buffer). We'll relex from that point.
527 const char *StrData = Buffer.data() + LocInfo.second;
528 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
529 if (!LexStart || LexStart == StrData)
530 return Loc;
531
532 // Create a lexer starting at the beginning of this token.
533 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
534 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
535 Buffer.end());
536 TheLexer.SetCommentRetentionState(true);
537
538 // Lex tokens until we find the token that contains the source location.
539 Token TheTok;
540 do {
541 TheLexer.LexFromRawLexer(TheTok);
542
543 if (TheLexer.getBufferLocation() > StrData) {
544 // Lexing this token has taken the lexer past the source location we're
545 // looking for. If the current token encompasses our source location,
546 // return the beginning of that token.
547 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
548 return TheTok.getLocation();
549
550 // We ended up skipping over the source location entirely, which means
551 // that it points into whitespace. We're done here.
552 break;
553 }
554 } while (TheTok.getKind() != tok::eof);
555
556 // We've passed our source location; just return the original source location.
557 return Loc;
558}
559
561 const SourceManager &SM,
562 const LangOptions &LangOpts) {
563 if (Loc.isFileID())
564 return getBeginningOfFileToken(Loc, SM, LangOpts);
565
566 if (!SM.isMacroArgExpansion(Loc))
567 return Loc;
568
569 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
570 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
571 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
572 std::pair<FileID, unsigned> BeginFileLocInfo =
573 SM.getDecomposedLoc(BeginFileLoc);
574 assert(FileLocInfo.first == BeginFileLocInfo.first &&
575 FileLocInfo.second >= BeginFileLocInfo.second);
576 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
577}
578
579namespace {
580
581enum PreambleDirectiveKind {
582 PDK_Skipped,
583 PDK_Unknown
584};
585
586} // namespace
587
589 const LangOptions &LangOpts,
590 unsigned MaxLines) {
591 // Create a lexer starting at the beginning of the file. Note that we use a
592 // "fake" file source location at offset 1 so that the lexer will track our
593 // position within the file.
594 const SourceLocation::UIntTy StartOffset = 1;
596 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
597 Buffer.end());
598 TheLexer.SetCommentRetentionState(true);
599
600 bool InPreprocessorDirective = false;
601 Token TheTok;
602 SourceLocation ActiveCommentLoc;
603
604 unsigned MaxLineOffset = 0;
605 if (MaxLines) {
606 const char *CurPtr = Buffer.begin();
607 unsigned CurLine = 0;
608 while (CurPtr != Buffer.end()) {
609 char ch = *CurPtr++;
610 if (ch == '\n') {
611 ++CurLine;
612 if (CurLine == MaxLines)
613 break;
614 }
615 }
616 if (CurPtr != Buffer.end())
617 MaxLineOffset = CurPtr - Buffer.begin();
618 }
619
620 do {
621 TheLexer.LexFromRawLexer(TheTok);
622
623 if (InPreprocessorDirective) {
624 // If we've hit the end of the file, we're done.
625 if (TheTok.getKind() == tok::eof) {
626 break;
627 }
628
629 // If we haven't hit the end of the preprocessor directive, skip this
630 // token.
631 if (!TheTok.isAtStartOfLine())
632 continue;
633
634 // We've passed the end of the preprocessor directive, and will look
635 // at this token again below.
636 InPreprocessorDirective = false;
637 }
638
639 // Keep track of the # of lines in the preamble.
640 if (TheTok.isAtStartOfLine()) {
641 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
642
643 // If we were asked to limit the number of lines in the preamble,
644 // and we're about to exceed that limit, we're done.
645 if (MaxLineOffset && TokOffset >= MaxLineOffset)
646 break;
647 }
648
649 // Comments are okay; skip over them.
650 if (TheTok.getKind() == tok::comment) {
651 if (ActiveCommentLoc.isInvalid())
652 ActiveCommentLoc = TheTok.getLocation();
653 continue;
654 }
655
656 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
657 // This is the start of a preprocessor directive.
658 Token HashTok = TheTok;
659 InPreprocessorDirective = true;
660 ActiveCommentLoc = SourceLocation();
661
662 // Figure out which directive this is. Since we're lexing raw tokens,
663 // we don't have an identifier table available. Instead, just look at
664 // the raw identifier to recognize and categorize preprocessor directives.
665 TheLexer.LexFromRawLexer(TheTok);
666 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
667 StringRef Keyword = TheTok.getRawIdentifier();
668 PreambleDirectiveKind PDK
669 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
670 .Case("include", PDK_Skipped)
671 .Case("__include_macros", PDK_Skipped)
672 .Case("define", PDK_Skipped)
673 .Case("undef", PDK_Skipped)
674 .Case("line", PDK_Skipped)
675 .Case("error", PDK_Skipped)
676 .Case("pragma", PDK_Skipped)
677 .Case("import", PDK_Skipped)
678 .Case("include_next", PDK_Skipped)
679 .Case("warning", PDK_Skipped)
680 .Case("ident", PDK_Skipped)
681 .Case("sccs", PDK_Skipped)
682 .Case("assert", PDK_Skipped)
683 .Case("unassert", PDK_Skipped)
684 .Case("if", PDK_Skipped)
685 .Case("ifdef", PDK_Skipped)
686 .Case("ifndef", PDK_Skipped)
687 .Case("elif", PDK_Skipped)
688 .Case("elifdef", PDK_Skipped)
689 .Case("elifndef", PDK_Skipped)
690 .Case("else", PDK_Skipped)
691 .Case("endif", PDK_Skipped)
692 .Default(PDK_Unknown);
693
694 switch (PDK) {
695 case PDK_Skipped:
696 continue;
697
698 case PDK_Unknown:
699 // We don't know what this directive is; stop at the '#'.
700 break;
701 }
702 }
703
704 // We only end up here if we didn't recognize the preprocessor
705 // directive or it was one that can't occur in the preamble at this
706 // point. Roll back the current token to the location of the '#'.
707 TheTok = HashTok;
708 }
709
710 // We hit a token that we don't recognize as being in the
711 // "preprocessing only" part of the file, so we're no longer in
712 // the preamble.
713 break;
714 } while (true);
715
716 SourceLocation End;
717 if (ActiveCommentLoc.isValid())
718 End = ActiveCommentLoc; // don't truncate a decl comment.
719 else
720 End = TheTok.getLocation();
721
722 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
723 TheTok.isAtStartOfLine());
724}
725
726unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
727 const SourceManager &SM,
728 const LangOptions &LangOpts) {
729 // Figure out how many physical characters away the specified expansion
730 // character is. This needs to take into consideration newlines and
731 // trigraphs.
732 bool Invalid = false;
733 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
734
735 // If they request the first char of the token, we're trivially done.
736 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
737 return 0;
738
739 unsigned PhysOffset = 0;
740
741 // The usual case is that tokens don't contain anything interesting. Skip
742 // over the uninteresting characters. If a token only consists of simple
743 // chars, this method is extremely fast.
744 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
745 if (CharNo == 0)
746 return PhysOffset;
747 ++TokPtr;
748 --CharNo;
749 ++PhysOffset;
750 }
751
752 // If we have a character that may be a trigraph or escaped newline, use a
753 // lexer to parse it correctly.
754 for (; CharNo; --CharNo) {
755 unsigned Size;
756 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
757 TokPtr += Size;
758 PhysOffset += Size;
759 }
760
761 // Final detail: if we end up on an escaped newline, we want to return the
762 // location of the actual byte of the token. For example foo<newline>bar
763 // advanced by 3 should return the location of b, not of \\. One compounding
764 // detail of this is that the escape may be made by a trigraph.
765 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
766 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
767
768 return PhysOffset;
769}
770
771/// Computes the source location just past the end of the
772/// token at this source location.
773///
774/// This routine can be used to produce a source location that
775/// points just past the end of the token referenced by \p Loc, and
776/// is generally used when a diagnostic needs to point just after a
777/// token where it expected something different that it received. If
778/// the returned source location would not be meaningful (e.g., if
779/// it points into a macro), this routine returns an invalid
780/// source location.
781///
782/// \param Offset an offset from the end of the token, where the source
783/// location should refer to. The default offset (0) produces a source
784/// location pointing just past the end of the token; an offset of 1 produces
785/// a source location pointing to the last character in the token, etc.
787 const SourceManager &SM,
788 const LangOptions &LangOpts) {
789 if (Loc.isInvalid())
790 return {};
791
792 if (Loc.isMacroID()) {
793 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
794 return {}; // Points inside the macro expansion.
795 }
796
797 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
798 if (Len > Offset)
799 Len = Len - Offset;
800 else
801 return Loc;
802
803 return Loc.getLocWithOffset(Len);
804}
805
806/// Returns true if the given MacroID location points at the first
807/// token of the macro expansion.
809 const SourceManager &SM,
810 const LangOptions &LangOpts,
811 SourceLocation *MacroBegin) {
812 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
813
814 SourceLocation expansionLoc;
815 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
816 return false;
817
818 if (expansionLoc.isFileID()) {
819 // No other macro expansions, this is the first.
820 if (MacroBegin)
821 *MacroBegin = expansionLoc;
822 return true;
823 }
824
825 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
826}
827
828/// Returns true if the given MacroID location points at the last
829/// token of the macro expansion.
831 const SourceManager &SM,
832 const LangOptions &LangOpts,
833 SourceLocation *MacroEnd) {
834 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
835
836 SourceLocation spellLoc = SM.getSpellingLoc(loc);
837 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
838 if (tokLen == 0)
839 return false;
840
841 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
842 SourceLocation expansionLoc;
843 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
844 return false;
845
846 if (expansionLoc.isFileID()) {
847 // No other macro expansions.
848 if (MacroEnd)
849 *MacroEnd = expansionLoc;
850 return true;
851 }
852
853 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
854}
855
857 const SourceManager &SM,
858 const LangOptions &LangOpts) {
859 SourceLocation Begin = Range.getBegin();
860 SourceLocation End = Range.getEnd();
861 assert(Begin.isFileID() && End.isFileID());
862 if (Range.isTokenRange()) {
863 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
864 if (End.isInvalid())
865 return {};
866 }
867
868 // Break down the source locations.
869 FileID FID;
870 unsigned BeginOffs;
871 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
872 if (FID.isInvalid())
873 return {};
874
875 unsigned EndOffs;
876 if (!SM.isInFileID(End, FID, &EndOffs) ||
877 BeginOffs > EndOffs)
878 return {};
879
881}
882
883// Assumes that `Loc` is in an expansion.
885 const SourceManager &SM) {
886 return SM.getSLocEntry(SM.getFileID(Loc))
887 .getExpansion()
888 .isExpansionTokenRange();
889}
890
892 const SourceManager &SM,
893 const LangOptions &LangOpts) {
894 SourceLocation Begin = Range.getBegin();
895 SourceLocation End = Range.getEnd();
896 if (Begin.isInvalid() || End.isInvalid())
897 return {};
898
899 if (Begin.isFileID() && End.isFileID())
900 return makeRangeFromFileLocs(Range, SM, LangOpts);
901
902 if (Begin.isMacroID() && End.isFileID()) {
903 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
904 return {};
905 Range.setBegin(Begin);
906 return makeRangeFromFileLocs(Range, SM, LangOpts);
907 }
908
909 if (Begin.isFileID() && End.isMacroID()) {
910 if (Range.isTokenRange()) {
911 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
912 return {};
913 // Use the *original* end, not the expanded one in `End`.
914 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
915 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
916 return {};
917 Range.setEnd(End);
918 return makeRangeFromFileLocs(Range, SM, LangOpts);
919 }
920
921 assert(Begin.isMacroID() && End.isMacroID());
922 SourceLocation MacroBegin, MacroEnd;
923 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
924 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
925 &MacroEnd)) ||
926 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
927 &MacroEnd)))) {
928 Range.setBegin(MacroBegin);
929 Range.setEnd(MacroEnd);
930 // Use the *original* `End`, not the expanded one in `MacroEnd`.
931 if (Range.isTokenRange())
932 Range.setTokenRange(isInExpansionTokenRange(End, SM));
933 return makeRangeFromFileLocs(Range, SM, LangOpts);
934 }
935
936 bool Invalid = false;
937 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
938 &Invalid);
939 if (Invalid)
940 return {};
941
942 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
943 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
944 &Invalid);
945 if (Invalid)
946 return {};
947
948 if (EndEntry.getExpansion().isMacroArgExpansion() &&
949 BeginEntry.getExpansion().getExpansionLocStart() ==
950 EndEntry.getExpansion().getExpansionLocStart()) {
951 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
952 Range.setEnd(SM.getImmediateSpellingLoc(End));
953 return makeFileCharRange(Range, SM, LangOpts);
954 }
955 }
956
957 return {};
958}
959
961 const SourceManager &SM,
962 const LangOptions &LangOpts,
963 bool *Invalid) {
964 Range = makeFileCharRange(Range, SM, LangOpts);
965 if (Range.isInvalid()) {
966 if (Invalid) *Invalid = true;
967 return {};
968 }
969
970 // Break down the source location.
971 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
972 if (beginInfo.first.isInvalid()) {
973 if (Invalid) *Invalid = true;
974 return {};
975 }
976
977 unsigned EndOffs;
978 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
979 beginInfo.second > EndOffs) {
980 if (Invalid) *Invalid = true;
981 return {};
982 }
983
984 // Try to the load the file buffer.
985 bool invalidTemp = false;
986 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
987 if (invalidTemp) {
988 if (Invalid) *Invalid = true;
989 return {};
990 }
991
992 if (Invalid) *Invalid = false;
993 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
994}
995
997 const SourceManager &SM,
998 const LangOptions &LangOpts) {
999 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1000
1001 // Find the location of the immediate macro expansion.
1002 while (true) {
1003 FileID FID = SM.getFileID(Loc);
1004 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1005 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1006 Loc = Expansion.getExpansionLocStart();
1007 if (!Expansion.isMacroArgExpansion())
1008 break;
1009
1010 // For macro arguments we need to check that the argument did not come
1011 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1012
1013 // Loc points to the argument id of the macro definition, move to the
1014 // macro expansion.
1015 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1016 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1017 if (SpellLoc.isFileID())
1018 break; // No inner macro.
1019
1020 // If spelling location resides in the same FileID as macro expansion
1021 // location, it means there is no inner macro.
1022 FileID MacroFID = SM.getFileID(Loc);
1023 if (SM.isInFileID(SpellLoc, MacroFID))
1024 break;
1025
1026 // Argument came from inner macro.
1027 Loc = SpellLoc;
1028 }
1029
1030 // Find the spelling location of the start of the non-argument expansion
1031 // range. This is where the macro name was spelled in order to begin
1032 // expanding this macro.
1033 Loc = SM.getSpellingLoc(Loc);
1034
1035 // Dig out the buffer where the macro name was spelled and the extents of the
1036 // name so that we can render it into the expansion note.
1037 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1038 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1039 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1040 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1041}
1042
1044 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1045 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1046 // Walk past macro argument expansions.
1047 while (SM.isMacroArgExpansion(Loc))
1048 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1049
1050 // If the macro's spelling isn't FileID or from scratch space, then it's
1051 // actually a token paste or stringization (or similar) and not a macro at
1052 // all.
1053 SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1054 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1055 return {};
1056
1057 // Find the spelling location of the start of the non-argument expansion
1058 // range. This is where the macro name was spelled in order to begin
1059 // expanding this macro.
1060 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1061
1062 // Dig out the buffer where the macro name was spelled and the extents of the
1063 // name so that we can render it into the expansion note.
1064 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1065 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1066 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1067 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1068}
1069
1071 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1072}
1073
1074bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1075 assert(isVerticalWhitespace(Str[0]));
1076 if (Str - 1 < BufferStart)
1077 return false;
1078
1079 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1080 (Str[0] == '\r' && Str[-1] == '\n')) {
1081 if (Str - 2 < BufferStart)
1082 return false;
1083 --Str;
1084 }
1085 --Str;
1086
1087 // Rewind to first non-space character:
1088 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1089 --Str;
1090
1091 return *Str == '\\';
1092}
1093
1095 const SourceManager &SM) {
1096 if (Loc.isInvalid() || Loc.isMacroID())
1097 return {};
1098 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1099 if (LocInfo.first.isInvalid())
1100 return {};
1101 bool Invalid = false;
1102 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1103 if (Invalid)
1104 return {};
1105 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1106 if (!Line)
1107 return {};
1108 StringRef Rest = Buffer.substr(Line - Buffer.data());
1109 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1110 return NumWhitespaceChars == StringRef::npos
1111 ? ""
1112 : Rest.take_front(NumWhitespaceChars);
1113}
1114
1115//===----------------------------------------------------------------------===//
1116// Diagnostics forwarding code.
1117//===----------------------------------------------------------------------===//
1118
1119/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1120/// lexer buffer was all expanded at a single point, perform the mapping.
1121/// This is currently only used for _Pragma implementation, so it is the slow
1122/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1123static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1124 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1126 SourceLocation FileLoc,
1127 unsigned CharNo, unsigned TokLen) {
1128 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1129
1130 // Otherwise, we're lexing "mapped tokens". This is used for things like
1131 // _Pragma handling. Combine the expansion location of FileLoc with the
1132 // spelling location.
1134
1135 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1136 // characters come from spelling(FileLoc)+Offset.
1137 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1138 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1139
1140 // Figure out the expansion loc range, which is the range covered by the
1141 // original _Pragma(...) sequence.
1142 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1143
1144 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1145}
1146
1147/// getSourceLocation - Return a source location identifier for the specified
1148/// offset in the current file.
1150 unsigned TokLen) const {
1151 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1152 "Location out of range for this buffer!");
1153
1154 // In the normal case, we're just lexing from a simple file buffer, return
1155 // the file id from FileLoc with the offset specified.
1156 unsigned CharNo = Loc-BufferStart;
1157 if (FileLoc.isFileID())
1158 return FileLoc.getLocWithOffset(CharNo);
1159
1160 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1161 // tokens are lexed from where the _Pragma was defined.
1162 assert(PP && "This doesn't work on raw lexers");
1163 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1164}
1165
1166/// Diag - Forwarding function for diagnostics. This translate a source
1167/// position in the current buffer into a SourceLocation object for rendering.
1168DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1169 return PP->Diag(getSourceLocation(Loc), DiagID);
1170}
1171
1172//===----------------------------------------------------------------------===//
1173// Trigraph and Escaped Newline Handling Code.
1174//===----------------------------------------------------------------------===//
1175
1176/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1177/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1178static char GetTrigraphCharForLetter(char Letter) {
1179 switch (Letter) {
1180 default: return 0;
1181 case '=': return '#';
1182 case ')': return ']';
1183 case '(': return '[';
1184 case '!': return '|';
1185 case '\'': return '^';
1186 case '>': return '}';
1187 case '/': return '\\';
1188 case '<': return '{';
1189 case '-': return '~';
1190 }
1191}
1192
1193/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1194/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1195/// return the result character. Finally, emit a warning about trigraph use
1196/// whether trigraphs are enabled or not.
1197static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1198 char Res = GetTrigraphCharForLetter(*CP);
1199 if (!Res)
1200 return Res;
1201
1202 if (!Trigraphs) {
1203 if (L && !L->isLexingRawMode())
1204 L->Diag(CP-2, diag::trigraph_ignored);
1205 return 0;
1206 }
1207
1208 if (L && !L->isLexingRawMode())
1209 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1210 return Res;
1211}
1212
1213/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1214/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1215/// trigraph equivalent on entry to this function.
1216unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1217 unsigned Size = 0;
1218 while (isWhitespace(Ptr[Size])) {
1219 ++Size;
1220
1221 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1222 continue;
1223
1224 // If this is a \r\n or \n\r, skip the other half.
1225 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1226 Ptr[Size-1] != Ptr[Size])
1227 ++Size;
1228
1229 return Size;
1230 }
1231
1232 // Not an escaped newline, must be a \t or something else.
1233 return 0;
1234}
1235
1236/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1237/// them), skip over them and return the first non-escaped-newline found,
1238/// otherwise return P.
1239const char *Lexer::SkipEscapedNewLines(const char *P) {
1240 while (true) {
1241 const char *AfterEscape;
1242 if (*P == '\\') {
1243 AfterEscape = P+1;
1244 } else if (*P == '?') {
1245 // If not a trigraph for escape, bail out.
1246 if (P[1] != '?' || P[2] != '/')
1247 return P;
1248 // FIXME: Take LangOpts into account; the language might not
1249 // support trigraphs.
1250 AfterEscape = P+3;
1251 } else {
1252 return P;
1253 }
1254
1255 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1256 if (NewLineSize == 0) return P;
1257 P = AfterEscape+NewLineSize;
1258 }
1259}
1260
1261std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
1262 const SourceManager &SM,
1263 const LangOptions &LangOpts) {
1264 if (Loc.isMacroID()) {
1265 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1266 return std::nullopt;
1267 }
1268 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1269
1270 // Break down the source location.
1271 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1272
1273 // Try to load the file buffer.
1274 bool InvalidTemp = false;
1275 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1276 if (InvalidTemp)
1277 return std::nullopt;
1278
1279 const char *TokenBegin = File.data() + LocInfo.second;
1280
1281 // Lex from the start of the given location.
1282 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1283 TokenBegin, File.end());
1284 // Find the token.
1285 Token Tok;
1286 lexer.LexFromRawLexer(Tok);
1287 return Tok;
1288}
1289
1290/// Checks that the given token is the first token that occurs after the
1291/// given location (this excludes comments and whitespace). Returns the location
1292/// immediately after the specified token. If the token is not found or the
1293/// location is inside a macro, the returned source location will be invalid.
1295 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1296 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1297 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1298 if (!Tok || Tok->isNot(TKind))
1299 return {};
1300 SourceLocation TokenLoc = Tok->getLocation();
1301
1302 // Calculate how much whitespace needs to be skipped if any.
1303 unsigned NumWhitespaceChars = 0;
1304 if (SkipTrailingWhitespaceAndNewLine) {
1305 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1306 unsigned char C = *TokenEnd;
1307 while (isHorizontalWhitespace(C)) {
1308 C = *(++TokenEnd);
1309 NumWhitespaceChars++;
1310 }
1311
1312 // Skip \r, \n, \r\n, or \n\r
1313 if (C == '\n' || C == '\r') {
1314 char PrevC = C;
1315 C = *(++TokenEnd);
1316 NumWhitespaceChars++;
1317 if ((C == '\n' || C == '\r') && C != PrevC)
1318 NumWhitespaceChars++;
1319 }
1320 }
1321
1322 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1323}
1324
1325/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1326/// get its size, and return it. This is tricky in several cases:
1327/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1328/// then either return the trigraph (skipping 3 chars) or the '?',
1329/// depending on whether trigraphs are enabled or not.
1330/// 2. If this is an escaped newline (potentially with whitespace between
1331/// the backslash and newline), implicitly skip the newline and return
1332/// the char after it.
1333///
1334/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1335/// know that we can accumulate into Size, and that we have already incremented
1336/// Ptr by Size bytes.
1337///
1338/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1339/// be updated to match.
1340char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1341 Token *Tok) {
1342 // If we have a slash, look for an escaped newline.
1343 if (Ptr[0] == '\\') {
1344 ++Size;
1345 ++Ptr;
1346Slash:
1347 // Common case, backslash-char where the char is not whitespace.
1348 if (!isWhitespace(Ptr[0])) return '\\';
1349
1350 // See if we have optional whitespace characters between the slash and
1351 // newline.
1352 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1353 // Remember that this token needs to be cleaned.
1354 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1355
1356 // Warn if there was whitespace between the backslash and newline.
1357 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1358 Diag(Ptr, diag::backslash_newline_space);
1359
1360 // Found backslash<whitespace><newline>. Parse the char after it.
1361 Size += EscapedNewLineSize;
1362 Ptr += EscapedNewLineSize;
1363
1364 // Use slow version to accumulate a correct size field.
1365 return getCharAndSizeSlow(Ptr, Size, Tok);
1366 }
1367
1368 // Otherwise, this is not an escaped newline, just return the slash.
1369 return '\\';
1370 }
1371
1372 // If this is a trigraph, process it.
1373 if (Ptr[0] == '?' && Ptr[1] == '?') {
1374 // If this is actually a legal trigraph (not something like "??x"), emit
1375 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1376 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1377 LangOpts.Trigraphs)) {
1378 // Remember that this token needs to be cleaned.
1379 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1380
1381 Ptr += 3;
1382 Size += 3;
1383 if (C == '\\') goto Slash;
1384 return C;
1385 }
1386 }
1387
1388 // If this is neither, return a single character.
1389 ++Size;
1390 return *Ptr;
1391}
1392
1393/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1394/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1395/// and that we have already incremented Ptr by Size bytes.
1396///
1397/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1398/// be updated to match.
1399char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1400 const LangOptions &LangOpts) {
1401 // If we have a slash, look for an escaped newline.
1402 if (Ptr[0] == '\\') {
1403 ++Size;
1404 ++Ptr;
1405Slash:
1406 // Common case, backslash-char where the char is not whitespace.
1407 if (!isWhitespace(Ptr[0])) return '\\';
1408
1409 // See if we have optional whitespace characters followed by a newline.
1410 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1411 // Found backslash<whitespace><newline>. Parse the char after it.
1412 Size += EscapedNewLineSize;
1413 Ptr += EscapedNewLineSize;
1414
1415 // Use slow version to accumulate a correct size field.
1416 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1417 }
1418
1419 // Otherwise, this is not an escaped newline, just return the slash.
1420 return '\\';
1421 }
1422
1423 // If this is a trigraph, process it.
1424 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1425 // If this is actually a legal trigraph (not something like "??x"), return
1426 // it.
1427 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1428 Ptr += 3;
1429 Size += 3;
1430 if (C == '\\') goto Slash;
1431 return C;
1432 }
1433 }
1434
1435 // If this is neither, return a single character.
1436 ++Size;
1437 return *Ptr;
1438}
1439
1440//===----------------------------------------------------------------------===//
1441// Helper methods for lexing.
1442//===----------------------------------------------------------------------===//
1443
1444/// Routine that indiscriminately sets the offset into the source file.
1445void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1446 BufferPtr = BufferStart + Offset;
1447 if (BufferPtr > BufferEnd)
1448 BufferPtr = BufferEnd;
1449 // FIXME: What exactly does the StartOfLine bit mean? There are two
1450 // possible meanings for the "start" of the line: the first token on the
1451 // unexpanded line, or the first token on the expanded line.
1452 IsAtStartOfLine = StartOfLine;
1453 IsAtPhysicalStartOfLine = StartOfLine;
1454}
1455
1456static bool isUnicodeWhitespace(uint32_t Codepoint) {
1457 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1459 return UnicodeWhitespaceChars.contains(Codepoint);
1460}
1461
1463 llvm::SmallString<5> CharBuf;
1464 llvm::raw_svector_ostream CharOS(CharBuf);
1465 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1466 return CharBuf;
1467}
1468
1469// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1470// we allow "Mathematical Notation Characters" in identifiers.
1471// This is a proposed profile that extends the XID_Start/XID_continue
1472// with mathematical symbols, superscipts and subscripts digits
1473// found in some production software.
1474// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1475static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1476 bool IsStart, bool &IsExtension) {
1477 static const llvm::sys::UnicodeCharSet MathStartChars(
1479 static const llvm::sys::UnicodeCharSet MathContinueChars(
1481 if (MathStartChars.contains(C) ||
1482 (!IsStart && MathContinueChars.contains(C))) {
1483 IsExtension = true;
1484 return true;
1485 }
1486 return false;
1487}
1488
1489static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1490 bool &IsExtension) {
1491 if (LangOpts.AsmPreprocessor) {
1492 return false;
1493 } else if (LangOpts.DollarIdents && '$' == C) {
1494 return true;
1495 } else if (LangOpts.CPlusPlus || LangOpts.C2x) {
1496 // A non-leading codepoint must have the XID_Continue property.
1497 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1498 // so we need to check both tables.
1499 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1500 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1501 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1502 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1503 return true;
1504 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1505 IsExtension);
1506 } else if (LangOpts.C11) {
1507 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1509 return C11AllowedIDChars.contains(C);
1510 } else {
1511 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1513 return C99AllowedIDChars.contains(C);
1514 }
1515}
1516
1517static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1518 bool &IsExtension) {
1519 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1520 IsExtension = false;
1521 if (LangOpts.AsmPreprocessor) {
1522 return false;
1523 }
1524 if (LangOpts.CPlusPlus || LangOpts.C2x) {
1525 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1526 if (XIDStartChars.contains(C))
1527 return true;
1528 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1529 IsExtension);
1530 }
1531 if (!isAllowedIDChar(C, LangOpts, IsExtension))
1532 return false;
1533 if (LangOpts.C11) {
1534 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1536 return !C11DisallowedInitialIDChars.contains(C);
1537 }
1538 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1540 return !C99DisallowedInitialIDChars.contains(C);
1541}
1542
1544 CharSourceRange Range) {
1545
1546 static const llvm::sys::UnicodeCharSet MathStartChars(
1548 static const llvm::sys::UnicodeCharSet MathContinueChars(
1550
1551 (void)MathStartChars;
1552 (void)MathContinueChars;
1553 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1554 "Unexpected mathematical notation codepoint");
1555 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1556 << codepointAsHexString(C) << Range;
1557}
1558
1559static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1560 const char *End) {
1562 L.getSourceLocation(End));
1563}
1564
1565static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1566 CharSourceRange Range, bool IsFirst) {
1567 // Check C99 compatibility.
1568 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1569 enum {
1570 CannotAppearInIdentifier = 0,
1571 CannotStartIdentifier
1572 };
1573
1574 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1576 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1578 if (!C99AllowedIDChars.contains(C)) {
1579 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1580 << Range
1581 << CannotAppearInIdentifier;
1582 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1583 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1584 << Range
1585 << CannotStartIdentifier;
1586 }
1587 }
1588}
1589
1590/// After encountering UTF-8 character C and interpreting it as an identifier
1591/// character, check whether it's a homoglyph for a common non-identifier
1592/// source character that is unlikely to be an intentional identifier
1593/// character and warn if so.
1595 CharSourceRange Range) {
1596 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1597 struct HomoglyphPair {
1598 uint32_t Character;
1599 char LooksLike;
1600 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1601 };
1602 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1603 {U'\u00ad', 0}, // SOFT HYPHEN
1604 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1605 {U'\u037e', ';'}, // GREEK QUESTION MARK
1606 {U'\u200b', 0}, // ZERO WIDTH SPACE
1607 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1608 {U'\u200d', 0}, // ZERO WIDTH JOINER
1609 {U'\u2060', 0}, // WORD JOINER
1610 {U'\u2061', 0}, // FUNCTION APPLICATION
1611 {U'\u2062', 0}, // INVISIBLE TIMES
1612 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1613 {U'\u2064', 0}, // INVISIBLE PLUS
1614 {U'\u2212', '-'}, // MINUS SIGN
1615 {U'\u2215', '/'}, // DIVISION SLASH
1616 {U'\u2216', '\\'}, // SET MINUS
1617 {U'\u2217', '*'}, // ASTERISK OPERATOR
1618 {U'\u2223', '|'}, // DIVIDES
1619 {U'\u2227', '^'}, // LOGICAL AND
1620 {U'\u2236', ':'}, // RATIO
1621 {U'\u223c', '~'}, // TILDE OPERATOR
1622 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1623 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1624 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1625 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1626 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1627 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1628 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1629 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1630 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1631 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1632 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1633 {U'\uff0c', ','}, // FULLWIDTH COMMA
1634 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1635 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1636 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1637 {U'\uff1a', ':'}, // FULLWIDTH COLON
1638 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1639 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1640 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1641 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1642 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1643 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1644 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1645 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1646 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1647 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1648 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1649 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1650 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1651 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1652 {0, 0}
1653 };
1654 auto Homoglyph =
1655 std::lower_bound(std::begin(SortedHomoglyphs),
1656 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1657 if (Homoglyph->Character == C) {
1658 if (Homoglyph->LooksLike) {
1659 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1660 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1661 << Range << codepointAsHexString(C) << LooksLikeStr;
1662 } else {
1663 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1664 << Range << codepointAsHexString(C);
1665 }
1666 }
1667}
1668
1670 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1671 CharSourceRange Range, bool IsFirst) {
1672 if (isASCII(CodePoint))
1673 return;
1674
1675 bool IsExtension;
1676 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1677 bool IsIDContinue =
1678 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1679
1680 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1681 return;
1682
1683 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1684
1685 if (!IsFirst || InvalidOnlyAtStart) {
1686 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1687 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1688 << FixItHint::CreateRemoval(Range);
1689 } else {
1690 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1691 << Range << codepointAsHexString(CodePoint)
1692 << FixItHint::CreateRemoval(Range);
1693 }
1694}
1695
1696bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1697 Token &Result) {
1698 const char *UCNPtr = CurPtr + Size;
1699 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1700 if (CodePoint == 0) {
1701 return false;
1702 }
1703 bool IsExtension = false;
1704 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1705 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1706 return false;
1710 PP->getDiagnostics(), LangOpts, CodePoint,
1711 makeCharRange(*this, CurPtr, UCNPtr),
1712 /*IsFirst=*/false);
1713
1714 // We got a unicode codepoint that is neither a space nor a
1715 // a valid identifier part.
1716 // Carry on as if the codepoint was valid for recovery purposes.
1717 } else if (!isLexingRawMode()) {
1718 if (IsExtension)
1720 makeCharRange(*this, CurPtr, UCNPtr));
1721
1723 makeCharRange(*this, CurPtr, UCNPtr),
1724 /*IsFirst=*/false);
1725 }
1726
1727 Result.setFlag(Token::HasUCN);
1728 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1729 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1730 CurPtr = UCNPtr;
1731 else
1732 while (CurPtr != UCNPtr)
1733 (void)getAndAdvanceChar(CurPtr, Result);
1734 return true;
1735}
1736
1737bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1738 const char *UnicodePtr = CurPtr;
1739 llvm::UTF32 CodePoint;
1740 llvm::ConversionResult Result =
1741 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1742 (const llvm::UTF8 *)BufferEnd,
1743 &CodePoint,
1744 llvm::strictConversion);
1745 if (Result != llvm::conversionOK)
1746 return false;
1747
1748 bool IsExtension = false;
1749 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1750 IsExtension)) {
1751 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1752 return false;
1753
1757 PP->getDiagnostics(), LangOpts, CodePoint,
1758 makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
1759 // We got a unicode codepoint that is neither a space nor a
1760 // a valid identifier part. Carry on as if the codepoint was
1761 // valid for recovery purposes.
1762 } else if (!isLexingRawMode()) {
1763 if (IsExtension)
1765 makeCharRange(*this, CurPtr, UnicodePtr));
1767 makeCharRange(*this, CurPtr, UnicodePtr),
1768 /*IsFirst=*/false);
1770 makeCharRange(*this, CurPtr, UnicodePtr));
1771 }
1772
1773 CurPtr = UnicodePtr;
1774 return true;
1775}
1776
1777bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1778 const char *CurPtr) {
1779 bool IsExtension = false;
1780 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1783 if (IsExtension)
1785 makeCharRange(*this, BufferPtr, CurPtr));
1787 makeCharRange(*this, BufferPtr, CurPtr),
1788 /*IsFirst=*/true);
1790 makeCharRange(*this, BufferPtr, CurPtr));
1791 }
1792
1793 MIOpt.ReadToken();
1794 return LexIdentifierContinue(Result, CurPtr);
1795 }
1796
1798 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1800 // Non-ASCII characters tend to creep into source code unintentionally.
1801 // Instead of letting the parser complain about the unknown token,
1802 // just drop the character.
1803 // Note that we can /only/ do this when the non-ASCII character is actually
1804 // spelled as Unicode, not written as a UCN. The standard requires that
1805 // we not throw away any possible preprocessor tokens, but there's a
1806 // loophole in the mapping of Unicode characters to basic character set
1807 // characters that allows us to map these particular characters to, say,
1808 // whitespace.
1810 PP->getDiagnostics(), LangOpts, C,
1811 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1812 BufferPtr = CurPtr;
1813 return false;
1814 }
1815
1816 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1817 // up by accident.
1818 MIOpt.ReadToken();
1819 FormTokenWithChars(Result, CurPtr, tok::unknown);
1820 return true;
1821}
1822
1823bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1824 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1825 while (true) {
1826 unsigned char C = *CurPtr;
1827 // Fast path.
1829 ++CurPtr;
1830 continue;
1831 }
1832
1833 unsigned Size;
1834 // Slow path: handle trigraph, unicode codepoints, UCNs.
1835 C = getCharAndSize(CurPtr, Size);
1837 CurPtr = ConsumeChar(CurPtr, Size, Result);
1838 continue;
1839 }
1840 if (C == '$') {
1841 // If we hit a $ and they are not supported in identifiers, we are done.
1842 if (!LangOpts.DollarIdents)
1843 break;
1844 // Otherwise, emit a diagnostic and continue.
1845 if (!isLexingRawMode())
1846 Diag(CurPtr, diag::ext_dollar_in_identifier);
1847 CurPtr = ConsumeChar(CurPtr, Size, Result);
1848 continue;
1849 }
1850 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1851 continue;
1852 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1853 continue;
1854 // Neither an expected Unicode codepoint nor a UCN.
1855 break;
1856 }
1857
1858 const char *IdStart = BufferPtr;
1859 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1860 Result.setRawIdentifierData(IdStart);
1861
1862 // If we are in raw mode, return this identifier raw. There is no need to
1863 // look up identifier information or attempt to macro expand it.
1864 if (LexingRawMode)
1865 return true;
1866
1867 // Fill in Result.IdentifierInfo and update the token kind,
1868 // looking up the identifier in the identifier table.
1870 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1871 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1872
1873 // If the completion point is at the end of an identifier, we want to treat
1874 // the identifier as incomplete even if it resolves to a macro or a keyword.
1875 // This allows e.g. 'class^' to complete to 'classifier'.
1876 if (isCodeCompletionPoint(CurPtr)) {
1877 // Return the code-completion token.
1878 Result.setKind(tok::code_completion);
1879 // Skip the code-completion char and all immediate identifier characters.
1880 // This ensures we get consistent behavior when completing at any point in
1881 // an identifier (i.e. at the start, in the middle, at the end). Note that
1882 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1883 // simpler.
1884 assert(*CurPtr == 0 && "Completion character must be 0");
1885 ++CurPtr;
1886 // Note that code completion token is not added as a separate character
1887 // when the completion point is at the end of the buffer. Therefore, we need
1888 // to check if the buffer has ended.
1889 if (CurPtr < BufferEnd) {
1890 while (isAsciiIdentifierContinue(*CurPtr))
1891 ++CurPtr;
1892 }
1893 BufferPtr = CurPtr;
1894 return true;
1895 }
1896
1897 // Finally, now that we know we have an identifier, pass this off to the
1898 // preprocessor, which may macro expand it or something.
1899 if (II->isHandleIdentifierCase())
1900 return PP->HandleIdentifier(Result);
1901
1902 return true;
1903}
1904
1905/// isHexaLiteral - Return true if Start points to a hex constant.
1906/// in microsoft mode (where this is supposed to be several different tokens).
1907bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1908 unsigned Size;
1909 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1910 if (C1 != '0')
1911 return false;
1912 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1913 return (C2 == 'x' || C2 == 'X');
1914}
1915
1916/// LexNumericConstant - Lex the remainder of a integer or floating point
1917/// constant. From[-1] is the first character lexed. Return the end of the
1918/// constant.
1919bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1920 unsigned Size;
1921 char C = getCharAndSize(CurPtr, Size);
1922 char PrevCh = 0;
1923 while (isPreprocessingNumberBody(C)) {
1924 CurPtr = ConsumeChar(CurPtr, Size, Result);
1925 PrevCh = C;
1926 C = getCharAndSize(CurPtr, Size);
1927 }
1928
1929 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1930 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1931 // If we are in Microsoft mode, don't continue if the constant is hex.
1932 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1933 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1934 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1935 }
1936
1937 // If we have a hex FP constant, continue.
1938 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1939 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1940 // not-quite-conforming extension. Only do so if this looks like it's
1941 // actually meant to be a hexfloat, and not if it has a ud-suffix.
1942 bool IsHexFloat = true;
1943 if (!LangOpts.C99) {
1944 if (!isHexaLiteral(BufferPtr, LangOpts))
1945 IsHexFloat = false;
1946 else if (!LangOpts.CPlusPlus17 &&
1947 std::find(BufferPtr, CurPtr, '_') != CurPtr)
1948 IsHexFloat = false;
1949 }
1950 if (IsHexFloat)
1951 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1952 }
1953
1954 // If we have a digit separator, continue.
1955 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C2x)) {
1956 unsigned NextSize;
1957 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
1958 if (isAsciiIdentifierContinue(Next)) {
1959 if (!isLexingRawMode())
1960 Diag(CurPtr, LangOpts.CPlusPlus
1961 ? diag::warn_cxx11_compat_digit_separator
1962 : diag::warn_c2x_compat_digit_separator);
1963 CurPtr = ConsumeChar(CurPtr, Size, Result);
1964 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1965 return LexNumericConstant(Result, CurPtr);
1966 }
1967 }
1968
1969 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1970 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1971 return LexNumericConstant(Result, CurPtr);
1972 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1973 return LexNumericConstant(Result, CurPtr);
1974
1975 // Update the location of token as well as BufferPtr.
1976 const char *TokStart = BufferPtr;
1977 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1978 Result.setLiteralData(TokStart);
1979 return true;
1980}
1981
1982/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1983/// in C++11, or warn on a ud-suffix in C++98.
1984const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1985 bool IsStringLiteral) {
1986 assert(LangOpts.CPlusPlus);
1987
1988 // Maximally munch an identifier.
1989 unsigned Size;
1990 char C = getCharAndSize(CurPtr, Size);
1991 bool Consumed = false;
1992
1993 if (!isAsciiIdentifierStart(C)) {
1994 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1995 Consumed = true;
1996 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
1997 Consumed = true;
1998 else
1999 return CurPtr;
2000 }
2001
2002 if (!LangOpts.CPlusPlus11) {
2003 if (!isLexingRawMode())
2004 Diag(CurPtr,
2005 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2006 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2008 return CurPtr;
2009 }
2010
2011 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2012 // that does not start with an underscore is ill-formed. As a conforming
2013 // extension, we treat all such suffixes as if they had whitespace before
2014 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2015 // likely to be a ud-suffix than a macro, however, and accept that.
2016 if (!Consumed) {
2017 bool IsUDSuffix = false;
2018 if (C == '_')
2019 IsUDSuffix = true;
2020 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2021 // In C++1y, we need to look ahead a few characters to see if this is a
2022 // valid suffix for a string literal or a numeric literal (this could be
2023 // the 'operator""if' defining a numeric literal operator).
2024 const unsigned MaxStandardSuffixLength = 3;
2025 char Buffer[MaxStandardSuffixLength] = { C };
2026 unsigned Consumed = Size;
2027 unsigned Chars = 1;
2028 while (true) {
2029 unsigned NextSize;
2030 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
2031 if (!isAsciiIdentifierContinue(Next)) {
2032 // End of suffix. Check whether this is on the allowed list.
2033 const StringRef CompleteSuffix(Buffer, Chars);
2034 IsUDSuffix =
2035 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2036 break;
2037 }
2038
2039 if (Chars == MaxStandardSuffixLength)
2040 // Too long: can't be a standard suffix.
2041 break;
2042
2043 Buffer[Chars++] = Next;
2044 Consumed += NextSize;
2045 }
2046 }
2047
2048 if (!IsUDSuffix) {
2049 if (!isLexingRawMode())
2050 Diag(CurPtr, LangOpts.MSVCCompat
2051 ? diag::ext_ms_reserved_user_defined_literal
2052 : diag::ext_reserved_user_defined_literal)
2054 return CurPtr;
2055 }
2056
2057 CurPtr = ConsumeChar(CurPtr, Size, Result);
2058 }
2059
2060 Result.setFlag(Token::HasUDSuffix);
2061 while (true) {
2062 C = getCharAndSize(CurPtr, Size);
2064 CurPtr = ConsumeChar(CurPtr, Size, Result);
2065 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2066 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
2067 } else
2068 break;
2069 }
2070
2071 return CurPtr;
2072}
2073
2074/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2075/// either " or L" or u8" or u" or U".
2076bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2077 tok::TokenKind Kind) {
2078 const char *AfterQuote = CurPtr;
2079 // Does this string contain the \0 character?
2080 const char *NulCharacter = nullptr;
2081
2082 if (!isLexingRawMode() &&
2083 (Kind == tok::utf8_string_literal ||
2084 Kind == tok::utf16_string_literal ||
2085 Kind == tok::utf32_string_literal))
2086 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2087 : diag::warn_c99_compat_unicode_literal);
2088
2089 char C = getAndAdvanceChar(CurPtr, Result);
2090 while (C != '"') {
2091 // Skip escaped characters. Escaped newlines will already be processed by
2092 // getAndAdvanceChar.
2093 if (C == '\\')
2094 C = getAndAdvanceChar(CurPtr, Result);
2095
2096 if (C == '\n' || C == '\r' || // Newline.
2097 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2098 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2099 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2100 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2101 return true;
2102 }
2103
2104 if (C == 0) {
2105 if (isCodeCompletionPoint(CurPtr-1)) {
2106 if (ParsingFilename)
2107 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2108 else
2110 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2111 cutOffLexing();
2112 return true;
2113 }
2114
2115 NulCharacter = CurPtr-1;
2116 }
2117 C = getAndAdvanceChar(CurPtr, Result);
2118 }
2119
2120 // If we are in C++11, lex the optional ud-suffix.
2121 if (LangOpts.CPlusPlus)
2122 CurPtr = LexUDSuffix(Result, CurPtr, true);
2123
2124 // If a nul character existed in the string, warn about it.
2125 if (NulCharacter && !isLexingRawMode())
2126 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2127
2128 // Update the location of the token as well as the BufferPtr instance var.
2129 const char *TokStart = BufferPtr;
2130 FormTokenWithChars(Result, CurPtr, Kind);
2131 Result.setLiteralData(TokStart);
2132 return true;
2133}
2134
2135/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2136/// having lexed R", LR", u8R", uR", or UR".
2137bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2138 tok::TokenKind Kind) {
2139 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2140 // Between the initial and final double quote characters of the raw string,
2141 // any transformations performed in phases 1 and 2 (trigraphs,
2142 // universal-character-names, and line splicing) are reverted.
2143
2144 if (!isLexingRawMode())
2145 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2146
2147 unsigned PrefixLen = 0;
2148
2149 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
2150 ++PrefixLen;
2151
2152 // If the last character was not a '(', then we didn't lex a valid delimiter.
2153 if (CurPtr[PrefixLen] != '(') {
2154 if (!isLexingRawMode()) {
2155 const char *PrefixEnd = &CurPtr[PrefixLen];
2156 if (PrefixLen == 16) {
2157 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2158 } else {
2159 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2160 << StringRef(PrefixEnd, 1);
2161 }
2162 }
2163
2164 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2165 // it's possible the '"' was intended to be part of the raw string, but
2166 // there's not much we can do about that.
2167 while (true) {
2168 char C = *CurPtr++;
2169
2170 if (C == '"')
2171 break;
2172 if (C == 0 && CurPtr-1 == BufferEnd) {
2173 --CurPtr;
2174 break;
2175 }
2176 }
2177
2178 FormTokenWithChars(Result, CurPtr, tok::unknown);
2179 return true;
2180 }
2181
2182 // Save prefix and move CurPtr past it
2183 const char *Prefix = CurPtr;
2184 CurPtr += PrefixLen + 1; // skip over prefix and '('
2185
2186 while (true) {
2187 char C = *CurPtr++;
2188
2189 if (C == ')') {
2190 // Check for prefix match and closing quote.
2191 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2192 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2193 break;
2194 }
2195 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2196 if (!isLexingRawMode())
2197 Diag(BufferPtr, diag::err_unterminated_raw_string)
2198 << StringRef(Prefix, PrefixLen);
2199 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2200 return true;
2201 }
2202 }
2203
2204 // If we are in C++11, lex the optional ud-suffix.
2205 if (LangOpts.CPlusPlus)
2206 CurPtr = LexUDSuffix(Result, CurPtr, true);
2207
2208 // Update the location of token as well as BufferPtr.
2209 const char *TokStart = BufferPtr;
2210 FormTokenWithChars(Result, CurPtr, Kind);
2211 Result.setLiteralData(TokStart);
2212 return true;
2213}
2214
2215/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2216/// after having lexed the '<' character. This is used for #include filenames.
2217bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2218 // Does this string contain the \0 character?
2219 const char *NulCharacter = nullptr;
2220 const char *AfterLessPos = CurPtr;
2221 char C = getAndAdvanceChar(CurPtr, Result);
2222 while (C != '>') {
2223 // Skip escaped characters. Escaped newlines will already be processed by
2224 // getAndAdvanceChar.
2225 if (C == '\\')
2226 C = getAndAdvanceChar(CurPtr, Result);
2227
2228 if (isVerticalWhitespace(C) || // Newline.
2229 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2230 // If the filename is unterminated, then it must just be a lone <
2231 // character. Return this as such.
2232 FormTokenWithChars(Result, AfterLessPos, tok::less);
2233 return true;
2234 }
2235
2236 if (C == 0) {
2237 if (isCodeCompletionPoint(CurPtr - 1)) {
2238 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2239 cutOffLexing();
2240 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2241 return true;
2242 }
2243 NulCharacter = CurPtr-1;
2244 }
2245 C = getAndAdvanceChar(CurPtr, Result);
2246 }
2247
2248 // If a nul character existed in the string, warn about it.
2249 if (NulCharacter && !isLexingRawMode())
2250 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2251
2252 // Update the location of token as well as BufferPtr.
2253 const char *TokStart = BufferPtr;
2254 FormTokenWithChars(Result, CurPtr, tok::header_name);
2255 Result.setLiteralData(TokStart);
2256 return true;
2257}
2258
2259void Lexer::codeCompleteIncludedFile(const char *PathStart,
2260 const char *CompletionPoint,
2261 bool IsAngled) {
2262 // Completion only applies to the filename, after the last slash.
2263 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2264 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2265 auto Slash = PartialPath.find_last_of(SlashChars);
2266 StringRef Dir =
2267 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2268 const char *StartOfFilename =
2269 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2270 // Code completion filter range is the filename only, up to completion point.
2272 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2273 // We should replace the characters up to the closing quote or closest slash,
2274 // if any.
2275 while (CompletionPoint < BufferEnd) {
2276 char Next = *(CompletionPoint + 1);
2277 if (Next == 0 || Next == '\r' || Next == '\n')
2278 break;
2279 ++CompletionPoint;
2280 if (Next == (IsAngled ? '>' : '"'))
2281 break;
2282 if (SlashChars.contains(Next))
2283 break;
2284 }
2285
2287 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2288 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2289 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2290}
2291
2292/// LexCharConstant - Lex the remainder of a character constant, after having
2293/// lexed either ' or L' or u8' or u' or U'.
2294bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2295 tok::TokenKind Kind) {
2296 // Does this character contain the \0 character?
2297 const char *NulCharacter = nullptr;
2298
2299 if (!isLexingRawMode()) {
2300 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2301 Diag(BufferPtr, LangOpts.CPlusPlus
2302 ? diag::warn_cxx98_compat_unicode_literal
2303 : diag::warn_c99_compat_unicode_literal);
2304 else if (Kind == tok::utf8_char_constant)
2305 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2306 }
2307
2308 char C = getAndAdvanceChar(CurPtr, Result);
2309 if (C == '\'') {
2310 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2311 Diag(BufferPtr, diag::ext_empty_character);
2312 FormTokenWithChars(Result, CurPtr, tok::unknown);
2313 return true;
2314 }
2315
2316 while (C != '\'') {
2317 // Skip escaped characters.
2318 if (C == '\\')
2319 C = getAndAdvanceChar(CurPtr, Result);
2320
2321 if (C == '\n' || C == '\r' || // Newline.
2322 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2323 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2324 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2325 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2326 return true;
2327 }
2328
2329 if (C == 0) {
2330 if (isCodeCompletionPoint(CurPtr-1)) {
2332 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2333 cutOffLexing();
2334 return true;
2335 }
2336
2337 NulCharacter = CurPtr-1;
2338 }
2339 C = getAndAdvanceChar(CurPtr, Result);
2340 }
2341
2342 // If we are in C++11, lex the optional ud-suffix.
2343 if (LangOpts.CPlusPlus)
2344 CurPtr = LexUDSuffix(Result, CurPtr, false);
2345
2346 // If a nul character existed in the character, warn about it.
2347 if (NulCharacter && !isLexingRawMode())
2348 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2349
2350 // Update the location of token as well as BufferPtr.
2351 const char *TokStart = BufferPtr;
2352 FormTokenWithChars(Result, CurPtr, Kind);
2353 Result.setLiteralData(TokStart);
2354 return true;
2355}
2356
2357/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2358/// Update BufferPtr to point to the next non-whitespace character and return.
2359///
2360/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2361bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2362 bool &TokAtPhysicalStartOfLine) {
2363 // Whitespace - Skip it, then return the token after the whitespace.
2364 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2365
2366 unsigned char Char = *CurPtr;
2367
2368 const char *lastNewLine = nullptr;
2369 auto setLastNewLine = [&](const char *Ptr) {
2370 lastNewLine = Ptr;
2371 if (!NewLinePtr)
2372 NewLinePtr = Ptr;
2373 };
2374 if (SawNewline)
2375 setLastNewLine(CurPtr - 1);
2376
2377 // Skip consecutive spaces efficiently.
2378 while (true) {
2379 // Skip horizontal whitespace very aggressively.
2380 while (isHorizontalWhitespace(Char))
2381 Char = *++CurPtr;
2382
2383 // Otherwise if we have something other than whitespace, we're done.
2384 if (!isVerticalWhitespace(Char))
2385 break;
2386
2388 // End of preprocessor directive line, let LexTokenInternal handle this.
2389 BufferPtr = CurPtr;
2390 return false;
2391 }
2392
2393 // OK, but handle newline.
2394 if (*CurPtr == '\n')
2395 setLastNewLine(CurPtr);
2396 SawNewline = true;
2397 Char = *++CurPtr;
2398 }
2399
2400 // If the client wants us to return whitespace, return it now.
2401 if (isKeepWhitespaceMode()) {
2402 FormTokenWithChars(Result, CurPtr, tok::unknown);
2403 if (SawNewline) {
2404 IsAtStartOfLine = true;
2405 IsAtPhysicalStartOfLine = true;
2406 }
2407 // FIXME: The next token will not have LeadingSpace set.
2408 return true;
2409 }
2410
2411 // If this isn't immediately after a newline, there is leading space.
2412 char PrevChar = CurPtr[-1];
2413 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2414
2415 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2416 if (SawNewline) {
2417 Result.setFlag(Token::StartOfLine);
2418 TokAtPhysicalStartOfLine = true;
2419
2420 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2421 if (auto *Handler = PP->getEmptylineHandler())
2422 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2423 getSourceLocation(lastNewLine)));
2424 }
2425 }
2426
2427 BufferPtr = CurPtr;
2428 return false;
2429}
2430
2431/// We have just read the // characters from input. Skip until we find the
2432/// newline character that terminates the comment. Then update BufferPtr and
2433/// return.
2434///
2435/// If we're in KeepCommentMode or any CommentHandler has inserted
2436/// some tokens, this will store the first token and return true.
2437bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2438 bool &TokAtPhysicalStartOfLine) {
2439 // If Line comments aren't explicitly enabled for this language, emit an
2440 // extension warning.
2441 if (!LineComment) {
2442 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2443 Diag(BufferPtr, diag::ext_line_comment);
2444
2445 // Mark them enabled so we only emit one warning for this translation
2446 // unit.
2447 LineComment = true;
2448 }
2449
2450 // Scan over the body of the comment. The common case, when scanning, is that
2451 // the comment contains normal ascii characters with nothing interesting in
2452 // them. As such, optimize for this case with the inner loop.
2453 //
2454 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2455 // character that ends the line comment.
2456
2457 // C++23 [lex.phases] p1
2458 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2459 // diagnostic only once per entire ill-formed subsequence to avoid
2460 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2461 bool UnicodeDecodingAlreadyDiagnosed = false;
2462
2463 char C;
2464 while (true) {
2465 C = *CurPtr;
2466 // Skip over characters in the fast loop.
2467 while (isASCII(C) && C != 0 && // Potentially EOF.
2468 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2469 C = *++CurPtr;
2470 UnicodeDecodingAlreadyDiagnosed = false;
2471 }
2472
2473 if (!isASCII(C)) {
2474 unsigned Length = llvm::getUTF8SequenceSize(
2475 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2476 if (Length == 0) {
2477 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2478 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2479 UnicodeDecodingAlreadyDiagnosed = true;
2480 ++CurPtr;
2481 } else {
2482 UnicodeDecodingAlreadyDiagnosed = false;
2483 CurPtr += Length;
2484 }
2485 continue;
2486 }
2487
2488 const char *NextLine = CurPtr;
2489 if (C != 0) {
2490 // We found a newline, see if it's escaped.
2491 const char *EscapePtr = CurPtr-1;
2492 bool HasSpace = false;
2493 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2494 --EscapePtr;
2495 HasSpace = true;
2496 }
2497
2498 if (*EscapePtr == '\\')
2499 // Escaped newline.
2500 CurPtr = EscapePtr;
2501 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2502 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2503 // Trigraph-escaped newline.
2504 CurPtr = EscapePtr-2;
2505 else
2506 break; // This is a newline, we're done.
2507
2508 // If there was space between the backslash and newline, warn about it.
2509 if (HasSpace && !isLexingRawMode())
2510 Diag(EscapePtr, diag::backslash_newline_space);
2511 }
2512
2513 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2514 // properly decode the character. Read it in raw mode to avoid emitting
2515 // diagnostics about things like trigraphs. If we see an escaped newline,
2516 // we'll handle it below.
2517 const char *OldPtr = CurPtr;
2518 bool OldRawMode = isLexingRawMode();
2519 LexingRawMode = true;
2520 C = getAndAdvanceChar(CurPtr, Result);
2521 LexingRawMode = OldRawMode;
2522
2523 // If we only read only one character, then no special handling is needed.
2524 // We're done and can skip forward to the newline.
2525 if (C != 0 && CurPtr == OldPtr+1) {
2526 CurPtr = NextLine;
2527 break;
2528 }
2529
2530 // If we read multiple characters, and one of those characters was a \r or
2531 // \n, then we had an escaped newline within the comment. Emit diagnostic
2532 // unless the next line is also a // comment.
2533 if (CurPtr != OldPtr + 1 && C != '/' &&
2534 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2535 for (; OldPtr != CurPtr; ++OldPtr)
2536 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2537 // Okay, we found a // comment that ends in a newline, if the next
2538 // line is also a // comment, but has spaces, don't emit a diagnostic.
2539 if (isWhitespace(C)) {
2540 const char *ForwardPtr = CurPtr;
2541 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2542 ++ForwardPtr;
2543 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2544 break;
2545 }
2546
2547 if (!isLexingRawMode())
2548 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2549 break;
2550 }
2551 }
2552
2553 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2554 --CurPtr;
2555 break;
2556 }
2557
2558 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2560 cutOffLexing();
2561 return false;
2562 }
2563 }
2564
2565 // Found but did not consume the newline. Notify comment handlers about the
2566 // comment unless we're in a #if 0 block.
2567 if (PP && !isLexingRawMode() &&
2569 getSourceLocation(CurPtr)))) {
2570 BufferPtr = CurPtr;
2571 return true; // A token has to be returned.
2572 }
2573
2574 // If we are returning comments as tokens, return this comment as a token.
2575 if (inKeepCommentMode())
2576 return SaveLineComment(Result, CurPtr);
2577
2578 // If we are inside a preprocessor directive and we see the end of line,
2579 // return immediately, so that the lexer can return this as an EOD token.
2580 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2581 BufferPtr = CurPtr;
2582 return false;
2583 }
2584
2585 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2586 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2587 // contribute to another token), it isn't needed for correctness. Note that
2588 // this is ok even in KeepWhitespaceMode, because we would have returned the
2589 /// comment above in that mode.
2590 NewLinePtr = CurPtr++;
2591
2592 // The next returned token is at the start of the line.
2593 Result.setFlag(Token::StartOfLine);
2594 TokAtPhysicalStartOfLine = true;
2595 // No leading whitespace seen so far.
2596 Result.clearFlag(Token::LeadingSpace);
2597 BufferPtr = CurPtr;
2598 return false;
2599}
2600
2601/// If in save-comment mode, package up this Line comment in an appropriate
2602/// way and return it.
2603bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2604 // If we're not in a preprocessor directive, just return the // comment
2605 // directly.
2606 FormTokenWithChars(Result, CurPtr, tok::comment);
2607
2609 return true;
2610
2611 // If this Line-style comment is in a macro definition, transmogrify it into
2612 // a C-style block comment.
2613 bool Invalid = false;
2614 std::string Spelling = PP->getSpelling(Result, &Invalid);
2615 if (Invalid)
2616 return true;
2617
2618 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2619 Spelling[1] = '*'; // Change prefix to "/*".
2620 Spelling += "*/"; // add suffix.
2621
2622 Result.setKind(tok::comment);
2623 PP->CreateString(Spelling, Result,
2624 Result.getLocation(), Result.getLocation());
2625 return true;
2626}
2627
2628/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2629/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2630/// a diagnostic if so. We know that the newline is inside of a block comment.
2631static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2632 bool Trigraphs) {
2633 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2634
2635 // Position of the first trigraph in the ending sequence.
2636 const char *TrigraphPos = nullptr;
2637 // Position of the first whitespace after a '\' in the ending sequence.
2638 const char *SpacePos = nullptr;
2639
2640 while (true) {
2641 // Back up off the newline.
2642 --CurPtr;
2643
2644 // If this is a two-character newline sequence, skip the other character.
2645 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2646 // \n\n or \r\r -> not escaped newline.
2647 if (CurPtr[0] == CurPtr[1])
2648 return false;
2649 // \n\r or \r\n -> skip the newline.
2650 --CurPtr;
2651 }
2652
2653 // If we have horizontal whitespace, skip over it. We allow whitespace
2654 // between the slash and newline.
2655 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2656 SpacePos = CurPtr;
2657 --CurPtr;
2658 }
2659
2660 // If we have a slash, this is an escaped newline.
2661 if (*CurPtr == '\\') {
2662 --CurPtr;
2663 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2664 // This is a trigraph encoding of a slash.
2665 TrigraphPos = CurPtr - 2;
2666 CurPtr -= 3;
2667 } else {
2668 return false;
2669 }
2670
2671 // If the character preceding the escaped newline is a '*', then after line
2672 // splicing we have a '*/' ending the comment.
2673 if (*CurPtr == '*')
2674 break;
2675
2676 if (*CurPtr != '\n' && *CurPtr != '\r')
2677 return false;
2678 }
2679
2680 if (TrigraphPos) {
2681 // If no trigraphs are enabled, warn that we ignored this trigraph and
2682 // ignore this * character.
2683 if (!Trigraphs) {
2684 if (!L->isLexingRawMode())
2685 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2686 return false;
2687 }
2688 if (!L->isLexingRawMode())
2689 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2690 }
2691
2692 // Warn about having an escaped newline between the */ characters.
2693 if (!L->isLexingRawMode())
2694 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2695
2696 // If there was space between the backslash and newline, warn about it.
2697 if (SpacePos && !L->isLexingRawMode())
2698 L->Diag(SpacePos, diag::backslash_newline_space);
2699
2700 return true;
2701}
2702
2703#ifdef __SSE2__
2704#include <emmintrin.h>
2705#elif __ALTIVEC__
2706#include <altivec.h>
2707#undef bool
2708#endif
2709
2710/// We have just read from input the / and * characters that started a comment.
2711/// Read until we find the * and / characters that terminate the comment.
2712/// Note that we don't bother decoding trigraphs or escaped newlines in block
2713/// comments, because they cannot cause the comment to end. The only thing
2714/// that can happen is the comment could end with an escaped newline between
2715/// the terminating * and /.
2716///
2717/// If we're in KeepCommentMode or any CommentHandler has inserted
2718/// some tokens, this will store the first token and return true.
2719bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2720 bool &TokAtPhysicalStartOfLine) {
2721 // Scan one character past where we should, looking for a '/' character. Once
2722 // we find it, check to see if it was preceded by a *. This common
2723 // optimization helps people who like to put a lot of * characters in their
2724 // comments.
2725
2726 // The first character we get with newlines and trigraphs skipped to handle
2727 // the degenerate /*/ case below correctly if the * has an escaped newline
2728 // after it.
2729 unsigned CharSize;
2730 unsigned char C = getCharAndSize(CurPtr, CharSize);
2731 CurPtr += CharSize;
2732 if (C == 0 && CurPtr == BufferEnd+1) {
2733 if (!isLexingRawMode())
2734 Diag(BufferPtr, diag::err_unterminated_block_comment);
2735 --CurPtr;
2736
2737 // KeepWhitespaceMode should return this broken comment as a token. Since
2738 // it isn't a well formed comment, just return it as an 'unknown' token.
2739 if (isKeepWhitespaceMode()) {
2740 FormTokenWithChars(Result, CurPtr, tok::unknown);
2741 return true;
2742 }
2743
2744 BufferPtr = CurPtr;
2745 return false;
2746 }
2747
2748 // Check to see if the first character after the '/*' is another /. If so,
2749 // then this slash does not end the block comment, it is part of it.
2750 if (C == '/')
2751 C = *CurPtr++;
2752
2753 // C++23 [lex.phases] p1
2754 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2755 // diagnostic only once per entire ill-formed subsequence to avoid
2756 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2757 bool UnicodeDecodingAlreadyDiagnosed = false;
2758
2759 while (true) {
2760 // Skip over all non-interesting characters until we find end of buffer or a
2761 // (probably ending) '/' character.
2762 if (CurPtr + 24 < BufferEnd &&
2763 // If there is a code-completion point avoid the fast scan because it
2764 // doesn't check for '\0'.
2765 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2766 // While not aligned to a 16-byte boundary.
2767 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2768 if (!isASCII(C))
2769 goto MultiByteUTF8;
2770 C = *CurPtr++;
2771 }
2772 if (C == '/') goto FoundSlash;
2773
2774#ifdef __SSE2__
2775 __m128i Slashes = _mm_set1_epi8('/');
2776 while (CurPtr + 16 < BufferEnd) {
2777 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2778 if (LLVM_UNLIKELY(Mask != 0)) {
2779 goto MultiByteUTF8;
2780 }
2781 // look for slashes
2782 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2783 Slashes));
2784 if (cmp != 0) {
2785 // Adjust the pointer to point directly after the first slash. It's
2786 // not necessary to set C here, it will be overwritten at the end of
2787 // the outer loop.
2788 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2789 goto FoundSlash;
2790 }
2791 CurPtr += 16;
2792 }
2793#elif __ALTIVEC__
2794 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2795 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2796 0x80, 0x80, 0x80, 0x80};
2797 __vector unsigned char Slashes = {
2798 '/', '/', '/', '/', '/', '/', '/', '/',
2799 '/', '/', '/', '/', '/', '/', '/', '/'
2800 };
2801 while (CurPtr + 16 < BufferEnd) {
2802 if (LLVM_UNLIKELY(
2803 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2804 goto MultiByteUTF8;
2805 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2806 break;
2807 }
2808 CurPtr += 16;
2809 }
2810
2811#else
2812 while (CurPtr + 16 < BufferEnd) {
2813 bool HasNonASCII = false;
2814 for (unsigned I = 0; I < 16; ++I)
2815 HasNonASCII |= !isASCII(CurPtr[I]);
2816
2817 if (LLVM_UNLIKELY(HasNonASCII))
2818 goto MultiByteUTF8;
2819
2820 bool HasSlash = false;
2821 for (unsigned I = 0; I < 16; ++I)
2822 HasSlash |= CurPtr[I] == '/';
2823 if (HasSlash)
2824 break;
2825 CurPtr += 16;
2826 }
2827#endif
2828
2829 // It has to be one of the bytes scanned, increment to it and read one.
2830 C = *CurPtr++;
2831 }
2832
2833 // Loop to scan the remainder, warning on invalid UTF-8
2834 // if the corresponding warning is enabled, emitting a diagnostic only once
2835 // per sequence that cannot be decoded.
2836 while (C != '/' && C != '\0') {
2837 if (isASCII(C)) {
2838 UnicodeDecodingAlreadyDiagnosed = false;
2839 C = *CurPtr++;
2840 continue;
2841 }
2842 MultiByteUTF8:
2843 // CurPtr is 1 code unit past C, so to decode
2844 // the codepoint, we need to read from the previous position.
2845 unsigned Length = llvm::getUTF8SequenceSize(
2846 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2847 if (Length == 0) {
2848 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2849 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2850 UnicodeDecodingAlreadyDiagnosed = true;
2851 } else {
2852 UnicodeDecodingAlreadyDiagnosed = false;
2853 CurPtr += Length - 1;
2854 }
2855 C = *CurPtr++;
2856 }
2857
2858 if (C == '/') {
2859 FoundSlash:
2860 if (CurPtr[-2] == '*') // We found the final */. We're done!
2861 break;
2862
2863 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2864 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2865 LangOpts.Trigraphs)) {
2866 // We found the final */, though it had an escaped newline between the
2867 // * and /. We're done!
2868 break;
2869 }
2870 }
2871 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2872 // If this is a /* inside of the comment, emit a warning. Don't do this
2873 // if this is a /*/, which will end the comment. This misses cases with
2874 // embedded escaped newlines, but oh well.
2875 if (!isLexingRawMode())
2876 Diag(CurPtr-1, diag::warn_nested_block_comment);
2877 }
2878 } else if (C == 0 && CurPtr == BufferEnd+1) {
2879 if (!isLexingRawMode())
2880 Diag(BufferPtr, diag::err_unterminated_block_comment);
2881 // Note: the user probably forgot a */. We could continue immediately
2882 // after the /*, but this would involve lexing a lot of what really is the
2883 // comment, which surely would confuse the parser.
2884 --CurPtr;
2885
2886 // KeepWhitespaceMode should return this broken comment as a token. Since
2887 // it isn't a well formed comment, just return it as an 'unknown' token.
2888 if (isKeepWhitespaceMode()) {
2889 FormTokenWithChars(Result, CurPtr, tok::unknown);
2890 return true;
2891 }
2892
2893 BufferPtr = CurPtr;
2894 return false;
2895 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2897 cutOffLexing();
2898 return false;
2899 }
2900
2901 C = *CurPtr++;
2902 }
2903
2904 // Notify comment handlers about the comment unless we're in a #if 0 block.
2905 if (PP && !isLexingRawMode() &&
2907 getSourceLocation(CurPtr)))) {
2908 BufferPtr = CurPtr;
2909 return true; // A token has to be returned.
2910 }
2911
2912 // If we are returning comments as tokens, return this comment as a token.
2913 if (inKeepCommentMode()) {
2914 FormTokenWithChars(Result, CurPtr, tok::comment);
2915 return true;
2916 }
2917
2918 // It is common for the tokens immediately after a /**/ comment to be
2919 // whitespace. Instead of going through the big switch, handle it
2920 // efficiently now. This is safe even in KeepWhitespaceMode because we would
2921 // have already returned above with the comment as a token.
2922 if (isHorizontalWhitespace(*CurPtr)) {
2923 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2924 return false;
2925 }
2926
2927 // Otherwise, just return so that the next character will be lexed as a token.
2928 BufferPtr = CurPtr;
2929 Result.setFlag(Token::LeadingSpace);
2930 return false;
2931}
2932
2933//===----------------------------------------------------------------------===//
2934// Primary Lexing Entry Points
2935//===----------------------------------------------------------------------===//
2936
2937/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2938/// uninterpreted string. This switches the lexer out of directive mode.
2940 assert(ParsingPreprocessorDirective && ParsingFilename == false &&
2941 "Must be in a preprocessing directive!");
2942 Token Tmp;
2943 Tmp.startToken();
2944
2945 // CurPtr - Cache BufferPtr in an automatic variable.
2946 const char *CurPtr = BufferPtr;
2947 while (true) {
2948 char Char = getAndAdvanceChar(CurPtr, Tmp);
2949 switch (Char) {
2950 default:
2951 if (Result)
2952 Result->push_back(Char);
2953 break;
2954 case 0: // Null.
2955 // Found end of file?
2956 if (CurPtr-1 != BufferEnd) {
2957 if (isCodeCompletionPoint(CurPtr-1)) {
2959 cutOffLexing();
2960 return;
2961 }
2962
2963 // Nope, normal character, continue.
2964 if (Result)
2965 Result->push_back(Char);
2966 break;
2967 }
2968 // FALL THROUGH.
2969 [[fallthrough]];
2970 case '\r':
2971 case '\n':
2972 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2973 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
2974 BufferPtr = CurPtr-1;
2975
2976 // Next, lex the character, which should handle the EOD transition.
2977 Lex(Tmp);
2978 if (Tmp.is(tok::code_completion)) {
2979 if (PP)
2981 Lex(Tmp);
2982 }
2983 assert(Tmp.is(tok::eod) && "Unexpected token!");
2984
2985 // Finally, we're done;
2986 return;
2987 }
2988 }
2989}
2990
2991/// LexEndOfFile - CurPtr points to the end of this file. Handle this
2992/// condition, reporting diagnostics and handling other edge cases as required.
2993/// This returns true if Result contains a token, false if PP.Lex should be
2994/// called again.
2995bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2996 // If we hit the end of the file while parsing a preprocessor directive,
2997 // end the preprocessor directive first. The next token returned will
2998 // then be the end of file.
3000 // Done parsing the "line".
3002 // Update the location of token as well as BufferPtr.
3003 FormTokenWithChars(Result, CurPtr, tok::eod);
3004
3005 // Restore comment saving mode, in case it was disabled for directive.
3006 if (PP)
3008 return true; // Have a token.
3009 }
3010
3011 // If we are in raw mode, return this event as an EOF token. Let the caller
3012 // that put us in raw mode handle the event.
3013 if (isLexingRawMode()) {
3014 Result.startToken();
3015 BufferPtr = BufferEnd;
3016 FormTokenWithChars(Result, BufferEnd, tok::eof);
3017 return true;
3018 }
3019
3022 // If the preamble cuts off the end of a header guard, consider it guarded.
3023 // The guard is valid for the preamble content itself, and for tools the
3024 // most useful answer is "yes, this file has a header guard".
3025 if (!ConditionalStack.empty())
3027 ConditionalStack.clear();
3028 }
3029
3030 // Issue diagnostics for unterminated #if and missing newline.
3031
3032 // If we are in a #if directive, emit an error.
3033 while (!ConditionalStack.empty()) {
3034 if (PP->getCodeCompletionFileLoc() != FileLoc)
3035 PP->Diag(ConditionalStack.back().IfLoc,
3036 diag::err_pp_unterminated_conditional);
3037 ConditionalStack.pop_back();
3038 }
3039
3040 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3041 // a pedwarn.
3042 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3044 SourceLocation EndLoc = getSourceLocation(BufferEnd);
3045 unsigned DiagID;
3046
3047 if (LangOpts.CPlusPlus11) {
3048 // C++11 [lex.phases] 2.2 p2
3049 // Prefer the C++98 pedantic compatibility warning over the generic,
3050 // non-extension, user-requested "missing newline at EOF" warning.
3051 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3052 DiagID = diag::warn_cxx98_compat_no_newline_eof;
3053 } else {
3054 DiagID = diag::warn_no_newline_eof;
3055 }
3056 } else {
3057 DiagID = diag::ext_no_newline_eof;
3058 }
3059
3060 Diag(BufferEnd, DiagID)
3061 << FixItHint::CreateInsertion(EndLoc, "\n");
3062 }
3063
3064 BufferPtr = CurPtr;
3065
3066 // Finally, let the preprocessor handle this.
3068}
3069
3070/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3071/// the specified lexer will return a tok::l_paren token, 0 if it is something
3072/// else and 2 if there are no more tokens in the buffer controlled by the
3073/// lexer.
3074unsigned Lexer::isNextPPTokenLParen() {
3075 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3076
3077 if (isDependencyDirectivesLexer()) {
3078 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3079 return 2;
3080 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3081 tok::l_paren);
3082 }
3083
3084 // Switch to 'skipping' mode. This will ensure that we can lex a token
3085 // without emitting diagnostics, disables macro expansion, and will cause EOF
3086 // to return an EOF token instead of popping the include stack.
3087 LexingRawMode = true;
3088
3089 // Save state that can be changed while lexing so that we can restore it.
3090 const char *TmpBufferPtr = BufferPtr;
3091 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3092 bool atStartOfLine = IsAtStartOfLine;
3093 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3094 bool leadingSpace = HasLeadingSpace;
3095
3096 Token Tok;
3097 Lex(Tok);
3098
3099 // Restore state that may have changed.
3100 BufferPtr = TmpBufferPtr;
3101 ParsingPreprocessorDirective = inPPDirectiveMode;
3102 HasLeadingSpace = leadingSpace;
3103 IsAtStartOfLine = atStartOfLine;
3104 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3105
3106 // Restore the lexer back to non-skipping mode.
3107 LexingRawMode = false;
3108
3109 if (Tok.is(tok::eof))
3110 return 2;
3111 return Tok.is(tok::l_paren);
3112}
3113
3114/// Find the end of a version control conflict marker.
3115static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3116 ConflictMarkerKind CMK) {
3117 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3118 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3119 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3120 size_t Pos = RestOfBuffer.find(Terminator);
3121 while (Pos != StringRef::npos) {
3122 // Must occur at start of line.
3123 if (Pos == 0 ||
3124 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3125 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3126 Pos = RestOfBuffer.find(Terminator);
3127 continue;
3128 }
3129 return RestOfBuffer.data()+Pos;
3130 }
3131 return nullptr;
3132}
3133
3134/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3135/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3136/// and recover nicely. This returns true if it is a conflict marker and false
3137/// if not.
3138bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3139 // Only a conflict marker if it starts at the beginning of a line.
3140 if (CurPtr != BufferStart &&
3141 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3142 return false;
3143
3144 // Check to see if we have <<<<<<< or >>>>.
3145 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
3146 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
3147 return false;
3148
3149 // If we have a situation where we don't care about conflict markers, ignore
3150 // it.
3151 if (CurrentConflictMarkerState || isLexingRawMode())
3152 return false;
3153
3154 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3155
3156 // Check to see if there is an ending marker somewhere in the buffer at the
3157 // start of a line to terminate this conflict marker.
3158 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3159 // We found a match. We are really in a conflict marker.
3160 // Diagnose this, and ignore to the end of line.
3161 Diag(CurPtr, diag::err_conflict_marker);
3162 CurrentConflictMarkerState = Kind;
3163
3164 // Skip ahead to the end of line. We know this exists because the
3165 // end-of-conflict marker starts with \r or \n.
3166 while (*CurPtr != '\r' && *CurPtr != '\n') {
3167 assert(CurPtr != BufferEnd && "Didn't find end of line");
3168 ++CurPtr;
3169 }
3170 BufferPtr = CurPtr;
3171 return true;
3172 }
3173
3174 // No end of conflict marker found.
3175 return false;
3176}
3177
3178/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3179/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3180/// is the end of a conflict marker. Handle it by ignoring up until the end of
3181/// the line. This returns true if it is a conflict marker and false if not.
3182bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3183 // Only a conflict marker if it starts at the beginning of a line.
3184 if (CurPtr != BufferStart &&
3185 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3186 return false;
3187
3188 // If we have a situation where we don't care about conflict markers, ignore
3189 // it.
3190 if (!CurrentConflictMarkerState || isLexingRawMode())
3191 return false;
3192
3193 // Check to see if we have the marker (4 characters in a row).
3194 for (unsigned i = 1; i != 4; ++i)
3195 if (CurPtr[i] != CurPtr[0])
3196 return false;
3197
3198 // If we do have it, search for the end of the conflict marker. This could
3199 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3200 // be the end of conflict marker.
3201 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3202 CurrentConflictMarkerState)) {
3203 CurPtr = End;
3204
3205 // Skip ahead to the end of line.
3206 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3207 ++CurPtr;
3208
3209 BufferPtr = CurPtr;
3210
3211 // No longer in the conflict marker.
3212 CurrentConflictMarkerState = CMK_None;
3213 return true;
3214 }
3215
3216 return false;
3217}
3218
3219static const char *findPlaceholderEnd(const char *CurPtr,
3220 const char *BufferEnd) {
3221 if (CurPtr == BufferEnd)
3222 return nullptr;
3223 BufferEnd -= 1; // Scan until the second last character.
3224 for (; CurPtr != BufferEnd; ++CurPtr) {
3225 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3226 return CurPtr + 2;
3227 }
3228 return nullptr;
3229}
3230
3231bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3232 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3234 return false;
3235 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3236 if (!End)
3237 return false;
3238 const char *Start = CurPtr - 1;
3239 if (!LangOpts.AllowEditorPlaceholders)
3240 Diag(Start, diag::err_placeholder_in_source);
3241 Result.startToken();
3242 FormTokenWithChars(Result, End, tok::raw_identifier);
3243 Result.setRawIdentifierData(Start);
3246 BufferPtr = End;
3247 return true;
3248}
3249
3250bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3251 if (PP && PP->isCodeCompletionEnabled()) {
3252 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3253 return Loc == PP->getCodeCompletionLoc();
3254 }
3255
3256 return false;
3257}
3258
3259std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3260 const char *SlashLoc,
3261 Token *Result) {
3262 unsigned CharSize;
3263 char Kind = getCharAndSize(StartPtr, CharSize);
3264 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3265
3266 unsigned NumHexDigits;
3267 if (Kind == 'u')
3268 NumHexDigits = 4;
3269 else if (Kind == 'U')
3270 NumHexDigits = 8;
3271
3272 bool Delimited = false;
3273 bool FoundEndDelimiter = false;
3274 unsigned Count = 0;
3275 bool Diagnose = Result && !isLexingRawMode();
3276
3277 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3278 if (Diagnose)
3279 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3280 return std::nullopt;
3281 }
3282
3283 const char *CurPtr = StartPtr + CharSize;
3284 const char *KindLoc = &CurPtr[-1];
3285
3286 uint32_t CodePoint = 0;
3287 while (Count != NumHexDigits || Delimited) {
3288 char C = getCharAndSize(CurPtr, CharSize);
3289 if (!Delimited && Count == 0 && C == '{') {
3290 Delimited = true;
3291 CurPtr += CharSize;
3292 continue;
3293 }
3294
3295 if (Delimited && C == '}') {
3296 CurPtr += CharSize;
3297 FoundEndDelimiter = true;
3298 break;
3299 }
3300
3301 unsigned Value = llvm::hexDigitValue(C);
3302 if (Value == -1U) {
3303 if (!Delimited)
3304 break;
3305 if (Diagnose)
3306 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3307 << StringRef(KindLoc, 1);
3308 return std::nullopt;
3309 }
3310
3311 if (CodePoint & 0xF000'0000) {
3312 if (Diagnose)
3313 Diag(KindLoc, diag::err_escape_too_large) << 0;
3314 return std::nullopt;
3315 }
3316
3317 CodePoint <<= 4;
3318 CodePoint |= Value;
3319 CurPtr += CharSize;
3320 Count++;
3321 }
3322
3323 if (Count == 0) {
3324 if (Diagnose)
3325 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3326 : diag::warn_ucn_escape_no_digits)
3327 << StringRef(KindLoc, 1);
3328 return std::nullopt;
3329 }
3330
3331 if (Delimited && Kind == 'U') {
3332 if (Diagnose)
3333 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3334 return std::nullopt;
3335 }
3336
3337 if (!Delimited && Count != NumHexDigits) {
3338 if (Diagnose) {
3339 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3340 // If the user wrote \U1234, suggest a fixit to \u.
3341 if (Count == 4 && NumHexDigits == 8) {
3342 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3343 Diag(KindLoc, diag::note_ucn_four_not_eight)
3344 << FixItHint::CreateReplacement(URange, "u");
3345 }
3346 }
3347 return std::nullopt;
3348 }
3349
3350 if (Delimited && PP) {
3351 Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b
3352 ? diag::warn_cxx2b_delimited_escape_sequence
3353 : diag::ext_delimited_escape_sequence)
3354 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3355 }
3356
3357 if (Result) {
3358 Result->setFlag(Token::HasUCN);
3359 // If the UCN contains either a trigraph or a line splicing,
3360 // we need to call getAndAdvanceChar again to set the appropriate flags
3361 // on Result.
3362 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3363 StartPtr = CurPtr;
3364 else
3365 while (StartPtr != CurPtr)
3366 (void)getAndAdvanceChar(StartPtr, *Result);
3367 } else {
3368 StartPtr = CurPtr;
3369 }
3370 return CodePoint;
3371}
3372
3373std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3374 const char *SlashLoc,
3375 Token *Result) {
3376 unsigned CharSize;
3377 bool Diagnose = Result && !isLexingRawMode();
3378
3379 char C = getCharAndSize(StartPtr, CharSize);
3380 assert(C == 'N' && "expected \\N{...}");
3381
3382 const char *CurPtr = StartPtr + CharSize;
3383 const char *KindLoc = &CurPtr[-1];
3384
3385 C = getCharAndSize(CurPtr, CharSize);
3386 if (C != '{') {
3387 if (Diagnose)
3388 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3389 return std::nullopt;
3390 }
3391 CurPtr += CharSize;
3392 const char *StartName = CurPtr;
3393 bool FoundEndDelimiter = false;
3395 while (C) {
3396 C = getCharAndSize(CurPtr, CharSize);
3397 CurPtr += CharSize;
3398 if (C == '}') {
3399 FoundEndDelimiter = true;
3400 break;
3401 }
3402
3404 break;
3405 Buffer.push_back(C);
3406 }
3407
3408 if (!FoundEndDelimiter || Buffer.empty()) {
3409 if (Diagnose)
3410 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3411 : diag::warn_delimited_ucn_incomplete)
3412 << StringRef(KindLoc, 1);
3413 return std::nullopt;
3414 }
3415
3416 StringRef Name(Buffer.data(), Buffer.size());
3417 std::optional<char32_t> Match =
3418 llvm::sys::unicode::nameToCodepointStrict(Name);
3419 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3420 if (!Match) {
3421 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3422 if (Diagnose) {
3423 Diag(StartName, diag::err_invalid_ucn_name)
3424 << StringRef(Buffer.data(), Buffer.size())
3425 << makeCharRange(*this, StartName, CurPtr - CharSize);
3426 if (LooseMatch) {
3427 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3429 makeCharRange(*this, StartName, CurPtr - CharSize),
3430 LooseMatch->Name);
3431 }
3432 }
3433 // We do not offer misspelled character names suggestions here
3434 // as the set of what would be a valid suggestion depends on context,
3435 // and we should not make invalid suggestions.
3436 }
3437
3438 if (Diagnose && Match)
3439 Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b
3440 ? diag::warn_cxx2b_delimited_escape_sequence
3441 : diag::ext_delimited_escape_sequence)
3442 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3443
3444 // If no diagnostic has been emitted yet, likely because we are doing a
3445 // tentative lexing, we do not want to recover here to make sure the token
3446 // will not be incorrectly considered valid. This function will be called
3447 // again and a diagnostic emitted then.
3448 if (LooseMatch && Diagnose)
3449 Match = LooseMatch->CodePoint;
3450
3451 if (Result) {
3452 Result->setFlag(Token::HasUCN);
3453 // If the UCN contains either a trigraph or a line splicing,
3454 // we need to call getAndAdvanceChar again to set the appropriate flags
3455 // on Result.
3456 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3457 StartPtr = CurPtr;
3458 else
3459 while (StartPtr != CurPtr)
3460 (void)getAndAdvanceChar(StartPtr, *Result);
3461 } else {
3462 StartPtr = CurPtr;
3463 }
3464 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3465}
3466
3467uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3468 Token *Result) {
3469
3470 unsigned CharSize;
3471 std::optional<uint32_t> CodePointOpt;
3472 char Kind = getCharAndSize(StartPtr, CharSize);
3473 if (Kind == 'u' || Kind == 'U')
3474 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3475 else if (Kind == 'N')
3476 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3477
3478 if (!CodePointOpt)
3479 return 0;
3480
3481 uint32_t CodePoint = *CodePointOpt;
3482
3483 // Don't apply C family restrictions to UCNs in assembly mode
3484 if (LangOpts.AsmPreprocessor)
3485 return CodePoint;
3486
3487 // C99 6.4.3p2: A universal character name shall not specify a character whose
3488 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3489 // 0060 (`), nor one in the range D800 through DFFF inclusive.)
3490 // C++11 [lex.charset]p2: If the hexadecimal value for a
3491 // universal-character-name corresponds to a surrogate code point (in the
3492 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3493 // if the hexadecimal value for a universal-character-name outside the
3494 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3495 // string literal corresponds to a control character (in either of the
3496 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3497 // basic source character set, the program is ill-formed.
3498 if (CodePoint < 0xA0) {
3499 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
3500 return CodePoint;
3501
3502 // We don't use isLexingRawMode() here because we need to warn about bad
3503 // UCNs even when skipping preprocessing tokens in a #if block.
3504 if (Result && PP) {
3505 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3506 Diag(BufferPtr, diag::err_ucn_control_character);
3507 else {
3508 char C = static_cast<char>(CodePoint);
3509 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3510 }
3511 }
3512
3513 return 0;
3514 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3515 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3516 // We don't use isLexingRawMode() here because we need to diagnose bad
3517 // UCNs even when skipping preprocessing tokens in a #if block.
3518 if (Result && PP) {
3519 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3520 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3521 else
3522 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3523 }
3524 return 0;
3525 }
3526
3527 return CodePoint;
3528}
3529
3530bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3531 const char *CurPtr) {
3532 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3534 Diag(BufferPtr, diag::ext_unicode_whitespace)
3535 << makeCharRange(*this, BufferPtr, CurPtr);
3536
3537 Result.setFlag(Token::LeadingSpace);
3538 return true;
3539 }
3540 return false;
3541}
3542
3543void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3544 IsAtStartOfLine = Result.isAtStartOfLine();
3545 HasLeadingSpace = Result.hasLeadingSpace();
3546 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3547 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3548}
3549
3550bool Lexer::Lex(Token &Result) {
3551 assert(!isDependencyDirectivesLexer());
3552
3553 // Start a new token.
3554 Result.startToken();
3555
3556 // Set up misc whitespace flags for LexTokenInternal.
3557 if (IsAtStartOfLine) {
3558 Result.setFlag(Token::StartOfLine);
3559 IsAtStartOfLine = false;
3560 }
3561
3562 if (HasLeadingSpace) {
3563 Result.setFlag(Token::LeadingSpace);
3564 HasLeadingSpace = false;
3565 }
3566
3567 if (HasLeadingEmptyMacro) {
3569 HasLeadingEmptyMacro = false;
3570 }
3571
3572 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3573 IsAtPhysicalStartOfLine = false;
3574 bool isRawLex = isLexingRawMode();
3575 (void) isRawLex;
3576 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3577 // (After the LexTokenInternal call, the lexer might be destroyed.)
3578 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3579 return returnedToken;
3580}
3581
3582/// LexTokenInternal - This implements a simple C family lexer. It is an
3583/// extremely performance critical piece of code. This assumes that the buffer
3584/// has a null character at the end of the file. This returns a preprocessing
3585/// token, not a normal token, as such, it is an internal interface. It assumes
3586/// that the Flags of result have been cleared before calling this.
3587bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3588LexStart:
3589 assert(!Result.needsCleaning() && "Result needs cleaning");
3590 assert(!Result.hasPtrData() && "Result has not been reset");
3591
3592 // CurPtr - Cache BufferPtr in an automatic variable.
3593 const char *CurPtr = BufferPtr;
3594
3595 // Small amounts of horizontal whitespace is very common between tokens.
3596 if (isHorizontalWhitespace(*CurPtr)) {
3597 do {
3598 ++CurPtr;
3599 } while (isHorizontalWhitespace(*CurPtr));
3600
3601 // If we are keeping whitespace and other tokens, just return what we just
3602 // skipped. The next lexer invocation will return the token after the
3603 // whitespace.
3604 if (isKeepWhitespaceMode()) {
3605 FormTokenWithChars(Result, CurPtr, tok::unknown);
3606 // FIXME: The next token will not have LeadingSpace set.
3607 return true;
3608 }
3609
3610 BufferPtr = CurPtr;
3611 Result.setFlag(Token::LeadingSpace);
3612 }
3613
3614 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3615
3616 // Read a character, advancing over it.
3617 char Char = getAndAdvanceChar(CurPtr, Result);
3619
3620 if (!isVerticalWhitespace(Char))
3621 NewLinePtr = nullptr;
3622
3623 switch (Char) {
3624 case 0: // Null.
3625 // Found end of file?
3626 if (CurPtr-1 == BufferEnd)
3627 return LexEndOfFile(Result, CurPtr-1);
3628
3629 // Check if we are performing code completion.
3630 if (isCodeCompletionPoint(CurPtr-1)) {
3631 // Return the code-completion token.
3632 Result.startToken();
3633 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3634 return true;
3635 }
3636
3637 if (!isLexingRawMode())
3638 Diag(CurPtr-1, diag::null_in_file);
3639 Result.setFlag(Token::LeadingSpace);
3640 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3641 return true; // KeepWhitespaceMode
3642
3643 // We know the lexer hasn't changed, so just try again with this lexer.
3644 // (We manually eliminate the tail call to avoid recursion.)
3645 goto LexNextToken;
3646
3647 case 26: // DOS & CP/M EOF: "^Z".
3648 // If we're in Microsoft extensions mode, treat this as end of file.
3649 if (LangOpts.MicrosoftExt) {
3650 if (!isLexingRawMode())
3651 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3652 return LexEndOfFile(Result, CurPtr-1);
3653 }
3654
3655 // If Microsoft extensions are disabled, this is just random garbage.
3656 Kind = tok::unknown;
3657 break;
3658
3659 case '\r':
3660 if (CurPtr[0] == '\n')
3661 (void)getAndAdvanceChar(CurPtr, Result);
3662 [[fallthrough]];
3663 case '\n':
3664 // If we are inside a preprocessor directive and we see the end of line,
3665 // we know we are done with the directive, so return an EOD token.
3667 // Done parsing the "line".
3669
3670 // Restore comment saving mode, in case it was disabled for directive.
3671 if (PP)
3673
3674 // Since we consumed a newline, we are back at the start of a line.
3675 IsAtStartOfLine = true;
3676 IsAtPhysicalStartOfLine = true;
3677 NewLinePtr = CurPtr - 1;
3678
3679 Kind = tok::eod;
3680 break;
3681 }
3682
3683 // No leading whitespace seen so far.
3684 Result.clearFlag(Token::LeadingSpace);
3685
3686 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3687 return true; // KeepWhitespaceMode
3688
3689 // We only saw whitespace, so just try again with this lexer.
3690 // (We manually eliminate the tail call to avoid recursion.)
3691 goto LexNextToken;
3692 case ' ':
3693 case '\t':
3694 case '\f':
3695 case '\v':
3696 SkipHorizontalWhitespace:
3697 Result.setFlag(Token::LeadingSpace);
3698 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3699 return true; // KeepWhitespaceMode
3700
3701 SkipIgnoredUnits:
3702 CurPtr = BufferPtr;
3703
3704 // If the next token is obviously a // or /* */ comment, skip it efficiently
3705 // too (without going through the big switch stmt).
3706 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3707 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3708 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3709 return true; // There is a token to return.
3710 goto SkipIgnoredUnits;
3711 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3712 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3713 return true; // There is a token to return.
3714 goto SkipIgnoredUnits;
3715 } else if (isHorizontalWhitespace(*CurPtr)) {
3716 goto SkipHorizontalWhitespace;
3717 }
3718 // We only saw whitespace, so just try again with this lexer.
3719 // (We manually eliminate the tail call to avoid recursion.)
3720 goto LexNextToken;
3721
3722 // C99 6.4.4.1: Integer Constants.
3723 // C99 6.4.4.2: Floating Constants.
3724 case '0': case '1': case '2': case '3': case '4':
3725 case '5': case '6': case '7': case '8': case '9':
3726 // Notify MIOpt that we read a non-whitespace/non-comment token.
3727 MIOpt.ReadToken();
3728 return LexNumericConstant(Result, CurPtr);
3729
3730 // Identifier (e.g., uber), or
3731 // UTF-8 (C2x/C++17) or UTF-16 (C11/C++11) character literal, or
3732 // UTF-8 or UTF-16 string literal (C11/C++11).
3733 case 'u':
3734 // Notify MIOpt that we read a non-whitespace/non-comment token.
3735 MIOpt.ReadToken();
3736
3737 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3738 Char = getCharAndSize(CurPtr, SizeTmp);
3739
3740 // UTF-16 string literal
3741 if (Char == '"')
3742 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3743 tok::utf16_string_literal);
3744
3745 // UTF-16 character constant
3746 if (Char == '\'')
3747 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3748 tok::utf16_char_constant);
3749
3750 // UTF-16 raw string literal
3751 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3752 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3753 return LexRawStringLiteral(Result,
3754 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3755 SizeTmp2, Result),
3756 tok::utf16_string_literal);
3757
3758 if (Char == '8') {
3759 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3760
3761 // UTF-8 string literal
3762 if (Char2 == '"')
3763 return LexStringLiteral(Result,
3764 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3765 SizeTmp2, Result),
3766 tok::utf8_string_literal);
3767 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C2x))
3768 return LexCharConstant(
3769 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3770 SizeTmp2, Result),
3771 tok::utf8_char_constant);
3772
3773 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3774 unsigned SizeTmp3;
3775 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3776 // UTF-8 raw string literal
3777 if (Char3 == '"') {
3778 return LexRawStringLiteral(Result,
3779 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3780 SizeTmp2, Result),
3781 SizeTmp3, Result),
3782 tok::utf8_string_literal);
3783 }
3784 }
3785 }
3786 }
3787
3788 // treat u like the start of an identifier.
3789 return LexIdentifierContinue(Result, CurPtr);
3790
3791 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3792 // Notify MIOpt that we read a non-whitespace/non-comment token.
3793 MIOpt.ReadToken();
3794
3795 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3796 Char = getCharAndSize(CurPtr, SizeTmp);
3797
3798 // UTF-32 string literal
3799 if (Char == '"')
3800 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3801 tok::utf32_string_literal);
3802
3803 // UTF-32 character constant
3804 if (Char == '\'')
3805 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3806 tok::utf32_char_constant);
3807
3808 // UTF-32 raw string literal
3809 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3810 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3811 return LexRawStringLiteral(Result,
3812 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3813 SizeTmp2, Result),
3814 tok::utf32_string_literal);
3815 }
3816
3817 // treat U like the start of an identifier.
3818 return LexIdentifierContinue(Result, CurPtr);
3819
3820 case 'R': // Identifier or C++0x raw string literal
3821 // Notify MIOpt that we read a non-whitespace/non-comment token.
3822 MIOpt.ReadToken();
3823
3824 if (LangOpts.CPlusPlus11) {
3825 Char = getCharAndSize(CurPtr, SizeTmp);
3826
3827 if (Char == '"')
3828 return LexRawStringLiteral(Result,
3829 ConsumeChar(CurPtr, SizeTmp, Result),
3830 tok::string_literal);
3831 }
3832
3833 // treat R like the start of an identifier.
3834 return LexIdentifierContinue(Result, CurPtr);
3835
3836 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3837 // Notify MIOpt that we read a non-whitespace/non-comment token.
3838 MIOpt.ReadToken();
3839 Char = getCharAndSize(CurPtr, SizeTmp);
3840
3841 // Wide string literal.
3842 if (Char == '"')
3843 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3844 tok::wide_string_literal);
3845
3846 // Wide raw string literal.
3847 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3848 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3849 return LexRawStringLiteral(Result,
3850 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3851 SizeTmp2, Result),
3852 tok::wide_string_literal);
3853
3854 // Wide character constant.
3855 if (Char == '\'')
3856 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3857 tok::wide_char_constant);
3858 // FALL THROUGH, treating L like the start of an identifier.
3859 [[fallthrough]];
3860
3861 // C99 6.4.2: Identifiers.
3862 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3863 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3864 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3865 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3866 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3867 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3868 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3869 case 'v': case 'w': case 'x': case 'y': case 'z':
3870 case '_':
3871 // Notify MIOpt that we read a non-whitespace/non-comment token.
3872 MIOpt.ReadToken();
3873 return LexIdentifierContinue(Result, CurPtr);
3874
3875 case '$': // $ in identifiers.
3876 if (LangOpts.DollarIdents) {
3877 if (!isLexingRawMode())
3878 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3879 // Notify MIOpt that we read a non-whitespace/non-comment token.
3880 MIOpt.ReadToken();
3881 return LexIdentifierContinue(Result, CurPtr);
3882 }
3883
3884 Kind = tok::unknown;
3885 break;
3886
3887 // C99 6.4.4: Character Constants.
3888 case '\'':
3889 // Notify MIOpt that we read a non-whitespace/non-comment token.
3890 MIOpt.ReadToken();
3891 return LexCharConstant(Result, CurPtr, tok::char_constant);
3892
3893 // C99 6.4.5: String Literals.
3894 case '"':
3895 // Notify MIOpt that we read a non-whitespace/non-comment token.
3896 MIOpt.ReadToken();
3897 return LexStringLiteral(Result, CurPtr,
3898 ParsingFilename ? tok::header_name
3899 : tok::string_literal);
3900
3901 // C99 6.4.6: Punctuators.
3902 case '?':
3903 Kind = tok::question;
3904 break;
3905 case '[':
3906 Kind = tok::l_square;
3907 break;
3908 case ']':
3909 Kind = tok::r_square;
3910 break;
3911 case '(':
3912 Kind = tok::l_paren;
3913 break;
3914 case ')':
3915 Kind = tok::r_paren;
3916 break;
3917 case '{':
3918 Kind = tok::l_brace;
3919 break;
3920 case '}':
3921 Kind = tok::r_brace;
3922 break;
3923 case '.':
3924 Char = getCharAndSize(CurPtr, SizeTmp);
3925 if (Char >= '0' && Char <= '9') {
3926 // Notify MIOpt that we read a non-whitespace/non-comment token.
3927 MIOpt.ReadToken();
3928
3929 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3930 } else if (LangOpts.CPlusPlus && Char == '*') {
3931 Kind = tok::periodstar;
3932 CurPtr += SizeTmp;
3933 } else if (Char == '.' &&
3934 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3935 Kind = tok::ellipsis;
3936 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3937 SizeTmp2, Result);
3938 } else {
3939 Kind = tok::period;
3940 }
3941 break;
3942 case '&':
3943 Char = getCharAndSize(CurPtr, SizeTmp);
3944 if (Char == '&') {
3945 Kind = tok::ampamp;
3946 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3947 } else if (Char == '=') {
3948 Kind = tok::ampequal;
3949 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3950 } else {
3951 Kind = tok::amp;
3952 }
3953 break;
3954 case '*':
3955 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3956 Kind = tok::starequal;
3957 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3958 } else {
3959 Kind = tok::star;
3960 }
3961 break;
3962 case '+':
3963 Char = getCharAndSize(CurPtr, SizeTmp);
3964 if (Char == '+') {
3965 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3966 Kind = tok::plusplus;
3967 } else if (Char == '=') {
3968 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3969 Kind = tok::plusequal;
3970 } else {
3971 Kind = tok::plus;
3972 }
3973 break;
3974 case '-':
3975 Char = getCharAndSize(CurPtr, SizeTmp);
3976 if (Char == '-') { // --
3977 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3978 Kind = tok::minusminus;
3979 } else if (Char == '>' && LangOpts.CPlusPlus &&
3980 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
3981 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3982 SizeTmp2, Result);
3983 Kind = tok::arrowstar;
3984 } else if (Char == '>') { // ->
3985 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3986 Kind = tok::arrow;
3987 } else if (Char == '=') { // -=
3988 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3989 Kind = tok::minusequal;
3990 } else {
3991 Kind = tok::minus;
3992 }
3993 break;
3994 case '~':
3995 Kind = tok::tilde;
3996 break;
3997 case '!':
3998 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3999 Kind = tok::exclaimequal;
4000 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4001 } else {
4002 Kind = tok::exclaim;
4003 }
4004 break;
4005 case '/':
4006 // 6.4.9: Comments
4007 Char = getCharAndSize(CurPtr, SizeTmp);
4008 if (Char == '/') { // Line comment.
4009 // Even if Line comments are disabled (e.g. in C89 mode), we generally
4010 // want to lex this as a comment. There is one problem with this though,
4011 // that in one particular corner case, this can change the behavior of the
4012 // resultant program. For example, In "foo //**/ bar", C89 would lex
4013 // this as "foo / bar" and languages with Line comments would lex it as
4014 // "foo". Check to see if the character after the second slash is a '*'.
4015 // If so, we will lex that as a "/" instead of the start of a comment.
4016 // However, we never do this if we are just preprocessing.
4017 bool TreatAsComment =
4018 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4019 if (!TreatAsComment)
4020 if (!(PP && PP->isPreprocessedOutput()))
4021 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4022
4023 if (TreatAsComment) {
4024 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4025 TokAtPhysicalStartOfLine))
4026 return true; // There is a token to return.
4027
4028 // It is common for the tokens immediately after a // comment to be
4029 // whitespace (indentation for the next line). Instead of going through
4030 // the big switch, handle it efficiently now.
4031 goto SkipIgnoredUnits;
4032 }
4033 }
4034
4035 if (Char == '*') { // /**/ comment.
4036 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4037 TokAtPhysicalStartOfLine))
4038 return true; // There is a token to return.
4039
4040 // We only saw whitespace, so just try again with this lexer.
4041 // (We manually eliminate the tail call to avoid recursion.)
4042 goto LexNextToken;
4043 }
4044
4045 if (Char == '=') {
4046 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4047 Kind = tok::slashequal;
4048 } else {
4049 Kind = tok::slash;
4050 }
4051 break;
4052 case '%':
4053 Char = getCharAndSize(CurPtr, SizeTmp);
4054 if (Char == '=') {
4055 Kind = tok::percentequal;
4056 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4057 } else if (LangOpts.Digraphs && Char == '>') {
4058 Kind = tok::r_brace; // '%>' -> '}'
4059 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4060 } else if (LangOpts.Digraphs && Char == ':') {
4061 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4062 Char = getCharAndSize(CurPtr, SizeTmp);
4063 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4064 Kind = tok::hashhash; // '%:%:' -> '##'
4065 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4066 SizeTmp2, Result);
4067 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4068 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4069 if (!isLexingRawMode())
4070 Diag(BufferPtr, diag::ext_charize_microsoft);
4071 Kind = tok::hashat;
4072 } else { // '%:' -> '#'
4073 // We parsed a # character. If this occurs at the start of the line,
4074 // it's actually the start of a preprocessing directive. Callback to
4075 // the preprocessor to handle it.
4076 // TODO: -fpreprocessed mode??
4077 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4078 goto HandleDirective;
4079
4080 Kind = tok::hash;
4081 }
4082 } else {
4083 Kind = tok::percent;
4084 }
4085 break;
4086 case '<':
4087 Char = getCharAndSize(CurPtr, SizeTmp);
4088 if (ParsingFilename) {
4089 return LexAngledStringLiteral(Result, CurPtr);
4090 } else if (Char == '<') {
4091 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4092 if (After == '=') {
4093 Kind = tok::lesslessequal;
4094 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4095 SizeTmp2, Result);
4096 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4097 // If this is actually a '<<<<<<<' version control conflict marker,
4098 // recognize it as such and recover nicely.
4099 goto LexNextToken;
4100 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4101 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4102 // ignore it.
4103 goto LexNextToken;
4104 } else if (LangOpts.CUDA && After == '<') {
4105 Kind = tok::lesslessless;
4106 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4107 SizeTmp2, Result);
4108 } else {
4109 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4110 Kind = tok::lessless;
4111 }
4112 } else if (Char == '=') {
4113 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4114 if (After == '>') {
4115 if (LangOpts.CPlusPlus20) {
4116 if (!isLexingRawMode())
4117 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4118 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4119 SizeTmp2, Result);
4120 Kind = tok::spaceship;
4121 break;
4122 }
4123 // Suggest adding a space between the '<=' and the '>' to avoid a
4124 // change in semantics if this turns up in C++ <=17 mode.
4125 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4126 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4128 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4129 }
4130 }
4131 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4132 Kind = tok::lessequal;
4133 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4134 if (LangOpts.CPlusPlus11 &&
4135 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4136 // C++0x [lex.pptoken]p3:
4137 // Otherwise, if the next three characters are <:: and the subsequent
4138 // character is neither : nor >, the < is treated as a preprocessor
4139 // token by itself and not as the first character of the alternative
4140 // token <:.
4141 unsigned SizeTmp3;
4142 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4143 if (After != ':' && After != '>') {
4144 Kind = tok::less;
4145 if (!isLexingRawMode())
4146 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4147 break;
4148 }
4149 }
4150
4151 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4152 Kind = tok::l_square;
4153 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4154 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4155 Kind = tok::l_brace;
4156 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4157 lexEditorPlaceholder(Result, CurPtr)) {
4158 return true;
4159 } else {
4160 Kind = tok::less;
4161 }
4162 break;
4163 case '>':
4164 Char = getCharAndSize(CurPtr, SizeTmp);
4165 if (Char == '=') {
4166 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4167 Kind = tok::greaterequal;
4168 } else if (Char == '>') {
4169 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4170 if (After == '=') {
4171 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4172 SizeTmp2, Result);
4173 Kind = tok::greatergreaterequal;
4174 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4175 // If this is actually a '>>>>' conflict marker, recognize it as such
4176 // and recover nicely.
4177 goto LexNextToken;
4178 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4179 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4180 goto LexNextToken;
4181 } else if (LangOpts.CUDA && After == '>') {
4182 Kind = tok::greatergreatergreater;
4183 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4184 SizeTmp2, Result);
4185 } else {
4186 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4187 Kind = tok::greatergreater;
4188 }
4189 } else {
4190 Kind = tok::greater;
4191 }
4192 break;
4193 case '^':
4194 Char = getCharAndSize(CurPtr, SizeTmp);
4195 if (Char == '=') {
4196 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4197 Kind = tok::caretequal;
4198 } else if (LangOpts.OpenCL && Char == '^') {
4199 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4200 Kind = tok::caretcaret;
4201 } else {
4202 Kind = tok::caret;
4203 }
4204 break;
4205 case '|':
4206 Char = getCharAndSize(CurPtr, SizeTmp);
4207 if (Char == '=') {
4208 Kind = tok::pipeequal;
4209 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4210 } else if (Char == '|') {
4211 // If this is '|||||||' and we're in a conflict marker, ignore it.
4212 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4213 goto LexNextToken;
4214 Kind = tok::pipepipe;
4215 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4216 } else {
4217 Kind = tok::pipe;
4218 }
4219 break;
4220 case ':':
4221 Char = getCharAndSize(CurPtr, SizeTmp);
4222 if (LangOpts.Digraphs && Char == '>') {
4223 Kind = tok::r_square; // ':>' -> ']'
4224 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4225 } else if ((LangOpts.CPlusPlus ||
4226 LangOpts.DoubleSquareBracketAttributes) &&
4227 Char == ':') {
4228 Kind = tok::coloncolon;
4229 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4230 } else {
4231 Kind = tok::colon;
4232 }
4233 break;
4234 case ';':
4235 Kind = tok::semi;
4236 break;
4237 case '=':
4238 Char = getCharAndSize(CurPtr, SizeTmp);
4239 if (Char == '=') {
4240 // If this is '====' and we're in a conflict marker, ignore it.
4241 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4242 goto LexNextToken;
4243
4244 Kind = tok::equalequal;
4245 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4246 } else {
4247 Kind = tok::equal;
4248 }
4249 break;
4250 case ',':
4251 Kind = tok::comma;
4252 break;
4253 case '#':
4254 Char = getCharAndSize(CurPtr, SizeTmp);
4255 if (Char == '#') {
4256 Kind = tok::hashhash;
4257 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4258 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4259 Kind = tok::hashat;
4260 if (!isLexingRawMode())
4261 Diag(BufferPtr, diag::ext_charize_microsoft);
4262 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4263 } else {
4264 // We parsed a # character. If this occurs at the start of the line,
4265 // it's actually the start of a preprocessing directive. Callback to
4266 // the preprocessor to handle it.
4267 // TODO: -fpreprocessed mode??
4268 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4269 goto HandleDirective;
4270
4271 Kind = tok::hash;
4272 }
4273 break;
4274
4275 case '@':
4276 // Objective C support.
4277 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4278 Kind = tok::at;
4279 else
4280 Kind = tok::unknown;
4281 break;
4282
4283 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4284 case '\\':
4285 if (!LangOpts.AsmPreprocessor) {
4286 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4287 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4288 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4289 return true; // KeepWhitespaceMode
4290
4291 // We only saw whitespace, so just try again with this lexer.
4292 // (We manually eliminate the tail call to avoid recursion.)
4293 goto LexNextToken;
4294 }
4295
4296 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4297 }
4298 }
4299
4300 Kind = tok::unknown;
4301 break;
4302
4303 default: {
4304 if (isASCII(Char)) {
4305 Kind = tok::unknown;
4306 break;
4307 }
4308
4309 llvm::UTF32 CodePoint;
4310
4311 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4312 // an escaped newline.
4313 --CurPtr;
4314 llvm::ConversionResult Status =
4315 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4316 (const llvm::UTF8 *)BufferEnd,
4317 &CodePoint,
4318 llvm::strictConversion);
4319 if (Status == llvm::conversionOK) {
4320 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4321 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4322 return true; // KeepWhitespaceMode
4323
4324 // We only saw whitespace, so just try again with this lexer.
4325 // (We manually eliminate the tail call to avoid recursion.)
4326 goto LexNextToken;
4327 }
4328 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4329 }
4330
4333 ++CurPtr;
4334 Kind = tok::unknown;
4335 break;
4336 }
4337
4338 // Non-ASCII characters tend to creep into source code unintentionally.
4339 // Instead of letting the parser complain about the unknown token,
4340 // just diagnose the invalid UTF-8, then drop the character.
4341 Diag(CurPtr, diag::err_invalid_utf8);
4342
4343 BufferPtr = CurPtr+1;
4344 // We're pretending the character didn't exist, so just try again with
4345 // this lexer.
4346 // (We manually eliminate the tail call to avoid recursion.)
4347 goto LexNextToken;
4348 }
4349 }
4350
4351 // Notify MIOpt that we read a non-whitespace/non-comment token.
4352 MIOpt.ReadToken();
4353
4354 // Update the location of token as well as BufferPtr.
4355 FormTokenWithChars(Result, CurPtr, Kind);
4356 return true;
4357
4358HandleDirective:
4359 // We parsed a # character and it's the start of a preprocessing directive.
4360
4361 FormTokenWithChars(Result, CurPtr, tok::hash);
4363
4365 // With a fatal failure in the module loader, we abort parsing.
4366 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
4367 return true;
4368 }
4369
4370 // We parsed the directive; lex a token with the new state.
4371 return false;
4372
4373LexNextToken:
4374 Result.clearFlag(Token::NeedsCleaning);
4375 goto LexStart;
4376}
4377
4378const char *Lexer::convertDependencyDirectiveToken(
4380 const char *TokPtr = BufferStart + DDTok.Offset;
4381 Result.startToken();
4382 Result.setLocation(getSourceLocation(TokPtr));
4383 Result.setKind(DDTok.Kind);
4384 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4385 Result.setLength(DDTok.Length);
4386 BufferPtr = TokPtr + DDTok.Length;
4387 return TokPtr;
4388}
4389
4390bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4391 assert(isDependencyDirectivesLexer());
4392
4393 using namespace dependency_directives_scan;
4394
4395 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4396 if (DepDirectives.front().Kind == pp_eof)
4397 return LexEndOfFile(Result, BufferEnd);
4398 if (DepDirectives.front().Kind == tokens_present_before_eof)
4399 MIOpt.ReadToken();
4400 NextDepDirectiveTokenIndex = 0;
4401 DepDirectives = DepDirectives.drop_front();
4402 }
4403
4405 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4406 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4407 // Read something other than a preprocessor directive hash.
4408 MIOpt.ReadToken();
4409 }
4410
4411 if (ParsingFilename && DDTok.is(tok::less)) {
4412 BufferPtr = BufferStart + DDTok.Offset;
4413 LexAngledStringLiteral(Result, BufferPtr + 1);
4414 if (Result.isNot(tok::header_name))
4415 return true;
4416 // Advance the index of lexed tokens.
4417 while (true) {
4418 const dependency_directives_scan::Token &NextTok =
4419 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4420 if (BufferStart + NextTok.Offset >= BufferPtr)
4421 break;
4422 ++NextDepDirectiveTokenIndex;
4423 }
4424 return true;
4425 }
4426
4427 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4428
4429 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4431 return false;
4432 }
4433 if (Result.is(tok::raw_identifier)) {
4434 Result.setRawIdentifierData(TokPtr);
4435 if (!isLexingRawMode()) {
4437 if (II->isHandleIdentifierCase())
4438 return PP->HandleIdentifier(Result);
4439 }
4440 return true;
4441 }
4442 if (Result.isLiteral()) {
4443 Result.setLiteralData(TokPtr);
4444 return true;
4445 }
4446 if (Result.is(tok::colon) &&
4447 (LangOpts.CPlusPlus || LangOpts.DoubleSquareBracketAttributes)) {
4448 // Convert consecutive colons to 'tok::coloncolon'.
4449 if (*BufferPtr == ':') {
4450 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4451 tok::colon));
4452 ++NextDepDirectiveTokenIndex;
4453 Result.setKind(tok::coloncolon);
4454 }
4455 return true;
4456 }
4457 if (Result.is(tok::eod))
4459
4460 return true;
4461}
4462
4463bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4464 assert(isDependencyDirectivesLexer());
4465
4466 using namespace dependency_directives_scan;
4467
4468 bool Stop = false;
4469 unsigned NestedIfs = 0;
4470 do {
4471 DepDirectives = DepDirectives.drop_front();
4472 switch (DepDirectives.front().Kind) {
4473 case pp_none:
4474 llvm_unreachable("unexpected 'pp_none'");
4475 case pp_include:
4477 case pp_define:
4478 case pp_undef:
4479 case pp_import:
4480 case pp_pragma_import:
4481 case pp_pragma_once:
4485 case pp_include_next:
4486 case decl_at_import:
4487 case cxx_module_decl:
4488 case cxx_import_decl:
4492 break;
4493 case pp_if:
4494 case pp_ifdef:
4495 case pp_ifndef:
4496 ++NestedIfs;
4497 break;
4498 case pp_elif:
4499 case pp_elifdef:
4500 case pp_elifndef:
4501 case pp_else:
4502 if (!NestedIfs) {
4503 Stop = true;
4504 }
4505 break;
4506 case pp_endif:
4507 if (!NestedIfs) {
4508 Stop = true;
4509 } else {
4510 --NestedIfs;
4511 }
4512 break;
4513 case pp_eof:
4514 NextDepDirectiveTokenIndex = 0;
4515 return LexEndOfFile(Result, BufferEnd);
4516 }
4517 } while (!Stop);
4518
4520 DepDirectives.front().Tokens.front();
4521 assert(DDTok.is(tok::hash));
4522 NextDepDirectiveTokenIndex = 1;
4523
4524 convertDependencyDirectiveToken(DDTok, Result);
4525 return false;
4526}
StringRef P
#define SM(sm)
Definition: Cuda.cpp:78
Defines the Diagnostic-related interfaces.
unsigned Offset
Definition: Format.cpp:2776
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:884
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition: Lexer.cpp:1475
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1669
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1197
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:276
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3115
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1594
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:512
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:236
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1125
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1489
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1559
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1456
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition: Lexer.cpp:1543
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3219
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition: Lexer.cpp:1462
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:856
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2631
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1178
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1517
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1565
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:495
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
SourceLocation Begin
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ int
__device__ __2f16 float c
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16229
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16021
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1266
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:192
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1542
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:911
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool isInvalid() const
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:134
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:123
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:97
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:82
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:960
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1294
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:229
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:996
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:2939
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:808
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1168
static char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:580
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:726
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:135
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:830
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:891
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1074
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1149
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1094
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:403
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1070
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:450
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:560
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:171
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1043
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:194
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:588
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:461
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1261
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:786
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:261
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:128
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
SourceManager & getSourceManager() const
EmptylineHandler * getEmptylineHandler() const
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
IdentifierTable & getIdentifierTable()
const LangOptions & getLangOpts() const
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
DiagnosticsEngine & getDiagnostics() const
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:181
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:298
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:115
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:126
unsigned getLength() const
Definition: Token.h:129
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:66
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:98
tok::TokenKind getKind() const
Definition: Token.h:93
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:270
@ HasUCN
Definition: Token.h:82
@ IsEditorPlaceholder
Definition: Token.h:87
@ LeadingEmptyMacro
Definition: Token.h:80
@ LeadingSpace
Definition: Token.h:76
@ StartOfLine
Definition: Token.h:74
@ HasUDSuffix
Definition: Token.h:81
@ NeedsCleaning
Definition: Token.h:79
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:120
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:57
void startToken()
Reset all flags to cleared.
Definition: Token.h:171
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:287
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:207
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:219
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:238
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4204
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3023
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3652
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:81
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:42
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:84
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
@ LineComment
Definition: LangStandard.h:48
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:76
@ C
Languages that the frontend can parse and compile.
@ Result
The result type of a method or function.
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:160
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:93
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:153
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c, bool AllowDollar=false)
Returns true if this is a body character of a C identifier, which is [a-zA-Z0-9_].
Definition: CharInfo.h:64
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:54
Definition: Format.h:4664
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
__PTRDIFF_TYPE__ ptrdiff_t
A signed integer type that is the result of subtracting two pointers.
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.