clang 18.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
18#include "clang/Basic/LLVM.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MathExtras.h"
36#include "llvm/Support/MemoryBufferRef.h"
37#include "llvm/Support/NativeFormatting.h"
38#include "llvm/Support/Unicode.h"
39#include "llvm/Support/UnicodeCharRanges.h"
40#include <algorithm>
41#include <cassert>
42#include <cstddef>
43#include <cstdint>
44#include <cstring>
45#include <optional>
46#include <string>
47#include <tuple>
48#include <utility>
49
50using namespace clang;
51
52//===----------------------------------------------------------------------===//
53// Token Class Implementation
54//===----------------------------------------------------------------------===//
55
56/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
58 if (isAnnotation())
59 return false;
61 return II->getObjCKeywordID() == objcKey;
62 return false;
63}
64
65/// getObjCKeywordID - Return the ObjC keyword kind.
67 if (isAnnotation())
68 return tok::objc_not_keyword;
70 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
71}
72
73//===----------------------------------------------------------------------===//
74// Lexer Class Implementation
75//===----------------------------------------------------------------------===//
76
77void Lexer::anchor() {}
78
79void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
80 const char *BufEnd) {
81 BufferStart = BufStart;
82 BufferPtr = BufPtr;
83 BufferEnd = BufEnd;
84
85 assert(BufEnd[0] == 0 &&
86 "We assume that the input buffer has a null character at the end"
87 " to simplify lexing!");
88
89 // Check whether we have a BOM in the beginning of the buffer. If yes - act
90 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
91 // skip the UTF-8 BOM if it's present.
92 if (BufferStart == BufferPtr) {
93 // Determine the size of the BOM.
94 StringRef Buf(BufferStart, BufferEnd - BufferStart);
95 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
96 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
97 .Default(0);
98
99 // Skip the BOM.
100 BufferPtr += BOMLength;
101 }
102
103 Is_PragmaLexer = false;
104 CurrentConflictMarkerState = CMK_None;
105
106 // Start of the file is a start of line.
107 IsAtStartOfLine = true;
108 IsAtPhysicalStartOfLine = true;
109
110 HasLeadingSpace = false;
111 HasLeadingEmptyMacro = false;
112
113 // We are not after parsing a #.
115
116 // We are not after parsing #include.
117 ParsingFilename = false;
118
119 // We are not in raw mode. Raw mode disables diagnostics and interpretation
120 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
121 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
122 // or otherwise skipping over tokens.
123 LexingRawMode = false;
124
125 // Default to not keeping comments.
126 ExtendedTokenMode = 0;
127
128 NewLinePtr = nullptr;
129}
130
131/// Lexer constructor - Create a new lexer object for the specified buffer
132/// with the specified preprocessor managing the lexing process. This lexer
133/// assumes that the associated file buffer and Preprocessor objects will
134/// outlive it, so it doesn't take ownership of either of them.
135Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
136 Preprocessor &PP, bool IsFirstIncludeOfFile)
137 : PreprocessorLexer(&PP, FID),
138 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
139 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
140 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
141 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
142 InputFile.getBufferEnd());
143
145}
146
147/// Lexer constructor - Create a new raw lexer object. This object is only
148/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
149/// range will outlive it, so it doesn't take ownership of it.
150Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
151 const char *BufStart, const char *BufPtr, const char *BufEnd,
152 bool IsFirstIncludeOfFile)
153 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
154 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
155 InitLexer(BufStart, BufPtr, BufEnd);
156
157 // We *are* in raw mode.
158 LexingRawMode = true;
159}
160
161/// Lexer constructor - Create a new raw lexer object. This object is only
162/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
163/// range will outlive it, so it doesn't take ownership of it.
164Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
165 const SourceManager &SM, const LangOptions &langOpts,
166 bool IsFirstIncludeOfFile)
167 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
168 FromFile.getBufferStart(), FromFile.getBufferEnd(),
169 IsFirstIncludeOfFile) {}
170
172 assert(PP && "Cannot reset token mode without a preprocessor");
173 if (LangOpts.TraditionalCPP)
175 else
177}
178
179/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
180/// _Pragma expansion. This has a variety of magic semantics that this method
181/// sets up. It returns a new'd Lexer that must be delete'd when done.
182///
183/// On entrance to this routine, TokStartLoc is a macro location which has a
184/// spelling loc that indicates the bytes to be lexed for the token and an
185/// expansion location that indicates where all lexed tokens should be
186/// "expanded from".
187///
188/// TODO: It would really be nice to make _Pragma just be a wrapper around a
189/// normal lexer that remaps tokens as they fly by. This would require making
190/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
191/// interface that could handle this stuff. This would pull GetMappedTokenLoc
192/// out of the critical path of the lexer!
193///
195 SourceLocation ExpansionLocStart,
196 SourceLocation ExpansionLocEnd,
197 unsigned TokLen, Preprocessor &PP) {
199
200 // Create the lexer as if we were going to lex the file normally.
201 FileID SpellingFID = SM.getFileID(SpellingLoc);
202 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
203 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
204
205 // Now that the lexer is created, change the start/end locations so that we
206 // just lex the subsection of the file that we want. This is lexing from a
207 // scratch buffer.
208 const char *StrData = SM.getCharacterData(SpellingLoc);
209
210 L->BufferPtr = StrData;
211 L->BufferEnd = StrData+TokLen;
212 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
213
214 // Set the SourceLocation with the remapping information. This ensures that
215 // GetMappedTokenLoc will remap the tokens as they are lexed.
216 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
217 ExpansionLocStart,
218 ExpansionLocEnd, TokLen);
219
220 // Ensure that the lexer thinks it is inside a directive, so that end \n will
221 // return an EOD token.
223
224 // This lexer really is for _Pragma.
225 L->Is_PragmaLexer = true;
226 return L;
227}
228
229void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
230 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
231 this->IsAtStartOfLine = IsAtStartOfLine;
232 assert((BufferStart + Offset) <= BufferEnd);
233 BufferPtr = BufferStart + Offset;
234}
235
236template <typename T> static void StringifyImpl(T &Str, char Quote) {
237 typename T::size_type i = 0, e = Str.size();
238 while (i < e) {
239 if (Str[i] == '\\' || Str[i] == Quote) {
240 Str.insert(Str.begin() + i, '\\');
241 i += 2;
242 ++e;
243 } else if (Str[i] == '\n' || Str[i] == '\r') {
244 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
245 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
246 Str[i] != Str[i + 1]) {
247 Str[i] = '\\';
248 Str[i + 1] = 'n';
249 } else {
250 // Replace '\n' and '\r' to '\\' followed by 'n'.
251 Str[i] = '\\';
252 Str.insert(Str.begin() + i + 1, 'n');
253 ++e;
254 }
255 i += 2;
256 } else
257 ++i;
258 }
259}
260
261std::string Lexer::Stringify(StringRef Str, bool Charify) {
262 std::string Result = std::string(Str);
263 char Quote = Charify ? '\'' : '"';
264 StringifyImpl(Result, Quote);
265 return Result;
266}
267
269
270//===----------------------------------------------------------------------===//
271// Token Spelling
272//===----------------------------------------------------------------------===//
273
274/// Slow case of getSpelling. Extract the characters comprising the
275/// spelling of this token from the provided input buffer.
276static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
277 const LangOptions &LangOpts, char *Spelling) {
278 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
279
280 size_t Length = 0;
281 const char *BufEnd = BufPtr + Tok.getLength();
282
283 if (tok::isStringLiteral(Tok.getKind())) {
284 // Munch the encoding-prefix and opening double-quote.
285 while (BufPtr < BufEnd) {
286 unsigned Size;
287 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
288 BufPtr += Size;
289
290 if (Spelling[Length - 1] == '"')
291 break;
292 }
293
294 // Raw string literals need special handling; trigraph expansion and line
295 // splicing do not occur within their d-char-sequence nor within their
296 // r-char-sequence.
297 if (Length >= 2 &&
298 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
299 // Search backwards from the end of the token to find the matching closing
300 // quote.
301 const char *RawEnd = BufEnd;
302 do --RawEnd; while (*RawEnd != '"');
303 size_t RawLength = RawEnd - BufPtr + 1;
304
305 // Everything between the quotes is included verbatim in the spelling.
306 memcpy(Spelling + Length, BufPtr, RawLength);
307 Length += RawLength;
308 BufPtr += RawLength;
309
310 // The rest of the token is lexed normally.
311 }
312 }
313
314 while (BufPtr < BufEnd) {
315 unsigned Size;
316 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
317 BufPtr += Size;
318 }
319
320 assert(Length < Tok.getLength() &&
321 "NeedsCleaning flag set on token that didn't need cleaning!");
322 return Length;
323}
324
325/// getSpelling() - Return the 'spelling' of this token. The spelling of a
326/// token are the characters used to represent the token in the source file
327/// after trigraph expansion and escaped-newline folding. In particular, this
328/// wants to get the true, uncanonicalized, spelling of things like digraphs
329/// UCNs, etc.
331 SmallVectorImpl<char> &buffer,
332 const SourceManager &SM,
333 const LangOptions &options,
334 bool *invalid) {
335 // Break down the source location.
336 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
337
338 // Try to the load the file buffer.
339 bool invalidTemp = false;
340 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
341 if (invalidTemp) {
342 if (invalid) *invalid = true;
343 return {};
344 }
345
346 const char *tokenBegin = file.data() + locInfo.second;
347
348 // Lex from the start of the given location.
349 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
350 file.begin(), tokenBegin, file.end());
351 Token token;
352 lexer.LexFromRawLexer(token);
353
354 unsigned length = token.getLength();
355
356 // Common case: no need for cleaning.
357 if (!token.needsCleaning())
358 return StringRef(tokenBegin, length);
359
360 // Hard case, we need to relex the characters into the string.
361 buffer.resize(length);
362 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
363 return StringRef(buffer.data(), buffer.size());
364}
365
366/// getSpelling() - Return the 'spelling' of this token. The spelling of a
367/// token are the characters used to represent the token in the source file
368/// after trigraph expansion and escaped-newline folding. In particular, this
369/// wants to get the true, uncanonicalized, spelling of things like digraphs
370/// UCNs, etc.
371std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
372 const LangOptions &LangOpts, bool *Invalid) {
373 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
374
375 bool CharDataInvalid = false;
376 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
377 &CharDataInvalid);
378 if (Invalid)
379 *Invalid = CharDataInvalid;
380 if (CharDataInvalid)
381 return {};
382
383 // If this token contains nothing interesting, return it directly.
384 if (!Tok.needsCleaning())
385 return std::string(TokStart, TokStart + Tok.getLength());
386
387 std::string Result;
388 Result.resize(Tok.getLength());
389 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
390 return Result;
391}
392
393/// getSpelling - This method is used to get the spelling of a token into a
394/// preallocated buffer, instead of as an std::string. The caller is required
395/// to allocate enough space for the token, which is guaranteed to be at least
396/// Tok.getLength() bytes long. The actual length of the token is returned.
397///
398/// Note that this method may do two possible things: it may either fill in
399/// the buffer specified with characters, or it may *change the input pointer*
400/// to point to a constant buffer with the data already in it (avoiding a
401/// copy). The caller is not allowed to modify the returned buffer pointer
402/// if an internal buffer is returned.
403unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
404 const SourceManager &SourceMgr,
405 const LangOptions &LangOpts, bool *Invalid) {
406 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
407
408 const char *TokStart = nullptr;
409 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
410 if (Tok.is(tok::raw_identifier))
411 TokStart = Tok.getRawIdentifier().data();
412 else if (!Tok.hasUCN()) {
413 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
414 // Just return the string from the identifier table, which is very quick.
415 Buffer = II->getNameStart();
416 return II->getLength();
417 }
418 }
419
420 // NOTE: this can be checked even after testing for an IdentifierInfo.
421 if (Tok.isLiteral())
422 TokStart = Tok.getLiteralData();
423
424 if (!TokStart) {
425 // Compute the start of the token in the input lexer buffer.
426 bool CharDataInvalid = false;
427 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
428 if (Invalid)
429 *Invalid = CharDataInvalid;
430 if (CharDataInvalid) {
431 Buffer = "";
432 return 0;
433 }
434 }
435
436 // If this token contains nothing interesting, return it directly.
437 if (!Tok.needsCleaning()) {
438 Buffer = TokStart;
439 return Tok.getLength();
440 }
441
442 // Otherwise, hard case, relex the characters into the string.
443 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
444}
445
446/// MeasureTokenLength - Relex the token at the specified location and return
447/// its length in bytes in the input file. If the token needs cleaning (e.g.
448/// includes a trigraph or an escaped newline) then this count includes bytes
449/// that are part of that.
451 const SourceManager &SM,
452 const LangOptions &LangOpts) {
453 Token TheTok;
454 if (getRawToken(Loc, TheTok, SM, LangOpts))
455 return 0;
456 return TheTok.getLength();
457}
458
459/// Relex the token at the specified location.
460/// \returns true if there was a failure, false on success.
462 const SourceManager &SM,
463 const LangOptions &LangOpts,
464 bool IgnoreWhiteSpace) {
465 // TODO: this could be special cased for common tokens like identifiers, ')',
466 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
467 // all obviously single-char tokens. This could use
468 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
469 // something.
470
471 // If this comes from a macro expansion, we really do want the macro name, not
472 // the token this macro expanded to.
473 Loc = SM.getExpansionLoc(Loc);
474 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
475 bool Invalid = false;
476 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
477 if (Invalid)
478 return true;
479
480 const char *StrData = Buffer.data()+LocInfo.second;
481
482 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
483 return true;
484
485 // Create a lexer starting at the beginning of this token.
486 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
487 Buffer.begin(), StrData, Buffer.end());
488 TheLexer.SetCommentRetentionState(true);
489 TheLexer.LexFromRawLexer(Result);
490 return false;
491}
492
493/// Returns the pointer that points to the beginning of line that contains
494/// the given offset, or null if the offset if invalid.
495static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
496 const char *BufStart = Buffer.data();
497 if (Offset >= Buffer.size())
498 return nullptr;
499
500 const char *LexStart = BufStart + Offset;
501 for (; LexStart != BufStart; --LexStart) {
502 if (isVerticalWhitespace(LexStart[0]) &&
503 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
504 // LexStart should point at first character of logical line.
505 ++LexStart;
506 break;
507 }
508 }
509 return LexStart;
510}
511
513 const SourceManager &SM,
514 const LangOptions &LangOpts) {
515 assert(Loc.isFileID());
516 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
517 if (LocInfo.first.isInvalid())
518 return Loc;
519
520 bool Invalid = false;
521 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
522 if (Invalid)
523 return Loc;
524
525 // Back up from the current location until we hit the beginning of a line
526 // (or the buffer). We'll relex from that point.
527 const char *StrData = Buffer.data() + LocInfo.second;
528 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
529 if (!LexStart || LexStart == StrData)
530 return Loc;
531
532 // Create a lexer starting at the beginning of this token.
533 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
534 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
535 Buffer.end());
536 TheLexer.SetCommentRetentionState(true);
537
538 // Lex tokens until we find the token that contains the source location.
539 Token TheTok;
540 do {
541 TheLexer.LexFromRawLexer(TheTok);
542
543 if (TheLexer.getBufferLocation() > StrData) {
544 // Lexing this token has taken the lexer past the source location we're
545 // looking for. If the current token encompasses our source location,
546 // return the beginning of that token.
547 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
548 return TheTok.getLocation();
549
550 // We ended up skipping over the source location entirely, which means
551 // that it points into whitespace. We're done here.
552 break;
553 }
554 } while (TheTok.getKind() != tok::eof);
555
556 // We've passed our source location; just return the original source location.
557 return Loc;
558}
559
561 const SourceManager &SM,
562 const LangOptions &LangOpts) {
563 if (Loc.isFileID())
564 return getBeginningOfFileToken(Loc, SM, LangOpts);
565
566 if (!SM.isMacroArgExpansion(Loc))
567 return Loc;
568
569 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
570 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
571 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
572 std::pair<FileID, unsigned> BeginFileLocInfo =
573 SM.getDecomposedLoc(BeginFileLoc);
574 assert(FileLocInfo.first == BeginFileLocInfo.first &&
575 FileLocInfo.second >= BeginFileLocInfo.second);
576 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
577}
578
579namespace {
580
581enum PreambleDirectiveKind {
582 PDK_Skipped,
583 PDK_Unknown
584};
585
586} // namespace
587
589 const LangOptions &LangOpts,
590 unsigned MaxLines) {
591 // Create a lexer starting at the beginning of the file. Note that we use a
592 // "fake" file source location at offset 1 so that the lexer will track our
593 // position within the file.
594 const SourceLocation::UIntTy StartOffset = 1;
596 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
597 Buffer.end());
598 TheLexer.SetCommentRetentionState(true);
599
600 bool InPreprocessorDirective = false;
601 Token TheTok;
602 SourceLocation ActiveCommentLoc;
603
604 unsigned MaxLineOffset = 0;
605 if (MaxLines) {
606 const char *CurPtr = Buffer.begin();
607 unsigned CurLine = 0;
608 while (CurPtr != Buffer.end()) {
609 char ch = *CurPtr++;
610 if (ch == '\n') {
611 ++CurLine;
612 if (CurLine == MaxLines)
613 break;
614 }
615 }
616 if (CurPtr != Buffer.end())
617 MaxLineOffset = CurPtr - Buffer.begin();
618 }
619
620 do {
621 TheLexer.LexFromRawLexer(TheTok);
622
623 if (InPreprocessorDirective) {
624 // If we've hit the end of the file, we're done.
625 if (TheTok.getKind() == tok::eof) {
626 break;
627 }
628
629 // If we haven't hit the end of the preprocessor directive, skip this
630 // token.
631 if (!TheTok.isAtStartOfLine())
632 continue;
633
634 // We've passed the end of the preprocessor directive, and will look
635 // at this token again below.
636 InPreprocessorDirective = false;
637 }
638
639 // Keep track of the # of lines in the preamble.
640 if (TheTok.isAtStartOfLine()) {
641 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
642
643 // If we were asked to limit the number of lines in the preamble,
644 // and we're about to exceed that limit, we're done.
645 if (MaxLineOffset && TokOffset >= MaxLineOffset)
646 break;
647 }
648
649 // Comments are okay; skip over them.
650 if (TheTok.getKind() == tok::comment) {
651 if (ActiveCommentLoc.isInvalid())
652 ActiveCommentLoc = TheTok.getLocation();
653 continue;
654 }
655
656 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
657 // This is the start of a preprocessor directive.
658 Token HashTok = TheTok;
659 InPreprocessorDirective = true;
660 ActiveCommentLoc = SourceLocation();
661
662 // Figure out which directive this is. Since we're lexing raw tokens,
663 // we don't have an identifier table available. Instead, just look at
664 // the raw identifier to recognize and categorize preprocessor directives.
665 TheLexer.LexFromRawLexer(TheTok);
666 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
667 StringRef Keyword = TheTok.getRawIdentifier();
668 PreambleDirectiveKind PDK
669 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
670 .Case("include", PDK_Skipped)
671 .Case("__include_macros", PDK_Skipped)
672 .Case("define", PDK_Skipped)
673 .Case("undef", PDK_Skipped)
674 .Case("line", PDK_Skipped)
675 .Case("error", PDK_Skipped)
676 .Case("pragma", PDK_Skipped)
677 .Case("import", PDK_Skipped)
678 .Case("include_next", PDK_Skipped)
679 .Case("warning", PDK_Skipped)
680 .Case("ident", PDK_Skipped)
681 .Case("sccs", PDK_Skipped)
682 .Case("assert", PDK_Skipped)
683 .Case("unassert", PDK_Skipped)
684 .Case("if", PDK_Skipped)
685 .Case("ifdef", PDK_Skipped)
686 .Case("ifndef", PDK_Skipped)
687 .Case("elif", PDK_Skipped)
688 .Case("elifdef", PDK_Skipped)
689 .Case("elifndef", PDK_Skipped)
690 .Case("else", PDK_Skipped)
691 .Case("endif", PDK_Skipped)
692 .Default(PDK_Unknown);
693
694 switch (PDK) {
695 case PDK_Skipped:
696 continue;
697
698 case PDK_Unknown:
699 // We don't know what this directive is; stop at the '#'.
700 break;
701 }
702 }
703
704 // We only end up here if we didn't recognize the preprocessor
705 // directive or it was one that can't occur in the preamble at this
706 // point. Roll back the current token to the location of the '#'.
707 TheTok = HashTok;
708 } else if (TheTok.isAtStartOfLine() &&
709 TheTok.getKind() == tok::raw_identifier &&
710 TheTok.getRawIdentifier() == "module" &&
711 LangOpts.CPlusPlusModules) {
712 // The initial global module fragment introducer "module;" is part of
713 // the preamble, which runs up to the module declaration "module foo;".
714 Token ModuleTok = TheTok;
715 do {
716 TheLexer.LexFromRawLexer(TheTok);
717 } while (TheTok.getKind() == tok::comment);
718 if (TheTok.getKind() != tok::semi) {
719 // Not global module fragment, roll back.
720 TheTok = ModuleTok;
721 break;
722 }
723 continue;
724 }
725
726 // We hit a token that we don't recognize as being in the
727 // "preprocessing only" part of the file, so we're no longer in
728 // the preamble.
729 break;
730 } while (true);
731
732 SourceLocation End;
733 if (ActiveCommentLoc.isValid())
734 End = ActiveCommentLoc; // don't truncate a decl comment.
735 else
736 End = TheTok.getLocation();
737
738 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
739 TheTok.isAtStartOfLine());
740}
741
742unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
743 const SourceManager &SM,
744 const LangOptions &LangOpts) {
745 // Figure out how many physical characters away the specified expansion
746 // character is. This needs to take into consideration newlines and
747 // trigraphs.
748 bool Invalid = false;
749 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
750
751 // If they request the first char of the token, we're trivially done.
752 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
753 return 0;
754
755 unsigned PhysOffset = 0;
756
757 // The usual case is that tokens don't contain anything interesting. Skip
758 // over the uninteresting characters. If a token only consists of simple
759 // chars, this method is extremely fast.
760 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
761 if (CharNo == 0)
762 return PhysOffset;
763 ++TokPtr;
764 --CharNo;
765 ++PhysOffset;
766 }
767
768 // If we have a character that may be a trigraph or escaped newline, use a
769 // lexer to parse it correctly.
770 for (; CharNo; --CharNo) {
771 unsigned Size;
772 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
773 TokPtr += Size;
774 PhysOffset += Size;
775 }
776
777 // Final detail: if we end up on an escaped newline, we want to return the
778 // location of the actual byte of the token. For example foo<newline>bar
779 // advanced by 3 should return the location of b, not of \\. One compounding
780 // detail of this is that the escape may be made by a trigraph.
781 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
782 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
783
784 return PhysOffset;
785}
786
787/// Computes the source location just past the end of the
788/// token at this source location.
789///
790/// This routine can be used to produce a source location that
791/// points just past the end of the token referenced by \p Loc, and
792/// is generally used when a diagnostic needs to point just after a
793/// token where it expected something different that it received. If
794/// the returned source location would not be meaningful (e.g., if
795/// it points into a macro), this routine returns an invalid
796/// source location.
797///
798/// \param Offset an offset from the end of the token, where the source
799/// location should refer to. The default offset (0) produces a source
800/// location pointing just past the end of the token; an offset of 1 produces
801/// a source location pointing to the last character in the token, etc.
803 const SourceManager &SM,
804 const LangOptions &LangOpts) {
805 if (Loc.isInvalid())
806 return {};
807
808 if (Loc.isMacroID()) {
809 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
810 return {}; // Points inside the macro expansion.
811 }
812
813 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
814 if (Len > Offset)
815 Len = Len - Offset;
816 else
817 return Loc;
818
819 return Loc.getLocWithOffset(Len);
820}
821
822/// Returns true if the given MacroID location points at the first
823/// token of the macro expansion.
825 const SourceManager &SM,
826 const LangOptions &LangOpts,
827 SourceLocation *MacroBegin) {
828 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
829
830 SourceLocation expansionLoc;
831 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
832 return false;
833
834 if (expansionLoc.isFileID()) {
835 // No other macro expansions, this is the first.
836 if (MacroBegin)
837 *MacroBegin = expansionLoc;
838 return true;
839 }
840
841 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
842}
843
844/// Returns true if the given MacroID location points at the last
845/// token of the macro expansion.
847 const SourceManager &SM,
848 const LangOptions &LangOpts,
849 SourceLocation *MacroEnd) {
850 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
851
852 SourceLocation spellLoc = SM.getSpellingLoc(loc);
853 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
854 if (tokLen == 0)
855 return false;
856
857 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
858 SourceLocation expansionLoc;
859 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
860 return false;
861
862 if (expansionLoc.isFileID()) {
863 // No other macro expansions.
864 if (MacroEnd)
865 *MacroEnd = expansionLoc;
866 return true;
867 }
868
869 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
870}
871
873 const SourceManager &SM,
874 const LangOptions &LangOpts) {
875 SourceLocation Begin = Range.getBegin();
876 SourceLocation End = Range.getEnd();
877 assert(Begin.isFileID() && End.isFileID());
878 if (Range.isTokenRange()) {
879 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
880 if (End.isInvalid())
881 return {};
882 }
883
884 // Break down the source locations.
885 FileID FID;
886 unsigned BeginOffs;
887 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
888 if (FID.isInvalid())
889 return {};
890
891 unsigned EndOffs;
892 if (!SM.isInFileID(End, FID, &EndOffs) ||
893 BeginOffs > EndOffs)
894 return {};
895
897}
898
899// Assumes that `Loc` is in an expansion.
901 const SourceManager &SM) {
902 return SM.getSLocEntry(SM.getFileID(Loc))
903 .getExpansion()
904 .isExpansionTokenRange();
905}
906
908 const SourceManager &SM,
909 const LangOptions &LangOpts) {
910 SourceLocation Begin = Range.getBegin();
911 SourceLocation End = Range.getEnd();
912 if (Begin.isInvalid() || End.isInvalid())
913 return {};
914
915 if (Begin.isFileID() && End.isFileID())
916 return makeRangeFromFileLocs(Range, SM, LangOpts);
917
918 if (Begin.isMacroID() && End.isFileID()) {
919 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
920 return {};
921 Range.setBegin(Begin);
922 return makeRangeFromFileLocs(Range, SM, LangOpts);
923 }
924
925 if (Begin.isFileID() && End.isMacroID()) {
926 if (Range.isTokenRange()) {
927 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
928 return {};
929 // Use the *original* end, not the expanded one in `End`.
930 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
931 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
932 return {};
933 Range.setEnd(End);
934 return makeRangeFromFileLocs(Range, SM, LangOpts);
935 }
936
937 assert(Begin.isMacroID() && End.isMacroID());
938 SourceLocation MacroBegin, MacroEnd;
939 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
940 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
941 &MacroEnd)) ||
942 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
943 &MacroEnd)))) {
944 Range.setBegin(MacroBegin);
945 Range.setEnd(MacroEnd);
946 // Use the *original* `End`, not the expanded one in `MacroEnd`.
947 if (Range.isTokenRange())
948 Range.setTokenRange(isInExpansionTokenRange(End, SM));
949 return makeRangeFromFileLocs(Range, SM, LangOpts);
950 }
951
952 bool Invalid = false;
953 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
954 &Invalid);
955 if (Invalid)
956 return {};
957
958 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
959 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
960 &Invalid);
961 if (Invalid)
962 return {};
963
964 if (EndEntry.getExpansion().isMacroArgExpansion() &&
965 BeginEntry.getExpansion().getExpansionLocStart() ==
966 EndEntry.getExpansion().getExpansionLocStart()) {
967 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
968 Range.setEnd(SM.getImmediateSpellingLoc(End));
969 return makeFileCharRange(Range, SM, LangOpts);
970 }
971 }
972
973 return {};
974}
975
977 const SourceManager &SM,
978 const LangOptions &LangOpts,
979 bool *Invalid) {
980 Range = makeFileCharRange(Range, SM, LangOpts);
981 if (Range.isInvalid()) {
982 if (Invalid) *Invalid = true;
983 return {};
984 }
985
986 // Break down the source location.
987 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
988 if (beginInfo.first.isInvalid()) {
989 if (Invalid) *Invalid = true;
990 return {};
991 }
992
993 unsigned EndOffs;
994 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
995 beginInfo.second > EndOffs) {
996 if (Invalid) *Invalid = true;
997 return {};
998 }
999
1000 // Try to the load the file buffer.
1001 bool invalidTemp = false;
1002 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1003 if (invalidTemp) {
1004 if (Invalid) *Invalid = true;
1005 return {};
1006 }
1007
1008 if (Invalid) *Invalid = false;
1009 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1010}
1011
1013 const SourceManager &SM,
1014 const LangOptions &LangOpts) {
1015 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1016
1017 // Find the location of the immediate macro expansion.
1018 while (true) {
1019 FileID FID = SM.getFileID(Loc);
1020 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1021 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1022 Loc = Expansion.getExpansionLocStart();
1023 if (!Expansion.isMacroArgExpansion())
1024 break;
1025
1026 // For macro arguments we need to check that the argument did not come
1027 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1028
1029 // Loc points to the argument id of the macro definition, move to the
1030 // macro expansion.
1031 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1032 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1033 if (SpellLoc.isFileID())
1034 break; // No inner macro.
1035
1036 // If spelling location resides in the same FileID as macro expansion
1037 // location, it means there is no inner macro.
1038 FileID MacroFID = SM.getFileID(Loc);
1039 if (SM.isInFileID(SpellLoc, MacroFID))
1040 break;
1041
1042 // Argument came from inner macro.
1043 Loc = SpellLoc;
1044 }
1045
1046 // Find the spelling location of the start of the non-argument expansion
1047 // range. This is where the macro name was spelled in order to begin
1048 // expanding this macro.
1049 Loc = SM.getSpellingLoc(Loc);
1050
1051 // Dig out the buffer where the macro name was spelled and the extents of the
1052 // name so that we can render it into the expansion note.
1053 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1054 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1055 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1056 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1057}
1058
1060 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1061 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1062 // Walk past macro argument expansions.
1063 while (SM.isMacroArgExpansion(Loc))
1064 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1065
1066 // If the macro's spelling isn't FileID or from scratch space, then it's
1067 // actually a token paste or stringization (or similar) and not a macro at
1068 // all.
1069 SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1070 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1071 return {};
1072
1073 // Find the spelling location of the start of the non-argument expansion
1074 // range. This is where the macro name was spelled in order to begin
1075 // expanding this macro.
1076 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1077
1078 // Dig out the buffer where the macro name was spelled and the extents of the
1079 // name so that we can render it into the expansion note.
1080 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1081 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1082 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1083 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1084}
1085
1087 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1088}
1089
1090bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1091 assert(isVerticalWhitespace(Str[0]));
1092 if (Str - 1 < BufferStart)
1093 return false;
1094
1095 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1096 (Str[0] == '\r' && Str[-1] == '\n')) {
1097 if (Str - 2 < BufferStart)
1098 return false;
1099 --Str;
1100 }
1101 --Str;
1102
1103 // Rewind to first non-space character:
1104 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1105 --Str;
1106
1107 return *Str == '\\';
1108}
1109
1111 const SourceManager &SM) {
1112 if (Loc.isInvalid() || Loc.isMacroID())
1113 return {};
1114 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1115 if (LocInfo.first.isInvalid())
1116 return {};
1117 bool Invalid = false;
1118 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1119 if (Invalid)
1120 return {};
1121 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1122 if (!Line)
1123 return {};
1124 StringRef Rest = Buffer.substr(Line - Buffer.data());
1125 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1126 return NumWhitespaceChars == StringRef::npos
1127 ? ""
1128 : Rest.take_front(NumWhitespaceChars);
1129}
1130
1131//===----------------------------------------------------------------------===//
1132// Diagnostics forwarding code.
1133//===----------------------------------------------------------------------===//
1134
1135/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1136/// lexer buffer was all expanded at a single point, perform the mapping.
1137/// This is currently only used for _Pragma implementation, so it is the slow
1138/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1139static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1140 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1142 SourceLocation FileLoc,
1143 unsigned CharNo, unsigned TokLen) {
1144 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1145
1146 // Otherwise, we're lexing "mapped tokens". This is used for things like
1147 // _Pragma handling. Combine the expansion location of FileLoc with the
1148 // spelling location.
1150
1151 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1152 // characters come from spelling(FileLoc)+Offset.
1153 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1154 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1155
1156 // Figure out the expansion loc range, which is the range covered by the
1157 // original _Pragma(...) sequence.
1158 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1159
1160 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1161}
1162
1163/// getSourceLocation - Return a source location identifier for the specified
1164/// offset in the current file.
1166 unsigned TokLen) const {
1167 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1168 "Location out of range for this buffer!");
1169
1170 // In the normal case, we're just lexing from a simple file buffer, return
1171 // the file id from FileLoc with the offset specified.
1172 unsigned CharNo = Loc-BufferStart;
1173 if (FileLoc.isFileID())
1174 return FileLoc.getLocWithOffset(CharNo);
1175
1176 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1177 // tokens are lexed from where the _Pragma was defined.
1178 assert(PP && "This doesn't work on raw lexers");
1179 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1180}
1181
1182/// Diag - Forwarding function for diagnostics. This translate a source
1183/// position in the current buffer into a SourceLocation object for rendering.
1184DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1185 return PP->Diag(getSourceLocation(Loc), DiagID);
1186}
1187
1188//===----------------------------------------------------------------------===//
1189// Trigraph and Escaped Newline Handling Code.
1190//===----------------------------------------------------------------------===//
1191
1192/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1193/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1194static char GetTrigraphCharForLetter(char Letter) {
1195 switch (Letter) {
1196 default: return 0;
1197 case '=': return '#';
1198 case ')': return ']';
1199 case '(': return '[';
1200 case '!': return '|';
1201 case '\'': return '^';
1202 case '>': return '}';
1203 case '/': return '\\';
1204 case '<': return '{';
1205 case '-': return '~';
1206 }
1207}
1208
1209/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1210/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1211/// return the result character. Finally, emit a warning about trigraph use
1212/// whether trigraphs are enabled or not.
1213static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1214 char Res = GetTrigraphCharForLetter(*CP);
1215 if (!Res)
1216 return Res;
1217
1218 if (!Trigraphs) {
1219 if (L && !L->isLexingRawMode())
1220 L->Diag(CP-2, diag::trigraph_ignored);
1221 return 0;
1222 }
1223
1224 if (L && !L->isLexingRawMode())
1225 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1226 return Res;
1227}
1228
1229/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1230/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1231/// trigraph equivalent on entry to this function.
1232unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1233 unsigned Size = 0;
1234 while (isWhitespace(Ptr[Size])) {
1235 ++Size;
1236
1237 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1238 continue;
1239
1240 // If this is a \r\n or \n\r, skip the other half.
1241 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1242 Ptr[Size-1] != Ptr[Size])
1243 ++Size;
1244
1245 return Size;
1246 }
1247
1248 // Not an escaped newline, must be a \t or something else.
1249 return 0;
1250}
1251
1252/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1253/// them), skip over them and return the first non-escaped-newline found,
1254/// otherwise return P.
1255const char *Lexer::SkipEscapedNewLines(const char *P) {
1256 while (true) {
1257 const char *AfterEscape;
1258 if (*P == '\\') {
1259 AfterEscape = P+1;
1260 } else if (*P == '?') {
1261 // If not a trigraph for escape, bail out.
1262 if (P[1] != '?' || P[2] != '/')
1263 return P;
1264 // FIXME: Take LangOpts into account; the language might not
1265 // support trigraphs.
1266 AfterEscape = P+3;
1267 } else {
1268 return P;
1269 }
1270
1271 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1272 if (NewLineSize == 0) return P;
1273 P = AfterEscape+NewLineSize;
1274 }
1275}
1276
1277std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
1278 const SourceManager &SM,
1279 const LangOptions &LangOpts) {
1280 if (Loc.isMacroID()) {
1281 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1282 return std::nullopt;
1283 }
1284 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1285
1286 // Break down the source location.
1287 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1288
1289 // Try to load the file buffer.
1290 bool InvalidTemp = false;
1291 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1292 if (InvalidTemp)
1293 return std::nullopt;
1294
1295 const char *TokenBegin = File.data() + LocInfo.second;
1296
1297 // Lex from the start of the given location.
1298 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1299 TokenBegin, File.end());
1300 // Find the token.
1301 Token Tok;
1302 lexer.LexFromRawLexer(Tok);
1303 return Tok;
1304}
1305
1306/// Checks that the given token is the first token that occurs after the
1307/// given location (this excludes comments and whitespace). Returns the location
1308/// immediately after the specified token. If the token is not found or the
1309/// location is inside a macro, the returned source location will be invalid.
1311 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1312 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1313 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1314 if (!Tok || Tok->isNot(TKind))
1315 return {};
1316 SourceLocation TokenLoc = Tok->getLocation();
1317
1318 // Calculate how much whitespace needs to be skipped if any.
1319 unsigned NumWhitespaceChars = 0;
1320 if (SkipTrailingWhitespaceAndNewLine) {
1321 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1322 unsigned char C = *TokenEnd;
1323 while (isHorizontalWhitespace(C)) {
1324 C = *(++TokenEnd);
1325 NumWhitespaceChars++;
1326 }
1327
1328 // Skip \r, \n, \r\n, or \n\r
1329 if (C == '\n' || C == '\r') {
1330 char PrevC = C;
1331 C = *(++TokenEnd);
1332 NumWhitespaceChars++;
1333 if ((C == '\n' || C == '\r') && C != PrevC)
1334 NumWhitespaceChars++;
1335 }
1336 }
1337
1338 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1339}
1340
1341/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1342/// get its size, and return it. This is tricky in several cases:
1343/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1344/// then either return the trigraph (skipping 3 chars) or the '?',
1345/// depending on whether trigraphs are enabled or not.
1346/// 2. If this is an escaped newline (potentially with whitespace between
1347/// the backslash and newline), implicitly skip the newline and return
1348/// the char after it.
1349///
1350/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1351/// know that we can accumulate into Size, and that we have already incremented
1352/// Ptr by Size bytes.
1353///
1354/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1355/// be updated to match.
1356char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1357 Token *Tok) {
1358 // If we have a slash, look for an escaped newline.
1359 if (Ptr[0] == '\\') {
1360 ++Size;
1361 ++Ptr;
1362Slash:
1363 // Common case, backslash-char where the char is not whitespace.
1364 if (!isWhitespace(Ptr[0])) return '\\';
1365
1366 // See if we have optional whitespace characters between the slash and
1367 // newline.
1368 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1369 // Remember that this token needs to be cleaned.
1370 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1371
1372 // Warn if there was whitespace between the backslash and newline.
1373 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1374 Diag(Ptr, diag::backslash_newline_space);
1375
1376 // Found backslash<whitespace><newline>. Parse the char after it.
1377 Size += EscapedNewLineSize;
1378 Ptr += EscapedNewLineSize;
1379
1380 // Use slow version to accumulate a correct size field.
1381 return getCharAndSizeSlow(Ptr, Size, Tok);
1382 }
1383
1384 // Otherwise, this is not an escaped newline, just return the slash.
1385 return '\\';
1386 }
1387
1388 // If this is a trigraph, process it.
1389 if (Ptr[0] == '?' && Ptr[1] == '?') {
1390 // If this is actually a legal trigraph (not something like "??x"), emit
1391 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1392 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1393 LangOpts.Trigraphs)) {
1394 // Remember that this token needs to be cleaned.
1395 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1396
1397 Ptr += 3;
1398 Size += 3;
1399 if (C == '\\') goto Slash;
1400 return C;
1401 }
1402 }
1403
1404 // If this is neither, return a single character.
1405 ++Size;
1406 return *Ptr;
1407}
1408
1409/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1410/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1411/// and that we have already incremented Ptr by Size bytes.
1412///
1413/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1414/// be updated to match.
1415char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1416 const LangOptions &LangOpts) {
1417 // If we have a slash, look for an escaped newline.
1418 if (Ptr[0] == '\\') {
1419 ++Size;
1420 ++Ptr;
1421Slash:
1422 // Common case, backslash-char where the char is not whitespace.
1423 if (!isWhitespace(Ptr[0])) return '\\';
1424
1425 // See if we have optional whitespace characters followed by a newline.
1426 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1427 // Found backslash<whitespace><newline>. Parse the char after it.
1428 Size += EscapedNewLineSize;
1429 Ptr += EscapedNewLineSize;
1430
1431 // Use slow version to accumulate a correct size field.
1432 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1433 }
1434
1435 // Otherwise, this is not an escaped newline, just return the slash.
1436 return '\\';
1437 }
1438
1439 // If this is a trigraph, process it.
1440 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1441 // If this is actually a legal trigraph (not something like "??x"), return
1442 // it.
1443 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1444 Ptr += 3;
1445 Size += 3;
1446 if (C == '\\') goto Slash;
1447 return C;
1448 }
1449 }
1450
1451 // If this is neither, return a single character.
1452 ++Size;
1453 return *Ptr;
1454}
1455
1456//===----------------------------------------------------------------------===//
1457// Helper methods for lexing.
1458//===----------------------------------------------------------------------===//
1459
1460/// Routine that indiscriminately sets the offset into the source file.
1461void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1462 BufferPtr = BufferStart + Offset;
1463 if (BufferPtr > BufferEnd)
1464 BufferPtr = BufferEnd;
1465 // FIXME: What exactly does the StartOfLine bit mean? There are two
1466 // possible meanings for the "start" of the line: the first token on the
1467 // unexpanded line, or the first token on the expanded line.
1468 IsAtStartOfLine = StartOfLine;
1469 IsAtPhysicalStartOfLine = StartOfLine;
1470}
1471
1472static bool isUnicodeWhitespace(uint32_t Codepoint) {
1473 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1475 return UnicodeWhitespaceChars.contains(Codepoint);
1476}
1477
1479 llvm::SmallString<5> CharBuf;
1480 llvm::raw_svector_ostream CharOS(CharBuf);
1481 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1482 return CharBuf;
1483}
1484
1485// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1486// we allow "Mathematical Notation Characters" in identifiers.
1487// This is a proposed profile that extends the XID_Start/XID_continue
1488// with mathematical symbols, superscipts and subscripts digits
1489// found in some production software.
1490// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1491static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1492 bool IsStart, bool &IsExtension) {
1493 static const llvm::sys::UnicodeCharSet MathStartChars(
1495 static const llvm::sys::UnicodeCharSet MathContinueChars(
1497 if (MathStartChars.contains(C) ||
1498 (!IsStart && MathContinueChars.contains(C))) {
1499 IsExtension = true;
1500 return true;
1501 }
1502 return false;
1503}
1504
1505static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1506 bool &IsExtension) {
1507 if (LangOpts.AsmPreprocessor) {
1508 return false;
1509 } else if (LangOpts.DollarIdents && '$' == C) {
1510 return true;
1511 } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1512 // A non-leading codepoint must have the XID_Continue property.
1513 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1514 // so we need to check both tables.
1515 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1516 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1517 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1518 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1519 return true;
1520 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1521 IsExtension);
1522 } else if (LangOpts.C11) {
1523 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1525 return C11AllowedIDChars.contains(C);
1526 } else {
1527 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1529 return C99AllowedIDChars.contains(C);
1530 }
1531}
1532
1533static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1534 bool &IsExtension) {
1535 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1536 IsExtension = false;
1537 if (LangOpts.AsmPreprocessor) {
1538 return false;
1539 }
1540 if (LangOpts.CPlusPlus || LangOpts.C23) {
1541 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1542 if (XIDStartChars.contains(C))
1543 return true;
1544 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1545 IsExtension);
1546 }
1547 if (!isAllowedIDChar(C, LangOpts, IsExtension))
1548 return false;
1549 if (LangOpts.C11) {
1550 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1552 return !C11DisallowedInitialIDChars.contains(C);
1553 }
1554 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1556 return !C99DisallowedInitialIDChars.contains(C);
1557}
1558
1560 CharSourceRange Range) {
1561
1562 static const llvm::sys::UnicodeCharSet MathStartChars(
1564 static const llvm::sys::UnicodeCharSet MathContinueChars(
1566
1567 (void)MathStartChars;
1568 (void)MathContinueChars;
1569 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1570 "Unexpected mathematical notation codepoint");
1571 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1572 << codepointAsHexString(C) << Range;
1573}
1574
1575static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1576 const char *End) {
1578 L.getSourceLocation(End));
1579}
1580
1581static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1582 CharSourceRange Range, bool IsFirst) {
1583 // Check C99 compatibility.
1584 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1585 enum {
1586 CannotAppearInIdentifier = 0,
1587 CannotStartIdentifier
1588 };
1589
1590 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1592 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1594 if (!C99AllowedIDChars.contains(C)) {
1595 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1596 << Range
1597 << CannotAppearInIdentifier;
1598 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1599 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1600 << Range
1601 << CannotStartIdentifier;
1602 }
1603 }
1604}
1605
1606/// After encountering UTF-8 character C and interpreting it as an identifier
1607/// character, check whether it's a homoglyph for a common non-identifier
1608/// source character that is unlikely to be an intentional identifier
1609/// character and warn if so.
1611 CharSourceRange Range) {
1612 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1613 struct HomoglyphPair {
1614 uint32_t Character;
1615 char LooksLike;
1616 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1617 };
1618 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1619 {U'\u00ad', 0}, // SOFT HYPHEN
1620 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1621 {U'\u037e', ';'}, // GREEK QUESTION MARK
1622 {U'\u200b', 0}, // ZERO WIDTH SPACE
1623 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1624 {U'\u200d', 0}, // ZERO WIDTH JOINER
1625 {U'\u2060', 0}, // WORD JOINER
1626 {U'\u2061', 0}, // FUNCTION APPLICATION
1627 {U'\u2062', 0}, // INVISIBLE TIMES
1628 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1629 {U'\u2064', 0}, // INVISIBLE PLUS
1630 {U'\u2212', '-'}, // MINUS SIGN
1631 {U'\u2215', '/'}, // DIVISION SLASH
1632 {U'\u2216', '\\'}, // SET MINUS
1633 {U'\u2217', '*'}, // ASTERISK OPERATOR
1634 {U'\u2223', '|'}, // DIVIDES
1635 {U'\u2227', '^'}, // LOGICAL AND
1636 {U'\u2236', ':'}, // RATIO
1637 {U'\u223c', '~'}, // TILDE OPERATOR
1638 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1639 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1640 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1641 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1642 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1643 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1644 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1645 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1646 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1647 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1648 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1649 {U'\uff0c', ','}, // FULLWIDTH COMMA
1650 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1651 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1652 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1653 {U'\uff1a', ':'}, // FULLWIDTH COLON
1654 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1655 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1656 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1657 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1658 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1659 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1660 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1661 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1662 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1663 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1664 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1665 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1666 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1667 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1668 {0, 0}
1669 };
1670 auto Homoglyph =
1671 std::lower_bound(std::begin(SortedHomoglyphs),
1672 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1673 if (Homoglyph->Character == C) {
1674 if (Homoglyph->LooksLike) {
1675 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1676 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1677 << Range << codepointAsHexString(C) << LooksLikeStr;
1678 } else {
1679 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1680 << Range << codepointAsHexString(C);
1681 }
1682 }
1683}
1684
1686 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1687 CharSourceRange Range, bool IsFirst) {
1688 if (isASCII(CodePoint))
1689 return;
1690
1691 bool IsExtension;
1692 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1693 bool IsIDContinue =
1694 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1695
1696 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1697 return;
1698
1699 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1700
1701 if (!IsFirst || InvalidOnlyAtStart) {
1702 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1703 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1704 << FixItHint::CreateRemoval(Range);
1705 } else {
1706 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1707 << Range << codepointAsHexString(CodePoint)
1708 << FixItHint::CreateRemoval(Range);
1709 }
1710}
1711
1712bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1713 Token &Result) {
1714 const char *UCNPtr = CurPtr + Size;
1715 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1716 if (CodePoint == 0) {
1717 return false;
1718 }
1719 bool IsExtension = false;
1720 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1721 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1722 return false;
1726 PP->getDiagnostics(), LangOpts, CodePoint,
1727 makeCharRange(*this, CurPtr, UCNPtr),
1728 /*IsFirst=*/false);
1729
1730 // We got a unicode codepoint that is neither a space nor a
1731 // a valid identifier part.
1732 // Carry on as if the codepoint was valid for recovery purposes.
1733 } else if (!isLexingRawMode()) {
1734 if (IsExtension)
1736 makeCharRange(*this, CurPtr, UCNPtr));
1737
1739 makeCharRange(*this, CurPtr, UCNPtr),
1740 /*IsFirst=*/false);
1741 }
1742
1743 Result.setFlag(Token::HasUCN);
1744 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1745 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1746 CurPtr = UCNPtr;
1747 else
1748 while (CurPtr != UCNPtr)
1749 (void)getAndAdvanceChar(CurPtr, Result);
1750 return true;
1751}
1752
1753bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1754 llvm::UTF32 CodePoint;
1755
1756 // If a UTF-8 codepoint appears immediately after an escaped new line,
1757 // CurPtr may point to the splicing \ on the preceding line,
1758 // so we need to skip it.
1759 unsigned FirstCodeUnitSize;
1760 getCharAndSize(CurPtr, FirstCodeUnitSize);
1761 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1762 const char *UnicodePtr = CharStart;
1763
1764 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1765 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1766 &CodePoint, llvm::strictConversion);
1767 if (ConvResult != llvm::conversionOK)
1768 return false;
1769
1770 bool IsExtension = false;
1771 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1772 IsExtension)) {
1773 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1774 return false;
1775
1779 PP->getDiagnostics(), LangOpts, CodePoint,
1780 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1781 // We got a unicode codepoint that is neither a space nor a
1782 // a valid identifier part. Carry on as if the codepoint was
1783 // valid for recovery purposes.
1784 } else if (!isLexingRawMode()) {
1785 if (IsExtension)
1787 PP->getDiagnostics(), CodePoint,
1788 makeCharRange(*this, CharStart, UnicodePtr));
1790 makeCharRange(*this, CharStart, UnicodePtr),
1791 /*IsFirst=*/false);
1793 makeCharRange(*this, CharStart, UnicodePtr));
1794 }
1795
1796 // Once we sucessfully parsed some UTF-8,
1797 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1798 // being lexed, and that warnings about trailing spaces are emitted.
1799 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1800 CurPtr = UnicodePtr;
1801 return true;
1802}
1803
1804bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1805 const char *CurPtr) {
1806 bool IsExtension = false;
1807 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1810 if (IsExtension)
1812 makeCharRange(*this, BufferPtr, CurPtr));
1814 makeCharRange(*this, BufferPtr, CurPtr),
1815 /*IsFirst=*/true);
1817 makeCharRange(*this, BufferPtr, CurPtr));
1818 }
1819
1820 MIOpt.ReadToken();
1821 return LexIdentifierContinue(Result, CurPtr);
1822 }
1823
1825 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1827 // Non-ASCII characters tend to creep into source code unintentionally.
1828 // Instead of letting the parser complain about the unknown token,
1829 // just drop the character.
1830 // Note that we can /only/ do this when the non-ASCII character is actually
1831 // spelled as Unicode, not written as a UCN. The standard requires that
1832 // we not throw away any possible preprocessor tokens, but there's a
1833 // loophole in the mapping of Unicode characters to basic character set
1834 // characters that allows us to map these particular characters to, say,
1835 // whitespace.
1837 PP->getDiagnostics(), LangOpts, C,
1838 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1839 BufferPtr = CurPtr;
1840 return false;
1841 }
1842
1843 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1844 // up by accident.
1845 MIOpt.ReadToken();
1846 FormTokenWithChars(Result, CurPtr, tok::unknown);
1847 return true;
1848}
1849
1850bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1851 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1852 while (true) {
1853 unsigned char C = *CurPtr;
1854 // Fast path.
1856 ++CurPtr;
1857 continue;
1858 }
1859
1860 unsigned Size;
1861 // Slow path: handle trigraph, unicode codepoints, UCNs.
1862 C = getCharAndSize(CurPtr, Size);
1864 CurPtr = ConsumeChar(CurPtr, Size, Result);
1865 continue;
1866 }
1867 if (C == '$') {
1868 // If we hit a $ and they are not supported in identifiers, we are done.
1869 if (!LangOpts.DollarIdents)
1870 break;
1871 // Otherwise, emit a diagnostic and continue.
1872 if (!isLexingRawMode())
1873 Diag(CurPtr, diag::ext_dollar_in_identifier);
1874 CurPtr = ConsumeChar(CurPtr, Size, Result);
1875 continue;
1876 }
1877 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1878 continue;
1879 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1880 continue;
1881 // Neither an expected Unicode codepoint nor a UCN.
1882 break;
1883 }
1884
1885 const char *IdStart = BufferPtr;
1886 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1887 Result.setRawIdentifierData(IdStart);
1888
1889 // If we are in raw mode, return this identifier raw. There is no need to
1890 // look up identifier information or attempt to macro expand it.
1891 if (LexingRawMode)
1892 return true;
1893
1894 // Fill in Result.IdentifierInfo and update the token kind,
1895 // looking up the identifier in the identifier table.
1897 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1898 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1899
1900 // If the completion point is at the end of an identifier, we want to treat
1901 // the identifier as incomplete even if it resolves to a macro or a keyword.
1902 // This allows e.g. 'class^' to complete to 'classifier'.
1903 if (isCodeCompletionPoint(CurPtr)) {
1904 // Return the code-completion token.
1905 Result.setKind(tok::code_completion);
1906 // Skip the code-completion char and all immediate identifier characters.
1907 // This ensures we get consistent behavior when completing at any point in
1908 // an identifier (i.e. at the start, in the middle, at the end). Note that
1909 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1910 // simpler.
1911 assert(*CurPtr == 0 && "Completion character must be 0");
1912 ++CurPtr;
1913 // Note that code completion token is not added as a separate character
1914 // when the completion point is at the end of the buffer. Therefore, we need
1915 // to check if the buffer has ended.
1916 if (CurPtr < BufferEnd) {
1917 while (isAsciiIdentifierContinue(*CurPtr))
1918 ++CurPtr;
1919 }
1920 BufferPtr = CurPtr;
1921 return true;
1922 }
1923
1924 // Finally, now that we know we have an identifier, pass this off to the
1925 // preprocessor, which may macro expand it or something.
1926 if (II->isHandleIdentifierCase())
1927 return PP->HandleIdentifier(Result);
1928
1929 return true;
1930}
1931
1932/// isHexaLiteral - Return true if Start points to a hex constant.
1933/// in microsoft mode (where this is supposed to be several different tokens).
1934bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1935 unsigned Size;
1936 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1937 if (C1 != '0')
1938 return false;
1939 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1940 return (C2 == 'x' || C2 == 'X');
1941}
1942
1943/// LexNumericConstant - Lex the remainder of a integer or floating point
1944/// constant. From[-1] is the first character lexed. Return the end of the
1945/// constant.
1946bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1947 unsigned Size;
1948 char C = getCharAndSize(CurPtr, Size);
1949 char PrevCh = 0;
1950 while (isPreprocessingNumberBody(C)) {
1951 CurPtr = ConsumeChar(CurPtr, Size, Result);
1952 PrevCh = C;
1953 C = getCharAndSize(CurPtr, Size);
1954 }
1955
1956 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
1957 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1958 // If we are in Microsoft mode, don't continue if the constant is hex.
1959 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1960 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
1961 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1962 }
1963
1964 // If we have a hex FP constant, continue.
1965 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
1966 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1967 // not-quite-conforming extension. Only do so if this looks like it's
1968 // actually meant to be a hexfloat, and not if it has a ud-suffix.
1969 bool IsHexFloat = true;
1970 if (!LangOpts.C99) {
1971 if (!isHexaLiteral(BufferPtr, LangOpts))
1972 IsHexFloat = false;
1973 else if (!LangOpts.CPlusPlus17 &&
1974 std::find(BufferPtr, CurPtr, '_') != CurPtr)
1975 IsHexFloat = false;
1976 }
1977 if (IsHexFloat)
1978 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1979 }
1980
1981 // If we have a digit separator, continue.
1982 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
1983 unsigned NextSize;
1984 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
1985 if (isAsciiIdentifierContinue(Next)) {
1986 if (!isLexingRawMode())
1987 Diag(CurPtr, LangOpts.CPlusPlus
1988 ? diag::warn_cxx11_compat_digit_separator
1989 : diag::warn_c23_compat_digit_separator);
1990 CurPtr = ConsumeChar(CurPtr, Size, Result);
1991 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1992 return LexNumericConstant(Result, CurPtr);
1993 }
1994 }
1995
1996 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1997 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1998 return LexNumericConstant(Result, CurPtr);
1999 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2000 return LexNumericConstant(Result, CurPtr);
2001
2002 // Update the location of token as well as BufferPtr.
2003 const char *TokStart = BufferPtr;
2004 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2005 Result.setLiteralData(TokStart);
2006 return true;
2007}
2008
2009/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2010/// in C++11, or warn on a ud-suffix in C++98.
2011const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2012 bool IsStringLiteral) {
2013 assert(LangOpts.CPlusPlus);
2014
2015 // Maximally munch an identifier.
2016 unsigned Size;
2017 char C = getCharAndSize(CurPtr, Size);
2018 bool Consumed = false;
2019
2020 if (!isAsciiIdentifierStart(C)) {
2021 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2022 Consumed = true;
2023 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2024 Consumed = true;
2025 else
2026 return CurPtr;
2027 }
2028
2029 if (!LangOpts.CPlusPlus11) {
2030 if (!isLexingRawMode())
2031 Diag(CurPtr,
2032 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2033 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2035 return CurPtr;
2036 }
2037
2038 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2039 // that does not start with an underscore is ill-formed. As a conforming
2040 // extension, we treat all such suffixes as if they had whitespace before
2041 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2042 // likely to be a ud-suffix than a macro, however, and accept that.
2043 if (!Consumed) {
2044 bool IsUDSuffix = false;
2045 if (C == '_')
2046 IsUDSuffix = true;
2047 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2048 // In C++1y, we need to look ahead a few characters to see if this is a
2049 // valid suffix for a string literal or a numeric literal (this could be
2050 // the 'operator""if' defining a numeric literal operator).
2051 const unsigned MaxStandardSuffixLength = 3;
2052 char Buffer[MaxStandardSuffixLength] = { C };
2053 unsigned Consumed = Size;
2054 unsigned Chars = 1;
2055 while (true) {
2056 unsigned NextSize;
2057 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
2058 if (!isAsciiIdentifierContinue(Next)) {
2059 // End of suffix. Check whether this is on the allowed list.
2060 const StringRef CompleteSuffix(Buffer, Chars);
2061 IsUDSuffix =
2062 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2063 break;
2064 }
2065
2066 if (Chars == MaxStandardSuffixLength)
2067 // Too long: can't be a standard suffix.
2068 break;
2069
2070 Buffer[Chars++] = Next;
2071 Consumed += NextSize;
2072 }
2073 }
2074
2075 if (!IsUDSuffix) {
2076 if (!isLexingRawMode())
2077 Diag(CurPtr, LangOpts.MSVCCompat
2078 ? diag::ext_ms_reserved_user_defined_literal
2079 : diag::ext_reserved_user_defined_literal)
2081 return CurPtr;
2082 }
2083
2084 CurPtr = ConsumeChar(CurPtr, Size, Result);
2085 }
2086
2087 Result.setFlag(Token::HasUDSuffix);
2088 while (true) {
2089 C = getCharAndSize(CurPtr, Size);
2091 CurPtr = ConsumeChar(CurPtr, Size, Result);
2092 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2093 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2094 } else
2095 break;
2096 }
2097
2098 return CurPtr;
2099}
2100
2101/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2102/// either " or L" or u8" or u" or U".
2103bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2104 tok::TokenKind Kind) {
2105 const char *AfterQuote = CurPtr;
2106 // Does this string contain the \0 character?
2107 const char *NulCharacter = nullptr;
2108
2109 if (!isLexingRawMode() &&
2110 (Kind == tok::utf8_string_literal ||
2111 Kind == tok::utf16_string_literal ||
2112 Kind == tok::utf32_string_literal))
2113 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2114 : diag::warn_c99_compat_unicode_literal);
2115
2116 char C = getAndAdvanceChar(CurPtr, Result);
2117 while (C != '"') {
2118 // Skip escaped characters. Escaped newlines will already be processed by
2119 // getAndAdvanceChar.
2120 if (C == '\\')
2121 C = getAndAdvanceChar(CurPtr, Result);
2122
2123 if (C == '\n' || C == '\r' || // Newline.
2124 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2125 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2126 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2127 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2128 return true;
2129 }
2130
2131 if (C == 0) {
2132 if (isCodeCompletionPoint(CurPtr-1)) {
2133 if (ParsingFilename)
2134 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2135 else
2137 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2138 cutOffLexing();
2139 return true;
2140 }
2141
2142 NulCharacter = CurPtr-1;
2143 }
2144 C = getAndAdvanceChar(CurPtr, Result);
2145 }
2146
2147 // If we are in C++11, lex the optional ud-suffix.
2148 if (LangOpts.CPlusPlus)
2149 CurPtr = LexUDSuffix(Result, CurPtr, true);
2150
2151 // If a nul character existed in the string, warn about it.
2152 if (NulCharacter && !isLexingRawMode())
2153 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2154
2155 // Update the location of the token as well as the BufferPtr instance var.
2156 const char *TokStart = BufferPtr;
2157 FormTokenWithChars(Result, CurPtr, Kind);
2158 Result.setLiteralData(TokStart);
2159 return true;
2160}
2161
2162/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2163/// having lexed R", LR", u8R", uR", or UR".
2164bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2165 tok::TokenKind Kind) {
2166 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2167 // Between the initial and final double quote characters of the raw string,
2168 // any transformations performed in phases 1 and 2 (trigraphs,
2169 // universal-character-names, and line splicing) are reverted.
2170
2171 if (!isLexingRawMode())
2172 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2173
2174 unsigned PrefixLen = 0;
2175
2176 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
2177 ++PrefixLen;
2178
2179 // If the last character was not a '(', then we didn't lex a valid delimiter.
2180 if (CurPtr[PrefixLen] != '(') {
2181 if (!isLexingRawMode()) {
2182 const char *PrefixEnd = &CurPtr[PrefixLen];
2183 if (PrefixLen == 16) {
2184 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2185 } else {
2186 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2187 << StringRef(PrefixEnd, 1);
2188 }
2189 }
2190
2191 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2192 // it's possible the '"' was intended to be part of the raw string, but
2193 // there's not much we can do about that.
2194 while (true) {
2195 char C = *CurPtr++;
2196
2197 if (C == '"')
2198 break;
2199 if (C == 0 && CurPtr-1 == BufferEnd) {
2200 --CurPtr;
2201 break;
2202 }
2203 }
2204
2205 FormTokenWithChars(Result, CurPtr, tok::unknown);
2206 return true;
2207 }
2208
2209 // Save prefix and move CurPtr past it
2210 const char *Prefix = CurPtr;
2211 CurPtr += PrefixLen + 1; // skip over prefix and '('
2212
2213 while (true) {
2214 char C = *CurPtr++;
2215
2216 if (C == ')') {
2217 // Check for prefix match and closing quote.
2218 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2219 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2220 break;
2221 }
2222 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2223 if (!isLexingRawMode())
2224 Diag(BufferPtr, diag::err_unterminated_raw_string)
2225 << StringRef(Prefix, PrefixLen);
2226 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2227 return true;
2228 }
2229 }
2230
2231 // If we are in C++11, lex the optional ud-suffix.
2232 if (LangOpts.CPlusPlus)
2233 CurPtr = LexUDSuffix(Result, CurPtr, true);
2234
2235 // Update the location of token as well as BufferPtr.
2236 const char *TokStart = BufferPtr;
2237 FormTokenWithChars(Result, CurPtr, Kind);
2238 Result.setLiteralData(TokStart);
2239 return true;
2240}
2241
2242/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2243/// after having lexed the '<' character. This is used for #include filenames.
2244bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2245 // Does this string contain the \0 character?
2246 const char *NulCharacter = nullptr;
2247 const char *AfterLessPos = CurPtr;
2248 char C = getAndAdvanceChar(CurPtr, Result);
2249 while (C != '>') {
2250 // Skip escaped characters. Escaped newlines will already be processed by
2251 // getAndAdvanceChar.
2252 if (C == '\\')
2253 C = getAndAdvanceChar(CurPtr, Result);
2254
2255 if (isVerticalWhitespace(C) || // Newline.
2256 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2257 // If the filename is unterminated, then it must just be a lone <
2258 // character. Return this as such.
2259 FormTokenWithChars(Result, AfterLessPos, tok::less);
2260 return true;
2261 }
2262
2263 if (C == 0) {
2264 if (isCodeCompletionPoint(CurPtr - 1)) {
2265 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2266 cutOffLexing();
2267 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2268 return true;
2269 }
2270 NulCharacter = CurPtr-1;
2271 }
2272 C = getAndAdvanceChar(CurPtr, Result);
2273 }
2274
2275 // If a nul character existed in the string, warn about it.
2276 if (NulCharacter && !isLexingRawMode())
2277 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2278
2279 // Update the location of token as well as BufferPtr.
2280 const char *TokStart = BufferPtr;
2281 FormTokenWithChars(Result, CurPtr, tok::header_name);
2282 Result.setLiteralData(TokStart);
2283 return true;
2284}
2285
2286void Lexer::codeCompleteIncludedFile(const char *PathStart,
2287 const char *CompletionPoint,
2288 bool IsAngled) {
2289 // Completion only applies to the filename, after the last slash.
2290 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2291 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2292 auto Slash = PartialPath.find_last_of(SlashChars);
2293 StringRef Dir =
2294 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2295 const char *StartOfFilename =
2296 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2297 // Code completion filter range is the filename only, up to completion point.
2299 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2300 // We should replace the characters up to the closing quote or closest slash,
2301 // if any.
2302 while (CompletionPoint < BufferEnd) {
2303 char Next = *(CompletionPoint + 1);
2304 if (Next == 0 || Next == '\r' || Next == '\n')
2305 break;
2306 ++CompletionPoint;
2307 if (Next == (IsAngled ? '>' : '"'))
2308 break;
2309 if (SlashChars.contains(Next))
2310 break;
2311 }
2312
2314 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2315 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2316 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2317}
2318
2319/// LexCharConstant - Lex the remainder of a character constant, after having
2320/// lexed either ' or L' or u8' or u' or U'.
2321bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2322 tok::TokenKind Kind) {
2323 // Does this character contain the \0 character?
2324 const char *NulCharacter = nullptr;
2325
2326 if (!isLexingRawMode()) {
2327 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2328 Diag(BufferPtr, LangOpts.CPlusPlus
2329 ? diag::warn_cxx98_compat_unicode_literal
2330 : diag::warn_c99_compat_unicode_literal);
2331 else if (Kind == tok::utf8_char_constant)
2332 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2333 }
2334
2335 char C = getAndAdvanceChar(CurPtr, Result);
2336 if (C == '\'') {
2337 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2338 Diag(BufferPtr, diag::ext_empty_character);
2339 FormTokenWithChars(Result, CurPtr, tok::unknown);
2340 return true;
2341 }
2342
2343 while (C != '\'') {
2344 // Skip escaped characters.
2345 if (C == '\\')
2346 C = getAndAdvanceChar(CurPtr, Result);
2347
2348 if (C == '\n' || C == '\r' || // Newline.
2349 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2350 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2351 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2352 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2353 return true;
2354 }
2355
2356 if (C == 0) {
2357 if (isCodeCompletionPoint(CurPtr-1)) {
2359 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2360 cutOffLexing();
2361 return true;
2362 }
2363
2364 NulCharacter = CurPtr-1;
2365 }
2366 C = getAndAdvanceChar(CurPtr, Result);
2367 }
2368
2369 // If we are in C++11, lex the optional ud-suffix.
2370 if (LangOpts.CPlusPlus)
2371 CurPtr = LexUDSuffix(Result, CurPtr, false);
2372
2373 // If a nul character existed in the character, warn about it.
2374 if (NulCharacter && !isLexingRawMode())
2375 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2376
2377 // Update the location of token as well as BufferPtr.
2378 const char *TokStart = BufferPtr;
2379 FormTokenWithChars(Result, CurPtr, Kind);
2380 Result.setLiteralData(TokStart);
2381 return true;
2382}
2383
2384/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2385/// Update BufferPtr to point to the next non-whitespace character and return.
2386///
2387/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2388bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2389 bool &TokAtPhysicalStartOfLine) {
2390 // Whitespace - Skip it, then return the token after the whitespace.
2391 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2392
2393 unsigned char Char = *CurPtr;
2394
2395 const char *lastNewLine = nullptr;
2396 auto setLastNewLine = [&](const char *Ptr) {
2397 lastNewLine = Ptr;
2398 if (!NewLinePtr)
2399 NewLinePtr = Ptr;
2400 };
2401 if (SawNewline)
2402 setLastNewLine(CurPtr - 1);
2403
2404 // Skip consecutive spaces efficiently.
2405 while (true) {
2406 // Skip horizontal whitespace very aggressively.
2407 while (isHorizontalWhitespace(Char))
2408 Char = *++CurPtr;
2409
2410 // Otherwise if we have something other than whitespace, we're done.
2411 if (!isVerticalWhitespace(Char))
2412 break;
2413
2415 // End of preprocessor directive line, let LexTokenInternal handle this.
2416 BufferPtr = CurPtr;
2417 return false;
2418 }
2419
2420 // OK, but handle newline.
2421 if (*CurPtr == '\n')
2422 setLastNewLine(CurPtr);
2423 SawNewline = true;
2424 Char = *++CurPtr;
2425 }
2426
2427 // If the client wants us to return whitespace, return it now.
2428 if (isKeepWhitespaceMode()) {
2429 FormTokenWithChars(Result, CurPtr, tok::unknown);
2430 if (SawNewline) {
2431 IsAtStartOfLine = true;
2432 IsAtPhysicalStartOfLine = true;
2433 }
2434 // FIXME: The next token will not have LeadingSpace set.
2435 return true;
2436 }
2437
2438 // If this isn't immediately after a newline, there is leading space.
2439 char PrevChar = CurPtr[-1];
2440 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2441
2442 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2443 if (SawNewline) {
2444 Result.setFlag(Token::StartOfLine);
2445 TokAtPhysicalStartOfLine = true;
2446
2447 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2448 if (auto *Handler = PP->getEmptylineHandler())
2449 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2450 getSourceLocation(lastNewLine)));
2451 }
2452 }
2453
2454 BufferPtr = CurPtr;
2455 return false;
2456}
2457
2458/// We have just read the // characters from input. Skip until we find the
2459/// newline character that terminates the comment. Then update BufferPtr and
2460/// return.
2461///
2462/// If we're in KeepCommentMode or any CommentHandler has inserted
2463/// some tokens, this will store the first token and return true.
2464bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2465 bool &TokAtPhysicalStartOfLine) {
2466 // If Line comments aren't explicitly enabled for this language, emit an
2467 // extension warning.
2468 if (!LineComment) {
2469 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2470 Diag(BufferPtr, diag::ext_line_comment);
2471
2472 // Mark them enabled so we only emit one warning for this translation
2473 // unit.
2474 LineComment = true;
2475 }
2476
2477 // Scan over the body of the comment. The common case, when scanning, is that
2478 // the comment contains normal ascii characters with nothing interesting in
2479 // them. As such, optimize for this case with the inner loop.
2480 //
2481 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2482 // character that ends the line comment.
2483
2484 // C++23 [lex.phases] p1
2485 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2486 // diagnostic only once per entire ill-formed subsequence to avoid
2487 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2488 bool UnicodeDecodingAlreadyDiagnosed = false;
2489
2490 char C;
2491 while (true) {
2492 C = *CurPtr;
2493 // Skip over characters in the fast loop.
2494 while (isASCII(C) && C != 0 && // Potentially EOF.
2495 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2496 C = *++CurPtr;
2497 UnicodeDecodingAlreadyDiagnosed = false;
2498 }
2499
2500 if (!isASCII(C)) {
2501 unsigned Length = llvm::getUTF8SequenceSize(
2502 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2503 if (Length == 0) {
2504 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2505 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2506 UnicodeDecodingAlreadyDiagnosed = true;
2507 ++CurPtr;
2508 } else {
2509 UnicodeDecodingAlreadyDiagnosed = false;
2510 CurPtr += Length;
2511 }
2512 continue;
2513 }
2514
2515 const char *NextLine = CurPtr;
2516 if (C != 0) {
2517 // We found a newline, see if it's escaped.
2518 const char *EscapePtr = CurPtr-1;
2519 bool HasSpace = false;
2520 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2521 --EscapePtr;
2522 HasSpace = true;
2523 }
2524
2525 if (*EscapePtr == '\\')
2526 // Escaped newline.
2527 CurPtr = EscapePtr;
2528 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2529 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2530 // Trigraph-escaped newline.
2531 CurPtr = EscapePtr-2;
2532 else
2533 break; // This is a newline, we're done.
2534
2535 // If there was space between the backslash and newline, warn about it.
2536 if (HasSpace && !isLexingRawMode())
2537 Diag(EscapePtr, diag::backslash_newline_space);
2538 }
2539
2540 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2541 // properly decode the character. Read it in raw mode to avoid emitting
2542 // diagnostics about things like trigraphs. If we see an escaped newline,
2543 // we'll handle it below.
2544 const char *OldPtr = CurPtr;
2545 bool OldRawMode = isLexingRawMode();
2546 LexingRawMode = true;
2547 C = getAndAdvanceChar(CurPtr, Result);
2548 LexingRawMode = OldRawMode;
2549
2550 // If we only read only one character, then no special handling is needed.
2551 // We're done and can skip forward to the newline.
2552 if (C != 0 && CurPtr == OldPtr+1) {
2553 CurPtr = NextLine;
2554 break;
2555 }
2556
2557 // If we read multiple characters, and one of those characters was a \r or
2558 // \n, then we had an escaped newline within the comment. Emit diagnostic
2559 // unless the next line is also a // comment.
2560 if (CurPtr != OldPtr + 1 && C != '/' &&
2561 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2562 for (; OldPtr != CurPtr; ++OldPtr)
2563 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2564 // Okay, we found a // comment that ends in a newline, if the next
2565 // line is also a // comment, but has spaces, don't emit a diagnostic.
2566 if (isWhitespace(C)) {
2567 const char *ForwardPtr = CurPtr;
2568 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2569 ++ForwardPtr;
2570 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2571 break;
2572 }
2573
2574 if (!isLexingRawMode())
2575 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2576 break;
2577 }
2578 }
2579
2580 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2581 --CurPtr;
2582 break;
2583 }
2584
2585 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2587 cutOffLexing();
2588 return false;
2589 }
2590 }
2591
2592 // Found but did not consume the newline. Notify comment handlers about the
2593 // comment unless we're in a #if 0 block.
2594 if (PP && !isLexingRawMode() &&
2596 getSourceLocation(CurPtr)))) {
2597 BufferPtr = CurPtr;
2598 return true; // A token has to be returned.
2599 }
2600
2601 // If we are returning comments as tokens, return this comment as a token.
2602 if (inKeepCommentMode())
2603 return SaveLineComment(Result, CurPtr);
2604
2605 // If we are inside a preprocessor directive and we see the end of line,
2606 // return immediately, so that the lexer can return this as an EOD token.
2607 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2608 BufferPtr = CurPtr;
2609 return false;
2610 }
2611
2612 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2613 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2614 // contribute to another token), it isn't needed for correctness. Note that
2615 // this is ok even in KeepWhitespaceMode, because we would have returned the
2616 // comment above in that mode.
2617 NewLinePtr = CurPtr++;
2618
2619 // The next returned token is at the start of the line.
2620 Result.setFlag(Token::StartOfLine);
2621 TokAtPhysicalStartOfLine = true;
2622 // No leading whitespace seen so far.
2623 Result.clearFlag(Token::LeadingSpace);
2624 BufferPtr = CurPtr;
2625 return false;
2626}
2627
2628/// If in save-comment mode, package up this Line comment in an appropriate
2629/// way and return it.
2630bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2631 // If we're not in a preprocessor directive, just return the // comment
2632 // directly.
2633 FormTokenWithChars(Result, CurPtr, tok::comment);
2634
2636 return true;
2637
2638 // If this Line-style comment is in a macro definition, transmogrify it into
2639 // a C-style block comment.
2640 bool Invalid = false;
2641 std::string Spelling = PP->getSpelling(Result, &Invalid);
2642 if (Invalid)
2643 return true;
2644
2645 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2646 Spelling[1] = '*'; // Change prefix to "/*".
2647 Spelling += "*/"; // add suffix.
2648
2649 Result.setKind(tok::comment);
2650 PP->CreateString(Spelling, Result,
2651 Result.getLocation(), Result.getLocation());
2652 return true;
2653}
2654
2655/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2656/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2657/// a diagnostic if so. We know that the newline is inside of a block comment.
2658static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2659 bool Trigraphs) {
2660 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2661
2662 // Position of the first trigraph in the ending sequence.
2663 const char *TrigraphPos = nullptr;
2664 // Position of the first whitespace after a '\' in the ending sequence.
2665 const char *SpacePos = nullptr;
2666
2667 while (true) {
2668 // Back up off the newline.
2669 --CurPtr;
2670
2671 // If this is a two-character newline sequence, skip the other character.
2672 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2673 // \n\n or \r\r -> not escaped newline.
2674 if (CurPtr[0] == CurPtr[1])
2675 return false;
2676 // \n\r or \r\n -> skip the newline.
2677 --CurPtr;
2678 }
2679
2680 // If we have horizontal whitespace, skip over it. We allow whitespace
2681 // between the slash and newline.
2682 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2683 SpacePos = CurPtr;
2684 --CurPtr;
2685 }
2686
2687 // If we have a slash, this is an escaped newline.
2688 if (*CurPtr == '\\') {
2689 --CurPtr;
2690 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2691 // This is a trigraph encoding of a slash.
2692 TrigraphPos = CurPtr - 2;
2693 CurPtr -= 3;
2694 } else {
2695 return false;
2696 }
2697
2698 // If the character preceding the escaped newline is a '*', then after line
2699 // splicing we have a '*/' ending the comment.
2700 if (*CurPtr == '*')
2701 break;
2702
2703 if (*CurPtr != '\n' && *CurPtr != '\r')
2704 return false;
2705 }
2706
2707 if (TrigraphPos) {
2708 // If no trigraphs are enabled, warn that we ignored this trigraph and
2709 // ignore this * character.
2710 if (!Trigraphs) {
2711 if (!L->isLexingRawMode())
2712 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2713 return false;
2714 }
2715 if (!L->isLexingRawMode())
2716 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2717 }
2718
2719 // Warn about having an escaped newline between the */ characters.
2720 if (!L->isLexingRawMode())
2721 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2722
2723 // If there was space between the backslash and newline, warn about it.
2724 if (SpacePos && !L->isLexingRawMode())
2725 L->Diag(SpacePos, diag::backslash_newline_space);
2726
2727 return true;
2728}
2729
2730#ifdef __SSE2__
2731#include <emmintrin.h>
2732#elif __ALTIVEC__
2733#include <altivec.h>
2734#undef bool
2735#endif
2736
2737/// We have just read from input the / and * characters that started a comment.
2738/// Read until we find the * and / characters that terminate the comment.
2739/// Note that we don't bother decoding trigraphs or escaped newlines in block
2740/// comments, because they cannot cause the comment to end. The only thing
2741/// that can happen is the comment could end with an escaped newline between
2742/// the terminating * and /.
2743///
2744/// If we're in KeepCommentMode or any CommentHandler has inserted
2745/// some tokens, this will store the first token and return true.
2746bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2747 bool &TokAtPhysicalStartOfLine) {
2748 // Scan one character past where we should, looking for a '/' character. Once
2749 // we find it, check to see if it was preceded by a *. This common
2750 // optimization helps people who like to put a lot of * characters in their
2751 // comments.
2752
2753 // The first character we get with newlines and trigraphs skipped to handle
2754 // the degenerate /*/ case below correctly if the * has an escaped newline
2755 // after it.
2756 unsigned CharSize;
2757 unsigned char C = getCharAndSize(CurPtr, CharSize);
2758 CurPtr += CharSize;
2759 if (C == 0 && CurPtr == BufferEnd+1) {
2760 if (!isLexingRawMode())
2761 Diag(BufferPtr, diag::err_unterminated_block_comment);
2762 --CurPtr;
2763
2764 // KeepWhitespaceMode should return this broken comment as a token. Since
2765 // it isn't a well formed comment, just return it as an 'unknown' token.
2766 if (isKeepWhitespaceMode()) {
2767 FormTokenWithChars(Result, CurPtr, tok::unknown);
2768 return true;
2769 }
2770
2771 BufferPtr = CurPtr;
2772 return false;
2773 }
2774
2775 // Check to see if the first character after the '/*' is another /. If so,
2776 // then this slash does not end the block comment, it is part of it.
2777 if (C == '/')
2778 C = *CurPtr++;
2779
2780 // C++23 [lex.phases] p1
2781 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2782 // diagnostic only once per entire ill-formed subsequence to avoid
2783 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2784 bool UnicodeDecodingAlreadyDiagnosed = false;
2785
2786 while (true) {
2787 // Skip over all non-interesting characters until we find end of buffer or a
2788 // (probably ending) '/' character.
2789 if (CurPtr + 24 < BufferEnd &&
2790 // If there is a code-completion point avoid the fast scan because it
2791 // doesn't check for '\0'.
2792 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2793 // While not aligned to a 16-byte boundary.
2794 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2795 if (!isASCII(C))
2796 goto MultiByteUTF8;
2797 C = *CurPtr++;
2798 }
2799 if (C == '/') goto FoundSlash;
2800
2801#ifdef __SSE2__
2802 __m128i Slashes = _mm_set1_epi8('/');
2803 while (CurPtr + 16 < BufferEnd) {
2804 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2805 if (LLVM_UNLIKELY(Mask != 0)) {
2806 goto MultiByteUTF8;
2807 }
2808 // look for slashes
2809 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2810 Slashes));
2811 if (cmp != 0) {
2812 // Adjust the pointer to point directly after the first slash. It's
2813 // not necessary to set C here, it will be overwritten at the end of
2814 // the outer loop.
2815 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2816 goto FoundSlash;
2817 }
2818 CurPtr += 16;
2819 }
2820#elif __ALTIVEC__
2821 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2822 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2823 0x80, 0x80, 0x80, 0x80};
2824 __vector unsigned char Slashes = {
2825 '/', '/', '/', '/', '/', '/', '/', '/',
2826 '/', '/', '/', '/', '/', '/', '/', '/'
2827 };
2828 while (CurPtr + 16 < BufferEnd) {
2829 if (LLVM_UNLIKELY(
2830 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2831 goto MultiByteUTF8;
2832 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2833 break;
2834 }
2835 CurPtr += 16;
2836 }
2837
2838#else
2839 while (CurPtr + 16 < BufferEnd) {
2840 bool HasNonASCII = false;
2841 for (unsigned I = 0; I < 16; ++I)
2842 HasNonASCII |= !isASCII(CurPtr[I]);
2843
2844 if (LLVM_UNLIKELY(HasNonASCII))
2845 goto MultiByteUTF8;
2846
2847 bool HasSlash = false;
2848 for (unsigned I = 0; I < 16; ++I)
2849 HasSlash |= CurPtr[I] == '/';
2850 if (HasSlash)
2851 break;
2852 CurPtr += 16;
2853 }
2854#endif
2855
2856 // It has to be one of the bytes scanned, increment to it and read one.
2857 C = *CurPtr++;
2858 }
2859
2860 // Loop to scan the remainder, warning on invalid UTF-8
2861 // if the corresponding warning is enabled, emitting a diagnostic only once
2862 // per sequence that cannot be decoded.
2863 while (C != '/' && C != '\0') {
2864 if (isASCII(C)) {
2865 UnicodeDecodingAlreadyDiagnosed = false;
2866 C = *CurPtr++;
2867 continue;
2868 }
2869 MultiByteUTF8:
2870 // CurPtr is 1 code unit past C, so to decode
2871 // the codepoint, we need to read from the previous position.
2872 unsigned Length = llvm::getUTF8SequenceSize(
2873 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2874 if (Length == 0) {
2875 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2876 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2877 UnicodeDecodingAlreadyDiagnosed = true;
2878 } else {
2879 UnicodeDecodingAlreadyDiagnosed = false;
2880 CurPtr += Length - 1;
2881 }
2882 C = *CurPtr++;
2883 }
2884
2885 if (C == '/') {
2886 FoundSlash:
2887 if (CurPtr[-2] == '*') // We found the final */. We're done!
2888 break;
2889
2890 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2891 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2892 LangOpts.Trigraphs)) {
2893 // We found the final */, though it had an escaped newline between the
2894 // * and /. We're done!
2895 break;
2896 }
2897 }
2898 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2899 // If this is a /* inside of the comment, emit a warning. Don't do this
2900 // if this is a /*/, which will end the comment. This misses cases with
2901 // embedded escaped newlines, but oh well.
2902 if (!isLexingRawMode())
2903 Diag(CurPtr-1, diag::warn_nested_block_comment);
2904 }
2905 } else if (C == 0 && CurPtr == BufferEnd+1) {
2906 if (!isLexingRawMode())
2907 Diag(BufferPtr, diag::err_unterminated_block_comment);
2908 // Note: the user probably forgot a */. We could continue immediately
2909 // after the /*, but this would involve lexing a lot of what really is the
2910 // comment, which surely would confuse the parser.
2911 --CurPtr;
2912
2913 // KeepWhitespaceMode should return this broken comment as a token. Since
2914 // it isn't a well formed comment, just return it as an 'unknown' token.
2915 if (isKeepWhitespaceMode()) {
2916 FormTokenWithChars(Result, CurPtr, tok::unknown);
2917 return true;
2918 }
2919
2920 BufferPtr = CurPtr;
2921 return false;
2922 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2924 cutOffLexing();
2925 return false;
2926 }
2927
2928 C = *CurPtr++;
2929 }
2930
2931 // Notify comment handlers about the comment unless we're in a #if 0 block.
2932 if (PP && !isLexingRawMode() &&
2934 getSourceLocation(CurPtr)))) {
2935 BufferPtr = CurPtr;
2936 return true; // A token has to be returned.
2937 }
2938
2939 // If we are returning comments as tokens, return this comment as a token.
2940 if (inKeepCommentMode()) {
2941 FormTokenWithChars(Result, CurPtr, tok::comment);
2942 return true;
2943 }
2944
2945 // It is common for the tokens immediately after a /**/ comment to be
2946 // whitespace. Instead of going through the big switch, handle it
2947 // efficiently now. This is safe even in KeepWhitespaceMode because we would
2948 // have already returned above with the comment as a token.
2949 if (isHorizontalWhitespace(*CurPtr)) {
2950 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2951 return false;
2952 }
2953
2954 // Otherwise, just return so that the next character will be lexed as a token.
2955 BufferPtr = CurPtr;
2956 Result.setFlag(Token::LeadingSpace);
2957 return false;
2958}
2959
2960//===----------------------------------------------------------------------===//
2961// Primary Lexing Entry Points
2962//===----------------------------------------------------------------------===//
2963
2964/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2965/// uninterpreted string. This switches the lexer out of directive mode.
2967 assert(ParsingPreprocessorDirective && ParsingFilename == false &&
2968 "Must be in a preprocessing directive!");
2969 Token Tmp;
2970 Tmp.startToken();
2971
2972 // CurPtr - Cache BufferPtr in an automatic variable.
2973 const char *CurPtr = BufferPtr;
2974 while (true) {
2975 char Char = getAndAdvanceChar(CurPtr, Tmp);
2976 switch (Char) {
2977 default:
2978 if (Result)
2979 Result->push_back(Char);
2980 break;
2981 case 0: // Null.
2982 // Found end of file?
2983 if (CurPtr-1 != BufferEnd) {
2984 if (isCodeCompletionPoint(CurPtr-1)) {
2986 cutOffLexing();
2987 return;
2988 }
2989
2990 // Nope, normal character, continue.
2991 if (Result)
2992 Result->push_back(Char);
2993 break;
2994 }
2995 // FALL THROUGH.
2996 [[fallthrough]];
2997 case '\r':
2998 case '\n':
2999 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3000 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3001 BufferPtr = CurPtr-1;
3002
3003 // Next, lex the character, which should handle the EOD transition.
3004 Lex(Tmp);
3005 if (Tmp.is(tok::code_completion)) {
3006 if (PP)
3008 Lex(Tmp);
3009 }
3010 assert(Tmp.is(tok::eod) && "Unexpected token!");
3011
3012 // Finally, we're done;
3013 return;
3014 }
3015 }
3016}
3017
3018/// LexEndOfFile - CurPtr points to the end of this file. Handle this
3019/// condition, reporting diagnostics and handling other edge cases as required.
3020/// This returns true if Result contains a token, false if PP.Lex should be
3021/// called again.
3022bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3023 // If we hit the end of the file while parsing a preprocessor directive,
3024 // end the preprocessor directive first. The next token returned will
3025 // then be the end of file.
3027 // Done parsing the "line".
3029 // Update the location of token as well as BufferPtr.
3030 FormTokenWithChars(Result, CurPtr, tok::eod);
3031
3032 // Restore comment saving mode, in case it was disabled for directive.
3033 if (PP)
3035 return true; // Have a token.
3036 }
3037
3038 // If we are in raw mode, return this event as an EOF token. Let the caller
3039 // that put us in raw mode handle the event.
3040 if (isLexingRawMode()) {
3041 Result.startToken();
3042 BufferPtr = BufferEnd;
3043 FormTokenWithChars(Result, BufferEnd, tok::eof);
3044 return true;
3045 }
3046
3049 // If the preamble cuts off the end of a header guard, consider it guarded.
3050 // The guard is valid for the preamble content itself, and for tools the
3051 // most useful answer is "yes, this file has a header guard".
3052 if (!ConditionalStack.empty())
3054 ConditionalStack.clear();
3055 }
3056
3057 // Issue diagnostics for unterminated #if and missing newline.
3058
3059 // If we are in a #if directive, emit an error.
3060 while (!ConditionalStack.empty()) {
3061 if (PP->getCodeCompletionFileLoc() != FileLoc)
3062 PP->Diag(ConditionalStack.back().IfLoc,
3063 diag::err_pp_unterminated_conditional);
3064 ConditionalStack.pop_back();
3065 }
3066
3067 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3068 // a pedwarn.
3069 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3071 SourceLocation EndLoc = getSourceLocation(BufferEnd);
3072 unsigned DiagID;
3073
3074 if (LangOpts.CPlusPlus11) {
3075 // C++11 [lex.phases] 2.2 p2
3076 // Prefer the C++98 pedantic compatibility warning over the generic,
3077 // non-extension, user-requested "missing newline at EOF" warning.
3078 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3079 DiagID = diag::warn_cxx98_compat_no_newline_eof;
3080 } else {
3081 DiagID = diag::warn_no_newline_eof;
3082 }
3083 } else {
3084 DiagID = diag::ext_no_newline_eof;
3085 }
3086
3087 Diag(BufferEnd, DiagID)
3088 << FixItHint::CreateInsertion(EndLoc, "\n");
3089 }
3090
3091 BufferPtr = CurPtr;
3092
3093 // Finally, let the preprocessor handle this.
3095}
3096
3097/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3098/// the specified lexer will return a tok::l_paren token, 0 if it is something
3099/// else and 2 if there are no more tokens in the buffer controlled by the
3100/// lexer.
3101unsigned Lexer::isNextPPTokenLParen() {
3102 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3103
3104 if (isDependencyDirectivesLexer()) {
3105 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3106 return 2;
3107 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3108 tok::l_paren);
3109 }
3110
3111 // Switch to 'skipping' mode. This will ensure that we can lex a token
3112 // without emitting diagnostics, disables macro expansion, and will cause EOF
3113 // to return an EOF token instead of popping the include stack.
3114 LexingRawMode = true;
3115
3116 // Save state that can be changed while lexing so that we can restore it.
3117 const char *TmpBufferPtr = BufferPtr;
3118 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3119 bool atStartOfLine = IsAtStartOfLine;
3120 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3121 bool leadingSpace = HasLeadingSpace;
3122
3123 Token Tok;
3124 Lex(Tok);
3125
3126 // Restore state that may have changed.
3127 BufferPtr = TmpBufferPtr;
3128 ParsingPreprocessorDirective = inPPDirectiveMode;
3129 HasLeadingSpace = leadingSpace;
3130 IsAtStartOfLine = atStartOfLine;
3131 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3132
3133 // Restore the lexer back to non-skipping mode.
3134 LexingRawMode = false;
3135
3136 if (Tok.is(tok::eof))
3137 return 2;
3138 return Tok.is(tok::l_paren);
3139}
3140
3141/// Find the end of a version control conflict marker.
3142static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3143 ConflictMarkerKind CMK) {
3144 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3145 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3146 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3147 size_t Pos = RestOfBuffer.find(Terminator);
3148 while (Pos != StringRef::npos) {
3149 // Must occur at start of line.
3150 if (Pos == 0 ||
3151 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3152 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3153 Pos = RestOfBuffer.find(Terminator);
3154 continue;
3155 }
3156 return RestOfBuffer.data()+Pos;
3157 }
3158 return nullptr;
3159}
3160
3161/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3162/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3163/// and recover nicely. This returns true if it is a conflict marker and false
3164/// if not.
3165bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3166 // Only a conflict marker if it starts at the beginning of a line.
3167 if (CurPtr != BufferStart &&
3168 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3169 return false;
3170
3171 // Check to see if we have <<<<<<< or >>>>.
3172 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
3173 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
3174 return false;
3175
3176 // If we have a situation where we don't care about conflict markers, ignore
3177 // it.
3178 if (CurrentConflictMarkerState || isLexingRawMode())
3179 return false;
3180
3181 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3182
3183 // Check to see if there is an ending marker somewhere in the buffer at the
3184 // start of a line to terminate this conflict marker.
3185 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3186 // We found a match. We are really in a conflict marker.
3187 // Diagnose this, and ignore to the end of line.
3188 Diag(CurPtr, diag::err_conflict_marker);
3189 CurrentConflictMarkerState = Kind;
3190
3191 // Skip ahead to the end of line. We know this exists because the
3192 // end-of-conflict marker starts with \r or \n.
3193 while (*CurPtr != '\r' && *CurPtr != '\n') {
3194 assert(CurPtr != BufferEnd && "Didn't find end of line");
3195 ++CurPtr;
3196 }
3197 BufferPtr = CurPtr;
3198 return true;
3199 }
3200
3201 // No end of conflict marker found.
3202 return false;
3203}
3204
3205/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3206/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3207/// is the end of a conflict marker. Handle it by ignoring up until the end of
3208/// the line. This returns true if it is a conflict marker and false if not.
3209bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3210 // Only a conflict marker if it starts at the beginning of a line.
3211 if (CurPtr != BufferStart &&
3212 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3213 return false;
3214
3215 // If we have a situation where we don't care about conflict markers, ignore
3216 // it.
3217 if (!CurrentConflictMarkerState || isLexingRawMode())
3218 return false;
3219
3220 // Check to see if we have the marker (4 characters in a row).
3221 for (unsigned i = 1; i != 4; ++i)
3222 if (CurPtr[i] != CurPtr[0])
3223 return false;
3224
3225 // If we do have it, search for the end of the conflict marker. This could
3226 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3227 // be the end of conflict marker.
3228 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3229 CurrentConflictMarkerState)) {
3230 CurPtr = End;
3231
3232 // Skip ahead to the end of line.
3233 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3234 ++CurPtr;
3235
3236 BufferPtr = CurPtr;
3237
3238 // No longer in the conflict marker.
3239 CurrentConflictMarkerState = CMK_None;
3240 return true;
3241 }
3242
3243 return false;
3244}
3245
3246static const char *findPlaceholderEnd(const char *CurPtr,
3247 const char *BufferEnd) {
3248 if (CurPtr == BufferEnd)
3249 return nullptr;
3250 BufferEnd -= 1; // Scan until the second last character.
3251 for (; CurPtr != BufferEnd; ++CurPtr) {
3252 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3253 return CurPtr + 2;
3254 }
3255 return nullptr;
3256}
3257
3258bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3259 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3261 return false;
3262 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3263 if (!End)
3264 return false;
3265 const char *Start = CurPtr - 1;
3266 if (!LangOpts.AllowEditorPlaceholders)
3267 Diag(Start, diag::err_placeholder_in_source);
3268 Result.startToken();
3269 FormTokenWithChars(Result, End, tok::raw_identifier);
3270 Result.setRawIdentifierData(Start);
3273 BufferPtr = End;
3274 return true;
3275}
3276
3277bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3278 if (PP && PP->isCodeCompletionEnabled()) {
3279 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3280 return Loc == PP->getCodeCompletionLoc();
3281 }
3282
3283 return false;
3284}
3285
3286std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3287 const char *SlashLoc,
3288 Token *Result) {
3289 unsigned CharSize;
3290 char Kind = getCharAndSize(StartPtr, CharSize);
3291 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3292
3293 unsigned NumHexDigits;
3294 if (Kind == 'u')
3295 NumHexDigits = 4;
3296 else if (Kind == 'U')
3297 NumHexDigits = 8;
3298
3299 bool Delimited = false;
3300 bool FoundEndDelimiter = false;
3301 unsigned Count = 0;
3302 bool Diagnose = Result && !isLexingRawMode();
3303
3304 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3305 if (Diagnose)
3306 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3307 return std::nullopt;
3308 }
3309
3310 const char *CurPtr = StartPtr + CharSize;
3311 const char *KindLoc = &CurPtr[-1];
3312
3313 uint32_t CodePoint = 0;
3314 while (Count != NumHexDigits || Delimited) {
3315 char C = getCharAndSize(CurPtr, CharSize);
3316 if (!Delimited && Count == 0 && C == '{') {
3317 Delimited = true;
3318 CurPtr += CharSize;
3319 continue;
3320 }
3321
3322 if (Delimited && C == '}') {
3323 CurPtr += CharSize;
3324 FoundEndDelimiter = true;
3325 break;
3326 }
3327
3328 unsigned Value = llvm::hexDigitValue(C);
3329 if (Value == -1U) {
3330 if (!Delimited)
3331 break;
3332 if (Diagnose)
3333 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3334 << StringRef(KindLoc, 1);
3335 return std::nullopt;
3336 }
3337
3338 if (CodePoint & 0xF000'0000) {
3339 if (Diagnose)
3340 Diag(KindLoc, diag::err_escape_too_large) << 0;
3341 return std::nullopt;
3342 }
3343
3344 CodePoint <<= 4;
3345 CodePoint |= Value;
3346 CurPtr += CharSize;
3347 Count++;
3348 }
3349
3350 if (Count == 0) {
3351 if (Diagnose)
3352 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3353 : diag::warn_ucn_escape_no_digits)
3354 << StringRef(KindLoc, 1);
3355 return std::nullopt;
3356 }
3357
3358 if (Delimited && Kind == 'U') {
3359 if (Diagnose)
3360 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3361 return std::nullopt;
3362 }
3363
3364 if (!Delimited && Count != NumHexDigits) {
3365 if (Diagnose) {
3366 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3367 // If the user wrote \U1234, suggest a fixit to \u.
3368 if (Count == 4 && NumHexDigits == 8) {
3369 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3370 Diag(KindLoc, diag::note_ucn_four_not_eight)
3371 << FixItHint::CreateReplacement(URange, "u");
3372 }
3373 }
3374 return std::nullopt;
3375 }
3376
3377 if (Delimited && PP) {
3378 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3379 ? diag::warn_cxx23_delimited_escape_sequence
3380 : diag::ext_delimited_escape_sequence)
3381 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3382 }
3383
3384 if (Result) {
3385 Result->setFlag(Token::HasUCN);
3386 // If the UCN contains either a trigraph or a line splicing,
3387 // we need to call getAndAdvanceChar again to set the appropriate flags
3388 // on Result.
3389 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3390 StartPtr = CurPtr;
3391 else
3392 while (StartPtr != CurPtr)
3393 (void)getAndAdvanceChar(StartPtr, *Result);
3394 } else {
3395 StartPtr = CurPtr;
3396 }
3397 return CodePoint;
3398}
3399
3400std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3401 const char *SlashLoc,
3402 Token *Result) {
3403 unsigned CharSize;
3404 bool Diagnose = Result && !isLexingRawMode();
3405
3406 char C = getCharAndSize(StartPtr, CharSize);
3407 assert(C == 'N' && "expected \\N{...}");
3408
3409 const char *CurPtr = StartPtr + CharSize;
3410 const char *KindLoc = &CurPtr[-1];
3411
3412 C = getCharAndSize(CurPtr, CharSize);
3413 if (C != '{') {
3414 if (Diagnose)
3415 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3416 return std::nullopt;
3417 }
3418 CurPtr += CharSize;
3419 const char *StartName = CurPtr;
3420 bool FoundEndDelimiter = false;
3422 while (C) {
3423 C = getCharAndSize(CurPtr, CharSize);
3424 CurPtr += CharSize;
3425 if (C == '}') {
3426 FoundEndDelimiter = true;
3427 break;
3428 }
3429
3431 break;
3432 Buffer.push_back(C);
3433 }
3434
3435 if (!FoundEndDelimiter || Buffer.empty()) {
3436 if (Diagnose)
3437 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3438 : diag::warn_delimited_ucn_incomplete)
3439 << StringRef(KindLoc, 1);
3440 return std::nullopt;
3441 }
3442
3443 StringRef Name(Buffer.data(), Buffer.size());
3444 std::optional<char32_t> Match =
3445 llvm::sys::unicode::nameToCodepointStrict(Name);
3446 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3447 if (!Match) {
3448 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3449 if (Diagnose) {
3450 Diag(StartName, diag::err_invalid_ucn_name)
3451 << StringRef(Buffer.data(), Buffer.size())
3452 << makeCharRange(*this, StartName, CurPtr - CharSize);
3453 if (LooseMatch) {
3454 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3456 makeCharRange(*this, StartName, CurPtr - CharSize),
3457 LooseMatch->Name);
3458 }
3459 }
3460 // We do not offer misspelled character names suggestions here
3461 // as the set of what would be a valid suggestion depends on context,
3462 // and we should not make invalid suggestions.
3463 }
3464
3465 if (Diagnose && Match)
3466 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3467 ? diag::warn_cxx23_delimited_escape_sequence
3468 : diag::ext_delimited_escape_sequence)
3469 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3470
3471 // If no diagnostic has been emitted yet, likely because we are doing a
3472 // tentative lexing, we do not want to recover here to make sure the token
3473 // will not be incorrectly considered valid. This function will be called
3474 // again and a diagnostic emitted then.
3475 if (LooseMatch && Diagnose)
3476 Match = LooseMatch->CodePoint;
3477
3478 if (Result) {
3479 Result->setFlag(Token::HasUCN);
3480 // If the UCN contains either a trigraph or a line splicing,
3481 // we need to call getAndAdvanceChar again to set the appropriate flags
3482 // on Result.
3483 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3484 StartPtr = CurPtr;
3485 else
3486 while (StartPtr != CurPtr)
3487 (void)getAndAdvanceChar(StartPtr, *Result);
3488 } else {
3489 StartPtr = CurPtr;
3490 }
3491 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3492}
3493
3494uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3495 Token *Result) {
3496
3497 unsigned CharSize;
3498 std::optional<uint32_t> CodePointOpt;
3499 char Kind = getCharAndSize(StartPtr, CharSize);
3500 if (Kind == 'u' || Kind == 'U')
3501 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3502 else if (Kind == 'N')
3503 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3504
3505 if (!CodePointOpt)
3506 return 0;
3507
3508 uint32_t CodePoint = *CodePointOpt;
3509
3510 // Don't apply C family restrictions to UCNs in assembly mode
3511 if (LangOpts.AsmPreprocessor)
3512 return CodePoint;
3513
3514 // C23 6.4.3p2: A universal character name shall not designate a code point
3515 // where the hexadecimal value is:
3516 // - in the range D800 through DFFF inclusive; or
3517 // - greater than 10FFFF.
3518 // A universal-character-name outside the c-char-sequence of a character
3519 // constant, or the s-char-sequence of a string-literal shall not designate
3520 // a control character or a character in the basic character set.
3521
3522 // C++11 [lex.charset]p2: If the hexadecimal value for a
3523 // universal-character-name corresponds to a surrogate code point (in the
3524 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3525 // if the hexadecimal value for a universal-character-name outside the
3526 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3527 // string literal corresponds to a control character (in either of the
3528 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3529 // basic source character set, the program is ill-formed.
3530 if (CodePoint < 0xA0) {
3531 // We don't use isLexingRawMode() here because we need to warn about bad
3532 // UCNs even when skipping preprocessing tokens in a #if block.
3533 if (Result && PP) {
3534 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3535 Diag(BufferPtr, diag::err_ucn_control_character);
3536 else {
3537 char C = static_cast<char>(CodePoint);
3538 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3539 }
3540 }
3541
3542 return 0;
3543 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3544 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3545 // We don't use isLexingRawMode() here because we need to diagnose bad
3546 // UCNs even when skipping preprocessing tokens in a #if block.
3547 if (Result && PP) {
3548 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3549 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3550 else
3551 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3552 }
3553 return 0;
3554 }
3555
3556 return CodePoint;
3557}
3558
3559bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3560 const char *CurPtr) {
3561 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3563 Diag(BufferPtr, diag::ext_unicode_whitespace)
3564 << makeCharRange(*this, BufferPtr, CurPtr);
3565
3566 Result.setFlag(Token::LeadingSpace);
3567 return true;
3568 }
3569 return false;
3570}
3571
3572void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3573 IsAtStartOfLine = Result.isAtStartOfLine();
3574 HasLeadingSpace = Result.hasLeadingSpace();
3575 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3576 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3577}
3578
3579bool Lexer::Lex(Token &Result) {
3580 assert(!isDependencyDirectivesLexer());
3581
3582 // Start a new token.
3583 Result.startToken();
3584
3585 // Set up misc whitespace flags for LexTokenInternal.
3586 if (IsAtStartOfLine) {
3587 Result.setFlag(Token::StartOfLine);
3588 IsAtStartOfLine = false;
3589 }
3590
3591 if (HasLeadingSpace) {
3592 Result.setFlag(Token::LeadingSpace);
3593 HasLeadingSpace = false;
3594 }
3595
3596 if (HasLeadingEmptyMacro) {
3598 HasLeadingEmptyMacro = false;
3599 }
3600
3601 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3602 IsAtPhysicalStartOfLine = false;
3603 bool isRawLex = isLexingRawMode();
3604 (void) isRawLex;
3605 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3606 // (After the LexTokenInternal call, the lexer might be destroyed.)
3607 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3608 return returnedToken;
3609}
3610
3611/// LexTokenInternal - This implements a simple C family lexer. It is an
3612/// extremely performance critical piece of code. This assumes that the buffer
3613/// has a null character at the end of the file. This returns a preprocessing
3614/// token, not a normal token, as such, it is an internal interface. It assumes
3615/// that the Flags of result have been cleared before calling this.
3616bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3617LexStart:
3618 assert(!Result.needsCleaning() && "Result needs cleaning");
3619 assert(!Result.hasPtrData() && "Result has not been reset");
3620
3621 // CurPtr - Cache BufferPtr in an automatic variable.
3622 const char *CurPtr = BufferPtr;
3623
3624 // Small amounts of horizontal whitespace is very common between tokens.
3625 if (isHorizontalWhitespace(*CurPtr)) {
3626 do {
3627 ++CurPtr;
3628 } while (isHorizontalWhitespace(*CurPtr));
3629
3630 // If we are keeping whitespace and other tokens, just return what we just
3631 // skipped. The next lexer invocation will return the token after the
3632 // whitespace.
3633 if (isKeepWhitespaceMode()) {
3634 FormTokenWithChars(Result, CurPtr, tok::unknown);
3635 // FIXME: The next token will not have LeadingSpace set.
3636 return true;
3637 }
3638
3639 BufferPtr = CurPtr;
3640 Result.setFlag(Token::LeadingSpace);
3641 }
3642
3643 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3644
3645 // Read a character, advancing over it.
3646 char Char = getAndAdvanceChar(CurPtr, Result);
3648
3649 if (!isVerticalWhitespace(Char))
3650 NewLinePtr = nullptr;
3651
3652 switch (Char) {
3653 case 0: // Null.
3654 // Found end of file?
3655 if (CurPtr-1 == BufferEnd)
3656 return LexEndOfFile(Result, CurPtr-1);
3657
3658 // Check if we are performing code completion.
3659 if (isCodeCompletionPoint(CurPtr-1)) {
3660 // Return the code-completion token.
3661 Result.startToken();
3662 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3663 return true;
3664 }
3665
3666 if (!isLexingRawMode())
3667 Diag(CurPtr-1, diag::null_in_file);
3668 Result.setFlag(Token::LeadingSpace);
3669 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3670 return true; // KeepWhitespaceMode
3671
3672 // We know the lexer hasn't changed, so just try again with this lexer.
3673 // (We manually eliminate the tail call to avoid recursion.)
3674 goto LexNextToken;
3675
3676 case 26: // DOS & CP/M EOF: "^Z".
3677 // If we're in Microsoft extensions mode, treat this as end of file.
3678 if (LangOpts.MicrosoftExt) {
3679 if (!isLexingRawMode())
3680 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3681 return LexEndOfFile(Result, CurPtr-1);
3682 }
3683
3684 // If Microsoft extensions are disabled, this is just random garbage.
3685 Kind = tok::unknown;
3686 break;
3687
3688 case '\r':
3689 if (CurPtr[0] == '\n')
3690 (void)getAndAdvanceChar(CurPtr, Result);
3691 [[fallthrough]];
3692 case '\n':
3693 // If we are inside a preprocessor directive and we see the end of line,
3694 // we know we are done with the directive, so return an EOD token.
3696 // Done parsing the "line".
3698
3699 // Restore comment saving mode, in case it was disabled for directive.
3700 if (PP)
3702
3703 // Since we consumed a newline, we are back at the start of a line.
3704 IsAtStartOfLine = true;
3705 IsAtPhysicalStartOfLine = true;
3706 NewLinePtr = CurPtr - 1;
3707
3708 Kind = tok::eod;
3709 break;
3710 }
3711
3712 // No leading whitespace seen so far.
3713 Result.clearFlag(Token::LeadingSpace);
3714
3715 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3716 return true; // KeepWhitespaceMode
3717
3718 // We only saw whitespace, so just try again with this lexer.
3719 // (We manually eliminate the tail call to avoid recursion.)
3720 goto LexNextToken;
3721 case ' ':
3722 case '\t':
3723 case '\f':
3724 case '\v':
3725 SkipHorizontalWhitespace:
3726 Result.setFlag(Token::LeadingSpace);
3727 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3728 return true; // KeepWhitespaceMode
3729
3730 SkipIgnoredUnits:
3731 CurPtr = BufferPtr;
3732
3733 // If the next token is obviously a // or /* */ comment, skip it efficiently
3734 // too (without going through the big switch stmt).
3735 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3736 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3737 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3738 return true; // There is a token to return.
3739 goto SkipIgnoredUnits;
3740 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3741 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3742 return true; // There is a token to return.
3743 goto SkipIgnoredUnits;
3744 } else if (isHorizontalWhitespace(*CurPtr)) {
3745 goto SkipHorizontalWhitespace;
3746 }
3747 // We only saw whitespace, so just try again with this lexer.
3748 // (We manually eliminate the tail call to avoid recursion.)
3749 goto LexNextToken;
3750
3751 // C99 6.4.4.1: Integer Constants.
3752 // C99 6.4.4.2: Floating Constants.
3753 case '0': case '1': case '2': case '3': case '4':
3754 case '5': case '6': case '7': case '8': case '9':
3755 // Notify MIOpt that we read a non-whitespace/non-comment token.
3756 MIOpt.ReadToken();
3757 return LexNumericConstant(Result, CurPtr);
3758
3759 // Identifier (e.g., uber), or
3760 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3761 // UTF-8 or UTF-16 string literal (C11/C++11).
3762 case 'u':
3763 // Notify MIOpt that we read a non-whitespace/non-comment token.
3764 MIOpt.ReadToken();
3765
3766 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3767 Char = getCharAndSize(CurPtr, SizeTmp);
3768
3769 // UTF-16 string literal
3770 if (Char == '"')
3771 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3772 tok::utf16_string_literal);
3773
3774 // UTF-16 character constant
3775 if (Char == '\'')
3776 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3777 tok::utf16_char_constant);
3778
3779 // UTF-16 raw string literal
3780 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3781 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3782 return LexRawStringLiteral(Result,
3783 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3784 SizeTmp2, Result),
3785 tok::utf16_string_literal);
3786
3787 if (Char == '8') {
3788 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3789
3790 // UTF-8 string literal
3791 if (Char2 == '"')
3792 return LexStringLiteral(Result,
3793 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3794 SizeTmp2, Result),
3795 tok::utf8_string_literal);
3796 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3797 return LexCharConstant(
3798 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3799 SizeTmp2, Result),
3800 tok::utf8_char_constant);
3801
3802 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3803 unsigned SizeTmp3;
3804 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3805 // UTF-8 raw string literal
3806 if (Char3 == '"') {
3807 return LexRawStringLiteral(Result,
3808 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3809 SizeTmp2, Result),
3810 SizeTmp3, Result),
3811 tok::utf8_string_literal);
3812 }
3813 }
3814 }
3815 }
3816
3817 // treat u like the start of an identifier.
3818 return LexIdentifierContinue(Result, CurPtr);
3819
3820 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3821 // Notify MIOpt that we read a non-whitespace/non-comment token.
3822 MIOpt.ReadToken();
3823
3824 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3825 Char = getCharAndSize(CurPtr, SizeTmp);
3826
3827 // UTF-32 string literal
3828 if (Char == '"')
3829 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3830 tok::utf32_string_literal);
3831
3832 // UTF-32 character constant
3833 if (Char == '\'')
3834 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3835 tok::utf32_char_constant);
3836
3837 // UTF-32 raw string literal
3838 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3839 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3840 return LexRawStringLiteral(Result,
3841 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3842 SizeTmp2, Result),
3843 tok::utf32_string_literal);
3844 }
3845
3846 // treat U like the start of an identifier.
3847 return LexIdentifierContinue(Result, CurPtr);
3848
3849 case 'R': // Identifier or C++0x raw string literal
3850 // Notify MIOpt that we read a non-whitespace/non-comment token.
3851 MIOpt.ReadToken();
3852
3853 if (LangOpts.CPlusPlus11) {
3854 Char = getCharAndSize(CurPtr, SizeTmp);
3855
3856 if (Char == '"')
3857 return LexRawStringLiteral(Result,
3858 ConsumeChar(CurPtr, SizeTmp, Result),
3859 tok::string_literal);
3860 }
3861
3862 // treat R like the start of an identifier.
3863 return LexIdentifierContinue(Result, CurPtr);
3864
3865 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3866 // Notify MIOpt that we read a non-whitespace/non-comment token.
3867 MIOpt.ReadToken();
3868 Char = getCharAndSize(CurPtr, SizeTmp);
3869
3870 // Wide string literal.
3871 if (Char == '"')
3872 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3873 tok::wide_string_literal);
3874
3875 // Wide raw string literal.
3876 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3877 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3878 return LexRawStringLiteral(Result,
3879 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3880 SizeTmp2, Result),
3881 tok::wide_string_literal);
3882
3883 // Wide character constant.
3884 if (Char == '\'')
3885 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3886 tok::wide_char_constant);
3887 // FALL THROUGH, treating L like the start of an identifier.
3888 [[fallthrough]];
3889
3890 // C99 6.4.2: Identifiers.
3891 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3892 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3893 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3894 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3895 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3896 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3897 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3898 case 'v': case 'w': case 'x': case 'y': case 'z':
3899 case '_':
3900 // Notify MIOpt that we read a non-whitespace/non-comment token.
3901 MIOpt.ReadToken();
3902 return LexIdentifierContinue(Result, CurPtr);
3903
3904 case '$': // $ in identifiers.
3905 if (LangOpts.DollarIdents) {
3906 if (!isLexingRawMode())
3907 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3908 // Notify MIOpt that we read a non-whitespace/non-comment token.
3909 MIOpt.ReadToken();
3910 return LexIdentifierContinue(Result, CurPtr);
3911 }
3912
3913 Kind = tok::unknown;
3914 break;
3915
3916 // C99 6.4.4: Character Constants.
3917 case '\'':
3918 // Notify MIOpt that we read a non-whitespace/non-comment token.
3919 MIOpt.ReadToken();
3920 return LexCharConstant(Result, CurPtr, tok::char_constant);
3921
3922 // C99 6.4.5: String Literals.
3923 case '"':
3924 // Notify MIOpt that we read a non-whitespace/non-comment token.
3925 MIOpt.ReadToken();
3926 return LexStringLiteral(Result, CurPtr,
3927 ParsingFilename ? tok::header_name
3928 : tok::string_literal);
3929
3930 // C99 6.4.6: Punctuators.
3931 case '?':
3932 Kind = tok::question;
3933 break;
3934 case '[':
3935 Kind = tok::l_square;
3936 break;
3937 case ']':
3938 Kind = tok::r_square;
3939 break;
3940 case '(':
3941 Kind = tok::l_paren;
3942 break;
3943 case ')':
3944 Kind = tok::r_paren;
3945 break;
3946 case '{':
3947 Kind = tok::l_brace;
3948 break;
3949 case '}':
3950 Kind = tok::r_brace;
3951 break;
3952 case '.':
3953 Char = getCharAndSize(CurPtr, SizeTmp);
3954 if (Char >= '0' && Char <= '9') {
3955 // Notify MIOpt that we read a non-whitespace/non-comment token.
3956 MIOpt.ReadToken();
3957
3958 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3959 } else if (LangOpts.CPlusPlus && Char == '*') {
3960 Kind = tok::periodstar;
3961 CurPtr += SizeTmp;
3962 } else if (Char == '.' &&
3963 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3964 Kind = tok::ellipsis;
3965 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3966 SizeTmp2, Result);
3967 } else {
3968 Kind = tok::period;
3969 }
3970 break;
3971 case '&':
3972 Char = getCharAndSize(CurPtr, SizeTmp);
3973 if (Char == '&') {
3974 Kind = tok::ampamp;
3975 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3976 } else if (Char == '=') {
3977 Kind = tok::ampequal;
3978 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3979 } else {
3980 Kind = tok::amp;
3981 }
3982 break;
3983 case '*':
3984 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3985 Kind = tok::starequal;
3986 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3987 } else {
3988 Kind = tok::star;
3989 }
3990 break;
3991 case '+':
3992 Char = getCharAndSize(CurPtr, SizeTmp);
3993 if (Char == '+') {
3994 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3995 Kind = tok::plusplus;
3996 } else if (Char == '=') {
3997 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3998 Kind = tok::plusequal;
3999 } else {
4000 Kind = tok::plus;
4001 }
4002 break;
4003 case '-':
4004 Char = getCharAndSize(CurPtr, SizeTmp);
4005 if (Char == '-') { // --
4006 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4007 Kind = tok::minusminus;
4008 } else if (Char == '>' && LangOpts.CPlusPlus &&
4009 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
4010 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4011 SizeTmp2, Result);
4012 Kind = tok::arrowstar;
4013 } else if (Char == '>') { // ->
4014 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4015 Kind = tok::arrow;
4016 } else if (Char == '=') { // -=
4017 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4018 Kind = tok::minusequal;
4019 } else {
4020 Kind = tok::minus;
4021 }
4022 break;
4023 case '~':
4024 Kind = tok::tilde;
4025 break;
4026 case '!':
4027 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4028 Kind = tok::exclaimequal;
4029 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4030 } else {
4031 Kind = tok::exclaim;
4032 }
4033 break;
4034 case '/':
4035 // 6.4.9: Comments
4036 Char = getCharAndSize(CurPtr, SizeTmp);
4037 if (Char == '/') { // Line comment.
4038 // Even if Line comments are disabled (e.g. in C89 mode), we generally
4039 // want to lex this as a comment. There is one problem with this though,
4040 // that in one particular corner case, this can change the behavior of the
4041 // resultant program. For example, In "foo //**/ bar", C89 would lex
4042 // this as "foo / bar" and languages with Line comments would lex it as
4043 // "foo". Check to see if the character after the second slash is a '*'.
4044 // If so, we will lex that as a "/" instead of the start of a comment.
4045 // However, we never do this if we are just preprocessing.
4046 bool TreatAsComment =
4047 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4048 if (!TreatAsComment)
4049 if (!(PP && PP->isPreprocessedOutput()))
4050 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4051
4052 if (TreatAsComment) {
4053 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4054 TokAtPhysicalStartOfLine))
4055 return true; // There is a token to return.
4056
4057 // It is common for the tokens immediately after a // comment to be
4058 // whitespace (indentation for the next line). Instead of going through
4059 // the big switch, handle it efficiently now.
4060 goto SkipIgnoredUnits;
4061 }
4062 }
4063
4064 if (Char == '*') { // /**/ comment.
4065 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4066 TokAtPhysicalStartOfLine))
4067 return true; // There is a token to return.
4068
4069 // We only saw whitespace, so just try again with this lexer.
4070 // (We manually eliminate the tail call to avoid recursion.)
4071 goto LexNextToken;
4072 }
4073
4074 if (Char == '=') {
4075 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4076 Kind = tok::slashequal;
4077 } else {
4078 Kind = tok::slash;
4079 }
4080 break;
4081 case '%':
4082 Char = getCharAndSize(CurPtr, SizeTmp);
4083 if (Char == '=') {
4084 Kind = tok::percentequal;
4085 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4086 } else if (LangOpts.Digraphs && Char == '>') {
4087 Kind = tok::r_brace; // '%>' -> '}'
4088 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4089 } else if (LangOpts.Digraphs && Char == ':') {
4090 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4091 Char = getCharAndSize(CurPtr, SizeTmp);
4092 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4093 Kind = tok::hashhash; // '%:%:' -> '##'
4094 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4095 SizeTmp2, Result);
4096 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4097 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4098 if (!isLexingRawMode())
4099 Diag(BufferPtr, diag::ext_charize_microsoft);
4100 Kind = tok::hashat;
4101 } else { // '%:' -> '#'
4102 // We parsed a # character. If this occurs at the start of the line,
4103 // it's actually the start of a preprocessing directive. Callback to
4104 // the preprocessor to handle it.
4105 // TODO: -fpreprocessed mode??
4106 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4107 goto HandleDirective;
4108
4109 Kind = tok::hash;
4110 }
4111 } else {
4112 Kind = tok::percent;
4113 }
4114 break;
4115 case '<':
4116 Char = getCharAndSize(CurPtr, SizeTmp);
4117 if (ParsingFilename) {
4118 return LexAngledStringLiteral(Result, CurPtr);
4119 } else if (Char == '<') {
4120 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4121 if (After == '=') {
4122 Kind = tok::lesslessequal;
4123 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4124 SizeTmp2, Result);
4125 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4126 // If this is actually a '<<<<<<<' version control conflict marker,
4127 // recognize it as such and recover nicely.
4128 goto LexNextToken;
4129 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4130 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4131 // ignore it.
4132 goto LexNextToken;
4133 } else if (LangOpts.CUDA && After == '<') {
4134 Kind = tok::lesslessless;
4135 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4136 SizeTmp2, Result);
4137 } else {
4138 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4139 Kind = tok::lessless;
4140 }
4141 } else if (Char == '=') {
4142 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4143 if (After == '>') {
4144 if (LangOpts.CPlusPlus20) {
4145 if (!isLexingRawMode())
4146 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4147 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4148 SizeTmp2, Result);
4149 Kind = tok::spaceship;
4150 break;
4151 }
4152 // Suggest adding a space between the '<=' and the '>' to avoid a
4153 // change in semantics if this turns up in C++ <=17 mode.
4154 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4155 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4157 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4158 }
4159 }
4160 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4161 Kind = tok::lessequal;
4162 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4163 if (LangOpts.CPlusPlus11 &&
4164 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4165 // C++0x [lex.pptoken]p3:
4166 // Otherwise, if the next three characters are <:: and the subsequent
4167 // character is neither : nor >, the < is treated as a preprocessor
4168 // token by itself and not as the first character of the alternative
4169 // token <:.
4170 unsigned SizeTmp3;
4171 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4172 if (After != ':' && After != '>') {
4173 Kind = tok::less;
4174 if (!isLexingRawMode())
4175 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4176 break;
4177 }
4178 }
4179
4180 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4181 Kind = tok::l_square;
4182 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4183 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4184 Kind = tok::l_brace;
4185 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4186 lexEditorPlaceholder(Result, CurPtr)) {
4187 return true;
4188 } else {
4189 Kind = tok::less;
4190 }
4191 break;
4192 case '>':
4193 Char = getCharAndSize(CurPtr, SizeTmp);
4194 if (Char == '=') {
4195 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4196 Kind = tok::greaterequal;
4197 } else if (Char == '>') {
4198 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4199 if (After == '=') {
4200 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4201 SizeTmp2, Result);
4202 Kind = tok::greatergreaterequal;
4203 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4204 // If this is actually a '>>>>' conflict marker, recognize it as such
4205 // and recover nicely.
4206 goto LexNextToken;
4207 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4208 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4209 goto LexNextToken;
4210 } else if (LangOpts.CUDA && After == '>') {
4211 Kind = tok::greatergreatergreater;
4212 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4213 SizeTmp2, Result);
4214 } else {
4215 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4216 Kind = tok::greatergreater;
4217 }
4218 } else {
4219 Kind = tok::greater;
4220 }
4221 break;
4222 case '^':
4223 Char = getCharAndSize(CurPtr, SizeTmp);
4224 if (Char == '=') {
4225 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4226 Kind = tok::caretequal;
4227 } else if (LangOpts.OpenCL && Char == '^') {
4228 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4229 Kind = tok::caretcaret;
4230 } else {
4231 Kind = tok::caret;
4232 }
4233 break;
4234 case '|':
4235 Char = getCharAndSize(CurPtr, SizeTmp);
4236 if (Char == '=') {
4237 Kind = tok::pipeequal;
4238 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4239 } else if (Char == '|') {
4240 // If this is '|||||||' and we're in a conflict marker, ignore it.
4241 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4242 goto LexNextToken;
4243 Kind = tok::pipepipe;
4244 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4245 } else {
4246 Kind = tok::pipe;
4247 }
4248 break;
4249 case ':':
4250 Char = getCharAndSize(CurPtr, SizeTmp);
4251 if (LangOpts.Digraphs && Char == '>') {
4252 Kind = tok::r_square; // ':>' -> ']'
4253 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4254 } else if (Char == ':') {
4255 Kind = tok::coloncolon;
4256 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4257 } else {
4258 Kind = tok::colon;
4259 }
4260 break;
4261 case ';':
4262 Kind = tok::semi;
4263 break;
4264 case '=':
4265 Char = getCharAndSize(CurPtr, SizeTmp);
4266 if (Char == '=') {
4267 // If this is '====' and we're in a conflict marker, ignore it.
4268 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4269 goto LexNextToken;
4270
4271 Kind = tok::equalequal;
4272 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4273 } else {
4274 Kind = tok::equal;
4275 }
4276 break;
4277 case ',':
4278 Kind = tok::comma;
4279 break;
4280 case '#':
4281 Char = getCharAndSize(CurPtr, SizeTmp);
4282 if (Char == '#') {
4283 Kind = tok::hashhash;
4284 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4285 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4286 Kind = tok::hashat;
4287 if (!isLexingRawMode())
4288 Diag(BufferPtr, diag::ext_charize_microsoft);
4289 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4290 } else {
4291 // We parsed a # character. If this occurs at the start of the line,
4292 // it's actually the start of a preprocessing directive. Callback to
4293 // the preprocessor to handle it.
4294 // TODO: -fpreprocessed mode??
4295 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4296 goto HandleDirective;
4297
4298 Kind = tok::hash;
4299 }
4300 break;
4301
4302 case '@':
4303 // Objective C support.
4304 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4305 Kind = tok::at;
4306 else
4307 Kind = tok::unknown;
4308 break;
4309
4310 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4311 case '\\':
4312 if (!LangOpts.AsmPreprocessor) {
4313 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4314 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4315 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4316 return true; // KeepWhitespaceMode
4317
4318 // We only saw whitespace, so just try again with this lexer.
4319 // (We manually eliminate the tail call to avoid recursion.)
4320 goto LexNextToken;
4321 }
4322
4323 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4324 }
4325 }
4326
4327 Kind = tok::unknown;
4328 break;
4329
4330 default: {
4331 if (isASCII(Char)) {
4332 Kind = tok::unknown;
4333 break;
4334 }
4335
4336 llvm::UTF32 CodePoint;
4337
4338 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4339 // an escaped newline.
4340 --CurPtr;
4341 llvm::ConversionResult Status =
4342 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4343 (const llvm::UTF8 *)BufferEnd,
4344 &CodePoint,
4345 llvm::strictConversion);
4346 if (Status == llvm::conversionOK) {
4347 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4348 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4349 return true; // KeepWhitespaceMode
4350
4351 // We only saw whitespace, so just try again with this lexer.
4352 // (We manually eliminate the tail call to avoid recursion.)
4353 goto LexNextToken;
4354 }
4355 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4356 }
4357
4360 ++CurPtr;
4361 Kind = tok::unknown;
4362 break;
4363 }
4364
4365 // Non-ASCII characters tend to creep into source code unintentionally.
4366 // Instead of letting the parser complain about the unknown token,
4367 // just diagnose the invalid UTF-8, then drop the character.
4368 Diag(CurPtr, diag::err_invalid_utf8);
4369
4370 BufferPtr = CurPtr+1;
4371 // We're pretending the character didn't exist, so just try again with
4372 // this lexer.
4373 // (We manually eliminate the tail call to avoid recursion.)
4374 goto LexNextToken;
4375 }
4376 }
4377
4378 // Notify MIOpt that we read a non-whitespace/non-comment token.
4379 MIOpt.ReadToken();
4380
4381 // Update the location of token as well as BufferPtr.
4382 FormTokenWithChars(Result, CurPtr, Kind);
4383 return true;
4384
4385HandleDirective:
4386 // We parsed a # character and it's the start of a preprocessing directive.
4387
4388 FormTokenWithChars(Result, CurPtr, tok::hash);
4390
4392 // With a fatal failure in the module loader, we abort parsing.
4393 return true;
4394
4395 // We parsed the directive; lex a token with the new state.
4396 return false;
4397
4398LexNextToken:
4399 Result.clearFlag(Token::NeedsCleaning);
4400 goto LexStart;
4401}
4402
4403const char *Lexer::convertDependencyDirectiveToken(
4405 const char *TokPtr = BufferStart + DDTok.Offset;
4406 Result.startToken();
4407 Result.setLocation(getSourceLocation(TokPtr));
4408 Result.setKind(DDTok.Kind);
4409 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4410 Result.setLength(DDTok.Length);
4411 BufferPtr = TokPtr + DDTok.Length;
4412 return TokPtr;
4413}
4414
4415bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4416 assert(isDependencyDirectivesLexer());
4417
4418 using namespace dependency_directives_scan;
4419
4420 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4421 if (DepDirectives.front().Kind == pp_eof)
4422 return LexEndOfFile(Result, BufferEnd);
4423 if (DepDirectives.front().Kind == tokens_present_before_eof)
4424 MIOpt.ReadToken();
4425 NextDepDirectiveTokenIndex = 0;
4426 DepDirectives = DepDirectives.drop_front();
4427 }
4428
4430 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4431 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4432 // Read something other than a preprocessor directive hash.
4433 MIOpt.ReadToken();
4434 }
4435
4436 if (ParsingFilename && DDTok.is(tok::less)) {
4437 BufferPtr = BufferStart + DDTok.Offset;
4438 LexAngledStringLiteral(Result, BufferPtr + 1);
4439 if (Result.isNot(tok::header_name))
4440 return true;
4441 // Advance the index of lexed tokens.
4442 while (true) {
4443 const dependency_directives_scan::Token &NextTok =
4444 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4445 if (BufferStart + NextTok.Offset >= BufferPtr)
4446 break;
4447 ++NextDepDirectiveTokenIndex;
4448 }
4449 return true;
4450 }
4451
4452 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4453
4454 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4456 return false;
4457 }
4458 if (Result.is(tok::raw_identifier)) {
4459 Result.setRawIdentifierData(TokPtr);
4460 if (!isLexingRawMode()) {
4462 if (II->isHandleIdentifierCase())
4463 return PP->HandleIdentifier(Result);
4464 }
4465 return true;
4466 }
4467 if (Result.isLiteral()) {
4468 Result.setLiteralData(TokPtr);
4469 return true;
4470 }
4471 if (Result.is(tok::colon)) {
4472 // Convert consecutive colons to 'tok::coloncolon'.
4473 if (*BufferPtr == ':') {
4474 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4475 tok::colon));
4476 ++NextDepDirectiveTokenIndex;
4477 Result.setKind(tok::coloncolon);
4478 }
4479 return true;
4480 }
4481 if (Result.is(tok::eod))
4483
4484 return true;
4485}
4486
4487bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4488 assert(isDependencyDirectivesLexer());
4489
4490 using namespace dependency_directives_scan;
4491
4492 bool Stop = false;
4493 unsigned NestedIfs = 0;
4494 do {
4495 DepDirectives = DepDirectives.drop_front();
4496 switch (DepDirectives.front().Kind) {
4497 case pp_none:
4498 llvm_unreachable("unexpected 'pp_none'");
4499 case pp_include:
4501 case pp_define:
4502 case pp_undef:
4503 case pp_import:
4504 case pp_pragma_import:
4505 case pp_pragma_once:
4510 case pp_include_next:
4511 case decl_at_import:
4512 case cxx_module_decl:
4513 case cxx_import_decl:
4517 break;
4518 case pp_if:
4519 case pp_ifdef:
4520 case pp_ifndef:
4521 ++NestedIfs;
4522 break;
4523 case pp_elif:
4524 case pp_elifdef:
4525 case pp_elifndef:
4526 case pp_else:
4527 if (!NestedIfs) {
4528 Stop = true;
4529 }
4530 break;
4531 case pp_endif:
4532 if (!NestedIfs) {
4533 Stop = true;
4534 } else {
4535 --NestedIfs;
4536 }
4537 break;
4538 case pp_eof:
4539 NextDepDirectiveTokenIndex = 0;
4540 return LexEndOfFile(Result, BufferEnd);
4541 }
4542 } while (!Stop);
4543
4545 DepDirectives.front().Tokens.front();
4546 assert(DDTok.is(tok::hash));
4547 NextDepDirectiveTokenIndex = 1;
4548
4549 convertDependencyDirectiveToken(DDTok, Result);
4550 return false;
4551}
StringRef P
#define SM(sm)
Definition: Cuda.cpp:80
Defines the Diagnostic-related interfaces.
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:900
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition: Lexer.cpp:1491
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1685
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1213
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:276
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3142
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1610
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:512
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:236
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1141
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1505
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1575
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1472
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition: Lexer.cpp:1559
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3246
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition: Lexer.cpp:1478
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:872
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2658
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1194
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1533
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1581
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:495
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
SourceLocation Begin
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ int
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16260
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16052
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1266
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:192
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1542
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:911
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool isInvalid() const
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:134
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:123
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:97
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:83
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:976
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1310
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:229
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1012
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:2966
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:824
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1184
static char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:580
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:742
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:135
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:846
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:907
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1090
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1165
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1110
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:403
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1086
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:450
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:560
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:171
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1059
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:194
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:588
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:461
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1277
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:802
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:261
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:128
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
SourceManager & getSourceManager() const
EmptylineHandler * getEmptylineHandler() const
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
IdentifierTable & getIdentifierTable()
const LangOptions & getLangOpts() const
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
DiagnosticsEngine & getDiagnostics() const
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:186
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:303
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:115
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:131
unsigned getLength() const
Definition: Token.h:134
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:66
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:98
tok::TokenKind getKind() const
Definition: Token.h:93
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:275
@ HasUCN
Definition: Token.h:82
@ IsEditorPlaceholder
Definition: Token.h:87
@ LeadingEmptyMacro
Definition: Token.h:80
@ LeadingSpace
Definition: Token.h:76
@ StartOfLine
Definition: Token.h:74
@ HasUDSuffix
Definition: Token.h:81
@ NeedsCleaning
Definition: Token.h:79
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:120
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:57
void startToken()
Reset all flags to cleared.
Definition: Token.h:176
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:292
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:212
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:224
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:243
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4204
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3023
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3652
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:89
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:42
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:84
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
@ LineComment
Definition: LangStandard.h:48
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:76
@ C
Languages that the frontend can parse and compile.
@ Result
The result type of a method or function.
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:160
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:93
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:153
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c, bool AllowDollar=false)
Returns true if this is a body character of a C identifier, which is [a-zA-Z0-9_].
Definition: CharInfo.h:64
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:54
Definition: Format.h:5078
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.