clang 20.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
18#include "clang/Basic/LLVM.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MemoryBufferRef.h"
36#include "llvm/Support/NativeFormatting.h"
37#include "llvm/Support/Unicode.h"
38#include "llvm/Support/UnicodeCharRanges.h"
39#include <algorithm>
40#include <cassert>
41#include <cstddef>
42#include <cstdint>
43#include <cstring>
44#include <optional>
45#include <string>
46#include <tuple>
47#include <utility>
48
49#ifdef __SSE4_2__
50#include <nmmintrin.h>
51#endif
52
53using namespace clang;
54
55//===----------------------------------------------------------------------===//
56// Token Class Implementation
57//===----------------------------------------------------------------------===//
58
59/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
61 if (isAnnotation())
62 return false;
63 if (const IdentifierInfo *II = getIdentifierInfo())
64 return II->getObjCKeywordID() == objcKey;
65 return false;
66}
67
68/// getObjCKeywordID - Return the ObjC keyword kind.
70 if (isAnnotation())
71 return tok::objc_not_keyword;
72 const IdentifierInfo *specId = getIdentifierInfo();
73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
74}
75
76/// Determine whether the token kind starts a simple-type-specifier.
77bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
78 switch (getKind()) {
79 case tok::annot_typename:
80 case tok::annot_decltype:
81 case tok::annot_pack_indexing_type:
82 return true;
83
84 case tok::kw_short:
85 case tok::kw_long:
86 case tok::kw___int64:
87 case tok::kw___int128:
88 case tok::kw_signed:
89 case tok::kw_unsigned:
90 case tok::kw_void:
91 case tok::kw_char:
92 case tok::kw_int:
93 case tok::kw_half:
94 case tok::kw_float:
95 case tok::kw_double:
96 case tok::kw___bf16:
97 case tok::kw__Float16:
98 case tok::kw___float128:
99 case tok::kw___ibm128:
100 case tok::kw_wchar_t:
101 case tok::kw_bool:
102 case tok::kw__Bool:
103 case tok::kw__Accum:
104 case tok::kw__Fract:
105 case tok::kw__Sat:
106#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
107#include "clang/Basic/TransformTypeTraits.def"
108 case tok::kw___auto_type:
109 case tok::kw_char16_t:
110 case tok::kw_char32_t:
111 case tok::kw_typeof:
112 case tok::kw_decltype:
113 case tok::kw_char8_t:
114 return getIdentifierInfo()->isKeyword(LangOpts);
115
116 default:
117 return false;
118 }
119}
120
121//===----------------------------------------------------------------------===//
122// Lexer Class Implementation
123//===----------------------------------------------------------------------===//
124
125void Lexer::anchor() {}
126
127void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
128 const char *BufEnd) {
129 BufferStart = BufStart;
130 BufferPtr = BufPtr;
131 BufferEnd = BufEnd;
132
133 assert(BufEnd[0] == 0 &&
134 "We assume that the input buffer has a null character at the end"
135 " to simplify lexing!");
136
137 // Check whether we have a BOM in the beginning of the buffer. If yes - act
138 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
139 // skip the UTF-8 BOM if it's present.
140 if (BufferStart == BufferPtr) {
141 // Determine the size of the BOM.
142 StringRef Buf(BufferStart, BufferEnd - BufferStart);
143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
144 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
145 .Default(0);
146
147 // Skip the BOM.
148 BufferPtr += BOMLength;
149 }
150
151 Is_PragmaLexer = false;
152 CurrentConflictMarkerState = CMK_None;
153
154 // Start of the file is a start of line.
155 IsAtStartOfLine = true;
156 IsAtPhysicalStartOfLine = true;
157
158 HasLeadingSpace = false;
159 HasLeadingEmptyMacro = false;
160
161 // We are not after parsing a #.
163
164 // We are not after parsing #include.
165 ParsingFilename = false;
166
167 // We are not in raw mode. Raw mode disables diagnostics and interpretation
168 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
169 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
170 // or otherwise skipping over tokens.
171 LexingRawMode = false;
172
173 // Default to not keeping comments.
174 ExtendedTokenMode = 0;
175
176 NewLinePtr = nullptr;
177}
178
179/// Lexer constructor - Create a new lexer object for the specified buffer
180/// with the specified preprocessor managing the lexing process. This lexer
181/// assumes that the associated file buffer and Preprocessor objects will
182/// outlive it, so it doesn't take ownership of either of them.
183Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
184 Preprocessor &PP, bool IsFirstIncludeOfFile)
185 : PreprocessorLexer(&PP, FID),
186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
187 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
190 InputFile.getBufferEnd());
191
193}
194
195/// Lexer constructor - Create a new raw lexer object. This object is only
196/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
197/// range will outlive it, so it doesn't take ownership of it.
198Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
199 const char *BufStart, const char *BufPtr, const char *BufEnd,
200 bool IsFirstIncludeOfFile)
201 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
203 InitLexer(BufStart, BufPtr, BufEnd);
204
205 // We *are* in raw mode.
206 LexingRawMode = true;
207}
208
209/// Lexer constructor - Create a new raw lexer object. This object is only
210/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
211/// range will outlive it, so it doesn't take ownership of it.
212Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
213 const SourceManager &SM, const LangOptions &langOpts,
214 bool IsFirstIncludeOfFile)
215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
216 FromFile.getBufferStart(), FromFile.getBufferEnd(),
217 IsFirstIncludeOfFile) {}
218
220 assert(PP && "Cannot reset token mode without a preprocessor");
221 if (LangOpts.TraditionalCPP)
223 else
225}
226
227/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
228/// _Pragma expansion. This has a variety of magic semantics that this method
229/// sets up. It returns a new'd Lexer that must be delete'd when done.
230///
231/// On entrance to this routine, TokStartLoc is a macro location which has a
232/// spelling loc that indicates the bytes to be lexed for the token and an
233/// expansion location that indicates where all lexed tokens should be
234/// "expanded from".
235///
236/// TODO: It would really be nice to make _Pragma just be a wrapper around a
237/// normal lexer that remaps tokens as they fly by. This would require making
238/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
239/// interface that could handle this stuff. This would pull GetMappedTokenLoc
240/// out of the critical path of the lexer!
241///
243 SourceLocation ExpansionLocStart,
244 SourceLocation ExpansionLocEnd,
245 unsigned TokLen, Preprocessor &PP) {
247
248 // Create the lexer as if we were going to lex the file normally.
249 FileID SpellingFID = SM.getFileID(SpellingLoc);
250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
251 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
252
253 // Now that the lexer is created, change the start/end locations so that we
254 // just lex the subsection of the file that we want. This is lexing from a
255 // scratch buffer.
256 const char *StrData = SM.getCharacterData(SpellingLoc);
257
258 L->BufferPtr = StrData;
259 L->BufferEnd = StrData+TokLen;
260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
261
262 // Set the SourceLocation with the remapping information. This ensures that
263 // GetMappedTokenLoc will remap the tokens as they are lexed.
264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
265 ExpansionLocStart,
266 ExpansionLocEnd, TokLen);
267
268 // Ensure that the lexer thinks it is inside a directive, so that end \n will
269 // return an EOD token.
271
272 // This lexer really is for _Pragma.
273 L->Is_PragmaLexer = true;
274 return L;
275}
276
277void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
279 this->IsAtStartOfLine = IsAtStartOfLine;
280 assert((BufferStart + Offset) <= BufferEnd);
281 BufferPtr = BufferStart + Offset;
282}
283
284template <typename T> static void StringifyImpl(T &Str, char Quote) {
285 typename T::size_type i = 0, e = Str.size();
286 while (i < e) {
287 if (Str[i] == '\\' || Str[i] == Quote) {
288 Str.insert(Str.begin() + i, '\\');
289 i += 2;
290 ++e;
291 } else if (Str[i] == '\n' || Str[i] == '\r') {
292 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
294 Str[i] != Str[i + 1]) {
295 Str[i] = '\\';
296 Str[i + 1] = 'n';
297 } else {
298 // Replace '\n' and '\r' to '\\' followed by 'n'.
299 Str[i] = '\\';
300 Str.insert(Str.begin() + i + 1, 'n');
301 ++e;
302 }
303 i += 2;
304 } else
305 ++i;
306 }
307}
308
309std::string Lexer::Stringify(StringRef Str, bool Charify) {
310 std::string Result = std::string(Str);
311 char Quote = Charify ? '\'' : '"';
312 StringifyImpl(Result, Quote);
313 return Result;
314}
315
317
318//===----------------------------------------------------------------------===//
319// Token Spelling
320//===----------------------------------------------------------------------===//
321
322/// Slow case of getSpelling. Extract the characters comprising the
323/// spelling of this token from the provided input buffer.
324static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
325 const LangOptions &LangOpts, char *Spelling) {
326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
327
328 size_t Length = 0;
329 const char *BufEnd = BufPtr + Tok.getLength();
330
331 if (tok::isStringLiteral(Tok.getKind())) {
332 // Munch the encoding-prefix and opening double-quote.
333 while (BufPtr < BufEnd) {
334 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
335 Spelling[Length++] = CharAndSize.Char;
336 BufPtr += CharAndSize.Size;
337
338 if (Spelling[Length - 1] == '"')
339 break;
340 }
341
342 // Raw string literals need special handling; trigraph expansion and line
343 // splicing do not occur within their d-char-sequence nor within their
344 // r-char-sequence.
345 if (Length >= 2 &&
346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
347 // Search backwards from the end of the token to find the matching closing
348 // quote.
349 const char *RawEnd = BufEnd;
350 do --RawEnd; while (*RawEnd != '"');
351 size_t RawLength = RawEnd - BufPtr + 1;
352
353 // Everything between the quotes is included verbatim in the spelling.
354 memcpy(Spelling + Length, BufPtr, RawLength);
355 Length += RawLength;
356 BufPtr += RawLength;
357
358 // The rest of the token is lexed normally.
359 }
360 }
361
362 while (BufPtr < BufEnd) {
363 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
364 Spelling[Length++] = CharAndSize.Char;
365 BufPtr += CharAndSize.Size;
366 }
367
368 assert(Length < Tok.getLength() &&
369 "NeedsCleaning flag set on token that didn't need cleaning!");
370 return Length;
371}
372
373/// getSpelling() - Return the 'spelling' of this token. The spelling of a
374/// token are the characters used to represent the token in the source file
375/// after trigraph expansion and escaped-newline folding. In particular, this
376/// wants to get the true, uncanonicalized, spelling of things like digraphs
377/// UCNs, etc.
379 SmallVectorImpl<char> &buffer,
380 const SourceManager &SM,
381 const LangOptions &options,
382 bool *invalid) {
383 // Break down the source location.
384 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
385
386 // Try to the load the file buffer.
387 bool invalidTemp = false;
388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
389 if (invalidTemp) {
390 if (invalid) *invalid = true;
391 return {};
392 }
393
394 const char *tokenBegin = file.data() + locInfo.second;
395
396 // Lex from the start of the given location.
397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
398 file.begin(), tokenBegin, file.end());
399 Token token;
400 lexer.LexFromRawLexer(token);
401
402 unsigned length = token.getLength();
403
404 // Common case: no need for cleaning.
405 if (!token.needsCleaning())
406 return StringRef(tokenBegin, length);
407
408 // Hard case, we need to relex the characters into the string.
409 buffer.resize(length);
410 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
411 return StringRef(buffer.data(), buffer.size());
412}
413
414/// getSpelling() - Return the 'spelling' of this token. The spelling of a
415/// token are the characters used to represent the token in the source file
416/// after trigraph expansion and escaped-newline folding. In particular, this
417/// wants to get the true, uncanonicalized, spelling of things like digraphs
418/// UCNs, etc.
419std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
420 const LangOptions &LangOpts, bool *Invalid) {
421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
422
423 bool CharDataInvalid = false;
424 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
425 &CharDataInvalid);
426 if (Invalid)
427 *Invalid = CharDataInvalid;
428 if (CharDataInvalid)
429 return {};
430
431 // If this token contains nothing interesting, return it directly.
432 if (!Tok.needsCleaning())
433 return std::string(TokStart, TokStart + Tok.getLength());
434
435 std::string Result;
436 Result.resize(Tok.getLength());
437 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
438 return Result;
439}
440
441/// getSpelling - This method is used to get the spelling of a token into a
442/// preallocated buffer, instead of as an std::string. The caller is required
443/// to allocate enough space for the token, which is guaranteed to be at least
444/// Tok.getLength() bytes long. The actual length of the token is returned.
445///
446/// Note that this method may do two possible things: it may either fill in
447/// the buffer specified with characters, or it may *change the input pointer*
448/// to point to a constant buffer with the data already in it (avoiding a
449/// copy). The caller is not allowed to modify the returned buffer pointer
450/// if an internal buffer is returned.
451unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
452 const SourceManager &SourceMgr,
453 const LangOptions &LangOpts, bool *Invalid) {
454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
455
456 const char *TokStart = nullptr;
457 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
458 if (Tok.is(tok::raw_identifier))
459 TokStart = Tok.getRawIdentifier().data();
460 else if (!Tok.hasUCN()) {
461 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
462 // Just return the string from the identifier table, which is very quick.
463 Buffer = II->getNameStart();
464 return II->getLength();
465 }
466 }
467
468 // NOTE: this can be checked even after testing for an IdentifierInfo.
469 if (Tok.isLiteral())
470 TokStart = Tok.getLiteralData();
471
472 if (!TokStart) {
473 // Compute the start of the token in the input lexer buffer.
474 bool CharDataInvalid = false;
475 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
476 if (Invalid)
477 *Invalid = CharDataInvalid;
478 if (CharDataInvalid) {
479 Buffer = "";
480 return 0;
481 }
482 }
483
484 // If this token contains nothing interesting, return it directly.
485 if (!Tok.needsCleaning()) {
486 Buffer = TokStart;
487 return Tok.getLength();
488 }
489
490 // Otherwise, hard case, relex the characters into the string.
491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
492}
493
494/// MeasureTokenLength - Relex the token at the specified location and return
495/// its length in bytes in the input file. If the token needs cleaning (e.g.
496/// includes a trigraph or an escaped newline) then this count includes bytes
497/// that are part of that.
499 const SourceManager &SM,
500 const LangOptions &LangOpts) {
501 Token TheTok;
502 if (getRawToken(Loc, TheTok, SM, LangOpts))
503 return 0;
504 return TheTok.getLength();
505}
506
507/// Relex the token at the specified location.
508/// \returns true if there was a failure, false on success.
510 const SourceManager &SM,
511 const LangOptions &LangOpts,
512 bool IgnoreWhiteSpace) {
513 // TODO: this could be special cased for common tokens like identifiers, ')',
514 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
515 // all obviously single-char tokens. This could use
516 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
517 // something.
518
519 // If this comes from a macro expansion, we really do want the macro name, not
520 // the token this macro expanded to.
521 Loc = SM.getExpansionLoc(Loc);
522 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
523 bool Invalid = false;
524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
525 if (Invalid)
526 return true;
527
528 const char *StrData = Buffer.data()+LocInfo.second;
529
530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))
531 return true;
532
533 // Create a lexer starting at the beginning of this token.
534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
535 Buffer.begin(), StrData, Buffer.end());
536 TheLexer.SetCommentRetentionState(true);
537 TheLexer.LexFromRawLexer(Result);
538 return false;
539}
540
541/// Returns the pointer that points to the beginning of line that contains
542/// the given offset, or null if the offset if invalid.
543static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
544 const char *BufStart = Buffer.data();
545 if (Offset >= Buffer.size())
546 return nullptr;
547
548 const char *LexStart = BufStart + Offset;
549 for (; LexStart != BufStart; --LexStart) {
550 if (isVerticalWhitespace(LexStart[0]) &&
551 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
552 // LexStart should point at first character of logical line.
553 ++LexStart;
554 break;
555 }
556 }
557 return LexStart;
558}
559
561 const SourceManager &SM,
562 const LangOptions &LangOpts) {
563 assert(Loc.isFileID());
564 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
565 if (LocInfo.first.isInvalid())
566 return Loc;
567
568 bool Invalid = false;
569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
570 if (Invalid)
571 return Loc;
572
573 // Back up from the current location until we hit the beginning of a line
574 // (or the buffer). We'll relex from that point.
575 const char *StrData = Buffer.data() + LocInfo.second;
576 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
577 if (!LexStart || LexStart == StrData)
578 return Loc;
579
580 // Create a lexer starting at the beginning of this token.
581 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
583 Buffer.end());
584 TheLexer.SetCommentRetentionState(true);
585
586 // Lex tokens until we find the token that contains the source location.
587 Token TheTok;
588 do {
589 TheLexer.LexFromRawLexer(TheTok);
590
591 if (TheLexer.getBufferLocation() > StrData) {
592 // Lexing this token has taken the lexer past the source location we're
593 // looking for. If the current token encompasses our source location,
594 // return the beginning of that token.
595 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
596 return TheTok.getLocation();
597
598 // We ended up skipping over the source location entirely, which means
599 // that it points into whitespace. We're done here.
600 break;
601 }
602 } while (TheTok.getKind() != tok::eof);
603
604 // We've passed our source location; just return the original source location.
605 return Loc;
606}
607
609 const SourceManager &SM,
610 const LangOptions &LangOpts) {
611 if (Loc.isFileID())
612 return getBeginningOfFileToken(Loc, SM, LangOpts);
613
614 if (!SM.isMacroArgExpansion(Loc))
615 return Loc;
616
617 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
618 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
619 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
620 std::pair<FileID, unsigned> BeginFileLocInfo =
621 SM.getDecomposedLoc(BeginFileLoc);
622 assert(FileLocInfo.first == BeginFileLocInfo.first &&
623 FileLocInfo.second >= BeginFileLocInfo.second);
624 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
625}
626
627namespace {
628
629enum PreambleDirectiveKind {
630 PDK_Skipped,
631 PDK_Unknown
632};
633
634} // namespace
635
637 const LangOptions &LangOpts,
638 unsigned MaxLines) {
639 // Create a lexer starting at the beginning of the file. Note that we use a
640 // "fake" file source location at offset 1 so that the lexer will track our
641 // position within the file.
642 const SourceLocation::UIntTy StartOffset = 1;
644 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
645 Buffer.end());
646 TheLexer.SetCommentRetentionState(true);
647
648 bool InPreprocessorDirective = false;
649 Token TheTok;
650 SourceLocation ActiveCommentLoc;
651
652 unsigned MaxLineOffset = 0;
653 if (MaxLines) {
654 const char *CurPtr = Buffer.begin();
655 unsigned CurLine = 0;
656 while (CurPtr != Buffer.end()) {
657 char ch = *CurPtr++;
658 if (ch == '\n') {
659 ++CurLine;
660 if (CurLine == MaxLines)
661 break;
662 }
663 }
664 if (CurPtr != Buffer.end())
665 MaxLineOffset = CurPtr - Buffer.begin();
666 }
667
668 do {
669 TheLexer.LexFromRawLexer(TheTok);
670
671 if (InPreprocessorDirective) {
672 // If we've hit the end of the file, we're done.
673 if (TheTok.getKind() == tok::eof) {
674 break;
675 }
676
677 // If we haven't hit the end of the preprocessor directive, skip this
678 // token.
679 if (!TheTok.isAtStartOfLine())
680 continue;
681
682 // We've passed the end of the preprocessor directive, and will look
683 // at this token again below.
684 InPreprocessorDirective = false;
685 }
686
687 // Keep track of the # of lines in the preamble.
688 if (TheTok.isAtStartOfLine()) {
689 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
690
691 // If we were asked to limit the number of lines in the preamble,
692 // and we're about to exceed that limit, we're done.
693 if (MaxLineOffset && TokOffset >= MaxLineOffset)
694 break;
695 }
696
697 // Comments are okay; skip over them.
698 if (TheTok.getKind() == tok::comment) {
699 if (ActiveCommentLoc.isInvalid())
700 ActiveCommentLoc = TheTok.getLocation();
701 continue;
702 }
703
704 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
705 // This is the start of a preprocessor directive.
706 Token HashTok = TheTok;
707 InPreprocessorDirective = true;
708 ActiveCommentLoc = SourceLocation();
709
710 // Figure out which directive this is. Since we're lexing raw tokens,
711 // we don't have an identifier table available. Instead, just look at
712 // the raw identifier to recognize and categorize preprocessor directives.
713 TheLexer.LexFromRawLexer(TheTok);
714 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
715 StringRef Keyword = TheTok.getRawIdentifier();
716 PreambleDirectiveKind PDK
717 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
718 .Case("include", PDK_Skipped)
719 .Case("__include_macros", PDK_Skipped)
720 .Case("define", PDK_Skipped)
721 .Case("undef", PDK_Skipped)
722 .Case("line", PDK_Skipped)
723 .Case("error", PDK_Skipped)
724 .Case("pragma", PDK_Skipped)
725 .Case("import", PDK_Skipped)
726 .Case("include_next", PDK_Skipped)
727 .Case("warning", PDK_Skipped)
728 .Case("ident", PDK_Skipped)
729 .Case("sccs", PDK_Skipped)
730 .Case("assert", PDK_Skipped)
731 .Case("unassert", PDK_Skipped)
732 .Case("if", PDK_Skipped)
733 .Case("ifdef", PDK_Skipped)
734 .Case("ifndef", PDK_Skipped)
735 .Case("elif", PDK_Skipped)
736 .Case("elifdef", PDK_Skipped)
737 .Case("elifndef", PDK_Skipped)
738 .Case("else", PDK_Skipped)
739 .Case("endif", PDK_Skipped)
740 .Default(PDK_Unknown);
741
742 switch (PDK) {
743 case PDK_Skipped:
744 continue;
745
746 case PDK_Unknown:
747 // We don't know what this directive is; stop at the '#'.
748 break;
749 }
750 }
751
752 // We only end up here if we didn't recognize the preprocessor
753 // directive or it was one that can't occur in the preamble at this
754 // point. Roll back the current token to the location of the '#'.
755 TheTok = HashTok;
756 } else if (TheTok.isAtStartOfLine() &&
757 TheTok.getKind() == tok::raw_identifier &&
758 TheTok.getRawIdentifier() == "module" &&
759 LangOpts.CPlusPlusModules) {
760 // The initial global module fragment introducer "module;" is part of
761 // the preamble, which runs up to the module declaration "module foo;".
762 Token ModuleTok = TheTok;
763 do {
764 TheLexer.LexFromRawLexer(TheTok);
765 } while (TheTok.getKind() == tok::comment);
766 if (TheTok.getKind() != tok::semi) {
767 // Not global module fragment, roll back.
768 TheTok = ModuleTok;
769 break;
770 }
771 continue;
772 }
773
774 // We hit a token that we don't recognize as being in the
775 // "preprocessing only" part of the file, so we're no longer in
776 // the preamble.
777 break;
778 } while (true);
779
780 SourceLocation End;
781 if (ActiveCommentLoc.isValid())
782 End = ActiveCommentLoc; // don't truncate a decl comment.
783 else
784 End = TheTok.getLocation();
785
786 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
787 TheTok.isAtStartOfLine());
788}
789
790unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
791 const SourceManager &SM,
792 const LangOptions &LangOpts) {
793 // Figure out how many physical characters away the specified expansion
794 // character is. This needs to take into consideration newlines and
795 // trigraphs.
796 bool Invalid = false;
797 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
798
799 // If they request the first char of the token, we're trivially done.
800 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
801 return 0;
802
803 unsigned PhysOffset = 0;
804
805 // The usual case is that tokens don't contain anything interesting. Skip
806 // over the uninteresting characters. If a token only consists of simple
807 // chars, this method is extremely fast.
808 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
809 if (CharNo == 0)
810 return PhysOffset;
811 ++TokPtr;
812 --CharNo;
813 ++PhysOffset;
814 }
815
816 // If we have a character that may be a trigraph or escaped newline, use a
817 // lexer to parse it correctly.
818 for (; CharNo; --CharNo) {
819 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
820 TokPtr += CharAndSize.Size;
821 PhysOffset += CharAndSize.Size;
822 }
823
824 // Final detail: if we end up on an escaped newline, we want to return the
825 // location of the actual byte of the token. For example foo<newline>bar
826 // advanced by 3 should return the location of b, not of \\. One compounding
827 // detail of this is that the escape may be made by a trigraph.
828 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
829 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
830
831 return PhysOffset;
832}
833
834/// Computes the source location just past the end of the
835/// token at this source location.
836///
837/// This routine can be used to produce a source location that
838/// points just past the end of the token referenced by \p Loc, and
839/// is generally used when a diagnostic needs to point just after a
840/// token where it expected something different that it received. If
841/// the returned source location would not be meaningful (e.g., if
842/// it points into a macro), this routine returns an invalid
843/// source location.
844///
845/// \param Offset an offset from the end of the token, where the source
846/// location should refer to. The default offset (0) produces a source
847/// location pointing just past the end of the token; an offset of 1 produces
848/// a source location pointing to the last character in the token, etc.
850 const SourceManager &SM,
851 const LangOptions &LangOpts) {
852 if (Loc.isInvalid())
853 return {};
854
855 if (Loc.isMacroID()) {
856 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
857 return {}; // Points inside the macro expansion.
858 }
859
860 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
861 if (Len > Offset)
862 Len = Len - Offset;
863 else
864 return Loc;
865
866 return Loc.getLocWithOffset(Len);
867}
868
869/// Returns true if the given MacroID location points at the first
870/// token of the macro expansion.
872 const SourceManager &SM,
873 const LangOptions &LangOpts,
874 SourceLocation *MacroBegin) {
875 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
876
877 SourceLocation expansionLoc;
878 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
879 return false;
880
881 if (expansionLoc.isFileID()) {
882 // No other macro expansions, this is the first.
883 if (MacroBegin)
884 *MacroBegin = expansionLoc;
885 return true;
886 }
887
888 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
889}
890
891/// Returns true if the given MacroID location points at the last
892/// token of the macro expansion.
894 const SourceManager &SM,
895 const LangOptions &LangOpts,
896 SourceLocation *MacroEnd) {
897 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
898
899 SourceLocation spellLoc = SM.getSpellingLoc(loc);
900 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
901 if (tokLen == 0)
902 return false;
903
904 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
905 SourceLocation expansionLoc;
906 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
907 return false;
908
909 if (expansionLoc.isFileID()) {
910 // No other macro expansions.
911 if (MacroEnd)
912 *MacroEnd = expansionLoc;
913 return true;
914 }
915
916 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
917}
918
920 const SourceManager &SM,
921 const LangOptions &LangOpts) {
924 assert(Begin.isFileID() && End.isFileID());
925 if (Range.isTokenRange()) {
926 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
927 if (End.isInvalid())
928 return {};
929 }
930
931 // Break down the source locations.
932 FileID FID;
933 unsigned BeginOffs;
934 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
935 if (FID.isInvalid())
936 return {};
937
938 unsigned EndOffs;
939 if (!SM.isInFileID(End, FID, &EndOffs) ||
940 BeginOffs > EndOffs)
941 return {};
942
944}
945
946// Assumes that `Loc` is in an expansion.
948 const SourceManager &SM) {
949 return SM.getSLocEntry(SM.getFileID(Loc))
950 .getExpansion()
951 .isExpansionTokenRange();
952}
953
955 const SourceManager &SM,
956 const LangOptions &LangOpts) {
959 if (Begin.isInvalid() || End.isInvalid())
960 return {};
961
962 if (Begin.isFileID() && End.isFileID())
963 return makeRangeFromFileLocs(Range, SM, LangOpts);
964
965 if (Begin.isMacroID() && End.isFileID()) {
966 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
967 return {};
969 return makeRangeFromFileLocs(Range, SM, LangOpts);
970 }
971
972 if (Begin.isFileID() && End.isMacroID()) {
973 if (Range.isTokenRange()) {
974 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
975 return {};
976 // Use the *original* end, not the expanded one in `End`.
977 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
978 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
979 return {};
980 Range.setEnd(End);
981 return makeRangeFromFileLocs(Range, SM, LangOpts);
982 }
983
984 assert(Begin.isMacroID() && End.isMacroID());
985 SourceLocation MacroBegin, MacroEnd;
986 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
987 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
988 &MacroEnd)) ||
989 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
990 &MacroEnd)))) {
991 Range.setBegin(MacroBegin);
992 Range.setEnd(MacroEnd);
993 // Use the *original* `End`, not the expanded one in `MacroEnd`.
994 if (Range.isTokenRange())
995 Range.setTokenRange(isInExpansionTokenRange(End, SM));
996 return makeRangeFromFileLocs(Range, SM, LangOpts);
997 }
998
999 bool Invalid = false;
1000 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
1001 &Invalid);
1002 if (Invalid)
1003 return {};
1004
1005 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
1006 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
1007 &Invalid);
1008 if (Invalid)
1009 return {};
1010
1011 if (EndEntry.getExpansion().isMacroArgExpansion() &&
1012 BeginEntry.getExpansion().getExpansionLocStart() ==
1013 EndEntry.getExpansion().getExpansionLocStart()) {
1014 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
1015 Range.setEnd(SM.getImmediateSpellingLoc(End));
1016 return makeFileCharRange(Range, SM, LangOpts);
1017 }
1018 }
1019
1020 return {};
1021}
1022
1024 const SourceManager &SM,
1025 const LangOptions &LangOpts,
1026 bool *Invalid) {
1027 Range = makeFileCharRange(Range, SM, LangOpts);
1028 if (Range.isInvalid()) {
1029 if (Invalid) *Invalid = true;
1030 return {};
1031 }
1032
1033 // Break down the source location.
1034 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
1035 if (beginInfo.first.isInvalid()) {
1036 if (Invalid) *Invalid = true;
1037 return {};
1038 }
1039
1040 unsigned EndOffs;
1041 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
1042 beginInfo.second > EndOffs) {
1043 if (Invalid) *Invalid = true;
1044 return {};
1045 }
1046
1047 // Try to the load the file buffer.
1048 bool invalidTemp = false;
1049 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1050 if (invalidTemp) {
1051 if (Invalid) *Invalid = true;
1052 return {};
1053 }
1054
1055 if (Invalid) *Invalid = false;
1056 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1057}
1058
1060 const SourceManager &SM,
1061 const LangOptions &LangOpts) {
1062 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1063
1064 // Find the location of the immediate macro expansion.
1065 while (true) {
1066 FileID FID = SM.getFileID(Loc);
1067 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1068 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1069 Loc = Expansion.getExpansionLocStart();
1070 if (!Expansion.isMacroArgExpansion())
1071 break;
1072
1073 // For macro arguments we need to check that the argument did not come
1074 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1075
1076 // Loc points to the argument id of the macro definition, move to the
1077 // macro expansion.
1078 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1079 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1080 if (SpellLoc.isFileID())
1081 break; // No inner macro.
1082
1083 // If spelling location resides in the same FileID as macro expansion
1084 // location, it means there is no inner macro.
1085 FileID MacroFID = SM.getFileID(Loc);
1086 if (SM.isInFileID(SpellLoc, MacroFID))
1087 break;
1088
1089 // Argument came from inner macro.
1090 Loc = SpellLoc;
1091 }
1092
1093 // Find the spelling location of the start of the non-argument expansion
1094 // range. This is where the macro name was spelled in order to begin
1095 // expanding this macro.
1096 Loc = SM.getSpellingLoc(Loc);
1097
1098 // Dig out the buffer where the macro name was spelled and the extents of the
1099 // name so that we can render it into the expansion note.
1100 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1101 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1102 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1103 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1104}
1105
1107 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1108 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1109 // Walk past macro argument expansions.
1110 while (SM.isMacroArgExpansion(Loc))
1111 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1112
1113 // If the macro's spelling isn't FileID or from scratch space, then it's
1114 // actually a token paste or stringization (or similar) and not a macro at
1115 // all.
1116 SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1117 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1118 return {};
1119
1120 // Find the spelling location of the start of the non-argument expansion
1121 // range. This is where the macro name was spelled in order to begin
1122 // expanding this macro.
1123 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1124
1125 // Dig out the buffer where the macro name was spelled and the extents of the
1126 // name so that we can render it into the expansion note.
1127 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1128 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1129 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1130 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1131}
1132
1134 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1135}
1136
1137bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1138 assert(isVerticalWhitespace(Str[0]));
1139 if (Str - 1 < BufferStart)
1140 return false;
1141
1142 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1143 (Str[0] == '\r' && Str[-1] == '\n')) {
1144 if (Str - 2 < BufferStart)
1145 return false;
1146 --Str;
1147 }
1148 --Str;
1149
1150 // Rewind to first non-space character:
1151 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1152 --Str;
1153
1154 return *Str == '\\';
1155}
1156
1158 const SourceManager &SM) {
1159 if (Loc.isInvalid() || Loc.isMacroID())
1160 return {};
1161 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1162 if (LocInfo.first.isInvalid())
1163 return {};
1164 bool Invalid = false;
1165 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1166 if (Invalid)
1167 return {};
1168 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1169 if (!Line)
1170 return {};
1171 StringRef Rest = Buffer.substr(Line - Buffer.data());
1172 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1173 return NumWhitespaceChars == StringRef::npos
1174 ? ""
1175 : Rest.take_front(NumWhitespaceChars);
1176}
1177
1178//===----------------------------------------------------------------------===//
1179// Diagnostics forwarding code.
1180//===----------------------------------------------------------------------===//
1181
1182/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1183/// lexer buffer was all expanded at a single point, perform the mapping.
1184/// This is currently only used for _Pragma implementation, so it is the slow
1185/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1186static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1187 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1189 SourceLocation FileLoc,
1190 unsigned CharNo, unsigned TokLen) {
1191 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1192
1193 // Otherwise, we're lexing "mapped tokens". This is used for things like
1194 // _Pragma handling. Combine the expansion location of FileLoc with the
1195 // spelling location.
1197
1198 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1199 // characters come from spelling(FileLoc)+Offset.
1200 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1201 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1202
1203 // Figure out the expansion loc range, which is the range covered by the
1204 // original _Pragma(...) sequence.
1205 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1206
1207 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1208}
1209
1210/// getSourceLocation - Return a source location identifier for the specified
1211/// offset in the current file.
1213 unsigned TokLen) const {
1214 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1215 "Location out of range for this buffer!");
1216
1217 // In the normal case, we're just lexing from a simple file buffer, return
1218 // the file id from FileLoc with the offset specified.
1219 unsigned CharNo = Loc-BufferStart;
1220 if (FileLoc.isFileID())
1221 return FileLoc.getLocWithOffset(CharNo);
1222
1223 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1224 // tokens are lexed from where the _Pragma was defined.
1225 assert(PP && "This doesn't work on raw lexers");
1226 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1227}
1228
1229/// Diag - Forwarding function for diagnostics. This translate a source
1230/// position in the current buffer into a SourceLocation object for rendering.
1231DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1232 return PP->Diag(getSourceLocation(Loc), DiagID);
1233}
1234
1235//===----------------------------------------------------------------------===//
1236// Trigraph and Escaped Newline Handling Code.
1237//===----------------------------------------------------------------------===//
1238
1239/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1240/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1241static char GetTrigraphCharForLetter(char Letter) {
1242 switch (Letter) {
1243 default: return 0;
1244 case '=': return '#';
1245 case ')': return ']';
1246 case '(': return '[';
1247 case '!': return '|';
1248 case '\'': return '^';
1249 case '>': return '}';
1250 case '/': return '\\';
1251 case '<': return '{';
1252 case '-': return '~';
1253 }
1254}
1255
1256/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1257/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1258/// return the result character. Finally, emit a warning about trigraph use
1259/// whether trigraphs are enabled or not.
1260static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1261 char Res = GetTrigraphCharForLetter(*CP);
1262 if (!Res)
1263 return Res;
1264
1265 if (!Trigraphs) {
1266 if (L && !L->isLexingRawMode())
1267 L->Diag(CP-2, diag::trigraph_ignored);
1268 return 0;
1269 }
1270
1271 if (L && !L->isLexingRawMode())
1272 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1273 return Res;
1274}
1275
1276/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1277/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1278/// trigraph equivalent on entry to this function.
1279unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1280 unsigned Size = 0;
1281 while (isWhitespace(Ptr[Size])) {
1282 ++Size;
1283
1284 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1285 continue;
1286
1287 // If this is a \r\n or \n\r, skip the other half.
1288 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1289 Ptr[Size-1] != Ptr[Size])
1290 ++Size;
1291
1292 return Size;
1293 }
1294
1295 // Not an escaped newline, must be a \t or something else.
1296 return 0;
1297}
1298
1299/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1300/// them), skip over them and return the first non-escaped-newline found,
1301/// otherwise return P.
1302const char *Lexer::SkipEscapedNewLines(const char *P) {
1303 while (true) {
1304 const char *AfterEscape;
1305 if (*P == '\\') {
1306 AfterEscape = P+1;
1307 } else if (*P == '?') {
1308 // If not a trigraph for escape, bail out.
1309 if (P[1] != '?' || P[2] != '/')
1310 return P;
1311 // FIXME: Take LangOpts into account; the language might not
1312 // support trigraphs.
1313 AfterEscape = P+3;
1314 } else {
1315 return P;
1316 }
1317
1318 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1319 if (NewLineSize == 0) return P;
1320 P = AfterEscape+NewLineSize;
1321 }
1322}
1323
1325 const SourceManager &SM,
1326 const LangOptions &LangOpts) {
1327 if (Loc.isMacroID()) {
1328 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1329 return std::nullopt;
1330 }
1331 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1332
1333 // Break down the source location.
1334 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1335
1336 // Try to load the file buffer.
1337 bool InvalidTemp = false;
1338 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1339 if (InvalidTemp)
1340 return std::nullopt;
1341
1342 const char *TokenBegin = File.data() + LocInfo.second;
1343
1344 // Lex from the start of the given location.
1345 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1346 TokenBegin, File.end());
1347 // Find the token.
1348 Token Tok;
1349 lexer.LexFromRawLexer(Tok);
1350 return Tok;
1351}
1352
1353/// Checks that the given token is the first token that occurs after the
1354/// given location (this excludes comments and whitespace). Returns the location
1355/// immediately after the specified token. If the token is not found or the
1356/// location is inside a macro, the returned source location will be invalid.
1359 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1360 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1361 if (!Tok || Tok->isNot(TKind))
1362 return {};
1363 SourceLocation TokenLoc = Tok->getLocation();
1364
1365 // Calculate how much whitespace needs to be skipped if any.
1366 unsigned NumWhitespaceChars = 0;
1367 if (SkipTrailingWhitespaceAndNewLine) {
1368 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1369 unsigned char C = *TokenEnd;
1370 while (isHorizontalWhitespace(C)) {
1371 C = *(++TokenEnd);
1372 NumWhitespaceChars++;
1373 }
1374
1375 // Skip \r, \n, \r\n, or \n\r
1376 if (C == '\n' || C == '\r') {
1377 char PrevC = C;
1378 C = *(++TokenEnd);
1379 NumWhitespaceChars++;
1380 if ((C == '\n' || C == '\r') && C != PrevC)
1381 NumWhitespaceChars++;
1382 }
1383 }
1384
1385 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1386}
1387
1388/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1389/// get its size, and return it. This is tricky in several cases:
1390/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1391/// then either return the trigraph (skipping 3 chars) or the '?',
1392/// depending on whether trigraphs are enabled or not.
1393/// 2. If this is an escaped newline (potentially with whitespace between
1394/// the backslash and newline), implicitly skip the newline and return
1395/// the char after it.
1396///
1397/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1398/// know that we can accumulate into Size, and that we have already incremented
1399/// Ptr by Size bytes.
1400///
1401/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1402/// be updated to match.
1403Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1404 unsigned Size = 0;
1405 // If we have a slash, look for an escaped newline.
1406 if (Ptr[0] == '\\') {
1407 ++Size;
1408 ++Ptr;
1409Slash:
1410 // Common case, backslash-char where the char is not whitespace.
1411 if (!isWhitespace(Ptr[0]))
1412 return {'\\', Size};
1413
1414 // See if we have optional whitespace characters between the slash and
1415 // newline.
1416 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1417 // Remember that this token needs to be cleaned.
1418 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1419
1420 // Warn if there was whitespace between the backslash and newline.
1421 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1422 Diag(Ptr, diag::backslash_newline_space);
1423
1424 // Found backslash<whitespace><newline>. Parse the char after it.
1425 Size += EscapedNewLineSize;
1426 Ptr += EscapedNewLineSize;
1427
1428 // Use slow version to accumulate a correct size field.
1429 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1430 CharAndSize.Size += Size;
1431 return CharAndSize;
1432 }
1433
1434 // Otherwise, this is not an escaped newline, just return the slash.
1435 return {'\\', Size};
1436 }
1437
1438 // If this is a trigraph, process it.
1439 if (Ptr[0] == '?' && Ptr[1] == '?') {
1440 // If this is actually a legal trigraph (not something like "??x"), emit
1441 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1442 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1443 LangOpts.Trigraphs)) {
1444 // Remember that this token needs to be cleaned.
1445 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1446
1447 Ptr += 3;
1448 Size += 3;
1449 if (C == '\\') goto Slash;
1450 return {C, Size};
1451 }
1452 }
1453
1454 // If this is neither, return a single character.
1455 return {*Ptr, Size + 1u};
1456}
1457
1458/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1459/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1460/// and that we have already incremented Ptr by Size bytes.
1461///
1462/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1463/// be updated to match.
1464Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1465 const LangOptions &LangOpts) {
1466
1467 unsigned Size = 0;
1468 // If we have a slash, look for an escaped newline.
1469 if (Ptr[0] == '\\') {
1470 ++Size;
1471 ++Ptr;
1472Slash:
1473 // Common case, backslash-char where the char is not whitespace.
1474 if (!isWhitespace(Ptr[0]))
1475 return {'\\', Size};
1476
1477 // See if we have optional whitespace characters followed by a newline.
1478 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1479 // Found backslash<whitespace><newline>. Parse the char after it.
1480 Size += EscapedNewLineSize;
1481 Ptr += EscapedNewLineSize;
1482
1483 // Use slow version to accumulate a correct size field.
1484 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1485 CharAndSize.Size += Size;
1486 return CharAndSize;
1487 }
1488
1489 // Otherwise, this is not an escaped newline, just return the slash.
1490 return {'\\', Size};
1491 }
1492
1493 // If this is a trigraph, process it.
1494 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1495 // If this is actually a legal trigraph (not something like "??x"), return
1496 // it.
1497 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1498 Ptr += 3;
1499 Size += 3;
1500 if (C == '\\') goto Slash;
1501 return {C, Size};
1502 }
1503 }
1504
1505 // If this is neither, return a single character.
1506 return {*Ptr, Size + 1u};
1507}
1508
1509//===----------------------------------------------------------------------===//
1510// Helper methods for lexing.
1511//===----------------------------------------------------------------------===//
1512
1513/// Routine that indiscriminately sets the offset into the source file.
1514void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1515 BufferPtr = BufferStart + Offset;
1516 if (BufferPtr > BufferEnd)
1517 BufferPtr = BufferEnd;
1518 // FIXME: What exactly does the StartOfLine bit mean? There are two
1519 // possible meanings for the "start" of the line: the first token on the
1520 // unexpanded line, or the first token on the expanded line.
1521 IsAtStartOfLine = StartOfLine;
1522 IsAtPhysicalStartOfLine = StartOfLine;
1523}
1524
1525static bool isUnicodeWhitespace(uint32_t Codepoint) {
1526 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1528 return UnicodeWhitespaceChars.contains(Codepoint);
1529}
1530
1532 llvm::SmallString<5> CharBuf;
1533 llvm::raw_svector_ostream CharOS(CharBuf);
1534 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1535 return CharBuf;
1536}
1537
1538// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1539// we allow "Mathematical Notation Characters" in identifiers.
1540// This is a proposed profile that extends the XID_Start/XID_continue
1541// with mathematical symbols, superscipts and subscripts digits
1542// found in some production software.
1543// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1544static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1545 bool IsStart, bool &IsExtension) {
1546 static const llvm::sys::UnicodeCharSet MathStartChars(
1548 static const llvm::sys::UnicodeCharSet MathContinueChars(
1550 if (MathStartChars.contains(C) ||
1551 (!IsStart && MathContinueChars.contains(C))) {
1552 IsExtension = true;
1553 return true;
1554 }
1555 return false;
1556}
1557
1558static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1559 bool &IsExtension) {
1560 if (LangOpts.AsmPreprocessor) {
1561 return false;
1562 } else if (LangOpts.DollarIdents && '$' == C) {
1563 return true;
1564 } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1565 // A non-leading codepoint must have the XID_Continue property.
1566 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1567 // so we need to check both tables.
1568 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1569 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1570 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1571 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1572 return true;
1573 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1574 IsExtension);
1575 } else if (LangOpts.C11) {
1576 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1578 return C11AllowedIDChars.contains(C);
1579 } else {
1580 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1582 return C99AllowedIDChars.contains(C);
1583 }
1584}
1585
1586static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1587 bool &IsExtension) {
1588 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1589 IsExtension = false;
1590 if (LangOpts.AsmPreprocessor) {
1591 return false;
1592 }
1593 if (LangOpts.CPlusPlus || LangOpts.C23) {
1594 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1595 if (XIDStartChars.contains(C))
1596 return true;
1597 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1598 IsExtension);
1599 }
1600 if (!isAllowedIDChar(C, LangOpts, IsExtension))
1601 return false;
1602 if (LangOpts.C11) {
1603 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1605 return !C11DisallowedInitialIDChars.contains(C);
1606 }
1607 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1609 return !C99DisallowedInitialIDChars.contains(C);
1610}
1611
1614
1615 static const llvm::sys::UnicodeCharSet MathStartChars(
1617 static const llvm::sys::UnicodeCharSet MathContinueChars(
1619
1620 (void)MathStartChars;
1621 (void)MathContinueChars;
1622 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1623 "Unexpected mathematical notation codepoint");
1624 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1626}
1627
1628static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1629 const char *End) {
1631 L.getSourceLocation(End));
1632}
1633
1634static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1635 CharSourceRange Range, bool IsFirst) {
1636 // Check C99 compatibility.
1637 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1638 enum {
1639 CannotAppearInIdentifier = 0,
1640 CannotStartIdentifier
1641 };
1642
1643 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1645 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1647 if (!C99AllowedIDChars.contains(C)) {
1648 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1649 << Range
1650 << CannotAppearInIdentifier;
1651 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1652 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1653 << Range
1654 << CannotStartIdentifier;
1655 }
1656 }
1657}
1658
1659/// After encountering UTF-8 character C and interpreting it as an identifier
1660/// character, check whether it's a homoglyph for a common non-identifier
1661/// source character that is unlikely to be an intentional identifier
1662/// character and warn if so.
1665 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1666 struct HomoglyphPair {
1667 uint32_t Character;
1668 char LooksLike;
1669 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1670 };
1671 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1672 {U'\u00ad', 0}, // SOFT HYPHEN
1673 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1674 {U'\u037e', ';'}, // GREEK QUESTION MARK
1675 {U'\u200b', 0}, // ZERO WIDTH SPACE
1676 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1677 {U'\u200d', 0}, // ZERO WIDTH JOINER
1678 {U'\u2060', 0}, // WORD JOINER
1679 {U'\u2061', 0}, // FUNCTION APPLICATION
1680 {U'\u2062', 0}, // INVISIBLE TIMES
1681 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1682 {U'\u2064', 0}, // INVISIBLE PLUS
1683 {U'\u2212', '-'}, // MINUS SIGN
1684 {U'\u2215', '/'}, // DIVISION SLASH
1685 {U'\u2216', '\\'}, // SET MINUS
1686 {U'\u2217', '*'}, // ASTERISK OPERATOR
1687 {U'\u2223', '|'}, // DIVIDES
1688 {U'\u2227', '^'}, // LOGICAL AND
1689 {U'\u2236', ':'}, // RATIO
1690 {U'\u223c', '~'}, // TILDE OPERATOR
1691 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1692 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1693 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1694 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1695 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1696 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1697 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1698 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1699 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1700 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1701 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1702 {U'\uff0c', ','}, // FULLWIDTH COMMA
1703 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1704 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1705 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1706 {U'\uff1a', ':'}, // FULLWIDTH COLON
1707 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1708 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1709 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1710 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1711 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1712 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1713 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1714 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1715 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1716 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1717 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1718 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1719 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1720 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1721 {0, 0}
1722 };
1723 auto Homoglyph =
1724 std::lower_bound(std::begin(SortedHomoglyphs),
1725 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1726 if (Homoglyph->Character == C) {
1727 if (Homoglyph->LooksLike) {
1728 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1729 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1730 << Range << codepointAsHexString(C) << LooksLikeStr;
1731 } else {
1732 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1734 }
1735 }
1736}
1737
1739 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1740 CharSourceRange Range, bool IsFirst) {
1741 if (isASCII(CodePoint))
1742 return;
1743
1744 bool IsExtension;
1745 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1746 bool IsIDContinue =
1747 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1748
1749 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1750 return;
1751
1752 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1753
1754 if (!IsFirst || InvalidOnlyAtStart) {
1755 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1756 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1758 } else {
1759 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1760 << Range << codepointAsHexString(CodePoint)
1762 }
1763}
1764
1765bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1766 Token &Result) {
1767 const char *UCNPtr = CurPtr + Size;
1768 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1769 if (CodePoint == 0) {
1770 return false;
1771 }
1772 bool IsExtension = false;
1773 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1774 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1775 return false;
1779 PP->getDiagnostics(), LangOpts, CodePoint,
1780 makeCharRange(*this, CurPtr, UCNPtr),
1781 /*IsFirst=*/false);
1782
1783 // We got a unicode codepoint that is neither a space nor a
1784 // a valid identifier part.
1785 // Carry on as if the codepoint was valid for recovery purposes.
1786 } else if (!isLexingRawMode()) {
1787 if (IsExtension)
1789 makeCharRange(*this, CurPtr, UCNPtr));
1790
1792 makeCharRange(*this, CurPtr, UCNPtr),
1793 /*IsFirst=*/false);
1794 }
1795
1796 Result.setFlag(Token::HasUCN);
1797 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1798 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1799 CurPtr = UCNPtr;
1800 else
1801 while (CurPtr != UCNPtr)
1802 (void)getAndAdvanceChar(CurPtr, Result);
1803 return true;
1804}
1805
1806bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1807 llvm::UTF32 CodePoint;
1808
1809 // If a UTF-8 codepoint appears immediately after an escaped new line,
1810 // CurPtr may point to the splicing \ on the preceding line,
1811 // so we need to skip it.
1812 unsigned FirstCodeUnitSize;
1813 getCharAndSize(CurPtr, FirstCodeUnitSize);
1814 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1815 const char *UnicodePtr = CharStart;
1816
1817 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1818 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1819 &CodePoint, llvm::strictConversion);
1820 if (ConvResult != llvm::conversionOK)
1821 return false;
1822
1823 bool IsExtension = false;
1824 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1825 IsExtension)) {
1826 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1827 return false;
1828
1832 PP->getDiagnostics(), LangOpts, CodePoint,
1833 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1834 // We got a unicode codepoint that is neither a space nor a
1835 // a valid identifier part. Carry on as if the codepoint was
1836 // valid for recovery purposes.
1837 } else if (!isLexingRawMode()) {
1838 if (IsExtension)
1840 PP->getDiagnostics(), CodePoint,
1841 makeCharRange(*this, CharStart, UnicodePtr));
1843 makeCharRange(*this, CharStart, UnicodePtr),
1844 /*IsFirst=*/false);
1846 makeCharRange(*this, CharStart, UnicodePtr));
1847 }
1848
1849 // Once we sucessfully parsed some UTF-8,
1850 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1851 // being lexed, and that warnings about trailing spaces are emitted.
1852 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1853 CurPtr = UnicodePtr;
1854 return true;
1855}
1856
1857bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1858 const char *CurPtr) {
1859 bool IsExtension = false;
1860 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1863 if (IsExtension)
1865 makeCharRange(*this, BufferPtr, CurPtr));
1867 makeCharRange(*this, BufferPtr, CurPtr),
1868 /*IsFirst=*/true);
1870 makeCharRange(*this, BufferPtr, CurPtr));
1871 }
1872
1873 MIOpt.ReadToken();
1874 return LexIdentifierContinue(Result, CurPtr);
1875 }
1876
1878 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1880 // Non-ASCII characters tend to creep into source code unintentionally.
1881 // Instead of letting the parser complain about the unknown token,
1882 // just drop the character.
1883 // Note that we can /only/ do this when the non-ASCII character is actually
1884 // spelled as Unicode, not written as a UCN. The standard requires that
1885 // we not throw away any possible preprocessor tokens, but there's a
1886 // loophole in the mapping of Unicode characters to basic character set
1887 // characters that allows us to map these particular characters to, say,
1888 // whitespace.
1890 PP->getDiagnostics(), LangOpts, C,
1891 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1892 BufferPtr = CurPtr;
1893 return false;
1894 }
1895
1896 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1897 // up by accident.
1898 MIOpt.ReadToken();
1899 FormTokenWithChars(Result, CurPtr, tok::unknown);
1900 return true;
1901}
1902
1903static const char *
1904fastParseASCIIIdentifier(const char *CurPtr,
1905 [[maybe_unused]] const char *BufferEnd) {
1906#ifdef __SSE4_2__
1907 alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1908 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1909 };
1910 constexpr ssize_t BytesPerRegister = 16;
1911
1912 __m128i AsciiIdentifierRangeV =
1913 _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1914
1915 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1916 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1917
1918 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1921 CurPtr += Consumed;
1922 if (Consumed == BytesPerRegister)
1923 continue;
1924 return CurPtr;
1925 }
1926#endif
1927
1928 unsigned char C = *CurPtr;
1930 C = *++CurPtr;
1931 return CurPtr;
1932}
1933
1934bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1935 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1936
1937 while (true) {
1938
1939 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1940
1941 unsigned Size;
1942 // Slow path: handle trigraph, unicode codepoints, UCNs.
1943 unsigned char C = getCharAndSize(CurPtr, Size);
1945 CurPtr = ConsumeChar(CurPtr, Size, Result);
1946 continue;
1947 }
1948 if (C == '$') {
1949 // If we hit a $ and they are not supported in identifiers, we are done.
1950 if (!LangOpts.DollarIdents)
1951 break;
1952 // Otherwise, emit a diagnostic and continue.
1953 if (!isLexingRawMode())
1954 Diag(CurPtr, diag::ext_dollar_in_identifier);
1955 CurPtr = ConsumeChar(CurPtr, Size, Result);
1956 continue;
1957 }
1958 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1959 continue;
1960 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1961 continue;
1962 // Neither an expected Unicode codepoint nor a UCN.
1963 break;
1964 }
1965
1966 const char *IdStart = BufferPtr;
1967 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1968 Result.setRawIdentifierData(IdStart);
1969
1970 // If we are in raw mode, return this identifier raw. There is no need to
1971 // look up identifier information or attempt to macro expand it.
1972 if (LexingRawMode)
1973 return true;
1974
1975 // Fill in Result.IdentifierInfo and update the token kind,
1976 // looking up the identifier in the identifier table.
1978 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1979 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1980
1981 // If the completion point is at the end of an identifier, we want to treat
1982 // the identifier as incomplete even if it resolves to a macro or a keyword.
1983 // This allows e.g. 'class^' to complete to 'classifier'.
1984 if (isCodeCompletionPoint(CurPtr)) {
1985 // Return the code-completion token.
1986 Result.setKind(tok::code_completion);
1987 // Skip the code-completion char and all immediate identifier characters.
1988 // This ensures we get consistent behavior when completing at any point in
1989 // an identifier (i.e. at the start, in the middle, at the end). Note that
1990 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1991 // simpler.
1992 assert(*CurPtr == 0 && "Completion character must be 0");
1993 ++CurPtr;
1994 // Note that code completion token is not added as a separate character
1995 // when the completion point is at the end of the buffer. Therefore, we need
1996 // to check if the buffer has ended.
1997 if (CurPtr < BufferEnd) {
1998 while (isAsciiIdentifierContinue(*CurPtr))
1999 ++CurPtr;
2000 }
2001 BufferPtr = CurPtr;
2002 return true;
2003 }
2004
2005 // Finally, now that we know we have an identifier, pass this off to the
2006 // preprocessor, which may macro expand it or something.
2007 if (II->isHandleIdentifierCase())
2008 return PP->HandleIdentifier(Result);
2009
2010 return true;
2011}
2012
2013/// isHexaLiteral - Return true if Start points to a hex constant.
2014/// in microsoft mode (where this is supposed to be several different tokens).
2015bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
2016 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
2017 char C1 = CharAndSize1.Char;
2018 if (C1 != '0')
2019 return false;
2020
2021 auto CharAndSize2 =
2022 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
2023 char C2 = CharAndSize2.Char;
2024 return (C2 == 'x' || C2 == 'X');
2025}
2026
2027/// LexNumericConstant - Lex the remainder of a integer or floating point
2028/// constant. From[-1] is the first character lexed. Return the end of the
2029/// constant.
2030bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2031 unsigned Size;
2032 char C = getCharAndSize(CurPtr, Size);
2033 char PrevCh = 0;
2034 while (isPreprocessingNumberBody(C)) {
2035 CurPtr = ConsumeChar(CurPtr, Size, Result);
2036 PrevCh = C;
2037 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
2038 CurPtr -= Size;
2039 break;
2040 }
2041 C = getCharAndSize(CurPtr, Size);
2042 }
2043
2044 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
2045 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2046 // If we are in Microsoft mode, don't continue if the constant is hex.
2047 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2048 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2049 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2050 }
2051
2052 // If we have a hex FP constant, continue.
2053 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2054 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2055 // not-quite-conforming extension. Only do so if this looks like it's
2056 // actually meant to be a hexfloat, and not if it has a ud-suffix.
2057 bool IsHexFloat = true;
2058 if (!LangOpts.C99) {
2059 if (!isHexaLiteral(BufferPtr, LangOpts))
2060 IsHexFloat = false;
2061 else if (!LangOpts.CPlusPlus17 &&
2062 std::find(BufferPtr, CurPtr, '_') != CurPtr)
2063 IsHexFloat = false;
2064 }
2065 if (IsHexFloat)
2066 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2067 }
2068
2069 // If we have a digit separator, continue.
2070 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2071 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2072 if (isAsciiIdentifierContinue(Next)) {
2073 if (!isLexingRawMode())
2074 Diag(CurPtr, LangOpts.CPlusPlus
2075 ? diag::warn_cxx11_compat_digit_separator
2076 : diag::warn_c23_compat_digit_separator);
2077 CurPtr = ConsumeChar(CurPtr, Size, Result);
2078 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2079 return LexNumericConstant(Result, CurPtr);
2080 }
2081 }
2082
2083 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2084 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2085 return LexNumericConstant(Result, CurPtr);
2086 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2087 return LexNumericConstant(Result, CurPtr);
2088
2089 // Update the location of token as well as BufferPtr.
2090 const char *TokStart = BufferPtr;
2091 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2092 Result.setLiteralData(TokStart);
2093 return true;
2094}
2095
2096/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2097/// in C++11, or warn on a ud-suffix in C++98.
2098const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2099 bool IsStringLiteral) {
2100 assert(LangOpts.CPlusPlus);
2101
2102 // Maximally munch an identifier.
2103 unsigned Size;
2104 char C = getCharAndSize(CurPtr, Size);
2105 bool Consumed = false;
2106
2107 if (!isAsciiIdentifierStart(C)) {
2108 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2109 Consumed = true;
2110 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2111 Consumed = true;
2112 else
2113 return CurPtr;
2114 }
2115
2116 if (!LangOpts.CPlusPlus11) {
2117 if (!isLexingRawMode())
2118 Diag(CurPtr,
2119 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2120 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2122 return CurPtr;
2123 }
2124
2125 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2126 // that does not start with an underscore is ill-formed. As a conforming
2127 // extension, we treat all such suffixes as if they had whitespace before
2128 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2129 // likely to be a ud-suffix than a macro, however, and accept that.
2130 if (!Consumed) {
2131 bool IsUDSuffix = false;
2132 if (C == '_')
2133 IsUDSuffix = true;
2134 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2135 // In C++1y, we need to look ahead a few characters to see if this is a
2136 // valid suffix for a string literal or a numeric literal (this could be
2137 // the 'operator""if' defining a numeric literal operator).
2138 const unsigned MaxStandardSuffixLength = 3;
2139 char Buffer[MaxStandardSuffixLength] = { C };
2140 unsigned Consumed = Size;
2141 unsigned Chars = 1;
2142 while (true) {
2143 auto [Next, NextSize] =
2144 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2145 if (!isAsciiIdentifierContinue(Next)) {
2146 // End of suffix. Check whether this is on the allowed list.
2147 const StringRef CompleteSuffix(Buffer, Chars);
2148 IsUDSuffix =
2149 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2150 break;
2151 }
2152
2153 if (Chars == MaxStandardSuffixLength)
2154 // Too long: can't be a standard suffix.
2155 break;
2156
2157 Buffer[Chars++] = Next;
2158 Consumed += NextSize;
2159 }
2160 }
2161
2162 if (!IsUDSuffix) {
2163 if (!isLexingRawMode())
2164 Diag(CurPtr, LangOpts.MSVCCompat
2165 ? diag::ext_ms_reserved_user_defined_literal
2166 : diag::ext_reserved_user_defined_literal)
2168 return CurPtr;
2169 }
2170
2171 CurPtr = ConsumeChar(CurPtr, Size, Result);
2172 }
2173
2174 Result.setFlag(Token::HasUDSuffix);
2175 while (true) {
2176 C = getCharAndSize(CurPtr, Size);
2178 CurPtr = ConsumeChar(CurPtr, Size, Result);
2179 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2180 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2181 } else
2182 break;
2183 }
2184
2185 return CurPtr;
2186}
2187
2188/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2189/// either " or L" or u8" or u" or U".
2190bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2191 tok::TokenKind Kind) {
2192 const char *AfterQuote = CurPtr;
2193 // Does this string contain the \0 character?
2194 const char *NulCharacter = nullptr;
2195
2196 if (!isLexingRawMode() &&
2197 (Kind == tok::utf8_string_literal ||
2198 Kind == tok::utf16_string_literal ||
2199 Kind == tok::utf32_string_literal))
2200 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2201 : diag::warn_c99_compat_unicode_literal);
2202
2203 char C = getAndAdvanceChar(CurPtr, Result);
2204 while (C != '"') {
2205 // Skip escaped characters. Escaped newlines will already be processed by
2206 // getAndAdvanceChar.
2207 if (C == '\\')
2208 C = getAndAdvanceChar(CurPtr, Result);
2209
2210 if (C == '\n' || C == '\r' || // Newline.
2211 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2212 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2213 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2214 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2215 return true;
2216 }
2217
2218 if (C == 0) {
2219 if (isCodeCompletionPoint(CurPtr-1)) {
2220 if (ParsingFilename)
2221 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2222 else
2224 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2225 cutOffLexing();
2226 return true;
2227 }
2228
2229 NulCharacter = CurPtr-1;
2230 }
2231 C = getAndAdvanceChar(CurPtr, Result);
2232 }
2233
2234 // If we are in C++11, lex the optional ud-suffix.
2235 if (LangOpts.CPlusPlus)
2236 CurPtr = LexUDSuffix(Result, CurPtr, true);
2237
2238 // If a nul character existed in the string, warn about it.
2239 if (NulCharacter && !isLexingRawMode())
2240 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2241
2242 // Update the location of the token as well as the BufferPtr instance var.
2243 const char *TokStart = BufferPtr;
2244 FormTokenWithChars(Result, CurPtr, Kind);
2245 Result.setLiteralData(TokStart);
2246 return true;
2247}
2248
2249/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2250/// having lexed R", LR", u8R", uR", or UR".
2251bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2252 tok::TokenKind Kind) {
2253 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2254 // Between the initial and final double quote characters of the raw string,
2255 // any transformations performed in phases 1 and 2 (trigraphs,
2256 // universal-character-names, and line splicing) are reverted.
2257
2258 if (!isLexingRawMode())
2259 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2260
2261 unsigned PrefixLen = 0;
2262
2263 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {
2264 if (!isLexingRawMode() &&
2265 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
2266 const char *Pos = &CurPtr[PrefixLen];
2267 Diag(Pos, LangOpts.CPlusPlus26
2268 ? diag::warn_cxx26_compat_raw_string_literal_character_set
2269 : diag::ext_cxx26_raw_string_literal_character_set)
2270 << StringRef(Pos, 1);
2271 }
2272 ++PrefixLen;
2273 }
2274
2275 // If the last character was not a '(', then we didn't lex a valid delimiter.
2276 if (CurPtr[PrefixLen] != '(') {
2277 if (!isLexingRawMode()) {
2278 const char *PrefixEnd = &CurPtr[PrefixLen];
2279 if (PrefixLen == 16) {
2280 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2281 } else if (*PrefixEnd == '\n') {
2282 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
2283 } else {
2284 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2285 << StringRef(PrefixEnd, 1);
2286 }
2287 }
2288
2289 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2290 // it's possible the '"' was intended to be part of the raw string, but
2291 // there's not much we can do about that.
2292 while (true) {
2293 char C = *CurPtr++;
2294
2295 if (C == '"')
2296 break;
2297 if (C == 0 && CurPtr-1 == BufferEnd) {
2298 --CurPtr;
2299 break;
2300 }
2301 }
2302
2303 FormTokenWithChars(Result, CurPtr, tok::unknown);
2304 return true;
2305 }
2306
2307 // Save prefix and move CurPtr past it
2308 const char *Prefix = CurPtr;
2309 CurPtr += PrefixLen + 1; // skip over prefix and '('
2310
2311 while (true) {
2312 char C = *CurPtr++;
2313
2314 if (C == ')') {
2315 // Check for prefix match and closing quote.
2316 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2317 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2318 break;
2319 }
2320 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2321 if (!isLexingRawMode())
2322 Diag(BufferPtr, diag::err_unterminated_raw_string)
2323 << StringRef(Prefix, PrefixLen);
2324 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2325 return true;
2326 }
2327 }
2328
2329 // If we are in C++11, lex the optional ud-suffix.
2330 if (LangOpts.CPlusPlus)
2331 CurPtr = LexUDSuffix(Result, CurPtr, true);
2332
2333 // Update the location of token as well as BufferPtr.
2334 const char *TokStart = BufferPtr;
2335 FormTokenWithChars(Result, CurPtr, Kind);
2336 Result.setLiteralData(TokStart);
2337 return true;
2338}
2339
2340/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2341/// after having lexed the '<' character. This is used for #include filenames.
2342bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2343 // Does this string contain the \0 character?
2344 const char *NulCharacter = nullptr;
2345 const char *AfterLessPos = CurPtr;
2346 char C = getAndAdvanceChar(CurPtr, Result);
2347 while (C != '>') {
2348 // Skip escaped characters. Escaped newlines will already be processed by
2349 // getAndAdvanceChar.
2350 if (C == '\\')
2351 C = getAndAdvanceChar(CurPtr, Result);
2352
2353 if (isVerticalWhitespace(C) || // Newline.
2354 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2355 // If the filename is unterminated, then it must just be a lone <
2356 // character. Return this as such.
2357 FormTokenWithChars(Result, AfterLessPos, tok::less);
2358 return true;
2359 }
2360
2361 if (C == 0) {
2362 if (isCodeCompletionPoint(CurPtr - 1)) {
2363 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2364 cutOffLexing();
2365 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2366 return true;
2367 }
2368 NulCharacter = CurPtr-1;
2369 }
2370 C = getAndAdvanceChar(CurPtr, Result);
2371 }
2372
2373 // If a nul character existed in the string, warn about it.
2374 if (NulCharacter && !isLexingRawMode())
2375 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2376
2377 // Update the location of token as well as BufferPtr.
2378 const char *TokStart = BufferPtr;
2379 FormTokenWithChars(Result, CurPtr, tok::header_name);
2380 Result.setLiteralData(TokStart);
2381 return true;
2382}
2383
2384void Lexer::codeCompleteIncludedFile(const char *PathStart,
2385 const char *CompletionPoint,
2386 bool IsAngled) {
2387 // Completion only applies to the filename, after the last slash.
2388 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2389 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2390 auto Slash = PartialPath.find_last_of(SlashChars);
2391 StringRef Dir =
2392 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2393 const char *StartOfFilename =
2394 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2395 // Code completion filter range is the filename only, up to completion point.
2397 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2398 // We should replace the characters up to the closing quote or closest slash,
2399 // if any.
2400 while (CompletionPoint < BufferEnd) {
2401 char Next = *(CompletionPoint + 1);
2402 if (Next == 0 || Next == '\r' || Next == '\n')
2403 break;
2404 ++CompletionPoint;
2405 if (Next == (IsAngled ? '>' : '"'))
2406 break;
2407 if (SlashChars.contains(Next))
2408 break;
2409 }
2410
2412 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2413 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2414 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2415}
2416
2417/// LexCharConstant - Lex the remainder of a character constant, after having
2418/// lexed either ' or L' or u8' or u' or U'.
2419bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2420 tok::TokenKind Kind) {
2421 // Does this character contain the \0 character?
2422 const char *NulCharacter = nullptr;
2423
2424 if (!isLexingRawMode()) {
2425 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2426 Diag(BufferPtr, LangOpts.CPlusPlus
2427 ? diag::warn_cxx98_compat_unicode_literal
2428 : diag::warn_c99_compat_unicode_literal);
2429 else if (Kind == tok::utf8_char_constant)
2430 Diag(BufferPtr, LangOpts.CPlusPlus
2431 ? diag::warn_cxx14_compat_u8_character_literal
2432 : diag::warn_c17_compat_u8_character_literal);
2433 }
2434
2435 char C = getAndAdvanceChar(CurPtr, Result);
2436 if (C == '\'') {
2437 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2438 Diag(BufferPtr, diag::ext_empty_character);
2439 FormTokenWithChars(Result, CurPtr, tok::unknown);
2440 return true;
2441 }
2442
2443 while (C != '\'') {
2444 // Skip escaped characters.
2445 if (C == '\\')
2446 C = getAndAdvanceChar(CurPtr, Result);
2447
2448 if (C == '\n' || C == '\r' || // Newline.
2449 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2450 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2451 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2452 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2453 return true;
2454 }
2455
2456 if (C == 0) {
2457 if (isCodeCompletionPoint(CurPtr-1)) {
2459 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2460 cutOffLexing();
2461 return true;
2462 }
2463
2464 NulCharacter = CurPtr-1;
2465 }
2466 C = getAndAdvanceChar(CurPtr, Result);
2467 }
2468
2469 // If we are in C++11, lex the optional ud-suffix.
2470 if (LangOpts.CPlusPlus)
2471 CurPtr = LexUDSuffix(Result, CurPtr, false);
2472
2473 // If a nul character existed in the character, warn about it.
2474 if (NulCharacter && !isLexingRawMode())
2475 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2476
2477 // Update the location of token as well as BufferPtr.
2478 const char *TokStart = BufferPtr;
2479 FormTokenWithChars(Result, CurPtr, Kind);
2480 Result.setLiteralData(TokStart);
2481 return true;
2482}
2483
2484/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2485/// Update BufferPtr to point to the next non-whitespace character and return.
2486///
2487/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2488bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2489 bool &TokAtPhysicalStartOfLine) {
2490 // Whitespace - Skip it, then return the token after the whitespace.
2491 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2492
2493 unsigned char Char = *CurPtr;
2494
2495 const char *lastNewLine = nullptr;
2496 auto setLastNewLine = [&](const char *Ptr) {
2497 lastNewLine = Ptr;
2498 if (!NewLinePtr)
2499 NewLinePtr = Ptr;
2500 };
2501 if (SawNewline)
2502 setLastNewLine(CurPtr - 1);
2503
2504 // Skip consecutive spaces efficiently.
2505 while (true) {
2506 // Skip horizontal whitespace very aggressively.
2507 while (isHorizontalWhitespace(Char))
2508 Char = *++CurPtr;
2509
2510 // Otherwise if we have something other than whitespace, we're done.
2511 if (!isVerticalWhitespace(Char))
2512 break;
2513
2515 // End of preprocessor directive line, let LexTokenInternal handle this.
2516 BufferPtr = CurPtr;
2517 return false;
2518 }
2519
2520 // OK, but handle newline.
2521 if (*CurPtr == '\n')
2522 setLastNewLine(CurPtr);
2523 SawNewline = true;
2524 Char = *++CurPtr;
2525 }
2526
2527 // If the client wants us to return whitespace, return it now.
2528 if (isKeepWhitespaceMode()) {
2529 FormTokenWithChars(Result, CurPtr, tok::unknown);
2530 if (SawNewline) {
2531 IsAtStartOfLine = true;
2532 IsAtPhysicalStartOfLine = true;
2533 }
2534 // FIXME: The next token will not have LeadingSpace set.
2535 return true;
2536 }
2537
2538 // If this isn't immediately after a newline, there is leading space.
2539 char PrevChar = CurPtr[-1];
2540 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2541
2542 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2543 if (SawNewline) {
2544 Result.setFlag(Token::StartOfLine);
2545 TokAtPhysicalStartOfLine = true;
2546
2547 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2548 if (auto *Handler = PP->getEmptylineHandler())
2549 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2550 getSourceLocation(lastNewLine)));
2551 }
2552 }
2553
2554 BufferPtr = CurPtr;
2555 return false;
2556}
2557
2558/// We have just read the // characters from input. Skip until we find the
2559/// newline character that terminates the comment. Then update BufferPtr and
2560/// return.
2561///
2562/// If we're in KeepCommentMode or any CommentHandler has inserted
2563/// some tokens, this will store the first token and return true.
2564bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2565 bool &TokAtPhysicalStartOfLine) {
2566 // If Line comments aren't explicitly enabled for this language, emit an
2567 // extension warning.
2568 if (!LineComment) {
2569 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2570 Diag(BufferPtr, diag::ext_line_comment);
2571
2572 // Mark them enabled so we only emit one warning for this translation
2573 // unit.
2574 LineComment = true;
2575 }
2576
2577 // Scan over the body of the comment. The common case, when scanning, is that
2578 // the comment contains normal ascii characters with nothing interesting in
2579 // them. As such, optimize for this case with the inner loop.
2580 //
2581 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2582 // character that ends the line comment.
2583
2584 // C++23 [lex.phases] p1
2585 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2586 // diagnostic only once per entire ill-formed subsequence to avoid
2587 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2588 bool UnicodeDecodingAlreadyDiagnosed = false;
2589
2590 char C;
2591 while (true) {
2592 C = *CurPtr;
2593 // Skip over characters in the fast loop.
2594 while (isASCII(C) && C != 0 && // Potentially EOF.
2595 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2596 C = *++CurPtr;
2597 UnicodeDecodingAlreadyDiagnosed = false;
2598 }
2599
2600 if (!isASCII(C)) {
2601 unsigned Length = llvm::getUTF8SequenceSize(
2602 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2603 if (Length == 0) {
2604 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2605 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2606 UnicodeDecodingAlreadyDiagnosed = true;
2607 ++CurPtr;
2608 } else {
2609 UnicodeDecodingAlreadyDiagnosed = false;
2610 CurPtr += Length;
2611 }
2612 continue;
2613 }
2614
2615 const char *NextLine = CurPtr;
2616 if (C != 0) {
2617 // We found a newline, see if it's escaped.
2618 const char *EscapePtr = CurPtr-1;
2619 bool HasSpace = false;
2620 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2621 --EscapePtr;
2622 HasSpace = true;
2623 }
2624
2625 if (*EscapePtr == '\\')
2626 // Escaped newline.
2627 CurPtr = EscapePtr;
2628 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2629 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2630 // Trigraph-escaped newline.
2631 CurPtr = EscapePtr-2;
2632 else
2633 break; // This is a newline, we're done.
2634
2635 // If there was space between the backslash and newline, warn about it.
2636 if (HasSpace && !isLexingRawMode())
2637 Diag(EscapePtr, diag::backslash_newline_space);
2638 }
2639
2640 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2641 // properly decode the character. Read it in raw mode to avoid emitting
2642 // diagnostics about things like trigraphs. If we see an escaped newline,
2643 // we'll handle it below.
2644 const char *OldPtr = CurPtr;
2645 bool OldRawMode = isLexingRawMode();
2646 LexingRawMode = true;
2647 C = getAndAdvanceChar(CurPtr, Result);
2648 LexingRawMode = OldRawMode;
2649
2650 // If we only read only one character, then no special handling is needed.
2651 // We're done and can skip forward to the newline.
2652 if (C != 0 && CurPtr == OldPtr+1) {
2653 CurPtr = NextLine;
2654 break;
2655 }
2656
2657 // If we read multiple characters, and one of those characters was a \r or
2658 // \n, then we had an escaped newline within the comment. Emit diagnostic
2659 // unless the next line is also a // comment.
2660 if (CurPtr != OldPtr + 1 && C != '/' &&
2661 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2662 for (; OldPtr != CurPtr; ++OldPtr)
2663 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2664 // Okay, we found a // comment that ends in a newline, if the next
2665 // line is also a // comment, but has spaces, don't emit a diagnostic.
2666 if (isWhitespace(C)) {
2667 const char *ForwardPtr = CurPtr;
2668 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2669 ++ForwardPtr;
2670 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2671 break;
2672 }
2673
2674 if (!isLexingRawMode())
2675 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2676 break;
2677 }
2678 }
2679
2680 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2681 --CurPtr;
2682 break;
2683 }
2684
2685 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2687 cutOffLexing();
2688 return false;
2689 }
2690 }
2691
2692 // Found but did not consume the newline. Notify comment handlers about the
2693 // comment unless we're in a #if 0 block.
2694 if (PP && !isLexingRawMode() &&
2696 getSourceLocation(CurPtr)))) {
2697 BufferPtr = CurPtr;
2698 return true; // A token has to be returned.
2699 }
2700
2701 // If we are returning comments as tokens, return this comment as a token.
2702 if (inKeepCommentMode())
2703 return SaveLineComment(Result, CurPtr);
2704
2705 // If we are inside a preprocessor directive and we see the end of line,
2706 // return immediately, so that the lexer can return this as an EOD token.
2707 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2708 BufferPtr = CurPtr;
2709 return false;
2710 }
2711
2712 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2713 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2714 // contribute to another token), it isn't needed for correctness. Note that
2715 // this is ok even in KeepWhitespaceMode, because we would have returned the
2716 // comment above in that mode.
2717 NewLinePtr = CurPtr++;
2718
2719 // The next returned token is at the start of the line.
2720 Result.setFlag(Token::StartOfLine);
2721 TokAtPhysicalStartOfLine = true;
2722 // No leading whitespace seen so far.
2723 Result.clearFlag(Token::LeadingSpace);
2724 BufferPtr = CurPtr;
2725 return false;
2726}
2727
2728/// If in save-comment mode, package up this Line comment in an appropriate
2729/// way and return it.
2730bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2731 // If we're not in a preprocessor directive, just return the // comment
2732 // directly.
2733 FormTokenWithChars(Result, CurPtr, tok::comment);
2734
2736 return true;
2737
2738 // If this Line-style comment is in a macro definition, transmogrify it into
2739 // a C-style block comment.
2740 bool Invalid = false;
2741 std::string Spelling = PP->getSpelling(Result, &Invalid);
2742 if (Invalid)
2743 return true;
2744
2745 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2746 Spelling[1] = '*'; // Change prefix to "/*".
2747 Spelling += "*/"; // add suffix.
2748
2749 Result.setKind(tok::comment);
2750 PP->CreateString(Spelling, Result,
2751 Result.getLocation(), Result.getLocation());
2752 return true;
2753}
2754
2755/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2756/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2757/// a diagnostic if so. We know that the newline is inside of a block comment.
2758static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2759 bool Trigraphs) {
2760 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2761
2762 // Position of the first trigraph in the ending sequence.
2763 const char *TrigraphPos = nullptr;
2764 // Position of the first whitespace after a '\' in the ending sequence.
2765 const char *SpacePos = nullptr;
2766
2767 while (true) {
2768 // Back up off the newline.
2769 --CurPtr;
2770
2771 // If this is a two-character newline sequence, skip the other character.
2772 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2773 // \n\n or \r\r -> not escaped newline.
2774 if (CurPtr[0] == CurPtr[1])
2775 return false;
2776 // \n\r or \r\n -> skip the newline.
2777 --CurPtr;
2778 }
2779
2780 // If we have horizontal whitespace, skip over it. We allow whitespace
2781 // between the slash and newline.
2782 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2783 SpacePos = CurPtr;
2784 --CurPtr;
2785 }
2786
2787 // If we have a slash, this is an escaped newline.
2788 if (*CurPtr == '\\') {
2789 --CurPtr;
2790 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2791 // This is a trigraph encoding of a slash.
2792 TrigraphPos = CurPtr - 2;
2793 CurPtr -= 3;
2794 } else {
2795 return false;
2796 }
2797
2798 // If the character preceding the escaped newline is a '*', then after line
2799 // splicing we have a '*/' ending the comment.
2800 if (*CurPtr == '*')
2801 break;
2802
2803 if (*CurPtr != '\n' && *CurPtr != '\r')
2804 return false;
2805 }
2806
2807 if (TrigraphPos) {
2808 // If no trigraphs are enabled, warn that we ignored this trigraph and
2809 // ignore this * character.
2810 if (!Trigraphs) {
2811 if (!L->isLexingRawMode())
2812 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2813 return false;
2814 }
2815 if (!L->isLexingRawMode())
2816 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2817 }
2818
2819 // Warn about having an escaped newline between the */ characters.
2820 if (!L->isLexingRawMode())
2821 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2822
2823 // If there was space between the backslash and newline, warn about it.
2824 if (SpacePos && !L->isLexingRawMode())
2825 L->Diag(SpacePos, diag::backslash_newline_space);
2826
2827 return true;
2828}
2829
2830#ifdef __SSE2__
2831#include <emmintrin.h>
2832#elif __ALTIVEC__
2833#include <altivec.h>
2834#undef bool
2835#endif
2836
2837/// We have just read from input the / and * characters that started a comment.
2838/// Read until we find the * and / characters that terminate the comment.
2839/// Note that we don't bother decoding trigraphs or escaped newlines in block
2840/// comments, because they cannot cause the comment to end. The only thing
2841/// that can happen is the comment could end with an escaped newline between
2842/// the terminating * and /.
2843///
2844/// If we're in KeepCommentMode or any CommentHandler has inserted
2845/// some tokens, this will store the first token and return true.
2846bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2847 bool &TokAtPhysicalStartOfLine) {
2848 // Scan one character past where we should, looking for a '/' character. Once
2849 // we find it, check to see if it was preceded by a *. This common
2850 // optimization helps people who like to put a lot of * characters in their
2851 // comments.
2852
2853 // The first character we get with newlines and trigraphs skipped to handle
2854 // the degenerate /*/ case below correctly if the * has an escaped newline
2855 // after it.
2856 unsigned CharSize;
2857 unsigned char C = getCharAndSize(CurPtr, CharSize);
2858 CurPtr += CharSize;
2859 if (C == 0 && CurPtr == BufferEnd+1) {
2860 if (!isLexingRawMode())
2861 Diag(BufferPtr, diag::err_unterminated_block_comment);
2862 --CurPtr;
2863
2864 // KeepWhitespaceMode should return this broken comment as a token. Since
2865 // it isn't a well formed comment, just return it as an 'unknown' token.
2866 if (isKeepWhitespaceMode()) {
2867 FormTokenWithChars(Result, CurPtr, tok::unknown);
2868 return true;
2869 }
2870
2871 BufferPtr = CurPtr;
2872 return false;
2873 }
2874
2875 // Check to see if the first character after the '/*' is another /. If so,
2876 // then this slash does not end the block comment, it is part of it.
2877 if (C == '/')
2878 C = *CurPtr++;
2879
2880 // C++23 [lex.phases] p1
2881 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2882 // diagnostic only once per entire ill-formed subsequence to avoid
2883 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2884 bool UnicodeDecodingAlreadyDiagnosed = false;
2885
2886 while (true) {
2887 // Skip over all non-interesting characters until we find end of buffer or a
2888 // (probably ending) '/' character.
2889 if (CurPtr + 24 < BufferEnd &&
2890 // If there is a code-completion point avoid the fast scan because it
2891 // doesn't check for '\0'.
2892 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2893 // While not aligned to a 16-byte boundary.
2894 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2895 if (!isASCII(C))
2896 goto MultiByteUTF8;
2897 C = *CurPtr++;
2898 }
2899 if (C == '/') goto FoundSlash;
2900
2901#ifdef __SSE2__
2902 __m128i Slashes = _mm_set1_epi8('/');
2903 while (CurPtr + 16 < BufferEnd) {
2904 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2905 if (LLVM_UNLIKELY(Mask != 0)) {
2906 goto MultiByteUTF8;
2907 }
2908 // look for slashes
2909 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2910 Slashes));
2911 if (cmp != 0) {
2912 // Adjust the pointer to point directly after the first slash. It's
2913 // not necessary to set C here, it will be overwritten at the end of
2914 // the outer loop.
2915 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2916 goto FoundSlash;
2917 }
2918 CurPtr += 16;
2919 }
2920#elif __ALTIVEC__
2921 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2922 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2923 0x80, 0x80, 0x80, 0x80};
2924 __vector unsigned char Slashes = {
2925 '/', '/', '/', '/', '/', '/', '/', '/',
2926 '/', '/', '/', '/', '/', '/', '/', '/'
2927 };
2928 while (CurPtr + 16 < BufferEnd) {
2929 if (LLVM_UNLIKELY(
2930 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2931 goto MultiByteUTF8;
2932 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2933 break;
2934 }
2935 CurPtr += 16;
2936 }
2937
2938#else
2939 while (CurPtr + 16 < BufferEnd) {
2940 bool HasNonASCII = false;
2941 for (unsigned I = 0; I < 16; ++I)
2942 HasNonASCII |= !isASCII(CurPtr[I]);
2943
2944 if (LLVM_UNLIKELY(HasNonASCII))
2945 goto MultiByteUTF8;
2946
2947 bool HasSlash = false;
2948 for (unsigned I = 0; I < 16; ++I)
2949 HasSlash |= CurPtr[I] == '/';
2950 if (HasSlash)
2951 break;
2952 CurPtr += 16;
2953 }
2954#endif
2955
2956 // It has to be one of the bytes scanned, increment to it and read one.
2957 C = *CurPtr++;
2958 }
2959
2960 // Loop to scan the remainder, warning on invalid UTF-8
2961 // if the corresponding warning is enabled, emitting a diagnostic only once
2962 // per sequence that cannot be decoded.
2963 while (C != '/' && C != '\0') {
2964 if (isASCII(C)) {
2965 UnicodeDecodingAlreadyDiagnosed = false;
2966 C = *CurPtr++;
2967 continue;
2968 }
2969 MultiByteUTF8:
2970 // CurPtr is 1 code unit past C, so to decode
2971 // the codepoint, we need to read from the previous position.
2972 unsigned Length = llvm::getUTF8SequenceSize(
2973 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2974 if (Length == 0) {
2975 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2976 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2977 UnicodeDecodingAlreadyDiagnosed = true;
2978 } else {
2979 UnicodeDecodingAlreadyDiagnosed = false;
2980 CurPtr += Length - 1;
2981 }
2982 C = *CurPtr++;
2983 }
2984
2985 if (C == '/') {
2986 FoundSlash:
2987 if (CurPtr[-2] == '*') // We found the final */. We're done!
2988 break;
2989
2990 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2991 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2992 LangOpts.Trigraphs)) {
2993 // We found the final */, though it had an escaped newline between the
2994 // * and /. We're done!
2995 break;
2996 }
2997 }
2998 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2999 // If this is a /* inside of the comment, emit a warning. Don't do this
3000 // if this is a /*/, which will end the comment. This misses cases with
3001 // embedded escaped newlines, but oh well.
3002 if (!isLexingRawMode())
3003 Diag(CurPtr-1, diag::warn_nested_block_comment);
3004 }
3005 } else if (C == 0 && CurPtr == BufferEnd+1) {
3006 if (!isLexingRawMode())
3007 Diag(BufferPtr, diag::err_unterminated_block_comment);
3008 // Note: the user probably forgot a */. We could continue immediately
3009 // after the /*, but this would involve lexing a lot of what really is the
3010 // comment, which surely would confuse the parser.
3011 --CurPtr;
3012
3013 // KeepWhitespaceMode should return this broken comment as a token. Since
3014 // it isn't a well formed comment, just return it as an 'unknown' token.
3015 if (isKeepWhitespaceMode()) {
3016 FormTokenWithChars(Result, CurPtr, tok::unknown);
3017 return true;
3018 }
3019
3020 BufferPtr = CurPtr;
3021 return false;
3022 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
3024 cutOffLexing();
3025 return false;
3026 }
3027
3028 C = *CurPtr++;
3029 }
3030
3031 // Notify comment handlers about the comment unless we're in a #if 0 block.
3032 if (PP && !isLexingRawMode() &&
3034 getSourceLocation(CurPtr)))) {
3035 BufferPtr = CurPtr;
3036 return true; // A token has to be returned.
3037 }
3038
3039 // If we are returning comments as tokens, return this comment as a token.
3040 if (inKeepCommentMode()) {
3041 FormTokenWithChars(Result, CurPtr, tok::comment);
3042 return true;
3043 }
3044
3045 // It is common for the tokens immediately after a /**/ comment to be
3046 // whitespace. Instead of going through the big switch, handle it
3047 // efficiently now. This is safe even in KeepWhitespaceMode because we would
3048 // have already returned above with the comment as a token.
3049 if (isHorizontalWhitespace(*CurPtr)) {
3050 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
3051 return false;
3052 }
3053
3054 // Otherwise, just return so that the next character will be lexed as a token.
3055 BufferPtr = CurPtr;
3056 Result.setFlag(Token::LeadingSpace);
3057 return false;
3058}
3059
3060//===----------------------------------------------------------------------===//
3061// Primary Lexing Entry Points
3062//===----------------------------------------------------------------------===//
3063
3064/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3065/// uninterpreted string. This switches the lexer out of directive mode.
3067 assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3068 "Must be in a preprocessing directive!");
3069 Token Tmp;
3070 Tmp.startToken();
3071
3072 // CurPtr - Cache BufferPtr in an automatic variable.
3073 const char *CurPtr = BufferPtr;
3074 while (true) {
3075 char Char = getAndAdvanceChar(CurPtr, Tmp);
3076 switch (Char) {
3077 default:
3078 if (Result)
3079 Result->push_back(Char);
3080 break;
3081 case 0: // Null.
3082 // Found end of file?
3083 if (CurPtr-1 != BufferEnd) {
3084 if (isCodeCompletionPoint(CurPtr-1)) {
3086 cutOffLexing();
3087 return;
3088 }
3089
3090 // Nope, normal character, continue.
3091 if (Result)
3092 Result->push_back(Char);
3093 break;
3094 }
3095 // FALL THROUGH.
3096 [[fallthrough]];
3097 case '\r':
3098 case '\n':
3099 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3100 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3101 BufferPtr = CurPtr-1;
3102
3103 // Next, lex the character, which should handle the EOD transition.
3104 Lex(Tmp);
3105 if (Tmp.is(tok::code_completion)) {
3106 if (PP)
3108 Lex(Tmp);
3109 }
3110 assert(Tmp.is(tok::eod) && "Unexpected token!");
3111
3112 // Finally, we're done;
3113 return;
3114 }
3115 }
3116}
3117
3118/// LexEndOfFile - CurPtr points to the end of this file. Handle this
3119/// condition, reporting diagnostics and handling other edge cases as required.
3120/// This returns true if Result contains a token, false if PP.Lex should be
3121/// called again.
3122bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3123 // If we hit the end of the file while parsing a preprocessor directive,
3124 // end the preprocessor directive first. The next token returned will
3125 // then be the end of file.
3127 // Done parsing the "line".
3129 // Update the location of token as well as BufferPtr.
3130 FormTokenWithChars(Result, CurPtr, tok::eod);
3131
3132 // Restore comment saving mode, in case it was disabled for directive.
3133 if (PP)
3135 return true; // Have a token.
3136 }
3137
3138 // If we are in raw mode, return this event as an EOF token. Let the caller
3139 // that put us in raw mode handle the event.
3140 if (isLexingRawMode()) {
3141 Result.startToken();
3142 BufferPtr = BufferEnd;
3143 FormTokenWithChars(Result, BufferEnd, tok::eof);
3144 return true;
3145 }
3146
3149 // If the preamble cuts off the end of a header guard, consider it guarded.
3150 // The guard is valid for the preamble content itself, and for tools the
3151 // most useful answer is "yes, this file has a header guard".
3152 if (!ConditionalStack.empty())
3154 ConditionalStack.clear();
3155 }
3156
3157 // Issue diagnostics for unterminated #if and missing newline.
3158
3159 // If we are in a #if directive, emit an error.
3160 while (!ConditionalStack.empty()) {
3161 if (PP->getCodeCompletionFileLoc() != FileLoc)
3162 PP->Diag(ConditionalStack.back().IfLoc,
3163 diag::err_pp_unterminated_conditional);
3164 ConditionalStack.pop_back();
3165 }
3166
3167 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3168 // a pedwarn.
3169 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3171 SourceLocation EndLoc = getSourceLocation(BufferEnd);
3172 unsigned DiagID;
3173
3174 if (LangOpts.CPlusPlus11) {
3175 // C++11 [lex.phases] 2.2 p2
3176 // Prefer the C++98 pedantic compatibility warning over the generic,
3177 // non-extension, user-requested "missing newline at EOF" warning.
3178 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3179 DiagID = diag::warn_cxx98_compat_no_newline_eof;
3180 } else {
3181 DiagID = diag::warn_no_newline_eof;
3182 }
3183 } else {
3184 DiagID = diag::ext_no_newline_eof;
3185 }
3186
3187 Diag(BufferEnd, DiagID)
3188 << FixItHint::CreateInsertion(EndLoc, "\n");
3189 }
3190
3191 BufferPtr = CurPtr;
3192
3193 // Finally, let the preprocessor handle this.
3195}
3196
3197/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3198/// the specified lexer will return a tok::l_paren token, 0 if it is something
3199/// else and 2 if there are no more tokens in the buffer controlled by the
3200/// lexer.
3201unsigned Lexer::isNextPPTokenLParen() {
3202 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3203
3204 if (isDependencyDirectivesLexer()) {
3205 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3206 return 2;
3207 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3208 tok::l_paren);
3209 }
3210
3211 // Switch to 'skipping' mode. This will ensure that we can lex a token
3212 // without emitting diagnostics, disables macro expansion, and will cause EOF
3213 // to return an EOF token instead of popping the include stack.
3214 LexingRawMode = true;
3215
3216 // Save state that can be changed while lexing so that we can restore it.
3217 const char *TmpBufferPtr = BufferPtr;
3218 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3219 bool atStartOfLine = IsAtStartOfLine;
3220 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3221 bool leadingSpace = HasLeadingSpace;
3222
3223 Token Tok;
3224 Lex(Tok);
3225
3226 // Restore state that may have changed.
3227 BufferPtr = TmpBufferPtr;
3228 ParsingPreprocessorDirective = inPPDirectiveMode;
3229 HasLeadingSpace = leadingSpace;
3230 IsAtStartOfLine = atStartOfLine;
3231 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3232
3233 // Restore the lexer back to non-skipping mode.
3234 LexingRawMode = false;
3235
3236 if (Tok.is(tok::eof))
3237 return 2;
3238 return Tok.is(tok::l_paren);
3239}
3240
3241/// Find the end of a version control conflict marker.
3242static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3243 ConflictMarkerKind CMK) {
3244 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3245 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3246 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3247 size_t Pos = RestOfBuffer.find(Terminator);
3248 while (Pos != StringRef::npos) {
3249 // Must occur at start of line.
3250 if (Pos == 0 ||
3251 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3252 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3253 Pos = RestOfBuffer.find(Terminator);
3254 continue;
3255 }
3256 return RestOfBuffer.data()+Pos;
3257 }
3258 return nullptr;
3259}
3260
3261/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3262/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3263/// and recover nicely. This returns true if it is a conflict marker and false
3264/// if not.
3265bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3266 // Only a conflict marker if it starts at the beginning of a line.
3267 if (CurPtr != BufferStart &&
3268 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3269 return false;
3270
3271 // Check to see if we have <<<<<<< or >>>>.
3272 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3273 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3274 return false;
3275
3276 // If we have a situation where we don't care about conflict markers, ignore
3277 // it.
3278 if (CurrentConflictMarkerState || isLexingRawMode())
3279 return false;
3280
3281 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3282
3283 // Check to see if there is an ending marker somewhere in the buffer at the
3284 // start of a line to terminate this conflict marker.
3285 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3286 // We found a match. We are really in a conflict marker.
3287 // Diagnose this, and ignore to the end of line.
3288 Diag(CurPtr, diag::err_conflict_marker);
3289 CurrentConflictMarkerState = Kind;
3290
3291 // Skip ahead to the end of line. We know this exists because the
3292 // end-of-conflict marker starts with \r or \n.
3293 while (*CurPtr != '\r' && *CurPtr != '\n') {
3294 assert(CurPtr != BufferEnd && "Didn't find end of line");
3295 ++CurPtr;
3296 }
3297 BufferPtr = CurPtr;
3298 return true;
3299 }
3300
3301 // No end of conflict marker found.
3302 return false;
3303}
3304
3305/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3306/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3307/// is the end of a conflict marker. Handle it by ignoring up until the end of
3308/// the line. This returns true if it is a conflict marker and false if not.
3309bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3310 // Only a conflict marker if it starts at the beginning of a line.
3311 if (CurPtr != BufferStart &&
3312 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3313 return false;
3314
3315 // If we have a situation where we don't care about conflict markers, ignore
3316 // it.
3317 if (!CurrentConflictMarkerState || isLexingRawMode())
3318 return false;
3319
3320 // Check to see if we have the marker (4 characters in a row).
3321 for (unsigned i = 1; i != 4; ++i)
3322 if (CurPtr[i] != CurPtr[0])
3323 return false;
3324
3325 // If we do have it, search for the end of the conflict marker. This could
3326 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3327 // be the end of conflict marker.
3328 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3329 CurrentConflictMarkerState)) {
3330 CurPtr = End;
3331
3332 // Skip ahead to the end of line.
3333 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3334 ++CurPtr;
3335
3336 BufferPtr = CurPtr;
3337
3338 // No longer in the conflict marker.
3339 CurrentConflictMarkerState = CMK_None;
3340 return true;
3341 }
3342
3343 return false;
3344}
3345
3346static const char *findPlaceholderEnd(const char *CurPtr,
3347 const char *BufferEnd) {
3348 if (CurPtr == BufferEnd)
3349 return nullptr;
3350 BufferEnd -= 1; // Scan until the second last character.
3351 for (; CurPtr != BufferEnd; ++CurPtr) {
3352 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3353 return CurPtr + 2;
3354 }
3355 return nullptr;
3356}
3357
3358bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3359 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3361 return false;
3362 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3363 if (!End)
3364 return false;
3365 const char *Start = CurPtr - 1;
3366 if (!LangOpts.AllowEditorPlaceholders)
3367 Diag(Start, diag::err_placeholder_in_source);
3368 Result.startToken();
3369 FormTokenWithChars(Result, End, tok::raw_identifier);
3370 Result.setRawIdentifierData(Start);
3373 BufferPtr = End;
3374 return true;
3375}
3376
3377bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3378 if (PP && PP->isCodeCompletionEnabled()) {
3379 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3380 return Loc == PP->getCodeCompletionLoc();
3381 }
3382
3383 return false;
3384}
3385
3386std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3387 const char *SlashLoc,
3388 Token *Result) {
3389 unsigned CharSize;
3390 char Kind = getCharAndSize(StartPtr, CharSize);
3391 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3392
3393 unsigned NumHexDigits;
3394 if (Kind == 'u')
3395 NumHexDigits = 4;
3396 else if (Kind == 'U')
3397 NumHexDigits = 8;
3398
3399 bool Delimited = false;
3400 bool FoundEndDelimiter = false;
3401 unsigned Count = 0;
3402 bool Diagnose = Result && !isLexingRawMode();
3403
3404 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3405 if (Diagnose)
3406 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3407 return std::nullopt;
3408 }
3409
3410 const char *CurPtr = StartPtr + CharSize;
3411 const char *KindLoc = &CurPtr[-1];
3412
3413 uint32_t CodePoint = 0;
3414 while (Count != NumHexDigits || Delimited) {
3415 char C = getCharAndSize(CurPtr, CharSize);
3416 if (!Delimited && Count == 0 && C == '{') {
3417 Delimited = true;
3418 CurPtr += CharSize;
3419 continue;
3420 }
3421
3422 if (Delimited && C == '}') {
3423 CurPtr += CharSize;
3424 FoundEndDelimiter = true;
3425 break;
3426 }
3427
3428 unsigned Value = llvm::hexDigitValue(C);
3429 if (Value == -1U) {
3430 if (!Delimited)
3431 break;
3432 if (Diagnose)
3433 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3434 << StringRef(KindLoc, 1);
3435 return std::nullopt;
3436 }
3437
3438 if (CodePoint & 0xF000'0000) {
3439 if (Diagnose)
3440 Diag(KindLoc, diag::err_escape_too_large) << 0;
3441 return std::nullopt;
3442 }
3443
3444 CodePoint <<= 4;
3445 CodePoint |= Value;
3446 CurPtr += CharSize;
3447 Count++;
3448 }
3449
3450 if (Count == 0) {
3451 if (Diagnose)
3452 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3453 : diag::warn_ucn_escape_no_digits)
3454 << StringRef(KindLoc, 1);
3455 return std::nullopt;
3456 }
3457
3458 if (Delimited && Kind == 'U') {
3459 if (Diagnose)
3460 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3461 return std::nullopt;
3462 }
3463
3464 if (!Delimited && Count != NumHexDigits) {
3465 if (Diagnose) {
3466 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3467 // If the user wrote \U1234, suggest a fixit to \u.
3468 if (Count == 4 && NumHexDigits == 8) {
3469 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3470 Diag(KindLoc, diag::note_ucn_four_not_eight)
3471 << FixItHint::CreateReplacement(URange, "u");
3472 }
3473 }
3474 return std::nullopt;
3475 }
3476
3477 if (Delimited && PP) {
3478 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3479 ? diag::warn_cxx23_delimited_escape_sequence
3480 : diag::ext_delimited_escape_sequence)
3481 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3482 }
3483
3484 if (Result) {
3485 Result->setFlag(Token::HasUCN);
3486 // If the UCN contains either a trigraph or a line splicing,
3487 // we need to call getAndAdvanceChar again to set the appropriate flags
3488 // on Result.
3489 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3490 StartPtr = CurPtr;
3491 else
3492 while (StartPtr != CurPtr)
3493 (void)getAndAdvanceChar(StartPtr, *Result);
3494 } else {
3495 StartPtr = CurPtr;
3496 }
3497 return CodePoint;
3498}
3499
3500std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3501 const char *SlashLoc,
3502 Token *Result) {
3503 unsigned CharSize;
3504 bool Diagnose = Result && !isLexingRawMode();
3505
3506 char C = getCharAndSize(StartPtr, CharSize);
3507 assert(C == 'N' && "expected \\N{...}");
3508
3509 const char *CurPtr = StartPtr + CharSize;
3510 const char *KindLoc = &CurPtr[-1];
3511
3512 C = getCharAndSize(CurPtr, CharSize);
3513 if (C != '{') {
3514 if (Diagnose)
3515 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3516 return std::nullopt;
3517 }
3518 CurPtr += CharSize;
3519 const char *StartName = CurPtr;
3520 bool FoundEndDelimiter = false;
3522 while (C) {
3523 C = getCharAndSize(CurPtr, CharSize);
3524 CurPtr += CharSize;
3525 if (C == '}') {
3526 FoundEndDelimiter = true;
3527 break;
3528 }
3529
3531 break;
3532 Buffer.push_back(C);
3533 }
3534
3535 if (!FoundEndDelimiter || Buffer.empty()) {
3536 if (Diagnose)
3537 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3538 : diag::warn_delimited_ucn_incomplete)
3539 << StringRef(KindLoc, 1);
3540 return std::nullopt;
3541 }
3542
3543 StringRef Name(Buffer.data(), Buffer.size());
3544 std::optional<char32_t> Match =
3545 llvm::sys::unicode::nameToCodepointStrict(Name);
3546 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3547 if (!Match) {
3548 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3549 if (Diagnose) {
3550 Diag(StartName, diag::err_invalid_ucn_name)
3551 << StringRef(Buffer.data(), Buffer.size())
3552 << makeCharRange(*this, StartName, CurPtr - CharSize);
3553 if (LooseMatch) {
3554 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3556 makeCharRange(*this, StartName, CurPtr - CharSize),
3557 LooseMatch->Name);
3558 }
3559 }
3560 // We do not offer misspelled character names suggestions here
3561 // as the set of what would be a valid suggestion depends on context,
3562 // and we should not make invalid suggestions.
3563 }
3564
3565 if (Diagnose && Match)
3566 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3567 ? diag::warn_cxx23_delimited_escape_sequence
3568 : diag::ext_delimited_escape_sequence)
3569 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3570
3571 // If no diagnostic has been emitted yet, likely because we are doing a
3572 // tentative lexing, we do not want to recover here to make sure the token
3573 // will not be incorrectly considered valid. This function will be called
3574 // again and a diagnostic emitted then.
3575 if (LooseMatch && Diagnose)
3576 Match = LooseMatch->CodePoint;
3577
3578 if (Result) {
3579 Result->setFlag(Token::HasUCN);
3580 // If the UCN contains either a trigraph or a line splicing,
3581 // we need to call getAndAdvanceChar again to set the appropriate flags
3582 // on Result.
3583 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3584 StartPtr = CurPtr;
3585 else
3586 while (StartPtr != CurPtr)
3587 (void)getAndAdvanceChar(StartPtr, *Result);
3588 } else {
3589 StartPtr = CurPtr;
3590 }
3591 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3592}
3593
3594uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3595 Token *Result) {
3596
3597 unsigned CharSize;
3598 std::optional<uint32_t> CodePointOpt;
3599 char Kind = getCharAndSize(StartPtr, CharSize);
3600 if (Kind == 'u' || Kind == 'U')
3601 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3602 else if (Kind == 'N')
3603 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3604
3605 if (!CodePointOpt)
3606 return 0;
3607
3608 uint32_t CodePoint = *CodePointOpt;
3609
3610 // Don't apply C family restrictions to UCNs in assembly mode
3611 if (LangOpts.AsmPreprocessor)
3612 return CodePoint;
3613
3614 // C23 6.4.3p2: A universal character name shall not designate a code point
3615 // where the hexadecimal value is:
3616 // - in the range D800 through DFFF inclusive; or
3617 // - greater than 10FFFF.
3618 // A universal-character-name outside the c-char-sequence of a character
3619 // constant, or the s-char-sequence of a string-literal shall not designate
3620 // a control character or a character in the basic character set.
3621
3622 // C++11 [lex.charset]p2: If the hexadecimal value for a
3623 // universal-character-name corresponds to a surrogate code point (in the
3624 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3625 // if the hexadecimal value for a universal-character-name outside the
3626 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3627 // string literal corresponds to a control character (in either of the
3628 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3629 // basic source character set, the program is ill-formed.
3630 if (CodePoint < 0xA0) {
3631 // We don't use isLexingRawMode() here because we need to warn about bad
3632 // UCNs even when skipping preprocessing tokens in a #if block.
3633 if (Result && PP) {
3634 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3635 Diag(BufferPtr, diag::err_ucn_control_character);
3636 else {
3637 char C = static_cast<char>(CodePoint);
3638 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3639 }
3640 }
3641
3642 return 0;
3643 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3644 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3645 // We don't use isLexingRawMode() here because we need to diagnose bad
3646 // UCNs even when skipping preprocessing tokens in a #if block.
3647 if (Result && PP) {
3648 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3649 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3650 else
3651 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3652 }
3653 return 0;
3654 }
3655
3656 return CodePoint;
3657}
3658
3659bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3660 const char *CurPtr) {
3661 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3663 Diag(BufferPtr, diag::ext_unicode_whitespace)
3664 << makeCharRange(*this, BufferPtr, CurPtr);
3665
3666 Result.setFlag(Token::LeadingSpace);
3667 return true;
3668 }
3669 return false;
3670}
3671
3672void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3673 IsAtStartOfLine = Result.isAtStartOfLine();
3674 HasLeadingSpace = Result.hasLeadingSpace();
3675 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3676 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3677}
3678
3680 assert(!isDependencyDirectivesLexer());
3681
3682 // Start a new token.
3683 Result.startToken();
3684
3685 // Set up misc whitespace flags for LexTokenInternal.
3686 if (IsAtStartOfLine) {
3687 Result.setFlag(Token::StartOfLine);
3688 IsAtStartOfLine = false;
3689 }
3690
3691 if (HasLeadingSpace) {
3692 Result.setFlag(Token::LeadingSpace);
3693 HasLeadingSpace = false;
3694 }
3695
3696 if (HasLeadingEmptyMacro) {
3698 HasLeadingEmptyMacro = false;
3699 }
3700
3701 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3702 IsAtPhysicalStartOfLine = false;
3703 bool isRawLex = isLexingRawMode();
3704 (void) isRawLex;
3705 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3706 // (After the LexTokenInternal call, the lexer might be destroyed.)
3707 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3708 return returnedToken;
3709}
3710
3711/// LexTokenInternal - This implements a simple C family lexer. It is an
3712/// extremely performance critical piece of code. This assumes that the buffer
3713/// has a null character at the end of the file. This returns a preprocessing
3714/// token, not a normal token, as such, it is an internal interface. It assumes
3715/// that the Flags of result have been cleared before calling this.
3716bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3717LexStart:
3718 assert(!Result.needsCleaning() && "Result needs cleaning");
3719 assert(!Result.hasPtrData() && "Result has not been reset");
3720
3721 // CurPtr - Cache BufferPtr in an automatic variable.
3722 const char *CurPtr = BufferPtr;
3723
3724 // Small amounts of horizontal whitespace is very common between tokens.
3725 if (isHorizontalWhitespace(*CurPtr)) {
3726 do {
3727 ++CurPtr;
3728 } while (isHorizontalWhitespace(*CurPtr));
3729
3730 // If we are keeping whitespace and other tokens, just return what we just
3731 // skipped. The next lexer invocation will return the token after the
3732 // whitespace.
3733 if (isKeepWhitespaceMode()) {
3734 FormTokenWithChars(Result, CurPtr, tok::unknown);
3735 // FIXME: The next token will not have LeadingSpace set.
3736 return true;
3737 }
3738
3739 BufferPtr = CurPtr;
3740 Result.setFlag(Token::LeadingSpace);
3741 }
3742
3743 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3744
3745 // Read a character, advancing over it.
3746 char Char = getAndAdvanceChar(CurPtr, Result);
3748
3749 if (!isVerticalWhitespace(Char))
3750 NewLinePtr = nullptr;
3751
3752 switch (Char) {
3753 case 0: // Null.
3754 // Found end of file?
3755 if (CurPtr-1 == BufferEnd)
3756 return LexEndOfFile(Result, CurPtr-1);
3757
3758 // Check if we are performing code completion.
3759 if (isCodeCompletionPoint(CurPtr-1)) {
3760 // Return the code-completion token.
3761 Result.startToken();
3762 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3763 return true;
3764 }
3765
3766 if (!isLexingRawMode())
3767 Diag(CurPtr-1, diag::null_in_file);
3768 Result.setFlag(Token::LeadingSpace);
3769 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3770 return true; // KeepWhitespaceMode
3771
3772 // We know the lexer hasn't changed, so just try again with this lexer.
3773 // (We manually eliminate the tail call to avoid recursion.)
3774 goto LexNextToken;
3775
3776 case 26: // DOS & CP/M EOF: "^Z".
3777 // If we're in Microsoft extensions mode, treat this as end of file.
3778 if (LangOpts.MicrosoftExt) {
3779 if (!isLexingRawMode())
3780 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3781 return LexEndOfFile(Result, CurPtr-1);
3782 }
3783
3784 // If Microsoft extensions are disabled, this is just random garbage.
3785 Kind = tok::unknown;
3786 break;
3787
3788 case '\r':
3789 if (CurPtr[0] == '\n')
3790 (void)getAndAdvanceChar(CurPtr, Result);
3791 [[fallthrough]];
3792 case '\n':
3793 // If we are inside a preprocessor directive and we see the end of line,
3794 // we know we are done with the directive, so return an EOD token.
3796 // Done parsing the "line".
3798
3799 // Restore comment saving mode, in case it was disabled for directive.
3800 if (PP)
3802
3803 // Since we consumed a newline, we are back at the start of a line.
3804 IsAtStartOfLine = true;
3805 IsAtPhysicalStartOfLine = true;
3806 NewLinePtr = CurPtr - 1;
3807
3808 Kind = tok::eod;
3809 break;
3810 }
3811
3812 // No leading whitespace seen so far.
3813 Result.clearFlag(Token::LeadingSpace);
3814
3815 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3816 return true; // KeepWhitespaceMode
3817
3818 // We only saw whitespace, so just try again with this lexer.
3819 // (We manually eliminate the tail call to avoid recursion.)
3820 goto LexNextToken;
3821 case ' ':
3822 case '\t':
3823 case '\f':
3824 case '\v':
3825 SkipHorizontalWhitespace:
3826 Result.setFlag(Token::LeadingSpace);
3827 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3828 return true; // KeepWhitespaceMode
3829
3830 SkipIgnoredUnits:
3831 CurPtr = BufferPtr;
3832
3833 // If the next token is obviously a // or /* */ comment, skip it efficiently
3834 // too (without going through the big switch stmt).
3835 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3836 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3837 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3838 return true; // There is a token to return.
3839 goto SkipIgnoredUnits;
3840 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3841 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3842 return true; // There is a token to return.
3843 goto SkipIgnoredUnits;
3844 } else if (isHorizontalWhitespace(*CurPtr)) {
3845 goto SkipHorizontalWhitespace;
3846 }
3847 // We only saw whitespace, so just try again with this lexer.
3848 // (We manually eliminate the tail call to avoid recursion.)
3849 goto LexNextToken;
3850
3851 // C99 6.4.4.1: Integer Constants.
3852 // C99 6.4.4.2: Floating Constants.
3853 case '0': case '1': case '2': case '3': case '4':
3854 case '5': case '6': case '7': case '8': case '9':
3855 // Notify MIOpt that we read a non-whitespace/non-comment token.
3856 MIOpt.ReadToken();
3857 return LexNumericConstant(Result, CurPtr);
3858
3859 // Identifier (e.g., uber), or
3860 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3861 // UTF-8 or UTF-16 string literal (C11/C++11).
3862 case 'u':
3863 // Notify MIOpt that we read a non-whitespace/non-comment token.
3864 MIOpt.ReadToken();
3865
3866 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3867 Char = getCharAndSize(CurPtr, SizeTmp);
3868
3869 // UTF-16 string literal
3870 if (Char == '"')
3871 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3872 tok::utf16_string_literal);
3873
3874 // UTF-16 character constant
3875 if (Char == '\'')
3876 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3877 tok::utf16_char_constant);
3878
3879 // UTF-16 raw string literal
3880 if (Char == 'R' && LangOpts.RawStringLiterals &&
3881 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3882 return LexRawStringLiteral(Result,
3883 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3884 SizeTmp2, Result),
3885 tok::utf16_string_literal);
3886
3887 if (Char == '8') {
3888 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3889
3890 // UTF-8 string literal
3891 if (Char2 == '"')
3892 return LexStringLiteral(Result,
3893 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3894 SizeTmp2, Result),
3895 tok::utf8_string_literal);
3896 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3897 return LexCharConstant(
3898 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3899 SizeTmp2, Result),
3900 tok::utf8_char_constant);
3901
3902 if (Char2 == 'R' && LangOpts.RawStringLiterals) {
3903 unsigned SizeTmp3;
3904 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3905 // UTF-8 raw string literal
3906 if (Char3 == '"') {
3907 return LexRawStringLiteral(Result,
3908 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3909 SizeTmp2, Result),
3910 SizeTmp3, Result),
3911 tok::utf8_string_literal);
3912 }
3913 }
3914 }
3915 }
3916
3917 // treat u like the start of an identifier.
3918 return LexIdentifierContinue(Result, CurPtr);
3919
3920 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3921 // Notify MIOpt that we read a non-whitespace/non-comment token.
3922 MIOpt.ReadToken();
3923
3924 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3925 Char = getCharAndSize(CurPtr, SizeTmp);
3926
3927 // UTF-32 string literal
3928 if (Char == '"')
3929 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3930 tok::utf32_string_literal);
3931
3932 // UTF-32 character constant
3933 if (Char == '\'')
3934 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3935 tok::utf32_char_constant);
3936
3937 // UTF-32 raw string literal
3938 if (Char == 'R' && LangOpts.RawStringLiterals &&
3939 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3940 return LexRawStringLiteral(Result,
3941 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3942 SizeTmp2, Result),
3943 tok::utf32_string_literal);
3944 }
3945
3946 // treat U like the start of an identifier.
3947 return LexIdentifierContinue(Result, CurPtr);
3948
3949 case 'R': // Identifier or C++0x raw string literal
3950 // Notify MIOpt that we read a non-whitespace/non-comment token.
3951 MIOpt.ReadToken();
3952
3953 if (LangOpts.RawStringLiterals) {
3954 Char = getCharAndSize(CurPtr, SizeTmp);
3955
3956 if (Char == '"')
3957 return LexRawStringLiteral(Result,
3958 ConsumeChar(CurPtr, SizeTmp, Result),
3959 tok::string_literal);
3960 }
3961
3962 // treat R like the start of an identifier.
3963 return LexIdentifierContinue(Result, CurPtr);
3964
3965 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3966 // Notify MIOpt that we read a non-whitespace/non-comment token.
3967 MIOpt.ReadToken();
3968 Char = getCharAndSize(CurPtr, SizeTmp);
3969
3970 // Wide string literal.
3971 if (Char == '"')
3972 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3973 tok::wide_string_literal);
3974
3975 // Wide raw string literal.
3976 if (LangOpts.RawStringLiterals && Char == 'R' &&
3977 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3978 return LexRawStringLiteral(Result,
3979 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3980 SizeTmp2, Result),
3981 tok::wide_string_literal);
3982
3983 // Wide character constant.
3984 if (Char == '\'')
3985 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3986 tok::wide_char_constant);
3987 // FALL THROUGH, treating L like the start of an identifier.
3988 [[fallthrough]];
3989
3990 // C99 6.4.2: Identifiers.
3991 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3992 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3993 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3994 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3995 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3996 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3997 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3998 case 'v': case 'w': case 'x': case 'y': case 'z':
3999 case '_':
4000 // Notify MIOpt that we read a non-whitespace/non-comment token.
4001 MIOpt.ReadToken();
4002 return LexIdentifierContinue(Result, CurPtr);
4003
4004 case '$': // $ in identifiers.
4005 if (LangOpts.DollarIdents) {
4006 if (!isLexingRawMode())
4007 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
4008 // Notify MIOpt that we read a non-whitespace/non-comment token.
4009 MIOpt.ReadToken();
4010 return LexIdentifierContinue(Result, CurPtr);
4011 }
4012
4013 Kind = tok::unknown;
4014 break;
4015
4016 // C99 6.4.4: Character Constants.
4017 case '\'':
4018 // Notify MIOpt that we read a non-whitespace/non-comment token.
4019 MIOpt.ReadToken();
4020 return LexCharConstant(Result, CurPtr, tok::char_constant);
4021
4022 // C99 6.4.5: String Literals.
4023 case '"':
4024 // Notify MIOpt that we read a non-whitespace/non-comment token.
4025 MIOpt.ReadToken();
4026 return LexStringLiteral(Result, CurPtr,
4027 ParsingFilename ? tok::header_name
4028 : tok::string_literal);
4029
4030 // C99 6.4.6: Punctuators.
4031 case '?':
4032 Kind = tok::question;
4033 break;
4034 case '[':
4035 Kind = tok::l_square;
4036 break;
4037 case ']':
4038 Kind = tok::r_square;
4039 break;
4040 case '(':
4041 Kind = tok::l_paren;
4042 break;
4043 case ')':
4044 Kind = tok::r_paren;
4045 break;
4046 case '{':
4047 Kind = tok::l_brace;
4048 break;
4049 case '}':
4050 Kind = tok::r_brace;
4051 break;
4052 case '.':
4053 Char = getCharAndSize(CurPtr, SizeTmp);
4054 if (Char >= '0' && Char <= '9') {
4055 // Notify MIOpt that we read a non-whitespace/non-comment token.
4056 MIOpt.ReadToken();
4057
4058 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4059 } else if (LangOpts.CPlusPlus && Char == '*') {
4060 Kind = tok::periodstar;
4061 CurPtr += SizeTmp;
4062 } else if (Char == '.' &&
4063 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4064 Kind = tok::ellipsis;
4065 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4066 SizeTmp2, Result);
4067 } else {
4068 Kind = tok::period;
4069 }
4070 break;
4071 case '&':
4072 Char = getCharAndSize(CurPtr, SizeTmp);
4073 if (Char == '&') {
4074 Kind = tok::ampamp;
4075 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4076 } else if (Char == '=') {
4077 Kind = tok::ampequal;
4078 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4079 } else {
4080 Kind = tok::amp;
4081 }
4082 break;
4083 case '*':
4084 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4085 Kind = tok::starequal;
4086 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4087 } else {
4088 Kind = tok::star;
4089 }
4090 break;
4091 case '+':
4092 Char = getCharAndSize(CurPtr, SizeTmp);
4093 if (Char == '+') {
4094 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4095 Kind = tok::plusplus;
4096 } else if (Char == '=') {
4097 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4098 Kind = tok::plusequal;
4099 } else {
4100 Kind = tok::plus;
4101 }
4102 break;
4103 case '-':
4104 Char = getCharAndSize(CurPtr, SizeTmp);
4105 if (Char == '-') { // --
4106 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4107 Kind = tok::minusminus;
4108 } else if (Char == '>' && LangOpts.CPlusPlus &&
4109 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
4110 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4111 SizeTmp2, Result);
4112 Kind = tok::arrowstar;
4113 } else if (Char == '>') { // ->
4114 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4115 Kind = tok::arrow;
4116 } else if (Char == '=') { // -=
4117 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4118 Kind = tok::minusequal;
4119 } else {
4120 Kind = tok::minus;
4121 }
4122 break;
4123 case '~':
4124 Kind = tok::tilde;
4125 break;
4126 case '!':
4127 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4128 Kind = tok::exclaimequal;
4129 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4130 } else {
4131 Kind = tok::exclaim;
4132 }
4133 break;
4134 case '/':
4135 // 6.4.9: Comments
4136 Char = getCharAndSize(CurPtr, SizeTmp);
4137 if (Char == '/') { // Line comment.
4138 // Even if Line comments are disabled (e.g. in C89 mode), we generally
4139 // want to lex this as a comment. There is one problem with this though,
4140 // that in one particular corner case, this can change the behavior of the
4141 // resultant program. For example, In "foo //**/ bar", C89 would lex
4142 // this as "foo / bar" and languages with Line comments would lex it as
4143 // "foo". Check to see if the character after the second slash is a '*'.
4144 // If so, we will lex that as a "/" instead of the start of a comment.
4145 // However, we never do this if we are just preprocessing.
4146 bool TreatAsComment =
4147 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4148 if (!TreatAsComment)
4149 if (!(PP && PP->isPreprocessedOutput()))
4150 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4151
4152 if (TreatAsComment) {
4153 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4154 TokAtPhysicalStartOfLine))
4155 return true; // There is a token to return.
4156
4157 // It is common for the tokens immediately after a // comment to be
4158 // whitespace (indentation for the next line). Instead of going through
4159 // the big switch, handle it efficiently now.
4160 goto SkipIgnoredUnits;
4161 }
4162 }
4163
4164 if (Char == '*') { // /**/ comment.
4165 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4166 TokAtPhysicalStartOfLine))
4167 return true; // There is a token to return.
4168
4169 // We only saw whitespace, so just try again with this lexer.
4170 // (We manually eliminate the tail call to avoid recursion.)
4171 goto LexNextToken;
4172 }
4173
4174 if (Char == '=') {
4175 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4176 Kind = tok::slashequal;
4177 } else {
4178 Kind = tok::slash;
4179 }
4180 break;
4181 case '%':
4182 Char = getCharAndSize(CurPtr, SizeTmp);
4183 if (Char == '=') {
4184 Kind = tok::percentequal;
4185 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4186 } else if (LangOpts.Digraphs && Char == '>') {
4187 Kind = tok::r_brace; // '%>' -> '}'
4188 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4189 } else if (LangOpts.Digraphs && Char == ':') {
4190 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4191 Char = getCharAndSize(CurPtr, SizeTmp);
4192 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4193 Kind = tok::hashhash; // '%:%:' -> '##'
4194 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4195 SizeTmp2, Result);
4196 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4197 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4198 if (!isLexingRawMode())
4199 Diag(BufferPtr, diag::ext_charize_microsoft);
4200 Kind = tok::hashat;
4201 } else { // '%:' -> '#'
4202 // We parsed a # character. If this occurs at the start of the line,
4203 // it's actually the start of a preprocessing directive. Callback to
4204 // the preprocessor to handle it.
4205 // TODO: -fpreprocessed mode??
4206 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4207 goto HandleDirective;
4208
4209 Kind = tok::hash;
4210 }
4211 } else {
4212 Kind = tok::percent;
4213 }
4214 break;
4215 case '<':
4216 Char = getCharAndSize(CurPtr, SizeTmp);
4217 if (ParsingFilename) {
4218 return LexAngledStringLiteral(Result, CurPtr);
4219 } else if (Char == '<') {
4220 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4221 if (After == '=') {
4222 Kind = tok::lesslessequal;
4223 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4224 SizeTmp2, Result);
4225 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4226 // If this is actually a '<<<<<<<' version control conflict marker,
4227 // recognize it as such and recover nicely.
4228 goto LexNextToken;
4229 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4230 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4231 // ignore it.
4232 goto LexNextToken;
4233 } else if (LangOpts.CUDA && After == '<') {
4234 Kind = tok::lesslessless;
4235 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4236 SizeTmp2, Result);
4237 } else {
4238 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4239 Kind = tok::lessless;
4240 }
4241 } else if (Char == '=') {
4242 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4243 if (After == '>') {
4244 if (LangOpts.CPlusPlus20) {
4245 if (!isLexingRawMode())
4246 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4247 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4248 SizeTmp2, Result);
4249 Kind = tok::spaceship;
4250 break;
4251 }
4252 // Suggest adding a space between the '<=' and the '>' to avoid a
4253 // change in semantics if this turns up in C++ <=17 mode.
4254 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4255 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4257 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4258 }
4259 }
4260 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4261 Kind = tok::lessequal;
4262 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4263 if (LangOpts.CPlusPlus11 &&
4264 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4265 // C++0x [lex.pptoken]p3:
4266 // Otherwise, if the next three characters are <:: and the subsequent
4267 // character is neither : nor >, the < is treated as a preprocessor
4268 // token by itself and not as the first character of the alternative
4269 // token <:.
4270 unsigned SizeTmp3;
4271 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4272 if (After != ':' && After != '>') {
4273 Kind = tok::less;
4274 if (!isLexingRawMode())
4275 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4276 break;
4277 }
4278 }
4279
4280 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4281 Kind = tok::l_square;
4282 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4283 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4284 Kind = tok::l_brace;
4285 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4286 lexEditorPlaceholder(Result, CurPtr)) {
4287 return true;
4288 } else {
4289 Kind = tok::less;
4290 }
4291 break;
4292 case '>':
4293 Char = getCharAndSize(CurPtr, SizeTmp);
4294 if (Char == '=') {
4295 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4296 Kind = tok::greaterequal;
4297 } else if (Char == '>') {
4298 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4299 if (After == '=') {
4300 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4301 SizeTmp2, Result);
4302 Kind = tok::greatergreaterequal;
4303 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4304 // If this is actually a '>>>>' conflict marker, recognize it as such
4305 // and recover nicely.
4306 goto LexNextToken;
4307 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4308 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4309 goto LexNextToken;
4310 } else if (LangOpts.CUDA && After == '>') {
4311 Kind = tok::greatergreatergreater;
4312 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4313 SizeTmp2, Result);
4314 } else {
4315 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4316 Kind = tok::greatergreater;
4317 }
4318 } else {
4319 Kind = tok::greater;
4320 }
4321 break;
4322 case '^':
4323 Char = getCharAndSize(CurPtr, SizeTmp);
4324 if (Char == '=') {
4325 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4326 Kind = tok::caretequal;
4327 } else {
4328 if (LangOpts.OpenCL && Char == '^')
4329 Diag(CurPtr, diag::err_opencl_logical_exclusive_or);
4330 Kind = tok::caret;
4331 }
4332 break;
4333 case '|':
4334 Char = getCharAndSize(CurPtr, SizeTmp);
4335 if (Char == '=') {
4336 Kind = tok::pipeequal;
4337 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4338 } else if (Char == '|') {
4339 // If this is '|||||||' and we're in a conflict marker, ignore it.
4340 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4341 goto LexNextToken;
4342 Kind = tok::pipepipe;
4343 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4344 } else {
4345 Kind = tok::pipe;
4346 }
4347 break;
4348 case ':':
4349 Char = getCharAndSize(CurPtr, SizeTmp);
4350 if (LangOpts.Digraphs && Char == '>') {
4351 Kind = tok::r_square; // ':>' -> ']'
4352 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4353 } else if (Char == ':') {
4354 Kind = tok::coloncolon;
4355 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4356 } else {
4357 Kind = tok::colon;
4358 }
4359 break;
4360 case ';':
4361 Kind = tok::semi;
4362 break;
4363 case '=':
4364 Char = getCharAndSize(CurPtr, SizeTmp);
4365 if (Char == '=') {
4366 // If this is '====' and we're in a conflict marker, ignore it.
4367 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4368 goto LexNextToken;
4369
4370 Kind = tok::equalequal;
4371 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4372 } else {
4373 Kind = tok::equal;
4374 }
4375 break;
4376 case ',':
4377 Kind = tok::comma;
4378 break;
4379 case '#':
4380 Char = getCharAndSize(CurPtr, SizeTmp);
4381 if (Char == '#') {
4382 Kind = tok::hashhash;
4383 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4384 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4385 Kind = tok::hashat;
4386 if (!isLexingRawMode())
4387 Diag(BufferPtr, diag::ext_charize_microsoft);
4388 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4389 } else {
4390 // We parsed a # character. If this occurs at the start of the line,
4391 // it's actually the start of a preprocessing directive. Callback to
4392 // the preprocessor to handle it.
4393 // TODO: -fpreprocessed mode??
4394 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4395 goto HandleDirective;
4396
4397 Kind = tok::hash;
4398 }
4399 break;
4400
4401 case '@':
4402 // Objective C support.
4403 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4404 Kind = tok::at;
4405 else
4406 Kind = tok::unknown;
4407 break;
4408
4409 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4410 case '\\':
4411 if (!LangOpts.AsmPreprocessor) {
4412 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4413 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4414 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4415 return true; // KeepWhitespaceMode
4416
4417 // We only saw whitespace, so just try again with this lexer.
4418 // (We manually eliminate the tail call to avoid recursion.)
4419 goto LexNextToken;
4420 }
4421
4422 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4423 }
4424 }
4425
4426 Kind = tok::unknown;
4427 break;
4428
4429 default: {
4430 if (isASCII(Char)) {
4431 Kind = tok::unknown;
4432 break;
4433 }
4434
4435 llvm::UTF32 CodePoint;
4436
4437 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4438 // an escaped newline.
4439 --CurPtr;
4440 llvm::ConversionResult Status =
4441 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4442 (const llvm::UTF8 *)BufferEnd,
4443 &CodePoint,
4444 llvm::strictConversion);
4445 if (Status == llvm::conversionOK) {
4446 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4447 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4448 return true; // KeepWhitespaceMode
4449
4450 // We only saw whitespace, so just try again with this lexer.
4451 // (We manually eliminate the tail call to avoid recursion.)
4452 goto LexNextToken;
4453 }
4454 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4455 }
4456
4459 ++CurPtr;
4460 Kind = tok::unknown;
4461 break;
4462 }
4463
4464 // Non-ASCII characters tend to creep into source code unintentionally.
4465 // Instead of letting the parser complain about the unknown token,
4466 // just diagnose the invalid UTF-8, then drop the character.
4467 Diag(CurPtr, diag::err_invalid_utf8);
4468
4469 BufferPtr = CurPtr+1;
4470 // We're pretending the character didn't exist, so just try again with
4471 // this lexer.
4472 // (We manually eliminate the tail call to avoid recursion.)
4473 goto LexNextToken;
4474 }
4475 }
4476
4477 // Notify MIOpt that we read a non-whitespace/non-comment token.
4478 MIOpt.ReadToken();
4479
4480 // Update the location of token as well as BufferPtr.
4481 FormTokenWithChars(Result, CurPtr, Kind);
4482 return true;
4483
4484HandleDirective:
4485 // We parsed a # character and it's the start of a preprocessing directive.
4486
4487 FormTokenWithChars(Result, CurPtr, tok::hash);
4489
4491 // With a fatal failure in the module loader, we abort parsing.
4492 return true;
4493
4494 // We parsed the directive; lex a token with the new state.
4495 return false;
4496
4497LexNextToken:
4498 Result.clearFlag(Token::NeedsCleaning);
4499 goto LexStart;
4500}
4501
4502const char *Lexer::convertDependencyDirectiveToken(
4504 const char *TokPtr = BufferStart + DDTok.Offset;
4505 Result.startToken();
4506 Result.setLocation(getSourceLocation(TokPtr));
4507 Result.setKind(DDTok.Kind);
4508 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4509 Result.setLength(DDTok.Length);
4510 BufferPtr = TokPtr + DDTok.Length;
4511 return TokPtr;
4512}
4513
4514bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4515 assert(isDependencyDirectivesLexer());
4516
4517 using namespace dependency_directives_scan;
4518
4519 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4520 if (DepDirectives.front().Kind == pp_eof)
4521 return LexEndOfFile(Result, BufferEnd);
4522 if (DepDirectives.front().Kind == tokens_present_before_eof)
4523 MIOpt.ReadToken();
4524 NextDepDirectiveTokenIndex = 0;
4525 DepDirectives = DepDirectives.drop_front();
4526 }
4527
4529 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4530 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4531 // Read something other than a preprocessor directive hash.
4532 MIOpt.ReadToken();
4533 }
4534
4535 if (ParsingFilename && DDTok.is(tok::less)) {
4536 BufferPtr = BufferStart + DDTok.Offset;
4537 LexAngledStringLiteral(Result, BufferPtr + 1);
4538 if (Result.isNot(tok::header_name))
4539 return true;
4540 // Advance the index of lexed tokens.
4541 while (true) {
4542 const dependency_directives_scan::Token &NextTok =
4543 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4544 if (BufferStart + NextTok.Offset >= BufferPtr)
4545 break;
4546 ++NextDepDirectiveTokenIndex;
4547 }
4548 return true;
4549 }
4550
4551 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4552
4553 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4555 return false;
4556 }
4557 if (Result.is(tok::raw_identifier)) {
4558 Result.setRawIdentifierData(TokPtr);
4559 if (!isLexingRawMode()) {
4561 if (II->isHandleIdentifierCase())
4562 return PP->HandleIdentifier(Result);
4563 }
4564 return true;
4565 }
4566 if (Result.isLiteral()) {
4567 Result.setLiteralData(TokPtr);
4568 return true;
4569 }
4570 if (Result.is(tok::colon)) {
4571 // Convert consecutive colons to 'tok::coloncolon'.
4572 if (*BufferPtr == ':') {
4573 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4574 tok::colon));
4575 ++NextDepDirectiveTokenIndex;
4576 Result.setKind(tok::coloncolon);
4577 }
4578 return true;
4579 }
4580 if (Result.is(tok::eod))
4582
4583 return true;
4584}
4585
4586bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4587 assert(isDependencyDirectivesLexer());
4588
4589 using namespace dependency_directives_scan;
4590
4591 bool Stop = false;
4592 unsigned NestedIfs = 0;
4593 do {
4594 DepDirectives = DepDirectives.drop_front();
4595 switch (DepDirectives.front().Kind) {
4596 case pp_none:
4597 llvm_unreachable("unexpected 'pp_none'");
4598 case pp_include:
4600 case pp_define:
4601 case pp_undef:
4602 case pp_import:
4603 case pp_pragma_import:
4604 case pp_pragma_once:
4609 case pp_include_next:
4610 case decl_at_import:
4611 case cxx_module_decl:
4612 case cxx_import_decl:
4616 break;
4617 case pp_if:
4618 case pp_ifdef:
4619 case pp_ifndef:
4620 ++NestedIfs;
4621 break;
4622 case pp_elif:
4623 case pp_elifdef:
4624 case pp_elifndef:
4625 case pp_else:
4626 if (!NestedIfs) {
4627 Stop = true;
4628 }
4629 break;
4630 case pp_endif:
4631 if (!NestedIfs) {
4632 Stop = true;
4633 } else {
4634 --NestedIfs;
4635 }
4636 break;
4637 case pp_eof:
4638 NextDepDirectiveTokenIndex = 0;
4639 return LexEndOfFile(Result, BufferEnd);
4640 }
4641 } while (!Stop);
4642
4644 DepDirectives.front().Tokens.front();
4645 assert(DDTok.is(tok::hash));
4646 NextDepDirectiveTokenIndex = 1;
4647
4648 convertDependencyDirectiveToken(DDTok, Result);
4649 return false;
4650}
StringRef P
#define SM(sm)
Definition: Cuda.cpp:84
Defines the Diagnostic-related interfaces.
Expr * E
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:947
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition: Lexer.cpp:1544
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1738
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1260
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:324
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3242
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1663
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:560
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:284
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1188
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1558
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1628
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1525
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition: Lexer.cpp:1612
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3346
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition: Lexer.cpp:1531
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:919
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2758
static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:1904
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1241
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1586
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1634
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:543
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
SourceRange Range
Definition: SemaObjC.cpp:758
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
SourceLocation Begin
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ int
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16260
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16052
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1220
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:231
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1493
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:939
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool isInvalid() const
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:138
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:127
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:101
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
bool isKeyword(const LangOptions &LangOpts) const
Return true if this token is a keyword in the specified language.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:499
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:1023
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1357
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:277
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1059
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:3066
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:871
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1231
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
bool Lex(Token &Result)
Lex - Return the next token in the file.
Definition: Lexer.cpp:3679
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:790
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:183
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:893
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:954
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1137
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1212
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1157
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:451
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1133
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:498
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:608
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:219
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1106
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:242
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:636
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:509
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1324
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:849
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:309
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:586
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:138
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
SourceManager & getSourceManager() const
EmptylineHandler * getEmptylineHandler() const
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
IdentifierTable & getIdentifierTable()
const LangOptions & getLangOpts() const
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
DiagnosticsEngine & getDiagnostics() const
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
void setBegin(SourceLocation b)
bool isInvalid() const
SourceLocation getEnd() const
SourceLocation getBegin() const
void setEnd(SourceLocation e)
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
Definition: Token.h:36
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:306
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:116
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
unsigned getLength() const
Definition: Token.h:135
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:69
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:99
tok::TokenKind getKind() const
Definition: Token.h:94
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:276
@ HasUCN
Definition: Token.h:83
@ IsEditorPlaceholder
Definition: Token.h:88
@ LeadingEmptyMacro
Definition: Token.h:81
@ LeadingSpace
Definition: Token.h:77
@ StartOfLine
Definition: Token.h:75
@ HasUDSuffix
Definition: Token.h:82
@ NeedsCleaning
Definition: Token.h:80
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:121
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:60
bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const
Determine whether the token kind starts a simple-type-specifier.
Definition: Lexer.cpp:77
void startToken()
Reset all flags to cleared.
Definition: Token.h:177
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:295
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:213
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:225
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:244
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4285
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3092
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3458
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3443
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3746
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:89
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:41
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:99
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
@ LineComment
Definition: LangStandard.h:49
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition: CharInfo.h:61
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:91
@ Result
The result type of a method or function.
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:175
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:108
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:168
const FunctionProtoType * T
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:53
unsigned int uint32_t
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
#define _SIDD_UBYTE_OPS
Definition: smmintrin.h:1532
#define _mm_cmpistri(A, B, M)
Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...
Definition: smmintrin.h:1664
#define _SIDD_LEAST_SIGNIFICANT
Definition: smmintrin.h:1550
#define _SIDD_NEGATIVE_POLARITY
Definition: smmintrin.h:1545
#define _SIDD_CMP_RANGES
Definition: smmintrin.h:1539
Represents a char and the number of bytes parsed to produce it.
Definition: Lexer.h:579
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.