clang 20.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
18#include "clang/Basic/LLVM.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MathExtras.h"
36#include "llvm/Support/MemoryBufferRef.h"
37#include "llvm/Support/NativeFormatting.h"
38#include "llvm/Support/Unicode.h"
39#include "llvm/Support/UnicodeCharRanges.h"
40#include <algorithm>
41#include <cassert>
42#include <cstddef>
43#include <cstdint>
44#include <cstring>
45#include <optional>
46#include <string>
47#include <tuple>
48#include <utility>
49
50#ifdef __SSE4_2__
51#include <nmmintrin.h>
52#endif
53
54using namespace clang;
55
56//===----------------------------------------------------------------------===//
57// Token Class Implementation
58//===----------------------------------------------------------------------===//
59
60/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
62 if (isAnnotation())
63 return false;
64 if (const IdentifierInfo *II = getIdentifierInfo())
65 return II->getObjCKeywordID() == objcKey;
66 return false;
67}
68
69/// getObjCKeywordID - Return the ObjC keyword kind.
71 if (isAnnotation())
72 return tok::objc_not_keyword;
73 const IdentifierInfo *specId = getIdentifierInfo();
74 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
75}
76
77/// Determine whether the token kind starts a simple-type-specifier.
78bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
79 switch (getKind()) {
80 case tok::annot_typename:
81 case tok::annot_decltype:
82 case tok::annot_pack_indexing_type:
83 return true;
84
85 case tok::kw_short:
86 case tok::kw_long:
87 case tok::kw___int64:
88 case tok::kw___int128:
89 case tok::kw_signed:
90 case tok::kw_unsigned:
91 case tok::kw_void:
92 case tok::kw_char:
93 case tok::kw_int:
94 case tok::kw_half:
95 case tok::kw_float:
96 case tok::kw_double:
97 case tok::kw___bf16:
98 case tok::kw__Float16:
99 case tok::kw___float128:
100 case tok::kw___ibm128:
101 case tok::kw_wchar_t:
102 case tok::kw_bool:
103 case tok::kw__Bool:
104 case tok::kw__Accum:
105 case tok::kw__Fract:
106 case tok::kw__Sat:
107#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
108#include "clang/Basic/TransformTypeTraits.def"
109 case tok::kw___auto_type:
110 case tok::kw_char16_t:
111 case tok::kw_char32_t:
112 case tok::kw_typeof:
113 case tok::kw_decltype:
114 case tok::kw_char8_t:
115 return getIdentifierInfo()->isKeyword(LangOpts);
116
117 default:
118 return false;
119 }
120}
121
122//===----------------------------------------------------------------------===//
123// Lexer Class Implementation
124//===----------------------------------------------------------------------===//
125
126void Lexer::anchor() {}
127
128void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
129 const char *BufEnd) {
130 BufferStart = BufStart;
131 BufferPtr = BufPtr;
132 BufferEnd = BufEnd;
133
134 assert(BufEnd[0] == 0 &&
135 "We assume that the input buffer has a null character at the end"
136 " to simplify lexing!");
137
138 // Check whether we have a BOM in the beginning of the buffer. If yes - act
139 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
140 // skip the UTF-8 BOM if it's present.
141 if (BufferStart == BufferPtr) {
142 // Determine the size of the BOM.
143 StringRef Buf(BufferStart, BufferEnd - BufferStart);
144 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
145 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
146 .Default(0);
147
148 // Skip the BOM.
149 BufferPtr += BOMLength;
150 }
151
152 Is_PragmaLexer = false;
153 CurrentConflictMarkerState = CMK_None;
154
155 // Start of the file is a start of line.
156 IsAtStartOfLine = true;
157 IsAtPhysicalStartOfLine = true;
158
159 HasLeadingSpace = false;
160 HasLeadingEmptyMacro = false;
161
162 // We are not after parsing a #.
164
165 // We are not after parsing #include.
166 ParsingFilename = false;
167
168 // We are not in raw mode. Raw mode disables diagnostics and interpretation
169 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
170 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
171 // or otherwise skipping over tokens.
172 LexingRawMode = false;
173
174 // Default to not keeping comments.
175 ExtendedTokenMode = 0;
176
177 NewLinePtr = nullptr;
178}
179
180/// Lexer constructor - Create a new lexer object for the specified buffer
181/// with the specified preprocessor managing the lexing process. This lexer
182/// assumes that the associated file buffer and Preprocessor objects will
183/// outlive it, so it doesn't take ownership of either of them.
184Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
185 Preprocessor &PP, bool IsFirstIncludeOfFile)
186 : PreprocessorLexer(&PP, FID),
187 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
188 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
189 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
190 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
191 InputFile.getBufferEnd());
192
194}
195
196/// Lexer constructor - Create a new raw lexer object. This object is only
197/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
198/// range will outlive it, so it doesn't take ownership of it.
199Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
200 const char *BufStart, const char *BufPtr, const char *BufEnd,
201 bool IsFirstIncludeOfFile)
202 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
203 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
204 InitLexer(BufStart, BufPtr, BufEnd);
205
206 // We *are* in raw mode.
207 LexingRawMode = true;
208}
209
210/// Lexer constructor - Create a new raw lexer object. This object is only
211/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
212/// range will outlive it, so it doesn't take ownership of it.
213Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
214 const SourceManager &SM, const LangOptions &langOpts,
215 bool IsFirstIncludeOfFile)
216 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
217 FromFile.getBufferStart(), FromFile.getBufferEnd(),
218 IsFirstIncludeOfFile) {}
219
221 assert(PP && "Cannot reset token mode without a preprocessor");
222 if (LangOpts.TraditionalCPP)
224 else
226}
227
228/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
229/// _Pragma expansion. This has a variety of magic semantics that this method
230/// sets up. It returns a new'd Lexer that must be delete'd when done.
231///
232/// On entrance to this routine, TokStartLoc is a macro location which has a
233/// spelling loc that indicates the bytes to be lexed for the token and an
234/// expansion location that indicates where all lexed tokens should be
235/// "expanded from".
236///
237/// TODO: It would really be nice to make _Pragma just be a wrapper around a
238/// normal lexer that remaps tokens as they fly by. This would require making
239/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
240/// interface that could handle this stuff. This would pull GetMappedTokenLoc
241/// out of the critical path of the lexer!
242///
244 SourceLocation ExpansionLocStart,
245 SourceLocation ExpansionLocEnd,
246 unsigned TokLen, Preprocessor &PP) {
248
249 // Create the lexer as if we were going to lex the file normally.
250 FileID SpellingFID = SM.getFileID(SpellingLoc);
251 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
252 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
253
254 // Now that the lexer is created, change the start/end locations so that we
255 // just lex the subsection of the file that we want. This is lexing from a
256 // scratch buffer.
257 const char *StrData = SM.getCharacterData(SpellingLoc);
258
259 L->BufferPtr = StrData;
260 L->BufferEnd = StrData+TokLen;
261 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
262
263 // Set the SourceLocation with the remapping information. This ensures that
264 // GetMappedTokenLoc will remap the tokens as they are lexed.
265 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
266 ExpansionLocStart,
267 ExpansionLocEnd, TokLen);
268
269 // Ensure that the lexer thinks it is inside a directive, so that end \n will
270 // return an EOD token.
272
273 // This lexer really is for _Pragma.
274 L->Is_PragmaLexer = true;
275 return L;
276}
277
278void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
279 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
280 this->IsAtStartOfLine = IsAtStartOfLine;
281 assert((BufferStart + Offset) <= BufferEnd);
282 BufferPtr = BufferStart + Offset;
283}
284
285template <typename T> static void StringifyImpl(T &Str, char Quote) {
286 typename T::size_type i = 0, e = Str.size();
287 while (i < e) {
288 if (Str[i] == '\\' || Str[i] == Quote) {
289 Str.insert(Str.begin() + i, '\\');
290 i += 2;
291 ++e;
292 } else if (Str[i] == '\n' || Str[i] == '\r') {
293 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
294 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
295 Str[i] != Str[i + 1]) {
296 Str[i] = '\\';
297 Str[i + 1] = 'n';
298 } else {
299 // Replace '\n' and '\r' to '\\' followed by 'n'.
300 Str[i] = '\\';
301 Str.insert(Str.begin() + i + 1, 'n');
302 ++e;
303 }
304 i += 2;
305 } else
306 ++i;
307 }
308}
309
310std::string Lexer::Stringify(StringRef Str, bool Charify) {
311 std::string Result = std::string(Str);
312 char Quote = Charify ? '\'' : '"';
313 StringifyImpl(Result, Quote);
314 return Result;
315}
316
318
319//===----------------------------------------------------------------------===//
320// Token Spelling
321//===----------------------------------------------------------------------===//
322
323/// Slow case of getSpelling. Extract the characters comprising the
324/// spelling of this token from the provided input buffer.
325static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
326 const LangOptions &LangOpts, char *Spelling) {
327 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
328
329 size_t Length = 0;
330 const char *BufEnd = BufPtr + Tok.getLength();
331
332 if (tok::isStringLiteral(Tok.getKind())) {
333 // Munch the encoding-prefix and opening double-quote.
334 while (BufPtr < BufEnd) {
335 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
336 Spelling[Length++] = CharAndSize.Char;
337 BufPtr += CharAndSize.Size;
338
339 if (Spelling[Length - 1] == '"')
340 break;
341 }
342
343 // Raw string literals need special handling; trigraph expansion and line
344 // splicing do not occur within their d-char-sequence nor within their
345 // r-char-sequence.
346 if (Length >= 2 &&
347 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
348 // Search backwards from the end of the token to find the matching closing
349 // quote.
350 const char *RawEnd = BufEnd;
351 do --RawEnd; while (*RawEnd != '"');
352 size_t RawLength = RawEnd - BufPtr + 1;
353
354 // Everything between the quotes is included verbatim in the spelling.
355 memcpy(Spelling + Length, BufPtr, RawLength);
356 Length += RawLength;
357 BufPtr += RawLength;
358
359 // The rest of the token is lexed normally.
360 }
361 }
362
363 while (BufPtr < BufEnd) {
364 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
365 Spelling[Length++] = CharAndSize.Char;
366 BufPtr += CharAndSize.Size;
367 }
368
369 assert(Length < Tok.getLength() &&
370 "NeedsCleaning flag set on token that didn't need cleaning!");
371 return Length;
372}
373
374/// getSpelling() - Return the 'spelling' of this token. The spelling of a
375/// token are the characters used to represent the token in the source file
376/// after trigraph expansion and escaped-newline folding. In particular, this
377/// wants to get the true, uncanonicalized, spelling of things like digraphs
378/// UCNs, etc.
380 SmallVectorImpl<char> &buffer,
381 const SourceManager &SM,
382 const LangOptions &options,
383 bool *invalid) {
384 // Break down the source location.
385 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
386
387 // Try to the load the file buffer.
388 bool invalidTemp = false;
389 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
390 if (invalidTemp) {
391 if (invalid) *invalid = true;
392 return {};
393 }
394
395 const char *tokenBegin = file.data() + locInfo.second;
396
397 // Lex from the start of the given location.
398 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
399 file.begin(), tokenBegin, file.end());
400 Token token;
401 lexer.LexFromRawLexer(token);
402
403 unsigned length = token.getLength();
404
405 // Common case: no need for cleaning.
406 if (!token.needsCleaning())
407 return StringRef(tokenBegin, length);
408
409 // Hard case, we need to relex the characters into the string.
410 buffer.resize(length);
411 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
412 return StringRef(buffer.data(), buffer.size());
413}
414
415/// getSpelling() - Return the 'spelling' of this token. The spelling of a
416/// token are the characters used to represent the token in the source file
417/// after trigraph expansion and escaped-newline folding. In particular, this
418/// wants to get the true, uncanonicalized, spelling of things like digraphs
419/// UCNs, etc.
420std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
421 const LangOptions &LangOpts, bool *Invalid) {
422 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
423
424 bool CharDataInvalid = false;
425 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
426 &CharDataInvalid);
427 if (Invalid)
428 *Invalid = CharDataInvalid;
429 if (CharDataInvalid)
430 return {};
431
432 // If this token contains nothing interesting, return it directly.
433 if (!Tok.needsCleaning())
434 return std::string(TokStart, TokStart + Tok.getLength());
435
436 std::string Result;
437 Result.resize(Tok.getLength());
438 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
439 return Result;
440}
441
442/// getSpelling - This method is used to get the spelling of a token into a
443/// preallocated buffer, instead of as an std::string. The caller is required
444/// to allocate enough space for the token, which is guaranteed to be at least
445/// Tok.getLength() bytes long. The actual length of the token is returned.
446///
447/// Note that this method may do two possible things: it may either fill in
448/// the buffer specified with characters, or it may *change the input pointer*
449/// to point to a constant buffer with the data already in it (avoiding a
450/// copy). The caller is not allowed to modify the returned buffer pointer
451/// if an internal buffer is returned.
452unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
453 const SourceManager &SourceMgr,
454 const LangOptions &LangOpts, bool *Invalid) {
455 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
456
457 const char *TokStart = nullptr;
458 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
459 if (Tok.is(tok::raw_identifier))
460 TokStart = Tok.getRawIdentifier().data();
461 else if (!Tok.hasUCN()) {
462 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
463 // Just return the string from the identifier table, which is very quick.
464 Buffer = II->getNameStart();
465 return II->getLength();
466 }
467 }
468
469 // NOTE: this can be checked even after testing for an IdentifierInfo.
470 if (Tok.isLiteral())
471 TokStart = Tok.getLiteralData();
472
473 if (!TokStart) {
474 // Compute the start of the token in the input lexer buffer.
475 bool CharDataInvalid = false;
476 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
477 if (Invalid)
478 *Invalid = CharDataInvalid;
479 if (CharDataInvalid) {
480 Buffer = "";
481 return 0;
482 }
483 }
484
485 // If this token contains nothing interesting, return it directly.
486 if (!Tok.needsCleaning()) {
487 Buffer = TokStart;
488 return Tok.getLength();
489 }
490
491 // Otherwise, hard case, relex the characters into the string.
492 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
493}
494
495/// MeasureTokenLength - Relex the token at the specified location and return
496/// its length in bytes in the input file. If the token needs cleaning (e.g.
497/// includes a trigraph or an escaped newline) then this count includes bytes
498/// that are part of that.
500 const SourceManager &SM,
501 const LangOptions &LangOpts) {
502 Token TheTok;
503 if (getRawToken(Loc, TheTok, SM, LangOpts))
504 return 0;
505 return TheTok.getLength();
506}
507
508/// Relex the token at the specified location.
509/// \returns true if there was a failure, false on success.
511 const SourceManager &SM,
512 const LangOptions &LangOpts,
513 bool IgnoreWhiteSpace) {
514 // TODO: this could be special cased for common tokens like identifiers, ')',
515 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
516 // all obviously single-char tokens. This could use
517 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
518 // something.
519
520 // If this comes from a macro expansion, we really do want the macro name, not
521 // the token this macro expanded to.
522 Loc = SM.getExpansionLoc(Loc);
523 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
524 bool Invalid = false;
525 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
526 if (Invalid)
527 return true;
528
529 const char *StrData = Buffer.data()+LocInfo.second;
530
531 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
532 return true;
533
534 // Create a lexer starting at the beginning of this token.
535 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
536 Buffer.begin(), StrData, Buffer.end());
537 TheLexer.SetCommentRetentionState(true);
538 TheLexer.LexFromRawLexer(Result);
539 return false;
540}
541
542/// Returns the pointer that points to the beginning of line that contains
543/// the given offset, or null if the offset if invalid.
544static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
545 const char *BufStart = Buffer.data();
546 if (Offset >= Buffer.size())
547 return nullptr;
548
549 const char *LexStart = BufStart + Offset;
550 for (; LexStart != BufStart; --LexStart) {
551 if (isVerticalWhitespace(LexStart[0]) &&
552 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
553 // LexStart should point at first character of logical line.
554 ++LexStart;
555 break;
556 }
557 }
558 return LexStart;
559}
560
562 const SourceManager &SM,
563 const LangOptions &LangOpts) {
564 assert(Loc.isFileID());
565 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
566 if (LocInfo.first.isInvalid())
567 return Loc;
568
569 bool Invalid = false;
570 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
571 if (Invalid)
572 return Loc;
573
574 // Back up from the current location until we hit the beginning of a line
575 // (or the buffer). We'll relex from that point.
576 const char *StrData = Buffer.data() + LocInfo.second;
577 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
578 if (!LexStart || LexStart == StrData)
579 return Loc;
580
581 // Create a lexer starting at the beginning of this token.
582 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
583 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
584 Buffer.end());
585 TheLexer.SetCommentRetentionState(true);
586
587 // Lex tokens until we find the token that contains the source location.
588 Token TheTok;
589 do {
590 TheLexer.LexFromRawLexer(TheTok);
591
592 if (TheLexer.getBufferLocation() > StrData) {
593 // Lexing this token has taken the lexer past the source location we're
594 // looking for. If the current token encompasses our source location,
595 // return the beginning of that token.
596 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
597 return TheTok.getLocation();
598
599 // We ended up skipping over the source location entirely, which means
600 // that it points into whitespace. We're done here.
601 break;
602 }
603 } while (TheTok.getKind() != tok::eof);
604
605 // We've passed our source location; just return the original source location.
606 return Loc;
607}
608
610 const SourceManager &SM,
611 const LangOptions &LangOpts) {
612 if (Loc.isFileID())
613 return getBeginningOfFileToken(Loc, SM, LangOpts);
614
615 if (!SM.isMacroArgExpansion(Loc))
616 return Loc;
617
618 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
619 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
620 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
621 std::pair<FileID, unsigned> BeginFileLocInfo =
622 SM.getDecomposedLoc(BeginFileLoc);
623 assert(FileLocInfo.first == BeginFileLocInfo.first &&
624 FileLocInfo.second >= BeginFileLocInfo.second);
625 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
626}
627
628namespace {
629
630enum PreambleDirectiveKind {
631 PDK_Skipped,
632 PDK_Unknown
633};
634
635} // namespace
636
638 const LangOptions &LangOpts,
639 unsigned MaxLines) {
640 // Create a lexer starting at the beginning of the file. Note that we use a
641 // "fake" file source location at offset 1 so that the lexer will track our
642 // position within the file.
643 const SourceLocation::UIntTy StartOffset = 1;
645 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
646 Buffer.end());
647 TheLexer.SetCommentRetentionState(true);
648
649 bool InPreprocessorDirective = false;
650 Token TheTok;
651 SourceLocation ActiveCommentLoc;
652
653 unsigned MaxLineOffset = 0;
654 if (MaxLines) {
655 const char *CurPtr = Buffer.begin();
656 unsigned CurLine = 0;
657 while (CurPtr != Buffer.end()) {
658 char ch = *CurPtr++;
659 if (ch == '\n') {
660 ++CurLine;
661 if (CurLine == MaxLines)
662 break;
663 }
664 }
665 if (CurPtr != Buffer.end())
666 MaxLineOffset = CurPtr - Buffer.begin();
667 }
668
669 do {
670 TheLexer.LexFromRawLexer(TheTok);
671
672 if (InPreprocessorDirective) {
673 // If we've hit the end of the file, we're done.
674 if (TheTok.getKind() == tok::eof) {
675 break;
676 }
677
678 // If we haven't hit the end of the preprocessor directive, skip this
679 // token.
680 if (!TheTok.isAtStartOfLine())
681 continue;
682
683 // We've passed the end of the preprocessor directive, and will look
684 // at this token again below.
685 InPreprocessorDirective = false;
686 }
687
688 // Keep track of the # of lines in the preamble.
689 if (TheTok.isAtStartOfLine()) {
690 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
691
692 // If we were asked to limit the number of lines in the preamble,
693 // and we're about to exceed that limit, we're done.
694 if (MaxLineOffset && TokOffset >= MaxLineOffset)
695 break;
696 }
697
698 // Comments are okay; skip over them.
699 if (TheTok.getKind() == tok::comment) {
700 if (ActiveCommentLoc.isInvalid())
701 ActiveCommentLoc = TheTok.getLocation();
702 continue;
703 }
704
705 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
706 // This is the start of a preprocessor directive.
707 Token HashTok = TheTok;
708 InPreprocessorDirective = true;
709 ActiveCommentLoc = SourceLocation();
710
711 // Figure out which directive this is. Since we're lexing raw tokens,
712 // we don't have an identifier table available. Instead, just look at
713 // the raw identifier to recognize and categorize preprocessor directives.
714 TheLexer.LexFromRawLexer(TheTok);
715 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
716 StringRef Keyword = TheTok.getRawIdentifier();
717 PreambleDirectiveKind PDK
718 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
719 .Case("include", PDK_Skipped)
720 .Case("__include_macros", PDK_Skipped)
721 .Case("define", PDK_Skipped)
722 .Case("undef", PDK_Skipped)
723 .Case("line", PDK_Skipped)
724 .Case("error", PDK_Skipped)
725 .Case("pragma", PDK_Skipped)
726 .Case("import", PDK_Skipped)
727 .Case("include_next", PDK_Skipped)
728 .Case("warning", PDK_Skipped)
729 .Case("ident", PDK_Skipped)
730 .Case("sccs", PDK_Skipped)
731 .Case("assert", PDK_Skipped)
732 .Case("unassert", PDK_Skipped)
733 .Case("if", PDK_Skipped)
734 .Case("ifdef", PDK_Skipped)
735 .Case("ifndef", PDK_Skipped)
736 .Case("elif", PDK_Skipped)
737 .Case("elifdef", PDK_Skipped)
738 .Case("elifndef", PDK_Skipped)
739 .Case("else", PDK_Skipped)
740 .Case("endif", PDK_Skipped)
741 .Default(PDK_Unknown);
742
743 switch (PDK) {
744 case PDK_Skipped:
745 continue;
746
747 case PDK_Unknown:
748 // We don't know what this directive is; stop at the '#'.
749 break;
750 }
751 }
752
753 // We only end up here if we didn't recognize the preprocessor
754 // directive or it was one that can't occur in the preamble at this
755 // point. Roll back the current token to the location of the '#'.
756 TheTok = HashTok;
757 } else if (TheTok.isAtStartOfLine() &&
758 TheTok.getKind() == tok::raw_identifier &&
759 TheTok.getRawIdentifier() == "module" &&
760 LangOpts.CPlusPlusModules) {
761 // The initial global module fragment introducer "module;" is part of
762 // the preamble, which runs up to the module declaration "module foo;".
763 Token ModuleTok = TheTok;
764 do {
765 TheLexer.LexFromRawLexer(TheTok);
766 } while (TheTok.getKind() == tok::comment);
767 if (TheTok.getKind() != tok::semi) {
768 // Not global module fragment, roll back.
769 TheTok = ModuleTok;
770 break;
771 }
772 continue;
773 }
774
775 // We hit a token that we don't recognize as being in the
776 // "preprocessing only" part of the file, so we're no longer in
777 // the preamble.
778 break;
779 } while (true);
780
781 SourceLocation End;
782 if (ActiveCommentLoc.isValid())
783 End = ActiveCommentLoc; // don't truncate a decl comment.
784 else
785 End = TheTok.getLocation();
786
787 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
788 TheTok.isAtStartOfLine());
789}
790
791unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
792 const SourceManager &SM,
793 const LangOptions &LangOpts) {
794 // Figure out how many physical characters away the specified expansion
795 // character is. This needs to take into consideration newlines and
796 // trigraphs.
797 bool Invalid = false;
798 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
799
800 // If they request the first char of the token, we're trivially done.
801 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
802 return 0;
803
804 unsigned PhysOffset = 0;
805
806 // The usual case is that tokens don't contain anything interesting. Skip
807 // over the uninteresting characters. If a token only consists of simple
808 // chars, this method is extremely fast.
809 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
810 if (CharNo == 0)
811 return PhysOffset;
812 ++TokPtr;
813 --CharNo;
814 ++PhysOffset;
815 }
816
817 // If we have a character that may be a trigraph or escaped newline, use a
818 // lexer to parse it correctly.
819 for (; CharNo; --CharNo) {
820 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
821 TokPtr += CharAndSize.Size;
822 PhysOffset += CharAndSize.Size;
823 }
824
825 // Final detail: if we end up on an escaped newline, we want to return the
826 // location of the actual byte of the token. For example foo<newline>bar
827 // advanced by 3 should return the location of b, not of \\. One compounding
828 // detail of this is that the escape may be made by a trigraph.
829 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
830 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
831
832 return PhysOffset;
833}
834
835/// Computes the source location just past the end of the
836/// token at this source location.
837///
838/// This routine can be used to produce a source location that
839/// points just past the end of the token referenced by \p Loc, and
840/// is generally used when a diagnostic needs to point just after a
841/// token where it expected something different that it received. If
842/// the returned source location would not be meaningful (e.g., if
843/// it points into a macro), this routine returns an invalid
844/// source location.
845///
846/// \param Offset an offset from the end of the token, where the source
847/// location should refer to. The default offset (0) produces a source
848/// location pointing just past the end of the token; an offset of 1 produces
849/// a source location pointing to the last character in the token, etc.
851 const SourceManager &SM,
852 const LangOptions &LangOpts) {
853 if (Loc.isInvalid())
854 return {};
855
856 if (Loc.isMacroID()) {
857 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
858 return {}; // Points inside the macro expansion.
859 }
860
861 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
862 if (Len > Offset)
863 Len = Len - Offset;
864 else
865 return Loc;
866
867 return Loc.getLocWithOffset(Len);
868}
869
870/// Returns true if the given MacroID location points at the first
871/// token of the macro expansion.
873 const SourceManager &SM,
874 const LangOptions &LangOpts,
875 SourceLocation *MacroBegin) {
876 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
877
878 SourceLocation expansionLoc;
879 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
880 return false;
881
882 if (expansionLoc.isFileID()) {
883 // No other macro expansions, this is the first.
884 if (MacroBegin)
885 *MacroBegin = expansionLoc;
886 return true;
887 }
888
889 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
890}
891
892/// Returns true if the given MacroID location points at the last
893/// token of the macro expansion.
895 const SourceManager &SM,
896 const LangOptions &LangOpts,
897 SourceLocation *MacroEnd) {
898 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
899
900 SourceLocation spellLoc = SM.getSpellingLoc(loc);
901 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
902 if (tokLen == 0)
903 return false;
904
905 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
906 SourceLocation expansionLoc;
907 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
908 return false;
909
910 if (expansionLoc.isFileID()) {
911 // No other macro expansions.
912 if (MacroEnd)
913 *MacroEnd = expansionLoc;
914 return true;
915 }
916
917 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
918}
919
921 const SourceManager &SM,
922 const LangOptions &LangOpts) {
925 assert(Begin.isFileID() && End.isFileID());
926 if (Range.isTokenRange()) {
927 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
928 if (End.isInvalid())
929 return {};
930 }
931
932 // Break down the source locations.
933 FileID FID;
934 unsigned BeginOffs;
935 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
936 if (FID.isInvalid())
937 return {};
938
939 unsigned EndOffs;
940 if (!SM.isInFileID(End, FID, &EndOffs) ||
941 BeginOffs > EndOffs)
942 return {};
943
945}
946
947// Assumes that `Loc` is in an expansion.
949 const SourceManager &SM) {
950 return SM.getSLocEntry(SM.getFileID(Loc))
951 .getExpansion()
952 .isExpansionTokenRange();
953}
954
956 const SourceManager &SM,
957 const LangOptions &LangOpts) {
960 if (Begin.isInvalid() || End.isInvalid())
961 return {};
962
963 if (Begin.isFileID() && End.isFileID())
964 return makeRangeFromFileLocs(Range, SM, LangOpts);
965
966 if (Begin.isMacroID() && End.isFileID()) {
967 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
968 return {};
970 return makeRangeFromFileLocs(Range, SM, LangOpts);
971 }
972
973 if (Begin.isFileID() && End.isMacroID()) {
974 if (Range.isTokenRange()) {
975 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
976 return {};
977 // Use the *original* end, not the expanded one in `End`.
978 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
979 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
980 return {};
981 Range.setEnd(End);
982 return makeRangeFromFileLocs(Range, SM, LangOpts);
983 }
984
985 assert(Begin.isMacroID() && End.isMacroID());
986 SourceLocation MacroBegin, MacroEnd;
987 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
988 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
989 &MacroEnd)) ||
990 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
991 &MacroEnd)))) {
992 Range.setBegin(MacroBegin);
993 Range.setEnd(MacroEnd);
994 // Use the *original* `End`, not the expanded one in `MacroEnd`.
995 if (Range.isTokenRange())
996 Range.setTokenRange(isInExpansionTokenRange(End, SM));
997 return makeRangeFromFileLocs(Range, SM, LangOpts);
998 }
999
1000 bool Invalid = false;
1001 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
1002 &Invalid);
1003 if (Invalid)
1004 return {};
1005
1006 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
1007 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
1008 &Invalid);
1009 if (Invalid)
1010 return {};
1011
1012 if (EndEntry.getExpansion().isMacroArgExpansion() &&
1013 BeginEntry.getExpansion().getExpansionLocStart() ==
1014 EndEntry.getExpansion().getExpansionLocStart()) {
1015 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
1016 Range.setEnd(SM.getImmediateSpellingLoc(End));
1017 return makeFileCharRange(Range, SM, LangOpts);
1018 }
1019 }
1020
1021 return {};
1022}
1023
1025 const SourceManager &SM,
1026 const LangOptions &LangOpts,
1027 bool *Invalid) {
1028 Range = makeFileCharRange(Range, SM, LangOpts);
1029 if (Range.isInvalid()) {
1030 if (Invalid) *Invalid = true;
1031 return {};
1032 }
1033
1034 // Break down the source location.
1035 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
1036 if (beginInfo.first.isInvalid()) {
1037 if (Invalid) *Invalid = true;
1038 return {};
1039 }
1040
1041 unsigned EndOffs;
1042 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
1043 beginInfo.second > EndOffs) {
1044 if (Invalid) *Invalid = true;
1045 return {};
1046 }
1047
1048 // Try to the load the file buffer.
1049 bool invalidTemp = false;
1050 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1051 if (invalidTemp) {
1052 if (Invalid) *Invalid = true;
1053 return {};
1054 }
1055
1056 if (Invalid) *Invalid = false;
1057 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1058}
1059
1061 const SourceManager &SM,
1062 const LangOptions &LangOpts) {
1063 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1064
1065 // Find the location of the immediate macro expansion.
1066 while (true) {
1067 FileID FID = SM.getFileID(Loc);
1068 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1069 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1070 Loc = Expansion.getExpansionLocStart();
1071 if (!Expansion.isMacroArgExpansion())
1072 break;
1073
1074 // For macro arguments we need to check that the argument did not come
1075 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1076
1077 // Loc points to the argument id of the macro definition, move to the
1078 // macro expansion.
1079 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1080 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1081 if (SpellLoc.isFileID())
1082 break; // No inner macro.
1083
1084 // If spelling location resides in the same FileID as macro expansion
1085 // location, it means there is no inner macro.
1086 FileID MacroFID = SM.getFileID(Loc);
1087 if (SM.isInFileID(SpellLoc, MacroFID))
1088 break;
1089
1090 // Argument came from inner macro.
1091 Loc = SpellLoc;
1092 }
1093
1094 // Find the spelling location of the start of the non-argument expansion
1095 // range. This is where the macro name was spelled in order to begin
1096 // expanding this macro.
1097 Loc = SM.getSpellingLoc(Loc);
1098
1099 // Dig out the buffer where the macro name was spelled and the extents of the
1100 // name so that we can render it into the expansion note.
1101 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1102 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1103 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1104 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1105}
1106
1108 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1109 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1110 // Walk past macro argument expansions.
1111 while (SM.isMacroArgExpansion(Loc))
1112 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1113
1114 // If the macro's spelling isn't FileID or from scratch space, then it's
1115 // actually a token paste or stringization (or similar) and not a macro at
1116 // all.
1117 SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1118 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1119 return {};
1120
1121 // Find the spelling location of the start of the non-argument expansion
1122 // range. This is where the macro name was spelled in order to begin
1123 // expanding this macro.
1124 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1125
1126 // Dig out the buffer where the macro name was spelled and the extents of the
1127 // name so that we can render it into the expansion note.
1128 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1129 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1130 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1131 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1132}
1133
1135 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1136}
1137
1138bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1139 assert(isVerticalWhitespace(Str[0]));
1140 if (Str - 1 < BufferStart)
1141 return false;
1142
1143 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1144 (Str[0] == '\r' && Str[-1] == '\n')) {
1145 if (Str - 2 < BufferStart)
1146 return false;
1147 --Str;
1148 }
1149 --Str;
1150
1151 // Rewind to first non-space character:
1152 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1153 --Str;
1154
1155 return *Str == '\\';
1156}
1157
1159 const SourceManager &SM) {
1160 if (Loc.isInvalid() || Loc.isMacroID())
1161 return {};
1162 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1163 if (LocInfo.first.isInvalid())
1164 return {};
1165 bool Invalid = false;
1166 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1167 if (Invalid)
1168 return {};
1169 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1170 if (!Line)
1171 return {};
1172 StringRef Rest = Buffer.substr(Line - Buffer.data());
1173 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1174 return NumWhitespaceChars == StringRef::npos
1175 ? ""
1176 : Rest.take_front(NumWhitespaceChars);
1177}
1178
1179//===----------------------------------------------------------------------===//
1180// Diagnostics forwarding code.
1181//===----------------------------------------------------------------------===//
1182
1183/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1184/// lexer buffer was all expanded at a single point, perform the mapping.
1185/// This is currently only used for _Pragma implementation, so it is the slow
1186/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1187static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1188 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1190 SourceLocation FileLoc,
1191 unsigned CharNo, unsigned TokLen) {
1192 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1193
1194 // Otherwise, we're lexing "mapped tokens". This is used for things like
1195 // _Pragma handling. Combine the expansion location of FileLoc with the
1196 // spelling location.
1198
1199 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1200 // characters come from spelling(FileLoc)+Offset.
1201 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1202 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1203
1204 // Figure out the expansion loc range, which is the range covered by the
1205 // original _Pragma(...) sequence.
1206 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1207
1208 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1209}
1210
1211/// getSourceLocation - Return a source location identifier for the specified
1212/// offset in the current file.
1214 unsigned TokLen) const {
1215 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1216 "Location out of range for this buffer!");
1217
1218 // In the normal case, we're just lexing from a simple file buffer, return
1219 // the file id from FileLoc with the offset specified.
1220 unsigned CharNo = Loc-BufferStart;
1221 if (FileLoc.isFileID())
1222 return FileLoc.getLocWithOffset(CharNo);
1223
1224 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1225 // tokens are lexed from where the _Pragma was defined.
1226 assert(PP && "This doesn't work on raw lexers");
1227 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1228}
1229
1230/// Diag - Forwarding function for diagnostics. This translate a source
1231/// position in the current buffer into a SourceLocation object for rendering.
1232DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1233 return PP->Diag(getSourceLocation(Loc), DiagID);
1234}
1235
1236//===----------------------------------------------------------------------===//
1237// Trigraph and Escaped Newline Handling Code.
1238//===----------------------------------------------------------------------===//
1239
1240/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1241/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1242static char GetTrigraphCharForLetter(char Letter) {
1243 switch (Letter) {
1244 default: return 0;
1245 case '=': return '#';
1246 case ')': return ']';
1247 case '(': return '[';
1248 case '!': return '|';
1249 case '\'': return '^';
1250 case '>': return '}';
1251 case '/': return '\\';
1252 case '<': return '{';
1253 case '-': return '~';
1254 }
1255}
1256
1257/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1258/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1259/// return the result character. Finally, emit a warning about trigraph use
1260/// whether trigraphs are enabled or not.
1261static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1262 char Res = GetTrigraphCharForLetter(*CP);
1263 if (!Res)
1264 return Res;
1265
1266 if (!Trigraphs) {
1267 if (L && !L->isLexingRawMode())
1268 L->Diag(CP-2, diag::trigraph_ignored);
1269 return 0;
1270 }
1271
1272 if (L && !L->isLexingRawMode())
1273 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1274 return Res;
1275}
1276
1277/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1278/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1279/// trigraph equivalent on entry to this function.
1280unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1281 unsigned Size = 0;
1282 while (isWhitespace(Ptr[Size])) {
1283 ++Size;
1284
1285 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1286 continue;
1287
1288 // If this is a \r\n or \n\r, skip the other half.
1289 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1290 Ptr[Size-1] != Ptr[Size])
1291 ++Size;
1292
1293 return Size;
1294 }
1295
1296 // Not an escaped newline, must be a \t or something else.
1297 return 0;
1298}
1299
1300/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1301/// them), skip over them and return the first non-escaped-newline found,
1302/// otherwise return P.
1303const char *Lexer::SkipEscapedNewLines(const char *P) {
1304 while (true) {
1305 const char *AfterEscape;
1306 if (*P == '\\') {
1307 AfterEscape = P+1;
1308 } else if (*P == '?') {
1309 // If not a trigraph for escape, bail out.
1310 if (P[1] != '?' || P[2] != '/')
1311 return P;
1312 // FIXME: Take LangOpts into account; the language might not
1313 // support trigraphs.
1314 AfterEscape = P+3;
1315 } else {
1316 return P;
1317 }
1318
1319 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1320 if (NewLineSize == 0) return P;
1321 P = AfterEscape+NewLineSize;
1322 }
1323}
1324
1326 const SourceManager &SM,
1327 const LangOptions &LangOpts) {
1328 if (Loc.isMacroID()) {
1329 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1330 return std::nullopt;
1331 }
1332 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1333
1334 // Break down the source location.
1335 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1336
1337 // Try to load the file buffer.
1338 bool InvalidTemp = false;
1339 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1340 if (InvalidTemp)
1341 return std::nullopt;
1342
1343 const char *TokenBegin = File.data() + LocInfo.second;
1344
1345 // Lex from the start of the given location.
1346 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1347 TokenBegin, File.end());
1348 // Find the token.
1349 Token Tok;
1350 lexer.LexFromRawLexer(Tok);
1351 return Tok;
1352}
1353
1354/// Checks that the given token is the first token that occurs after the
1355/// given location (this excludes comments and whitespace). Returns the location
1356/// immediately after the specified token. If the token is not found or the
1357/// location is inside a macro, the returned source location will be invalid.
1360 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1361 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1362 if (!Tok || Tok->isNot(TKind))
1363 return {};
1364 SourceLocation TokenLoc = Tok->getLocation();
1365
1366 // Calculate how much whitespace needs to be skipped if any.
1367 unsigned NumWhitespaceChars = 0;
1368 if (SkipTrailingWhitespaceAndNewLine) {
1369 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1370 unsigned char C = *TokenEnd;
1371 while (isHorizontalWhitespace(C)) {
1372 C = *(++TokenEnd);
1373 NumWhitespaceChars++;
1374 }
1375
1376 // Skip \r, \n, \r\n, or \n\r
1377 if (C == '\n' || C == '\r') {
1378 char PrevC = C;
1379 C = *(++TokenEnd);
1380 NumWhitespaceChars++;
1381 if ((C == '\n' || C == '\r') && C != PrevC)
1382 NumWhitespaceChars++;
1383 }
1384 }
1385
1386 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1387}
1388
1389/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1390/// get its size, and return it. This is tricky in several cases:
1391/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1392/// then either return the trigraph (skipping 3 chars) or the '?',
1393/// depending on whether trigraphs are enabled or not.
1394/// 2. If this is an escaped newline (potentially with whitespace between
1395/// the backslash and newline), implicitly skip the newline and return
1396/// the char after it.
1397///
1398/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1399/// know that we can accumulate into Size, and that we have already incremented
1400/// Ptr by Size bytes.
1401///
1402/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1403/// be updated to match.
1404Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1405 unsigned Size = 0;
1406 // If we have a slash, look for an escaped newline.
1407 if (Ptr[0] == '\\') {
1408 ++Size;
1409 ++Ptr;
1410Slash:
1411 // Common case, backslash-char where the char is not whitespace.
1412 if (!isWhitespace(Ptr[0]))
1413 return {'\\', Size};
1414
1415 // See if we have optional whitespace characters between the slash and
1416 // newline.
1417 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1418 // Remember that this token needs to be cleaned.
1419 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1420
1421 // Warn if there was whitespace between the backslash and newline.
1422 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1423 Diag(Ptr, diag::backslash_newline_space);
1424
1425 // Found backslash<whitespace><newline>. Parse the char after it.
1426 Size += EscapedNewLineSize;
1427 Ptr += EscapedNewLineSize;
1428
1429 // Use slow version to accumulate a correct size field.
1430 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1431 CharAndSize.Size += Size;
1432 return CharAndSize;
1433 }
1434
1435 // Otherwise, this is not an escaped newline, just return the slash.
1436 return {'\\', Size};
1437 }
1438
1439 // If this is a trigraph, process it.
1440 if (Ptr[0] == '?' && Ptr[1] == '?') {
1441 // If this is actually a legal trigraph (not something like "??x"), emit
1442 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1443 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1444 LangOpts.Trigraphs)) {
1445 // Remember that this token needs to be cleaned.
1446 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1447
1448 Ptr += 3;
1449 Size += 3;
1450 if (C == '\\') goto Slash;
1451 return {C, Size};
1452 }
1453 }
1454
1455 // If this is neither, return a single character.
1456 return {*Ptr, Size + 1u};
1457}
1458
1459/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1460/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1461/// and that we have already incremented Ptr by Size bytes.
1462///
1463/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1464/// be updated to match.
1465Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1466 const LangOptions &LangOpts) {
1467
1468 unsigned Size = 0;
1469 // If we have a slash, look for an escaped newline.
1470 if (Ptr[0] == '\\') {
1471 ++Size;
1472 ++Ptr;
1473Slash:
1474 // Common case, backslash-char where the char is not whitespace.
1475 if (!isWhitespace(Ptr[0]))
1476 return {'\\', Size};
1477
1478 // See if we have optional whitespace characters followed by a newline.
1479 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1480 // Found backslash<whitespace><newline>. Parse the char after it.
1481 Size += EscapedNewLineSize;
1482 Ptr += EscapedNewLineSize;
1483
1484 // Use slow version to accumulate a correct size field.
1485 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1486 CharAndSize.Size += Size;
1487 return CharAndSize;
1488 }
1489
1490 // Otherwise, this is not an escaped newline, just return the slash.
1491 return {'\\', Size};
1492 }
1493
1494 // If this is a trigraph, process it.
1495 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1496 // If this is actually a legal trigraph (not something like "??x"), return
1497 // it.
1498 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1499 Ptr += 3;
1500 Size += 3;
1501 if (C == '\\') goto Slash;
1502 return {C, Size};
1503 }
1504 }
1505
1506 // If this is neither, return a single character.
1507 return {*Ptr, Size + 1u};
1508}
1509
1510//===----------------------------------------------------------------------===//
1511// Helper methods for lexing.
1512//===----------------------------------------------------------------------===//
1513
1514/// Routine that indiscriminately sets the offset into the source file.
1515void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1516 BufferPtr = BufferStart + Offset;
1517 if (BufferPtr > BufferEnd)
1518 BufferPtr = BufferEnd;
1519 // FIXME: What exactly does the StartOfLine bit mean? There are two
1520 // possible meanings for the "start" of the line: the first token on the
1521 // unexpanded line, or the first token on the expanded line.
1522 IsAtStartOfLine = StartOfLine;
1523 IsAtPhysicalStartOfLine = StartOfLine;
1524}
1525
1526static bool isUnicodeWhitespace(uint32_t Codepoint) {
1527 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1529 return UnicodeWhitespaceChars.contains(Codepoint);
1530}
1531
1533 llvm::SmallString<5> CharBuf;
1534 llvm::raw_svector_ostream CharOS(CharBuf);
1535 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1536 return CharBuf;
1537}
1538
1539// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1540// we allow "Mathematical Notation Characters" in identifiers.
1541// This is a proposed profile that extends the XID_Start/XID_continue
1542// with mathematical symbols, superscipts and subscripts digits
1543// found in some production software.
1544// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1545static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1546 bool IsStart, bool &IsExtension) {
1547 static const llvm::sys::UnicodeCharSet MathStartChars(
1549 static const llvm::sys::UnicodeCharSet MathContinueChars(
1551 if (MathStartChars.contains(C) ||
1552 (!IsStart && MathContinueChars.contains(C))) {
1553 IsExtension = true;
1554 return true;
1555 }
1556 return false;
1557}
1558
1559static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1560 bool &IsExtension) {
1561 if (LangOpts.AsmPreprocessor) {
1562 return false;
1563 } else if (LangOpts.DollarIdents && '$' == C) {
1564 return true;
1565 } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1566 // A non-leading codepoint must have the XID_Continue property.
1567 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1568 // so we need to check both tables.
1569 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1570 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1571 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1572 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1573 return true;
1574 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1575 IsExtension);
1576 } else if (LangOpts.C11) {
1577 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1579 return C11AllowedIDChars.contains(C);
1580 } else {
1581 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1583 return C99AllowedIDChars.contains(C);
1584 }
1585}
1586
1587static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1588 bool &IsExtension) {
1589 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1590 IsExtension = false;
1591 if (LangOpts.AsmPreprocessor) {
1592 return false;
1593 }
1594 if (LangOpts.CPlusPlus || LangOpts.C23) {
1595 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1596 if (XIDStartChars.contains(C))
1597 return true;
1598 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1599 IsExtension);
1600 }
1601 if (!isAllowedIDChar(C, LangOpts, IsExtension))
1602 return false;
1603 if (LangOpts.C11) {
1604 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1606 return !C11DisallowedInitialIDChars.contains(C);
1607 }
1608 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1610 return !C99DisallowedInitialIDChars.contains(C);
1611}
1612
1615
1616 static const llvm::sys::UnicodeCharSet MathStartChars(
1618 static const llvm::sys::UnicodeCharSet MathContinueChars(
1620
1621 (void)MathStartChars;
1622 (void)MathContinueChars;
1623 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1624 "Unexpected mathematical notation codepoint");
1625 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1627}
1628
1629static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1630 const char *End) {
1632 L.getSourceLocation(End));
1633}
1634
1635static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1636 CharSourceRange Range, bool IsFirst) {
1637 // Check C99 compatibility.
1638 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1639 enum {
1640 CannotAppearInIdentifier = 0,
1641 CannotStartIdentifier
1642 };
1643
1644 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1646 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1648 if (!C99AllowedIDChars.contains(C)) {
1649 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1650 << Range
1651 << CannotAppearInIdentifier;
1652 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1653 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1654 << Range
1655 << CannotStartIdentifier;
1656 }
1657 }
1658}
1659
1660/// After encountering UTF-8 character C and interpreting it as an identifier
1661/// character, check whether it's a homoglyph for a common non-identifier
1662/// source character that is unlikely to be an intentional identifier
1663/// character and warn if so.
1666 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1667 struct HomoglyphPair {
1668 uint32_t Character;
1669 char LooksLike;
1670 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1671 };
1672 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1673 {U'\u00ad', 0}, // SOFT HYPHEN
1674 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1675 {U'\u037e', ';'}, // GREEK QUESTION MARK
1676 {U'\u200b', 0}, // ZERO WIDTH SPACE
1677 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1678 {U'\u200d', 0}, // ZERO WIDTH JOINER
1679 {U'\u2060', 0}, // WORD JOINER
1680 {U'\u2061', 0}, // FUNCTION APPLICATION
1681 {U'\u2062', 0}, // INVISIBLE TIMES
1682 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1683 {U'\u2064', 0}, // INVISIBLE PLUS
1684 {U'\u2212', '-'}, // MINUS SIGN
1685 {U'\u2215', '/'}, // DIVISION SLASH
1686 {U'\u2216', '\\'}, // SET MINUS
1687 {U'\u2217', '*'}, // ASTERISK OPERATOR
1688 {U'\u2223', '|'}, // DIVIDES
1689 {U'\u2227', '^'}, // LOGICAL AND
1690 {U'\u2236', ':'}, // RATIO
1691 {U'\u223c', '~'}, // TILDE OPERATOR
1692 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1693 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1694 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1695 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1696 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1697 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1698 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1699 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1700 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1701 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1702 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1703 {U'\uff0c', ','}, // FULLWIDTH COMMA
1704 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1705 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1706 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1707 {U'\uff1a', ':'}, // FULLWIDTH COLON
1708 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1709 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1710 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1711 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1712 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1713 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1714 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1715 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1716 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1717 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1718 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1719 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1720 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1721 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1722 {0, 0}
1723 };
1724 auto Homoglyph =
1725 std::lower_bound(std::begin(SortedHomoglyphs),
1726 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1727 if (Homoglyph->Character == C) {
1728 if (Homoglyph->LooksLike) {
1729 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1730 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1731 << Range << codepointAsHexString(C) << LooksLikeStr;
1732 } else {
1733 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1735 }
1736 }
1737}
1738
1740 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1741 CharSourceRange Range, bool IsFirst) {
1742 if (isASCII(CodePoint))
1743 return;
1744
1745 bool IsExtension;
1746 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1747 bool IsIDContinue =
1748 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1749
1750 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1751 return;
1752
1753 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1754
1755 if (!IsFirst || InvalidOnlyAtStart) {
1756 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1757 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1759 } else {
1760 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1761 << Range << codepointAsHexString(CodePoint)
1763 }
1764}
1765
1766bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1767 Token &Result) {
1768 const char *UCNPtr = CurPtr + Size;
1769 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1770 if (CodePoint == 0) {
1771 return false;
1772 }
1773 bool IsExtension = false;
1774 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1775 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1776 return false;
1780 PP->getDiagnostics(), LangOpts, CodePoint,
1781 makeCharRange(*this, CurPtr, UCNPtr),
1782 /*IsFirst=*/false);
1783
1784 // We got a unicode codepoint that is neither a space nor a
1785 // a valid identifier part.
1786 // Carry on as if the codepoint was valid for recovery purposes.
1787 } else if (!isLexingRawMode()) {
1788 if (IsExtension)
1790 makeCharRange(*this, CurPtr, UCNPtr));
1791
1793 makeCharRange(*this, CurPtr, UCNPtr),
1794 /*IsFirst=*/false);
1795 }
1796
1797 Result.setFlag(Token::HasUCN);
1798 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1799 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1800 CurPtr = UCNPtr;
1801 else
1802 while (CurPtr != UCNPtr)
1803 (void)getAndAdvanceChar(CurPtr, Result);
1804 return true;
1805}
1806
1807bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1808 llvm::UTF32 CodePoint;
1809
1810 // If a UTF-8 codepoint appears immediately after an escaped new line,
1811 // CurPtr may point to the splicing \ on the preceding line,
1812 // so we need to skip it.
1813 unsigned FirstCodeUnitSize;
1814 getCharAndSize(CurPtr, FirstCodeUnitSize);
1815 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1816 const char *UnicodePtr = CharStart;
1817
1818 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1819 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1820 &CodePoint, llvm::strictConversion);
1821 if (ConvResult != llvm::conversionOK)
1822 return false;
1823
1824 bool IsExtension = false;
1825 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1826 IsExtension)) {
1827 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1828 return false;
1829
1833 PP->getDiagnostics(), LangOpts, CodePoint,
1834 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1835 // We got a unicode codepoint that is neither a space nor a
1836 // a valid identifier part. Carry on as if the codepoint was
1837 // valid for recovery purposes.
1838 } else if (!isLexingRawMode()) {
1839 if (IsExtension)
1841 PP->getDiagnostics(), CodePoint,
1842 makeCharRange(*this, CharStart, UnicodePtr));
1844 makeCharRange(*this, CharStart, UnicodePtr),
1845 /*IsFirst=*/false);
1847 makeCharRange(*this, CharStart, UnicodePtr));
1848 }
1849
1850 // Once we sucessfully parsed some UTF-8,
1851 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1852 // being lexed, and that warnings about trailing spaces are emitted.
1853 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1854 CurPtr = UnicodePtr;
1855 return true;
1856}
1857
1858bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1859 const char *CurPtr) {
1860 bool IsExtension = false;
1861 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1864 if (IsExtension)
1866 makeCharRange(*this, BufferPtr, CurPtr));
1868 makeCharRange(*this, BufferPtr, CurPtr),
1869 /*IsFirst=*/true);
1871 makeCharRange(*this, BufferPtr, CurPtr));
1872 }
1873
1874 MIOpt.ReadToken();
1875 return LexIdentifierContinue(Result, CurPtr);
1876 }
1877
1879 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1881 // Non-ASCII characters tend to creep into source code unintentionally.
1882 // Instead of letting the parser complain about the unknown token,
1883 // just drop the character.
1884 // Note that we can /only/ do this when the non-ASCII character is actually
1885 // spelled as Unicode, not written as a UCN. The standard requires that
1886 // we not throw away any possible preprocessor tokens, but there's a
1887 // loophole in the mapping of Unicode characters to basic character set
1888 // characters that allows us to map these particular characters to, say,
1889 // whitespace.
1891 PP->getDiagnostics(), LangOpts, C,
1892 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1893 BufferPtr = CurPtr;
1894 return false;
1895 }
1896
1897 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1898 // up by accident.
1899 MIOpt.ReadToken();
1900 FormTokenWithChars(Result, CurPtr, tok::unknown);
1901 return true;
1902}
1903
1904static const char *
1905fastParseASCIIIdentifier(const char *CurPtr,
1906 [[maybe_unused]] const char *BufferEnd) {
1907#ifdef __SSE4_2__
1908 alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1909 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1910 };
1911 constexpr ssize_t BytesPerRegister = 16;
1912
1913 __m128i AsciiIdentifierRangeV =
1914 _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1915
1916 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1917 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1918
1919 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1922 CurPtr += Consumed;
1923 if (Consumed == BytesPerRegister)
1924 continue;
1925 return CurPtr;
1926 }
1927#endif
1928
1929 unsigned char C = *CurPtr;
1931 C = *++CurPtr;
1932 return CurPtr;
1933}
1934
1935bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1936 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1937
1938 while (true) {
1939
1940 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1941
1942 unsigned Size;
1943 // Slow path: handle trigraph, unicode codepoints, UCNs.
1944 unsigned char C = getCharAndSize(CurPtr, Size);
1946 CurPtr = ConsumeChar(CurPtr, Size, Result);
1947 continue;
1948 }
1949 if (C == '$') {
1950 // If we hit a $ and they are not supported in identifiers, we are done.
1951 if (!LangOpts.DollarIdents)
1952 break;
1953 // Otherwise, emit a diagnostic and continue.
1954 if (!isLexingRawMode())
1955 Diag(CurPtr, diag::ext_dollar_in_identifier);
1956 CurPtr = ConsumeChar(CurPtr, Size, Result);
1957 continue;
1958 }
1959 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1960 continue;
1961 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1962 continue;
1963 // Neither an expected Unicode codepoint nor a UCN.
1964 break;
1965 }
1966
1967 const char *IdStart = BufferPtr;
1968 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1969 Result.setRawIdentifierData(IdStart);
1970
1971 // If we are in raw mode, return this identifier raw. There is no need to
1972 // look up identifier information or attempt to macro expand it.
1973 if (LexingRawMode)
1974 return true;
1975
1976 // Fill in Result.IdentifierInfo and update the token kind,
1977 // looking up the identifier in the identifier table.
1979 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1980 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1981
1982 // If the completion point is at the end of an identifier, we want to treat
1983 // the identifier as incomplete even if it resolves to a macro or a keyword.
1984 // This allows e.g. 'class^' to complete to 'classifier'.
1985 if (isCodeCompletionPoint(CurPtr)) {
1986 // Return the code-completion token.
1987 Result.setKind(tok::code_completion);
1988 // Skip the code-completion char and all immediate identifier characters.
1989 // This ensures we get consistent behavior when completing at any point in
1990 // an identifier (i.e. at the start, in the middle, at the end). Note that
1991 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1992 // simpler.
1993 assert(*CurPtr == 0 && "Completion character must be 0");
1994 ++CurPtr;
1995 // Note that code completion token is not added as a separate character
1996 // when the completion point is at the end of the buffer. Therefore, we need
1997 // to check if the buffer has ended.
1998 if (CurPtr < BufferEnd) {
1999 while (isAsciiIdentifierContinue(*CurPtr))
2000 ++CurPtr;
2001 }
2002 BufferPtr = CurPtr;
2003 return true;
2004 }
2005
2006 // Finally, now that we know we have an identifier, pass this off to the
2007 // preprocessor, which may macro expand it or something.
2008 if (II->isHandleIdentifierCase())
2009 return PP->HandleIdentifier(Result);
2010
2011 return true;
2012}
2013
2014/// isHexaLiteral - Return true if Start points to a hex constant.
2015/// in microsoft mode (where this is supposed to be several different tokens).
2016bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
2017 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
2018 char C1 = CharAndSize1.Char;
2019 if (C1 != '0')
2020 return false;
2021
2022 auto CharAndSize2 =
2023 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
2024 char C2 = CharAndSize2.Char;
2025 return (C2 == 'x' || C2 == 'X');
2026}
2027
2028/// LexNumericConstant - Lex the remainder of a integer or floating point
2029/// constant. From[-1] is the first character lexed. Return the end of the
2030/// constant.
2031bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2032 unsigned Size;
2033 char C = getCharAndSize(CurPtr, Size);
2034 char PrevCh = 0;
2035 while (isPreprocessingNumberBody(C)) {
2036 CurPtr = ConsumeChar(CurPtr, Size, Result);
2037 PrevCh = C;
2038 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
2039 CurPtr -= Size;
2040 break;
2041 }
2042 C = getCharAndSize(CurPtr, Size);
2043 }
2044
2045 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
2046 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2047 // If we are in Microsoft mode, don't continue if the constant is hex.
2048 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2049 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2050 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2051 }
2052
2053 // If we have a hex FP constant, continue.
2054 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2055 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2056 // not-quite-conforming extension. Only do so if this looks like it's
2057 // actually meant to be a hexfloat, and not if it has a ud-suffix.
2058 bool IsHexFloat = true;
2059 if (!LangOpts.C99) {
2060 if (!isHexaLiteral(BufferPtr, LangOpts))
2061 IsHexFloat = false;
2062 else if (!LangOpts.CPlusPlus17 &&
2063 std::find(BufferPtr, CurPtr, '_') != CurPtr)
2064 IsHexFloat = false;
2065 }
2066 if (IsHexFloat)
2067 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2068 }
2069
2070 // If we have a digit separator, continue.
2071 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2072 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2073 if (isAsciiIdentifierContinue(Next)) {
2074 if (!isLexingRawMode())
2075 Diag(CurPtr, LangOpts.CPlusPlus
2076 ? diag::warn_cxx11_compat_digit_separator
2077 : diag::warn_c23_compat_digit_separator);
2078 CurPtr = ConsumeChar(CurPtr, Size, Result);
2079 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2080 return LexNumericConstant(Result, CurPtr);
2081 }
2082 }
2083
2084 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2085 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2086 return LexNumericConstant(Result, CurPtr);
2087 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2088 return LexNumericConstant(Result, CurPtr);
2089
2090 // Update the location of token as well as BufferPtr.
2091 const char *TokStart = BufferPtr;
2092 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2093 Result.setLiteralData(TokStart);
2094 return true;
2095}
2096
2097/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2098/// in C++11, or warn on a ud-suffix in C++98.
2099const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2100 bool IsStringLiteral) {
2101 assert(LangOpts.CPlusPlus);
2102
2103 // Maximally munch an identifier.
2104 unsigned Size;
2105 char C = getCharAndSize(CurPtr, Size);
2106 bool Consumed = false;
2107
2108 if (!isAsciiIdentifierStart(C)) {
2109 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2110 Consumed = true;
2111 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2112 Consumed = true;
2113 else
2114 return CurPtr;
2115 }
2116
2117 if (!LangOpts.CPlusPlus11) {
2118 if (!isLexingRawMode())
2119 Diag(CurPtr,
2120 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2121 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2123 return CurPtr;
2124 }
2125
2126 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2127 // that does not start with an underscore is ill-formed. As a conforming
2128 // extension, we treat all such suffixes as if they had whitespace before
2129 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2130 // likely to be a ud-suffix than a macro, however, and accept that.
2131 if (!Consumed) {
2132 bool IsUDSuffix = false;
2133 if (C == '_')
2134 IsUDSuffix = true;
2135 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2136 // In C++1y, we need to look ahead a few characters to see if this is a
2137 // valid suffix for a string literal or a numeric literal (this could be
2138 // the 'operator""if' defining a numeric literal operator).
2139 const unsigned MaxStandardSuffixLength = 3;
2140 char Buffer[MaxStandardSuffixLength] = { C };
2141 unsigned Consumed = Size;
2142 unsigned Chars = 1;
2143 while (true) {
2144 auto [Next, NextSize] =
2145 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2146 if (!isAsciiIdentifierContinue(Next)) {
2147 // End of suffix. Check whether this is on the allowed list.
2148 const StringRef CompleteSuffix(Buffer, Chars);
2149 IsUDSuffix =
2150 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2151 break;
2152 }
2153
2154 if (Chars == MaxStandardSuffixLength)
2155 // Too long: can't be a standard suffix.
2156 break;
2157
2158 Buffer[Chars++] = Next;
2159 Consumed += NextSize;
2160 }
2161 }
2162
2163 if (!IsUDSuffix) {
2164 if (!isLexingRawMode())
2165 Diag(CurPtr, LangOpts.MSVCCompat
2166 ? diag::ext_ms_reserved_user_defined_literal
2167 : diag::ext_reserved_user_defined_literal)
2169 return CurPtr;
2170 }
2171
2172 CurPtr = ConsumeChar(CurPtr, Size, Result);
2173 }
2174
2175 Result.setFlag(Token::HasUDSuffix);
2176 while (true) {
2177 C = getCharAndSize(CurPtr, Size);
2179 CurPtr = ConsumeChar(CurPtr, Size, Result);
2180 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2181 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2182 } else
2183 break;
2184 }
2185
2186 return CurPtr;
2187}
2188
2189/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2190/// either " or L" or u8" or u" or U".
2191bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2192 tok::TokenKind Kind) {
2193 const char *AfterQuote = CurPtr;
2194 // Does this string contain the \0 character?
2195 const char *NulCharacter = nullptr;
2196
2197 if (!isLexingRawMode() &&
2198 (Kind == tok::utf8_string_literal ||
2199 Kind == tok::utf16_string_literal ||
2200 Kind == tok::utf32_string_literal))
2201 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2202 : diag::warn_c99_compat_unicode_literal);
2203
2204 char C = getAndAdvanceChar(CurPtr, Result);
2205 while (C != '"') {
2206 // Skip escaped characters. Escaped newlines will already be processed by
2207 // getAndAdvanceChar.
2208 if (C == '\\')
2209 C = getAndAdvanceChar(CurPtr, Result);
2210
2211 if (C == '\n' || C == '\r' || // Newline.
2212 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2213 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2214 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2215 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2216 return true;
2217 }
2218
2219 if (C == 0) {
2220 if (isCodeCompletionPoint(CurPtr-1)) {
2221 if (ParsingFilename)
2222 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2223 else
2225 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2226 cutOffLexing();
2227 return true;
2228 }
2229
2230 NulCharacter = CurPtr-1;
2231 }
2232 C = getAndAdvanceChar(CurPtr, Result);
2233 }
2234
2235 // If we are in C++11, lex the optional ud-suffix.
2236 if (LangOpts.CPlusPlus)
2237 CurPtr = LexUDSuffix(Result, CurPtr, true);
2238
2239 // If a nul character existed in the string, warn about it.
2240 if (NulCharacter && !isLexingRawMode())
2241 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2242
2243 // Update the location of the token as well as the BufferPtr instance var.
2244 const char *TokStart = BufferPtr;
2245 FormTokenWithChars(Result, CurPtr, Kind);
2246 Result.setLiteralData(TokStart);
2247 return true;
2248}
2249
2250/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2251/// having lexed R", LR", u8R", uR", or UR".
2252bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2253 tok::TokenKind Kind) {
2254 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2255 // Between the initial and final double quote characters of the raw string,
2256 // any transformations performed in phases 1 and 2 (trigraphs,
2257 // universal-character-names, and line splicing) are reverted.
2258
2259 if (!isLexingRawMode())
2260 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2261
2262 unsigned PrefixLen = 0;
2263
2264 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {
2265 if (!isLexingRawMode() &&
2266 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
2267 const char *Pos = &CurPtr[PrefixLen];
2268 Diag(Pos, LangOpts.CPlusPlus26
2269 ? diag::warn_cxx26_compat_raw_string_literal_character_set
2270 : diag::ext_cxx26_raw_string_literal_character_set)
2271 << StringRef(Pos, 1);
2272 }
2273 ++PrefixLen;
2274 }
2275
2276 // If the last character was not a '(', then we didn't lex a valid delimiter.
2277 if (CurPtr[PrefixLen] != '(') {
2278 if (!isLexingRawMode()) {
2279 const char *PrefixEnd = &CurPtr[PrefixLen];
2280 if (PrefixLen == 16) {
2281 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2282 } else if (*PrefixEnd == '\n') {
2283 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
2284 } else {
2285 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2286 << StringRef(PrefixEnd, 1);
2287 }
2288 }
2289
2290 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2291 // it's possible the '"' was intended to be part of the raw string, but
2292 // there's not much we can do about that.
2293 while (true) {
2294 char C = *CurPtr++;
2295
2296 if (C == '"')
2297 break;
2298 if (C == 0 && CurPtr-1 == BufferEnd) {
2299 --CurPtr;
2300 break;
2301 }
2302 }
2303
2304 FormTokenWithChars(Result, CurPtr, tok::unknown);
2305 return true;
2306 }
2307
2308 // Save prefix and move CurPtr past it
2309 const char *Prefix = CurPtr;
2310 CurPtr += PrefixLen + 1; // skip over prefix and '('
2311
2312 while (true) {
2313 char C = *CurPtr++;
2314
2315 if (C == ')') {
2316 // Check for prefix match and closing quote.
2317 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2318 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2319 break;
2320 }
2321 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2322 if (!isLexingRawMode())
2323 Diag(BufferPtr, diag::err_unterminated_raw_string)
2324 << StringRef(Prefix, PrefixLen);
2325 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2326 return true;
2327 }
2328 }
2329
2330 // If we are in C++11, lex the optional ud-suffix.
2331 if (LangOpts.CPlusPlus)
2332 CurPtr = LexUDSuffix(Result, CurPtr, true);
2333
2334 // Update the location of token as well as BufferPtr.
2335 const char *TokStart = BufferPtr;
2336 FormTokenWithChars(Result, CurPtr, Kind);
2337 Result.setLiteralData(TokStart);
2338 return true;
2339}
2340
2341/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2342/// after having lexed the '<' character. This is used for #include filenames.
2343bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2344 // Does this string contain the \0 character?
2345 const char *NulCharacter = nullptr;
2346 const char *AfterLessPos = CurPtr;
2347 char C = getAndAdvanceChar(CurPtr, Result);
2348 while (C != '>') {
2349 // Skip escaped characters. Escaped newlines will already be processed by
2350 // getAndAdvanceChar.
2351 if (C == '\\')
2352 C = getAndAdvanceChar(CurPtr, Result);
2353
2354 if (isVerticalWhitespace(C) || // Newline.
2355 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2356 // If the filename is unterminated, then it must just be a lone <
2357 // character. Return this as such.
2358 FormTokenWithChars(Result, AfterLessPos, tok::less);
2359 return true;
2360 }
2361
2362 if (C == 0) {
2363 if (isCodeCompletionPoint(CurPtr - 1)) {
2364 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2365 cutOffLexing();
2366 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2367 return true;
2368 }
2369 NulCharacter = CurPtr-1;
2370 }
2371 C = getAndAdvanceChar(CurPtr, Result);
2372 }
2373
2374 // If a nul character existed in the string, warn about it.
2375 if (NulCharacter && !isLexingRawMode())
2376 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2377
2378 // Update the location of token as well as BufferPtr.
2379 const char *TokStart = BufferPtr;
2380 FormTokenWithChars(Result, CurPtr, tok::header_name);
2381 Result.setLiteralData(TokStart);
2382 return true;
2383}
2384
2385void Lexer::codeCompleteIncludedFile(const char *PathStart,
2386 const char *CompletionPoint,
2387 bool IsAngled) {
2388 // Completion only applies to the filename, after the last slash.
2389 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2390 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2391 auto Slash = PartialPath.find_last_of(SlashChars);
2392 StringRef Dir =
2393 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2394 const char *StartOfFilename =
2395 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2396 // Code completion filter range is the filename only, up to completion point.
2398 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2399 // We should replace the characters up to the closing quote or closest slash,
2400 // if any.
2401 while (CompletionPoint < BufferEnd) {
2402 char Next = *(CompletionPoint + 1);
2403 if (Next == 0 || Next == '\r' || Next == '\n')
2404 break;
2405 ++CompletionPoint;
2406 if (Next == (IsAngled ? '>' : '"'))
2407 break;
2408 if (SlashChars.contains(Next))
2409 break;
2410 }
2411
2413 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2414 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2415 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2416}
2417
2418/// LexCharConstant - Lex the remainder of a character constant, after having
2419/// lexed either ' or L' or u8' or u' or U'.
2420bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2421 tok::TokenKind Kind) {
2422 // Does this character contain the \0 character?
2423 const char *NulCharacter = nullptr;
2424
2425 if (!isLexingRawMode()) {
2426 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2427 Diag(BufferPtr, LangOpts.CPlusPlus
2428 ? diag::warn_cxx98_compat_unicode_literal
2429 : diag::warn_c99_compat_unicode_literal);
2430 else if (Kind == tok::utf8_char_constant)
2431 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2432 }
2433
2434 char C = getAndAdvanceChar(CurPtr, Result);
2435 if (C == '\'') {
2436 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2437 Diag(BufferPtr, diag::ext_empty_character);
2438 FormTokenWithChars(Result, CurPtr, tok::unknown);
2439 return true;
2440 }
2441
2442 while (C != '\'') {
2443 // Skip escaped characters.
2444 if (C == '\\')
2445 C = getAndAdvanceChar(CurPtr, Result);
2446
2447 if (C == '\n' || C == '\r' || // Newline.
2448 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2449 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2450 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2451 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2452 return true;
2453 }
2454
2455 if (C == 0) {
2456 if (isCodeCompletionPoint(CurPtr-1)) {
2458 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2459 cutOffLexing();
2460 return true;
2461 }
2462
2463 NulCharacter = CurPtr-1;
2464 }
2465 C = getAndAdvanceChar(CurPtr, Result);
2466 }
2467
2468 // If we are in C++11, lex the optional ud-suffix.
2469 if (LangOpts.CPlusPlus)
2470 CurPtr = LexUDSuffix(Result, CurPtr, false);
2471
2472 // If a nul character existed in the character, warn about it.
2473 if (NulCharacter && !isLexingRawMode())
2474 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2475
2476 // Update the location of token as well as BufferPtr.
2477 const char *TokStart = BufferPtr;
2478 FormTokenWithChars(Result, CurPtr, Kind);
2479 Result.setLiteralData(TokStart);
2480 return true;
2481}
2482
2483/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2484/// Update BufferPtr to point to the next non-whitespace character and return.
2485///
2486/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2487bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2488 bool &TokAtPhysicalStartOfLine) {
2489 // Whitespace - Skip it, then return the token after the whitespace.
2490 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2491
2492 unsigned char Char = *CurPtr;
2493
2494 const char *lastNewLine = nullptr;
2495 auto setLastNewLine = [&](const char *Ptr) {
2496 lastNewLine = Ptr;
2497 if (!NewLinePtr)
2498 NewLinePtr = Ptr;
2499 };
2500 if (SawNewline)
2501 setLastNewLine(CurPtr - 1);
2502
2503 // Skip consecutive spaces efficiently.
2504 while (true) {
2505 // Skip horizontal whitespace very aggressively.
2506 while (isHorizontalWhitespace(Char))
2507 Char = *++CurPtr;
2508
2509 // Otherwise if we have something other than whitespace, we're done.
2510 if (!isVerticalWhitespace(Char))
2511 break;
2512
2514 // End of preprocessor directive line, let LexTokenInternal handle this.
2515 BufferPtr = CurPtr;
2516 return false;
2517 }
2518
2519 // OK, but handle newline.
2520 if (*CurPtr == '\n')
2521 setLastNewLine(CurPtr);
2522 SawNewline = true;
2523 Char = *++CurPtr;
2524 }
2525
2526 // If the client wants us to return whitespace, return it now.
2527 if (isKeepWhitespaceMode()) {
2528 FormTokenWithChars(Result, CurPtr, tok::unknown);
2529 if (SawNewline) {
2530 IsAtStartOfLine = true;
2531 IsAtPhysicalStartOfLine = true;
2532 }
2533 // FIXME: The next token will not have LeadingSpace set.
2534 return true;
2535 }
2536
2537 // If this isn't immediately after a newline, there is leading space.
2538 char PrevChar = CurPtr[-1];
2539 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2540
2541 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2542 if (SawNewline) {
2543 Result.setFlag(Token::StartOfLine);
2544 TokAtPhysicalStartOfLine = true;
2545
2546 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2547 if (auto *Handler = PP->getEmptylineHandler())
2548 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2549 getSourceLocation(lastNewLine)));
2550 }
2551 }
2552
2553 BufferPtr = CurPtr;
2554 return false;
2555}
2556
2557/// We have just read the // characters from input. Skip until we find the
2558/// newline character that terminates the comment. Then update BufferPtr and
2559/// return.
2560///
2561/// If we're in KeepCommentMode or any CommentHandler has inserted
2562/// some tokens, this will store the first token and return true.
2563bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2564 bool &TokAtPhysicalStartOfLine) {
2565 // If Line comments aren't explicitly enabled for this language, emit an
2566 // extension warning.
2567 if (!LineComment) {
2568 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2569 Diag(BufferPtr, diag::ext_line_comment);
2570
2571 // Mark them enabled so we only emit one warning for this translation
2572 // unit.
2573 LineComment = true;
2574 }
2575
2576 // Scan over the body of the comment. The common case, when scanning, is that
2577 // the comment contains normal ascii characters with nothing interesting in
2578 // them. As such, optimize for this case with the inner loop.
2579 //
2580 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2581 // character that ends the line comment.
2582
2583 // C++23 [lex.phases] p1
2584 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2585 // diagnostic only once per entire ill-formed subsequence to avoid
2586 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2587 bool UnicodeDecodingAlreadyDiagnosed = false;
2588
2589 char C;
2590 while (true) {
2591 C = *CurPtr;
2592 // Skip over characters in the fast loop.
2593 while (isASCII(C) && C != 0 && // Potentially EOF.
2594 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2595 C = *++CurPtr;
2596 UnicodeDecodingAlreadyDiagnosed = false;
2597 }
2598
2599 if (!isASCII(C)) {
2600 unsigned Length = llvm::getUTF8SequenceSize(
2601 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2602 if (Length == 0) {
2603 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2604 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2605 UnicodeDecodingAlreadyDiagnosed = true;
2606 ++CurPtr;
2607 } else {
2608 UnicodeDecodingAlreadyDiagnosed = false;
2609 CurPtr += Length;
2610 }
2611 continue;
2612 }
2613
2614 const char *NextLine = CurPtr;
2615 if (C != 0) {
2616 // We found a newline, see if it's escaped.
2617 const char *EscapePtr = CurPtr-1;
2618 bool HasSpace = false;
2619 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2620 --EscapePtr;
2621 HasSpace = true;
2622 }
2623
2624 if (*EscapePtr == '\\')
2625 // Escaped newline.
2626 CurPtr = EscapePtr;
2627 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2628 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2629 // Trigraph-escaped newline.
2630 CurPtr = EscapePtr-2;
2631 else
2632 break; // This is a newline, we're done.
2633
2634 // If there was space between the backslash and newline, warn about it.
2635 if (HasSpace && !isLexingRawMode())
2636 Diag(EscapePtr, diag::backslash_newline_space);
2637 }
2638
2639 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2640 // properly decode the character. Read it in raw mode to avoid emitting
2641 // diagnostics about things like trigraphs. If we see an escaped newline,
2642 // we'll handle it below.
2643 const char *OldPtr = CurPtr;
2644 bool OldRawMode = isLexingRawMode();
2645 LexingRawMode = true;
2646 C = getAndAdvanceChar(CurPtr, Result);
2647 LexingRawMode = OldRawMode;
2648
2649 // If we only read only one character, then no special handling is needed.
2650 // We're done and can skip forward to the newline.
2651 if (C != 0 && CurPtr == OldPtr+1) {
2652 CurPtr = NextLine;
2653 break;
2654 }
2655
2656 // If we read multiple characters, and one of those characters was a \r or
2657 // \n, then we had an escaped newline within the comment. Emit diagnostic
2658 // unless the next line is also a // comment.
2659 if (CurPtr != OldPtr + 1 && C != '/' &&
2660 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2661 for (; OldPtr != CurPtr; ++OldPtr)
2662 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2663 // Okay, we found a // comment that ends in a newline, if the next
2664 // line is also a // comment, but has spaces, don't emit a diagnostic.
2665 if (isWhitespace(C)) {
2666 const char *ForwardPtr = CurPtr;
2667 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2668 ++ForwardPtr;
2669 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2670 break;
2671 }
2672
2673 if (!isLexingRawMode())
2674 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2675 break;
2676 }
2677 }
2678
2679 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2680 --CurPtr;
2681 break;
2682 }
2683
2684 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2686 cutOffLexing();
2687 return false;
2688 }
2689 }
2690
2691 // Found but did not consume the newline. Notify comment handlers about the
2692 // comment unless we're in a #if 0 block.
2693 if (PP && !isLexingRawMode() &&
2695 getSourceLocation(CurPtr)))) {
2696 BufferPtr = CurPtr;
2697 return true; // A token has to be returned.
2698 }
2699
2700 // If we are returning comments as tokens, return this comment as a token.
2701 if (inKeepCommentMode())
2702 return SaveLineComment(Result, CurPtr);
2703
2704 // If we are inside a preprocessor directive and we see the end of line,
2705 // return immediately, so that the lexer can return this as an EOD token.
2706 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2707 BufferPtr = CurPtr;
2708 return false;
2709 }
2710
2711 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2712 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2713 // contribute to another token), it isn't needed for correctness. Note that
2714 // this is ok even in KeepWhitespaceMode, because we would have returned the
2715 // comment above in that mode.
2716 NewLinePtr = CurPtr++;
2717
2718 // The next returned token is at the start of the line.
2719 Result.setFlag(Token::StartOfLine);
2720 TokAtPhysicalStartOfLine = true;
2721 // No leading whitespace seen so far.
2722 Result.clearFlag(Token::LeadingSpace);
2723 BufferPtr = CurPtr;
2724 return false;
2725}
2726
2727/// If in save-comment mode, package up this Line comment in an appropriate
2728/// way and return it.
2729bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2730 // If we're not in a preprocessor directive, just return the // comment
2731 // directly.
2732 FormTokenWithChars(Result, CurPtr, tok::comment);
2733
2735 return true;
2736
2737 // If this Line-style comment is in a macro definition, transmogrify it into
2738 // a C-style block comment.
2739 bool Invalid = false;
2740 std::string Spelling = PP->getSpelling(Result, &Invalid);
2741 if (Invalid)
2742 return true;
2743
2744 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2745 Spelling[1] = '*'; // Change prefix to "/*".
2746 Spelling += "*/"; // add suffix.
2747
2748 Result.setKind(tok::comment);
2749 PP->CreateString(Spelling, Result,
2750 Result.getLocation(), Result.getLocation());
2751 return true;
2752}
2753
2754/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2755/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2756/// a diagnostic if so. We know that the newline is inside of a block comment.
2757static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2758 bool Trigraphs) {
2759 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2760
2761 // Position of the first trigraph in the ending sequence.
2762 const char *TrigraphPos = nullptr;
2763 // Position of the first whitespace after a '\' in the ending sequence.
2764 const char *SpacePos = nullptr;
2765
2766 while (true) {
2767 // Back up off the newline.
2768 --CurPtr;
2769
2770 // If this is a two-character newline sequence, skip the other character.
2771 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2772 // \n\n or \r\r -> not escaped newline.
2773 if (CurPtr[0] == CurPtr[1])
2774 return false;
2775 // \n\r or \r\n -> skip the newline.
2776 --CurPtr;
2777 }
2778
2779 // If we have horizontal whitespace, skip over it. We allow whitespace
2780 // between the slash and newline.
2781 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2782 SpacePos = CurPtr;
2783 --CurPtr;
2784 }
2785
2786 // If we have a slash, this is an escaped newline.
2787 if (*CurPtr == '\\') {
2788 --CurPtr;
2789 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2790 // This is a trigraph encoding of a slash.
2791 TrigraphPos = CurPtr - 2;
2792 CurPtr -= 3;
2793 } else {
2794 return false;
2795 }
2796
2797 // If the character preceding the escaped newline is a '*', then after line
2798 // splicing we have a '*/' ending the comment.
2799 if (*CurPtr == '*')
2800 break;
2801
2802 if (*CurPtr != '\n' && *CurPtr != '\r')
2803 return false;
2804 }
2805
2806 if (TrigraphPos) {
2807 // If no trigraphs are enabled, warn that we ignored this trigraph and
2808 // ignore this * character.
2809 if (!Trigraphs) {
2810 if (!L->isLexingRawMode())
2811 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2812 return false;
2813 }
2814 if (!L->isLexingRawMode())
2815 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2816 }
2817
2818 // Warn about having an escaped newline between the */ characters.
2819 if (!L->isLexingRawMode())
2820 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2821
2822 // If there was space between the backslash and newline, warn about it.
2823 if (SpacePos && !L->isLexingRawMode())
2824 L->Diag(SpacePos, diag::backslash_newline_space);
2825
2826 return true;
2827}
2828
2829#ifdef __SSE2__
2830#include <emmintrin.h>
2831#elif __ALTIVEC__
2832#include <altivec.h>
2833#undef bool
2834#endif
2835
2836/// We have just read from input the / and * characters that started a comment.
2837/// Read until we find the * and / characters that terminate the comment.
2838/// Note that we don't bother decoding trigraphs or escaped newlines in block
2839/// comments, because they cannot cause the comment to end. The only thing
2840/// that can happen is the comment could end with an escaped newline between
2841/// the terminating * and /.
2842///
2843/// If we're in KeepCommentMode or any CommentHandler has inserted
2844/// some tokens, this will store the first token and return true.
2845bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2846 bool &TokAtPhysicalStartOfLine) {
2847 // Scan one character past where we should, looking for a '/' character. Once
2848 // we find it, check to see if it was preceded by a *. This common
2849 // optimization helps people who like to put a lot of * characters in their
2850 // comments.
2851
2852 // The first character we get with newlines and trigraphs skipped to handle
2853 // the degenerate /*/ case below correctly if the * has an escaped newline
2854 // after it.
2855 unsigned CharSize;
2856 unsigned char C = getCharAndSize(CurPtr, CharSize);
2857 CurPtr += CharSize;
2858 if (C == 0 && CurPtr == BufferEnd+1) {
2859 if (!isLexingRawMode())
2860 Diag(BufferPtr, diag::err_unterminated_block_comment);
2861 --CurPtr;
2862
2863 // KeepWhitespaceMode should return this broken comment as a token. Since
2864 // it isn't a well formed comment, just return it as an 'unknown' token.
2865 if (isKeepWhitespaceMode()) {
2866 FormTokenWithChars(Result, CurPtr, tok::unknown);
2867 return true;
2868 }
2869
2870 BufferPtr = CurPtr;
2871 return false;
2872 }
2873
2874 // Check to see if the first character after the '/*' is another /. If so,
2875 // then this slash does not end the block comment, it is part of it.
2876 if (C == '/')
2877 C = *CurPtr++;
2878
2879 // C++23 [lex.phases] p1
2880 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2881 // diagnostic only once per entire ill-formed subsequence to avoid
2882 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2883 bool UnicodeDecodingAlreadyDiagnosed = false;
2884
2885 while (true) {
2886 // Skip over all non-interesting characters until we find end of buffer or a
2887 // (probably ending) '/' character.
2888 if (CurPtr + 24 < BufferEnd &&
2889 // If there is a code-completion point avoid the fast scan because it
2890 // doesn't check for '\0'.
2891 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2892 // While not aligned to a 16-byte boundary.
2893 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2894 if (!isASCII(C))
2895 goto MultiByteUTF8;
2896 C = *CurPtr++;
2897 }
2898 if (C == '/') goto FoundSlash;
2899
2900#ifdef __SSE2__
2901 __m128i Slashes = _mm_set1_epi8('/');
2902 while (CurPtr + 16 < BufferEnd) {
2903 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2904 if (LLVM_UNLIKELY(Mask != 0)) {
2905 goto MultiByteUTF8;
2906 }
2907 // look for slashes
2908 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2909 Slashes));
2910 if (cmp != 0) {
2911 // Adjust the pointer to point directly after the first slash. It's
2912 // not necessary to set C here, it will be overwritten at the end of
2913 // the outer loop.
2914 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2915 goto FoundSlash;
2916 }
2917 CurPtr += 16;
2918 }
2919#elif __ALTIVEC__
2920 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2921 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2922 0x80, 0x80, 0x80, 0x80};
2923 __vector unsigned char Slashes = {
2924 '/', '/', '/', '/', '/', '/', '/', '/',
2925 '/', '/', '/', '/', '/', '/', '/', '/'
2926 };
2927 while (CurPtr + 16 < BufferEnd) {
2928 if (LLVM_UNLIKELY(
2929 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2930 goto MultiByteUTF8;
2931 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2932 break;
2933 }
2934 CurPtr += 16;
2935 }
2936
2937#else
2938 while (CurPtr + 16 < BufferEnd) {
2939 bool HasNonASCII = false;
2940 for (unsigned I = 0; I < 16; ++I)
2941 HasNonASCII |= !isASCII(CurPtr[I]);
2942
2943 if (LLVM_UNLIKELY(HasNonASCII))
2944 goto MultiByteUTF8;
2945
2946 bool HasSlash = false;
2947 for (unsigned I = 0; I < 16; ++I)
2948 HasSlash |= CurPtr[I] == '/';
2949 if (HasSlash)
2950 break;
2951 CurPtr += 16;
2952 }
2953#endif
2954
2955 // It has to be one of the bytes scanned, increment to it and read one.
2956 C = *CurPtr++;
2957 }
2958
2959 // Loop to scan the remainder, warning on invalid UTF-8
2960 // if the corresponding warning is enabled, emitting a diagnostic only once
2961 // per sequence that cannot be decoded.
2962 while (C != '/' && C != '\0') {
2963 if (isASCII(C)) {
2964 UnicodeDecodingAlreadyDiagnosed = false;
2965 C = *CurPtr++;
2966 continue;
2967 }
2968 MultiByteUTF8:
2969 // CurPtr is 1 code unit past C, so to decode
2970 // the codepoint, we need to read from the previous position.
2971 unsigned Length = llvm::getUTF8SequenceSize(
2972 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2973 if (Length == 0) {
2974 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2975 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2976 UnicodeDecodingAlreadyDiagnosed = true;
2977 } else {
2978 UnicodeDecodingAlreadyDiagnosed = false;
2979 CurPtr += Length - 1;
2980 }
2981 C = *CurPtr++;
2982 }
2983
2984 if (C == '/') {
2985 FoundSlash:
2986 if (CurPtr[-2] == '*') // We found the final */. We're done!
2987 break;
2988
2989 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2990 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2991 LangOpts.Trigraphs)) {
2992 // We found the final */, though it had an escaped newline between the
2993 // * and /. We're done!
2994 break;
2995 }
2996 }
2997 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2998 // If this is a /* inside of the comment, emit a warning. Don't do this
2999 // if this is a /*/, which will end the comment. This misses cases with
3000 // embedded escaped newlines, but oh well.
3001 if (!isLexingRawMode())
3002 Diag(CurPtr-1, diag::warn_nested_block_comment);
3003 }
3004 } else if (C == 0 && CurPtr == BufferEnd+1) {
3005 if (!isLexingRawMode())
3006 Diag(BufferPtr, diag::err_unterminated_block_comment);
3007 // Note: the user probably forgot a */. We could continue immediately
3008 // after the /*, but this would involve lexing a lot of what really is the
3009 // comment, which surely would confuse the parser.
3010 --CurPtr;
3011
3012 // KeepWhitespaceMode should return this broken comment as a token. Since
3013 // it isn't a well formed comment, just return it as an 'unknown' token.
3014 if (isKeepWhitespaceMode()) {
3015 FormTokenWithChars(Result, CurPtr, tok::unknown);
3016 return true;
3017 }
3018
3019 BufferPtr = CurPtr;
3020 return false;
3021 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
3023 cutOffLexing();
3024 return false;
3025 }
3026
3027 C = *CurPtr++;
3028 }
3029
3030 // Notify comment handlers about the comment unless we're in a #if 0 block.
3031 if (PP && !isLexingRawMode() &&
3033 getSourceLocation(CurPtr)))) {
3034 BufferPtr = CurPtr;
3035 return true; // A token has to be returned.
3036 }
3037
3038 // If we are returning comments as tokens, return this comment as a token.
3039 if (inKeepCommentMode()) {
3040 FormTokenWithChars(Result, CurPtr, tok::comment);
3041 return true;
3042 }
3043
3044 // It is common for the tokens immediately after a /**/ comment to be
3045 // whitespace. Instead of going through the big switch, handle it
3046 // efficiently now. This is safe even in KeepWhitespaceMode because we would
3047 // have already returned above with the comment as a token.
3048 if (isHorizontalWhitespace(*CurPtr)) {
3049 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
3050 return false;
3051 }
3052
3053 // Otherwise, just return so that the next character will be lexed as a token.
3054 BufferPtr = CurPtr;
3055 Result.setFlag(Token::LeadingSpace);
3056 return false;
3057}
3058
3059//===----------------------------------------------------------------------===//
3060// Primary Lexing Entry Points
3061//===----------------------------------------------------------------------===//
3062
3063/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3064/// uninterpreted string. This switches the lexer out of directive mode.
3066 assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3067 "Must be in a preprocessing directive!");
3068 Token Tmp;
3069 Tmp.startToken();
3070
3071 // CurPtr - Cache BufferPtr in an automatic variable.
3072 const char *CurPtr = BufferPtr;
3073 while (true) {
3074 char Char = getAndAdvanceChar(CurPtr, Tmp);
3075 switch (Char) {
3076 default:
3077 if (Result)
3078 Result->push_back(Char);
3079 break;
3080 case 0: // Null.
3081 // Found end of file?
3082 if (CurPtr-1 != BufferEnd) {
3083 if (isCodeCompletionPoint(CurPtr-1)) {
3085 cutOffLexing();
3086 return;
3087 }
3088
3089 // Nope, normal character, continue.
3090 if (Result)
3091 Result->push_back(Char);
3092 break;
3093 }
3094 // FALL THROUGH.
3095 [[fallthrough]];
3096 case '\r':
3097 case '\n':
3098 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3099 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3100 BufferPtr = CurPtr-1;
3101
3102 // Next, lex the character, which should handle the EOD transition.
3103 Lex(Tmp);
3104 if (Tmp.is(tok::code_completion)) {
3105 if (PP)
3107 Lex(Tmp);
3108 }
3109 assert(Tmp.is(tok::eod) && "Unexpected token!");
3110
3111 // Finally, we're done;
3112 return;
3113 }
3114 }
3115}
3116
3117/// LexEndOfFile - CurPtr points to the end of this file. Handle this
3118/// condition, reporting diagnostics and handling other edge cases as required.
3119/// This returns true if Result contains a token, false if PP.Lex should be
3120/// called again.
3121bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3122 // If we hit the end of the file while parsing a preprocessor directive,
3123 // end the preprocessor directive first. The next token returned will
3124 // then be the end of file.
3126 // Done parsing the "line".
3128 // Update the location of token as well as BufferPtr.
3129 FormTokenWithChars(Result, CurPtr, tok::eod);
3130
3131 // Restore comment saving mode, in case it was disabled for directive.
3132 if (PP)
3134 return true; // Have a token.
3135 }
3136
3137 // If we are in raw mode, return this event as an EOF token. Let the caller
3138 // that put us in raw mode handle the event.
3139 if (isLexingRawMode()) {
3140 Result.startToken();
3141 BufferPtr = BufferEnd;
3142 FormTokenWithChars(Result, BufferEnd, tok::eof);
3143 return true;
3144 }
3145
3148 // If the preamble cuts off the end of a header guard, consider it guarded.
3149 // The guard is valid for the preamble content itself, and for tools the
3150 // most useful answer is "yes, this file has a header guard".
3151 if (!ConditionalStack.empty())
3153 ConditionalStack.clear();
3154 }
3155
3156 // Issue diagnostics for unterminated #if and missing newline.
3157
3158 // If we are in a #if directive, emit an error.
3159 while (!ConditionalStack.empty()) {
3160 if (PP->getCodeCompletionFileLoc() != FileLoc)
3161 PP->Diag(ConditionalStack.back().IfLoc,
3162 diag::err_pp_unterminated_conditional);
3163 ConditionalStack.pop_back();
3164 }
3165
3166 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3167 // a pedwarn.
3168 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3170 SourceLocation EndLoc = getSourceLocation(BufferEnd);
3171 unsigned DiagID;
3172
3173 if (LangOpts.CPlusPlus11) {
3174 // C++11 [lex.phases] 2.2 p2
3175 // Prefer the C++98 pedantic compatibility warning over the generic,
3176 // non-extension, user-requested "missing newline at EOF" warning.
3177 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3178 DiagID = diag::warn_cxx98_compat_no_newline_eof;
3179 } else {
3180 DiagID = diag::warn_no_newline_eof;
3181 }
3182 } else {
3183 DiagID = diag::ext_no_newline_eof;
3184 }
3185
3186 Diag(BufferEnd, DiagID)
3187 << FixItHint::CreateInsertion(EndLoc, "\n");
3188 }
3189
3190 BufferPtr = CurPtr;
3191
3192 // Finally, let the preprocessor handle this.
3194}
3195
3196/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3197/// the specified lexer will return a tok::l_paren token, 0 if it is something
3198/// else and 2 if there are no more tokens in the buffer controlled by the
3199/// lexer.
3200unsigned Lexer::isNextPPTokenLParen() {
3201 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3202
3203 if (isDependencyDirectivesLexer()) {
3204 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3205 return 2;
3206 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3207 tok::l_paren);
3208 }
3209
3210 // Switch to 'skipping' mode. This will ensure that we can lex a token
3211 // without emitting diagnostics, disables macro expansion, and will cause EOF
3212 // to return an EOF token instead of popping the include stack.
3213 LexingRawMode = true;
3214
3215 // Save state that can be changed while lexing so that we can restore it.
3216 const char *TmpBufferPtr = BufferPtr;
3217 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3218 bool atStartOfLine = IsAtStartOfLine;
3219 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3220 bool leadingSpace = HasLeadingSpace;
3221
3222 Token Tok;
3223 Lex(Tok);
3224
3225 // Restore state that may have changed.
3226 BufferPtr = TmpBufferPtr;
3227 ParsingPreprocessorDirective = inPPDirectiveMode;
3228 HasLeadingSpace = leadingSpace;
3229 IsAtStartOfLine = atStartOfLine;
3230 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3231
3232 // Restore the lexer back to non-skipping mode.
3233 LexingRawMode = false;
3234
3235 if (Tok.is(tok::eof))
3236 return 2;
3237 return Tok.is(tok::l_paren);
3238}
3239
3240/// Find the end of a version control conflict marker.
3241static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3242 ConflictMarkerKind CMK) {
3243 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3244 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3245 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3246 size_t Pos = RestOfBuffer.find(Terminator);
3247 while (Pos != StringRef::npos) {
3248 // Must occur at start of line.
3249 if (Pos == 0 ||
3250 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3251 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3252 Pos = RestOfBuffer.find(Terminator);
3253 continue;
3254 }
3255 return RestOfBuffer.data()+Pos;
3256 }
3257 return nullptr;
3258}
3259
3260/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3261/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3262/// and recover nicely. This returns true if it is a conflict marker and false
3263/// if not.
3264bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3265 // Only a conflict marker if it starts at the beginning of a line.
3266 if (CurPtr != BufferStart &&
3267 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3268 return false;
3269
3270 // Check to see if we have <<<<<<< or >>>>.
3271 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3272 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3273 return false;
3274
3275 // If we have a situation where we don't care about conflict markers, ignore
3276 // it.
3277 if (CurrentConflictMarkerState || isLexingRawMode())
3278 return false;
3279
3280 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3281
3282 // Check to see if there is an ending marker somewhere in the buffer at the
3283 // start of a line to terminate this conflict marker.
3284 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3285 // We found a match. We are really in a conflict marker.
3286 // Diagnose this, and ignore to the end of line.
3287 Diag(CurPtr, diag::err_conflict_marker);
3288 CurrentConflictMarkerState = Kind;
3289
3290 // Skip ahead to the end of line. We know this exists because the
3291 // end-of-conflict marker starts with \r or \n.
3292 while (*CurPtr != '\r' && *CurPtr != '\n') {
3293 assert(CurPtr != BufferEnd && "Didn't find end of line");
3294 ++CurPtr;
3295 }
3296 BufferPtr = CurPtr;
3297 return true;
3298 }
3299
3300 // No end of conflict marker found.
3301 return false;
3302}
3303
3304/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3305/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3306/// is the end of a conflict marker. Handle it by ignoring up until the end of
3307/// the line. This returns true if it is a conflict marker and false if not.
3308bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3309 // Only a conflict marker if it starts at the beginning of a line.
3310 if (CurPtr != BufferStart &&
3311 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3312 return false;
3313
3314 // If we have a situation where we don't care about conflict markers, ignore
3315 // it.
3316 if (!CurrentConflictMarkerState || isLexingRawMode())
3317 return false;
3318
3319 // Check to see if we have the marker (4 characters in a row).
3320 for (unsigned i = 1; i != 4; ++i)
3321 if (CurPtr[i] != CurPtr[0])
3322 return false;
3323
3324 // If we do have it, search for the end of the conflict marker. This could
3325 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3326 // be the end of conflict marker.
3327 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3328 CurrentConflictMarkerState)) {
3329 CurPtr = End;
3330
3331 // Skip ahead to the end of line.
3332 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3333 ++CurPtr;
3334
3335 BufferPtr = CurPtr;
3336
3337 // No longer in the conflict marker.
3338 CurrentConflictMarkerState = CMK_None;
3339 return true;
3340 }
3341
3342 return false;
3343}
3344
3345static const char *findPlaceholderEnd(const char *CurPtr,
3346 const char *BufferEnd) {
3347 if (CurPtr == BufferEnd)
3348 return nullptr;
3349 BufferEnd -= 1; // Scan until the second last character.
3350 for (; CurPtr != BufferEnd; ++CurPtr) {
3351 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3352 return CurPtr + 2;
3353 }
3354 return nullptr;
3355}
3356
3357bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3358 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3360 return false;
3361 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3362 if (!End)
3363 return false;
3364 const char *Start = CurPtr - 1;
3365 if (!LangOpts.AllowEditorPlaceholders)
3366 Diag(Start, diag::err_placeholder_in_source);
3367 Result.startToken();
3368 FormTokenWithChars(Result, End, tok::raw_identifier);
3369 Result.setRawIdentifierData(Start);
3372 BufferPtr = End;
3373 return true;
3374}
3375
3376bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3377 if (PP && PP->isCodeCompletionEnabled()) {
3378 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3379 return Loc == PP->getCodeCompletionLoc();
3380 }
3381
3382 return false;
3383}
3384
3385std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3386 const char *SlashLoc,
3387 Token *Result) {
3388 unsigned CharSize;
3389 char Kind = getCharAndSize(StartPtr, CharSize);
3390 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3391
3392 unsigned NumHexDigits;
3393 if (Kind == 'u')
3394 NumHexDigits = 4;
3395 else if (Kind == 'U')
3396 NumHexDigits = 8;
3397
3398 bool Delimited = false;
3399 bool FoundEndDelimiter = false;
3400 unsigned Count = 0;
3401 bool Diagnose = Result && !isLexingRawMode();
3402
3403 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3404 if (Diagnose)
3405 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3406 return std::nullopt;
3407 }
3408
3409 const char *CurPtr = StartPtr + CharSize;
3410 const char *KindLoc = &CurPtr[-1];
3411
3412 uint32_t CodePoint = 0;
3413 while (Count != NumHexDigits || Delimited) {
3414 char C = getCharAndSize(CurPtr, CharSize);
3415 if (!Delimited && Count == 0 && C == '{') {
3416 Delimited = true;
3417 CurPtr += CharSize;
3418 continue;
3419 }
3420
3421 if (Delimited && C == '}') {
3422 CurPtr += CharSize;
3423 FoundEndDelimiter = true;
3424 break;
3425 }
3426
3427 unsigned Value = llvm::hexDigitValue(C);
3428 if (Value == -1U) {
3429 if (!Delimited)
3430 break;
3431 if (Diagnose)
3432 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3433 << StringRef(KindLoc, 1);
3434 return std::nullopt;
3435 }
3436
3437 if (CodePoint & 0xF000'0000) {
3438 if (Diagnose)
3439 Diag(KindLoc, diag::err_escape_too_large) << 0;
3440 return std::nullopt;
3441 }
3442
3443 CodePoint <<= 4;
3444 CodePoint |= Value;
3445 CurPtr += CharSize;
3446 Count++;
3447 }
3448
3449 if (Count == 0) {
3450 if (Diagnose)
3451 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3452 : diag::warn_ucn_escape_no_digits)
3453 << StringRef(KindLoc, 1);
3454 return std::nullopt;
3455 }
3456
3457 if (Delimited && Kind == 'U') {
3458 if (Diagnose)
3459 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3460 return std::nullopt;
3461 }
3462
3463 if (!Delimited && Count != NumHexDigits) {
3464 if (Diagnose) {
3465 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3466 // If the user wrote \U1234, suggest a fixit to \u.
3467 if (Count == 4 && NumHexDigits == 8) {
3468 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3469 Diag(KindLoc, diag::note_ucn_four_not_eight)
3470 << FixItHint::CreateReplacement(URange, "u");
3471 }
3472 }
3473 return std::nullopt;
3474 }
3475
3476 if (Delimited && PP) {
3477 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3478 ? diag::warn_cxx23_delimited_escape_sequence
3479 : diag::ext_delimited_escape_sequence)
3480 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3481 }
3482
3483 if (Result) {
3484 Result->setFlag(Token::HasUCN);
3485 // If the UCN contains either a trigraph or a line splicing,
3486 // we need to call getAndAdvanceChar again to set the appropriate flags
3487 // on Result.
3488 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3489 StartPtr = CurPtr;
3490 else
3491 while (StartPtr != CurPtr)
3492 (void)getAndAdvanceChar(StartPtr, *Result);
3493 } else {
3494 StartPtr = CurPtr;
3495 }
3496 return CodePoint;
3497}
3498
3499std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3500 const char *SlashLoc,
3501 Token *Result) {
3502 unsigned CharSize;
3503 bool Diagnose = Result && !isLexingRawMode();
3504
3505 char C = getCharAndSize(StartPtr, CharSize);
3506 assert(C == 'N' && "expected \\N{...}");
3507
3508 const char *CurPtr = StartPtr + CharSize;
3509 const char *KindLoc = &CurPtr[-1];
3510
3511 C = getCharAndSize(CurPtr, CharSize);
3512 if (C != '{') {
3513 if (Diagnose)
3514 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3515 return std::nullopt;
3516 }
3517 CurPtr += CharSize;
3518 const char *StartName = CurPtr;
3519 bool FoundEndDelimiter = false;
3521 while (C) {
3522 C = getCharAndSize(CurPtr, CharSize);
3523 CurPtr += CharSize;
3524 if (C == '}') {
3525 FoundEndDelimiter = true;
3526 break;
3527 }
3528
3530 break;
3531 Buffer.push_back(C);
3532 }
3533
3534 if (!FoundEndDelimiter || Buffer.empty()) {
3535 if (Diagnose)
3536 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3537 : diag::warn_delimited_ucn_incomplete)
3538 << StringRef(KindLoc, 1);
3539 return std::nullopt;
3540 }
3541
3542 StringRef Name(Buffer.data(), Buffer.size());
3543 std::optional<char32_t> Match =
3544 llvm::sys::unicode::nameToCodepointStrict(Name);
3545 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3546 if (!Match) {
3547 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3548 if (Diagnose) {
3549 Diag(StartName, diag::err_invalid_ucn_name)
3550 << StringRef(Buffer.data(), Buffer.size())
3551 << makeCharRange(*this, StartName, CurPtr - CharSize);
3552 if (LooseMatch) {
3553 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3555 makeCharRange(*this, StartName, CurPtr - CharSize),
3556 LooseMatch->Name);
3557 }
3558 }
3559 // We do not offer misspelled character names suggestions here
3560 // as the set of what would be a valid suggestion depends on context,
3561 // and we should not make invalid suggestions.
3562 }
3563
3564 if (Diagnose && Match)
3565 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3566 ? diag::warn_cxx23_delimited_escape_sequence
3567 : diag::ext_delimited_escape_sequence)
3568 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3569
3570 // If no diagnostic has been emitted yet, likely because we are doing a
3571 // tentative lexing, we do not want to recover here to make sure the token
3572 // will not be incorrectly considered valid. This function will be called
3573 // again and a diagnostic emitted then.
3574 if (LooseMatch && Diagnose)
3575 Match = LooseMatch->CodePoint;
3576
3577 if (Result) {
3578 Result->setFlag(Token::HasUCN);
3579 // If the UCN contains either a trigraph or a line splicing,
3580 // we need to call getAndAdvanceChar again to set the appropriate flags
3581 // on Result.
3582 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3583 StartPtr = CurPtr;
3584 else
3585 while (StartPtr != CurPtr)
3586 (void)getAndAdvanceChar(StartPtr, *Result);
3587 } else {
3588 StartPtr = CurPtr;
3589 }
3590 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3591}
3592
3593uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3594 Token *Result) {
3595
3596 unsigned CharSize;
3597 std::optional<uint32_t> CodePointOpt;
3598 char Kind = getCharAndSize(StartPtr, CharSize);
3599 if (Kind == 'u' || Kind == 'U')
3600 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3601 else if (Kind == 'N')
3602 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3603
3604 if (!CodePointOpt)
3605 return 0;
3606
3607 uint32_t CodePoint = *CodePointOpt;
3608
3609 // Don't apply C family restrictions to UCNs in assembly mode
3610 if (LangOpts.AsmPreprocessor)
3611 return CodePoint;
3612
3613 // C23 6.4.3p2: A universal character name shall not designate a code point
3614 // where the hexadecimal value is:
3615 // - in the range D800 through DFFF inclusive; or
3616 // - greater than 10FFFF.
3617 // A universal-character-name outside the c-char-sequence of a character
3618 // constant, or the s-char-sequence of a string-literal shall not designate
3619 // a control character or a character in the basic character set.
3620
3621 // C++11 [lex.charset]p2: If the hexadecimal value for a
3622 // universal-character-name corresponds to a surrogate code point (in the
3623 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3624 // if the hexadecimal value for a universal-character-name outside the
3625 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3626 // string literal corresponds to a control character (in either of the
3627 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3628 // basic source character set, the program is ill-formed.
3629 if (CodePoint < 0xA0) {
3630 // We don't use isLexingRawMode() here because we need to warn about bad
3631 // UCNs even when skipping preprocessing tokens in a #if block.
3632 if (Result && PP) {
3633 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3634 Diag(BufferPtr, diag::err_ucn_control_character);
3635 else {
3636 char C = static_cast<char>(CodePoint);
3637 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3638 }
3639 }
3640
3641 return 0;
3642 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3643 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3644 // We don't use isLexingRawMode() here because we need to diagnose bad
3645 // UCNs even when skipping preprocessing tokens in a #if block.
3646 if (Result && PP) {
3647 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3648 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3649 else
3650 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3651 }
3652 return 0;
3653 }
3654
3655 return CodePoint;
3656}
3657
3658bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3659 const char *CurPtr) {
3660 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3662 Diag(BufferPtr, diag::ext_unicode_whitespace)
3663 << makeCharRange(*this, BufferPtr, CurPtr);
3664
3665 Result.setFlag(Token::LeadingSpace);
3666 return true;
3667 }
3668 return false;
3669}
3670
3671void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3672 IsAtStartOfLine = Result.isAtStartOfLine();
3673 HasLeadingSpace = Result.hasLeadingSpace();
3674 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3675 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3676}
3677
3679 assert(!isDependencyDirectivesLexer());
3680
3681 // Start a new token.
3682 Result.startToken();
3683
3684 // Set up misc whitespace flags for LexTokenInternal.
3685 if (IsAtStartOfLine) {
3686 Result.setFlag(Token::StartOfLine);
3687 IsAtStartOfLine = false;
3688 }
3689
3690 if (HasLeadingSpace) {
3691 Result.setFlag(Token::LeadingSpace);
3692 HasLeadingSpace = false;
3693 }
3694
3695 if (HasLeadingEmptyMacro) {
3697 HasLeadingEmptyMacro = false;
3698 }
3699
3700 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3701 IsAtPhysicalStartOfLine = false;
3702 bool isRawLex = isLexingRawMode();
3703 (void) isRawLex;
3704 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3705 // (After the LexTokenInternal call, the lexer might be destroyed.)
3706 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3707 return returnedToken;
3708}
3709
3710/// LexTokenInternal - This implements a simple C family lexer. It is an
3711/// extremely performance critical piece of code. This assumes that the buffer
3712/// has a null character at the end of the file. This returns a preprocessing
3713/// token, not a normal token, as such, it is an internal interface. It assumes
3714/// that the Flags of result have been cleared before calling this.
3715bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3716LexStart:
3717 assert(!Result.needsCleaning() && "Result needs cleaning");
3718 assert(!Result.hasPtrData() && "Result has not been reset");
3719
3720 // CurPtr - Cache BufferPtr in an automatic variable.
3721 const char *CurPtr = BufferPtr;
3722
3723 // Small amounts of horizontal whitespace is very common between tokens.
3724 if (isHorizontalWhitespace(*CurPtr)) {
3725 do {
3726 ++CurPtr;
3727 } while (isHorizontalWhitespace(*CurPtr));
3728
3729 // If we are keeping whitespace and other tokens, just return what we just
3730 // skipped. The next lexer invocation will return the token after the
3731 // whitespace.
3732 if (isKeepWhitespaceMode()) {
3733 FormTokenWithChars(Result, CurPtr, tok::unknown);
3734 // FIXME: The next token will not have LeadingSpace set.
3735 return true;
3736 }
3737
3738 BufferPtr = CurPtr;
3739 Result.setFlag(Token::LeadingSpace);
3740 }
3741
3742 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3743
3744 // Read a character, advancing over it.
3745 char Char = getAndAdvanceChar(CurPtr, Result);
3747
3748 if (!isVerticalWhitespace(Char))
3749 NewLinePtr = nullptr;
3750
3751 switch (Char) {
3752 case 0: // Null.
3753 // Found end of file?
3754 if (CurPtr-1 == BufferEnd)
3755 return LexEndOfFile(Result, CurPtr-1);
3756
3757 // Check if we are performing code completion.
3758 if (isCodeCompletionPoint(CurPtr-1)) {
3759 // Return the code-completion token.
3760 Result.startToken();
3761 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3762 return true;
3763 }
3764
3765 if (!isLexingRawMode())
3766 Diag(CurPtr-1, diag::null_in_file);
3767 Result.setFlag(Token::LeadingSpace);
3768 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3769 return true; // KeepWhitespaceMode
3770
3771 // We know the lexer hasn't changed, so just try again with this lexer.
3772 // (We manually eliminate the tail call to avoid recursion.)
3773 goto LexNextToken;
3774
3775 case 26: // DOS & CP/M EOF: "^Z".
3776 // If we're in Microsoft extensions mode, treat this as end of file.
3777 if (LangOpts.MicrosoftExt) {
3778 if (!isLexingRawMode())
3779 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3780 return LexEndOfFile(Result, CurPtr-1);
3781 }
3782
3783 // If Microsoft extensions are disabled, this is just random garbage.
3784 Kind = tok::unknown;
3785 break;
3786
3787 case '\r':
3788 if (CurPtr[0] == '\n')
3789 (void)getAndAdvanceChar(CurPtr, Result);
3790 [[fallthrough]];
3791 case '\n':
3792 // If we are inside a preprocessor directive and we see the end of line,
3793 // we know we are done with the directive, so return an EOD token.
3795 // Done parsing the "line".
3797
3798 // Restore comment saving mode, in case it was disabled for directive.
3799 if (PP)
3801
3802 // Since we consumed a newline, we are back at the start of a line.
3803 IsAtStartOfLine = true;
3804 IsAtPhysicalStartOfLine = true;
3805 NewLinePtr = CurPtr - 1;
3806
3807 Kind = tok::eod;
3808 break;
3809 }
3810
3811 // No leading whitespace seen so far.
3812 Result.clearFlag(Token::LeadingSpace);
3813
3814 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3815 return true; // KeepWhitespaceMode
3816
3817 // We only saw whitespace, so just try again with this lexer.
3818 // (We manually eliminate the tail call to avoid recursion.)
3819 goto LexNextToken;
3820 case ' ':
3821 case '\t':
3822 case '\f':
3823 case '\v':
3824 SkipHorizontalWhitespace:
3825 Result.setFlag(Token::LeadingSpace);
3826 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3827 return true; // KeepWhitespaceMode
3828
3829 SkipIgnoredUnits:
3830 CurPtr = BufferPtr;
3831
3832 // If the next token is obviously a // or /* */ comment, skip it efficiently
3833 // too (without going through the big switch stmt).
3834 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3835 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3836 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3837 return true; // There is a token to return.
3838 goto SkipIgnoredUnits;
3839 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3840 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3841 return true; // There is a token to return.
3842 goto SkipIgnoredUnits;
3843 } else if (isHorizontalWhitespace(*CurPtr)) {
3844 goto SkipHorizontalWhitespace;
3845 }
3846 // We only saw whitespace, so just try again with this lexer.
3847 // (We manually eliminate the tail call to avoid recursion.)
3848 goto LexNextToken;
3849
3850 // C99 6.4.4.1: Integer Constants.
3851 // C99 6.4.4.2: Floating Constants.
3852 case '0': case '1': case '2': case '3': case '4':
3853 case '5': case '6': case '7': case '8': case '9':
3854 // Notify MIOpt that we read a non-whitespace/non-comment token.
3855 MIOpt.ReadToken();
3856 return LexNumericConstant(Result, CurPtr);
3857
3858 // Identifier (e.g., uber), or
3859 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3860 // UTF-8 or UTF-16 string literal (C11/C++11).
3861 case 'u':
3862 // Notify MIOpt that we read a non-whitespace/non-comment token.
3863 MIOpt.ReadToken();
3864
3865 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3866 Char = getCharAndSize(CurPtr, SizeTmp);
3867
3868 // UTF-16 string literal
3869 if (Char == '"')
3870 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3871 tok::utf16_string_literal);
3872
3873 // UTF-16 character constant
3874 if (Char == '\'')
3875 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3876 tok::utf16_char_constant);
3877
3878 // UTF-16 raw string literal
3879 if (Char == 'R' && LangOpts.RawStringLiterals &&
3880 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3881 return LexRawStringLiteral(Result,
3882 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3883 SizeTmp2, Result),
3884 tok::utf16_string_literal);
3885
3886 if (Char == '8') {
3887 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3888
3889 // UTF-8 string literal
3890 if (Char2 == '"')
3891 return LexStringLiteral(Result,
3892 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3893 SizeTmp2, Result),
3894 tok::utf8_string_literal);
3895 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3896 return LexCharConstant(
3897 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3898 SizeTmp2, Result),
3899 tok::utf8_char_constant);
3900
3901 if (Char2 == 'R' && LangOpts.RawStringLiterals) {
3902 unsigned SizeTmp3;
3903 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3904 // UTF-8 raw string literal
3905 if (Char3 == '"') {
3906 return LexRawStringLiteral(Result,
3907 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3908 SizeTmp2, Result),
3909 SizeTmp3, Result),
3910 tok::utf8_string_literal);
3911 }
3912 }
3913 }
3914 }
3915
3916 // treat u like the start of an identifier.
3917 return LexIdentifierContinue(Result, CurPtr);
3918
3919 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3920 // Notify MIOpt that we read a non-whitespace/non-comment token.
3921 MIOpt.ReadToken();
3922
3923 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3924 Char = getCharAndSize(CurPtr, SizeTmp);
3925
3926 // UTF-32 string literal
3927 if (Char == '"')
3928 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3929 tok::utf32_string_literal);
3930
3931 // UTF-32 character constant
3932 if (Char == '\'')
3933 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3934 tok::utf32_char_constant);
3935
3936 // UTF-32 raw string literal
3937 if (Char == 'R' && LangOpts.RawStringLiterals &&
3938 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3939 return LexRawStringLiteral(Result,
3940 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3941 SizeTmp2, Result),
3942 tok::utf32_string_literal);
3943 }
3944
3945 // treat U like the start of an identifier.
3946 return LexIdentifierContinue(Result, CurPtr);
3947
3948 case 'R': // Identifier or C++0x raw string literal
3949 // Notify MIOpt that we read a non-whitespace/non-comment token.
3950 MIOpt.ReadToken();
3951
3952 if (LangOpts.RawStringLiterals) {
3953 Char = getCharAndSize(CurPtr, SizeTmp);
3954
3955 if (Char == '"')
3956 return LexRawStringLiteral(Result,
3957 ConsumeChar(CurPtr, SizeTmp, Result),
3958 tok::string_literal);
3959 }
3960
3961 // treat R like the start of an identifier.
3962 return LexIdentifierContinue(Result, CurPtr);
3963
3964 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3965 // Notify MIOpt that we read a non-whitespace/non-comment token.
3966 MIOpt.ReadToken();
3967 Char = getCharAndSize(CurPtr, SizeTmp);
3968
3969 // Wide string literal.
3970 if (Char == '"')
3971 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3972 tok::wide_string_literal);
3973
3974 // Wide raw string literal.
3975 if (LangOpts.RawStringLiterals && Char == 'R' &&
3976 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3977 return LexRawStringLiteral(Result,
3978 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3979 SizeTmp2, Result),
3980 tok::wide_string_literal);
3981
3982 // Wide character constant.
3983 if (Char == '\'')
3984 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3985 tok::wide_char_constant);
3986 // FALL THROUGH, treating L like the start of an identifier.
3987 [[fallthrough]];
3988
3989 // C99 6.4.2: Identifiers.
3990 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3991 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3992 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3993 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3994 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3995 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3996 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3997 case 'v': case 'w': case 'x': case 'y': case 'z':
3998 case '_':
3999 // Notify MIOpt that we read a non-whitespace/non-comment token.
4000 MIOpt.ReadToken();
4001 return LexIdentifierContinue(Result, CurPtr);
4002
4003 case '$': // $ in identifiers.
4004 if (LangOpts.DollarIdents) {
4005 if (!isLexingRawMode())
4006 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
4007 // Notify MIOpt that we read a non-whitespace/non-comment token.
4008 MIOpt.ReadToken();
4009 return LexIdentifierContinue(Result, CurPtr);
4010 }
4011
4012 Kind = tok::unknown;
4013 break;
4014
4015 // C99 6.4.4: Character Constants.
4016 case '\'':
4017 // Notify MIOpt that we read a non-whitespace/non-comment token.
4018 MIOpt.ReadToken();
4019 return LexCharConstant(Result, CurPtr, tok::char_constant);
4020
4021 // C99 6.4.5: String Literals.
4022 case '"':
4023 // Notify MIOpt that we read a non-whitespace/non-comment token.
4024 MIOpt.ReadToken();
4025 return LexStringLiteral(Result, CurPtr,
4026 ParsingFilename ? tok::header_name
4027 : tok::string_literal);
4028
4029 // C99 6.4.6: Punctuators.
4030 case '?':
4031 Kind = tok::question;
4032 break;
4033 case '[':
4034 Kind = tok::l_square;
4035 break;
4036 case ']':
4037 Kind = tok::r_square;
4038 break;
4039 case '(':
4040 Kind = tok::l_paren;
4041 break;
4042 case ')':
4043 Kind = tok::r_paren;
4044 break;
4045 case '{':
4046 Kind = tok::l_brace;
4047 break;
4048 case '}':
4049 Kind = tok::r_brace;
4050 break;
4051 case '.':
4052 Char = getCharAndSize(CurPtr, SizeTmp);
4053 if (Char >= '0' && Char <= '9') {
4054 // Notify MIOpt that we read a non-whitespace/non-comment token.
4055 MIOpt.ReadToken();
4056
4057 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4058 } else if (LangOpts.CPlusPlus && Char == '*') {
4059 Kind = tok::periodstar;
4060 CurPtr += SizeTmp;
4061 } else if (Char == '.' &&
4062 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4063 Kind = tok::ellipsis;
4064 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4065 SizeTmp2, Result);
4066 } else {
4067 Kind = tok::period;
4068 }
4069 break;
4070 case '&':
4071 Char = getCharAndSize(CurPtr, SizeTmp);
4072 if (Char == '&') {
4073 Kind = tok::ampamp;
4074 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4075 } else if (Char == '=') {
4076 Kind = tok::ampequal;
4077 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4078 } else {
4079 Kind = tok::amp;
4080 }
4081 break;
4082 case '*':
4083 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4084 Kind = tok::starequal;
4085 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4086 } else {
4087 Kind = tok::star;
4088 }
4089 break;
4090 case '+':
4091 Char = getCharAndSize(CurPtr, SizeTmp);
4092 if (Char == '+') {
4093 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4094 Kind = tok::plusplus;
4095 } else if (Char == '=') {
4096 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4097 Kind = tok::plusequal;
4098 } else {
4099 Kind = tok::plus;
4100 }
4101 break;
4102 case '-':
4103 Char = getCharAndSize(CurPtr, SizeTmp);
4104 if (Char == '-') { // --
4105 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4106 Kind = tok::minusminus;
4107 } else if (Char == '>' && LangOpts.CPlusPlus &&
4108 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
4109 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4110 SizeTmp2, Result);
4111 Kind = tok::arrowstar;
4112 } else if (Char == '>') { // ->
4113 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4114 Kind = tok::arrow;
4115 } else if (Char == '=') { // -=
4116 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4117 Kind = tok::minusequal;
4118 } else {
4119 Kind = tok::minus;
4120 }
4121 break;
4122 case '~':
4123 Kind = tok::tilde;
4124 break;
4125 case '!':
4126 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4127 Kind = tok::exclaimequal;
4128 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4129 } else {
4130 Kind = tok::exclaim;
4131 }
4132 break;
4133 case '/':
4134 // 6.4.9: Comments
4135 Char = getCharAndSize(CurPtr, SizeTmp);
4136 if (Char == '/') { // Line comment.
4137 // Even if Line comments are disabled (e.g. in C89 mode), we generally
4138 // want to lex this as a comment. There is one problem with this though,
4139 // that in one particular corner case, this can change the behavior of the
4140 // resultant program. For example, In "foo //**/ bar", C89 would lex
4141 // this as "foo / bar" and languages with Line comments would lex it as
4142 // "foo". Check to see if the character after the second slash is a '*'.
4143 // If so, we will lex that as a "/" instead of the start of a comment.
4144 // However, we never do this if we are just preprocessing.
4145 bool TreatAsComment =
4146 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4147 if (!TreatAsComment)
4148 if (!(PP && PP->isPreprocessedOutput()))
4149 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4150
4151 if (TreatAsComment) {
4152 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4153 TokAtPhysicalStartOfLine))
4154 return true; // There is a token to return.
4155
4156 // It is common for the tokens immediately after a // comment to be
4157 // whitespace (indentation for the next line). Instead of going through
4158 // the big switch, handle it efficiently now.
4159 goto SkipIgnoredUnits;
4160 }
4161 }
4162
4163 if (Char == '*') { // /**/ comment.
4164 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4165 TokAtPhysicalStartOfLine))
4166 return true; // There is a token to return.
4167
4168 // We only saw whitespace, so just try again with this lexer.
4169 // (We manually eliminate the tail call to avoid recursion.)
4170 goto LexNextToken;
4171 }
4172
4173 if (Char == '=') {
4174 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4175 Kind = tok::slashequal;
4176 } else {
4177 Kind = tok::slash;
4178 }
4179 break;
4180 case '%':
4181 Char = getCharAndSize(CurPtr, SizeTmp);
4182 if (Char == '=') {
4183 Kind = tok::percentequal;
4184 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4185 } else if (LangOpts.Digraphs && Char == '>') {
4186 Kind = tok::r_brace; // '%>' -> '}'
4187 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4188 } else if (LangOpts.Digraphs && Char == ':') {
4189 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4190 Char = getCharAndSize(CurPtr, SizeTmp);
4191 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4192 Kind = tok::hashhash; // '%:%:' -> '##'
4193 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4194 SizeTmp2, Result);
4195 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4196 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4197 if (!isLexingRawMode())
4198 Diag(BufferPtr, diag::ext_charize_microsoft);
4199 Kind = tok::hashat;
4200 } else { // '%:' -> '#'
4201 // We parsed a # character. If this occurs at the start of the line,
4202 // it's actually the start of a preprocessing directive. Callback to
4203 // the preprocessor to handle it.
4204 // TODO: -fpreprocessed mode??
4205 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4206 goto HandleDirective;
4207
4208 Kind = tok::hash;
4209 }
4210 } else {
4211 Kind = tok::percent;
4212 }
4213 break;
4214 case '<':
4215 Char = getCharAndSize(CurPtr, SizeTmp);
4216 if (ParsingFilename) {
4217 return LexAngledStringLiteral(Result, CurPtr);
4218 } else if (Char == '<') {
4219 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4220 if (After == '=') {
4221 Kind = tok::lesslessequal;
4222 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4223 SizeTmp2, Result);
4224 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4225 // If this is actually a '<<<<<<<' version control conflict marker,
4226 // recognize it as such and recover nicely.
4227 goto LexNextToken;
4228 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4229 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4230 // ignore it.
4231 goto LexNextToken;
4232 } else if (LangOpts.CUDA && After == '<') {
4233 Kind = tok::lesslessless;
4234 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4235 SizeTmp2, Result);
4236 } else {
4237 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4238 Kind = tok::lessless;
4239 }
4240 } else if (Char == '=') {
4241 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4242 if (After == '>') {
4243 if (LangOpts.CPlusPlus20) {
4244 if (!isLexingRawMode())
4245 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4246 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4247 SizeTmp2, Result);
4248 Kind = tok::spaceship;
4249 break;
4250 }
4251 // Suggest adding a space between the '<=' and the '>' to avoid a
4252 // change in semantics if this turns up in C++ <=17 mode.
4253 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4254 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4256 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4257 }
4258 }
4259 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4260 Kind = tok::lessequal;
4261 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4262 if (LangOpts.CPlusPlus11 &&
4263 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4264 // C++0x [lex.pptoken]p3:
4265 // Otherwise, if the next three characters are <:: and the subsequent
4266 // character is neither : nor >, the < is treated as a preprocessor
4267 // token by itself and not as the first character of the alternative
4268 // token <:.
4269 unsigned SizeTmp3;
4270 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4271 if (After != ':' && After != '>') {
4272 Kind = tok::less;
4273 if (!isLexingRawMode())
4274 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4275 break;
4276 }
4277 }
4278
4279 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4280 Kind = tok::l_square;
4281 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4282 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4283 Kind = tok::l_brace;
4284 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4285 lexEditorPlaceholder(Result, CurPtr)) {
4286 return true;
4287 } else {
4288 Kind = tok::less;
4289 }
4290 break;
4291 case '>':
4292 Char = getCharAndSize(CurPtr, SizeTmp);
4293 if (Char == '=') {
4294 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4295 Kind = tok::greaterequal;
4296 } else if (Char == '>') {
4297 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4298 if (After == '=') {
4299 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4300 SizeTmp2, Result);
4301 Kind = tok::greatergreaterequal;
4302 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4303 // If this is actually a '>>>>' conflict marker, recognize it as such
4304 // and recover nicely.
4305 goto LexNextToken;
4306 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4307 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4308 goto LexNextToken;
4309 } else if (LangOpts.CUDA && After == '>') {
4310 Kind = tok::greatergreatergreater;
4311 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4312 SizeTmp2, Result);
4313 } else {
4314 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4315 Kind = tok::greatergreater;
4316 }
4317 } else {
4318 Kind = tok::greater;
4319 }
4320 break;
4321 case '^':
4322 Char = getCharAndSize(CurPtr, SizeTmp);
4323 if (Char == '=') {
4324 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4325 Kind = tok::caretequal;
4326 } else if (LangOpts.OpenCL && Char == '^') {
4327 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4328 Kind = tok::caretcaret;
4329 } else {
4330 Kind = tok::caret;
4331 }
4332 break;
4333 case '|':
4334 Char = getCharAndSize(CurPtr, SizeTmp);
4335 if (Char == '=') {
4336 Kind = tok::pipeequal;
4337 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4338 } else if (Char == '|') {
4339 // If this is '|||||||' and we're in a conflict marker, ignore it.
4340 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4341 goto LexNextToken;
4342 Kind = tok::pipepipe;
4343 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4344 } else {
4345 Kind = tok::pipe;
4346 }
4347 break;
4348 case ':':
4349 Char = getCharAndSize(CurPtr, SizeTmp);
4350 if (LangOpts.Digraphs && Char == '>') {
4351 Kind = tok::r_square; // ':>' -> ']'
4352 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4353 } else if (Char == ':') {
4354 Kind = tok::coloncolon;
4355 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4356 } else {
4357 Kind = tok::colon;
4358 }
4359 break;
4360 case ';':
4361 Kind = tok::semi;
4362 break;
4363 case '=':
4364 Char = getCharAndSize(CurPtr, SizeTmp);
4365 if (Char == '=') {
4366 // If this is '====' and we're in a conflict marker, ignore it.
4367 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4368 goto LexNextToken;
4369
4370 Kind = tok::equalequal;
4371 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4372 } else {
4373 Kind = tok::equal;
4374 }
4375 break;
4376 case ',':
4377 Kind = tok::comma;
4378 break;
4379 case '#':
4380 Char = getCharAndSize(CurPtr, SizeTmp);
4381 if (Char == '#') {
4382 Kind = tok::hashhash;
4383 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4384 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4385 Kind = tok::hashat;
4386 if (!isLexingRawMode())
4387 Diag(BufferPtr, diag::ext_charize_microsoft);
4388 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4389 } else {
4390 // We parsed a # character. If this occurs at the start of the line,
4391 // it's actually the start of a preprocessing directive. Callback to
4392 // the preprocessor to handle it.
4393 // TODO: -fpreprocessed mode??
4394 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4395 goto HandleDirective;
4396
4397 Kind = tok::hash;
4398 }
4399 break;
4400
4401 case '@':
4402 // Objective C support.
4403 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4404 Kind = tok::at;
4405 else
4406 Kind = tok::unknown;
4407 break;
4408
4409 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4410 case '\\':
4411 if (!LangOpts.AsmPreprocessor) {
4412 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4413 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4414 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4415 return true; // KeepWhitespaceMode
4416
4417 // We only saw whitespace, so just try again with this lexer.
4418 // (We manually eliminate the tail call to avoid recursion.)
4419 goto LexNextToken;
4420 }
4421
4422 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4423 }
4424 }
4425
4426 Kind = tok::unknown;
4427 break;
4428
4429 default: {
4430 if (isASCII(Char)) {
4431 Kind = tok::unknown;
4432 break;
4433 }
4434
4435 llvm::UTF32 CodePoint;
4436
4437 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4438 // an escaped newline.
4439 --CurPtr;
4440 llvm::ConversionResult Status =
4441 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4442 (const llvm::UTF8 *)BufferEnd,
4443 &CodePoint,
4444 llvm::strictConversion);
4445 if (Status == llvm::conversionOK) {
4446 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4447 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4448 return true; // KeepWhitespaceMode
4449
4450 // We only saw whitespace, so just try again with this lexer.
4451 // (We manually eliminate the tail call to avoid recursion.)
4452 goto LexNextToken;
4453 }
4454 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4455 }
4456
4459 ++CurPtr;
4460 Kind = tok::unknown;
4461 break;
4462 }
4463
4464 // Non-ASCII characters tend to creep into source code unintentionally.
4465 // Instead of letting the parser complain about the unknown token,
4466 // just diagnose the invalid UTF-8, then drop the character.
4467 Diag(CurPtr, diag::err_invalid_utf8);
4468
4469 BufferPtr = CurPtr+1;
4470 // We're pretending the character didn't exist, so just try again with
4471 // this lexer.
4472 // (We manually eliminate the tail call to avoid recursion.)
4473 goto LexNextToken;
4474 }
4475 }
4476
4477 // Notify MIOpt that we read a non-whitespace/non-comment token.
4478 MIOpt.ReadToken();
4479
4480 // Update the location of token as well as BufferPtr.
4481 FormTokenWithChars(Result, CurPtr, Kind);
4482 return true;
4483
4484HandleDirective:
4485 // We parsed a # character and it's the start of a preprocessing directive.
4486
4487 FormTokenWithChars(Result, CurPtr, tok::hash);
4489
4491 // With a fatal failure in the module loader, we abort parsing.
4492 return true;
4493
4494 // We parsed the directive; lex a token with the new state.
4495 return false;
4496
4497LexNextToken:
4498 Result.clearFlag(Token::NeedsCleaning);
4499 goto LexStart;
4500}
4501
4502const char *Lexer::convertDependencyDirectiveToken(
4504 const char *TokPtr = BufferStart + DDTok.Offset;
4505 Result.startToken();
4506 Result.setLocation(getSourceLocation(TokPtr));
4507 Result.setKind(DDTok.Kind);
4508 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4509 Result.setLength(DDTok.Length);
4510 BufferPtr = TokPtr + DDTok.Length;
4511 return TokPtr;
4512}
4513
4514bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4515 assert(isDependencyDirectivesLexer());
4516
4517 using namespace dependency_directives_scan;
4518
4519 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4520 if (DepDirectives.front().Kind == pp_eof)
4521 return LexEndOfFile(Result, BufferEnd);
4522 if (DepDirectives.front().Kind == tokens_present_before_eof)
4523 MIOpt.ReadToken();
4524 NextDepDirectiveTokenIndex = 0;
4525 DepDirectives = DepDirectives.drop_front();
4526 }
4527
4529 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4530 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4531 // Read something other than a preprocessor directive hash.
4532 MIOpt.ReadToken();
4533 }
4534
4535 if (ParsingFilename && DDTok.is(tok::less)) {
4536 BufferPtr = BufferStart + DDTok.Offset;
4537 LexAngledStringLiteral(Result, BufferPtr + 1);
4538 if (Result.isNot(tok::header_name))
4539 return true;
4540 // Advance the index of lexed tokens.
4541 while (true) {
4542 const dependency_directives_scan::Token &NextTok =
4543 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4544 if (BufferStart + NextTok.Offset >= BufferPtr)
4545 break;
4546 ++NextDepDirectiveTokenIndex;
4547 }
4548 return true;
4549 }
4550
4551 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4552
4553 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4555 return false;
4556 }
4557 if (Result.is(tok::raw_identifier)) {
4558 Result.setRawIdentifierData(TokPtr);
4559 if (!isLexingRawMode()) {
4561 if (II->isHandleIdentifierCase())
4562 return PP->HandleIdentifier(Result);
4563 }
4564 return true;
4565 }
4566 if (Result.isLiteral()) {
4567 Result.setLiteralData(TokPtr);
4568 return true;
4569 }
4570 if (Result.is(tok::colon)) {
4571 // Convert consecutive colons to 'tok::coloncolon'.
4572 if (*BufferPtr == ':') {
4573 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4574 tok::colon));
4575 ++NextDepDirectiveTokenIndex;
4576 Result.setKind(tok::coloncolon);
4577 }
4578 return true;
4579 }
4580 if (Result.is(tok::eod))
4582
4583 return true;
4584}
4585
4586bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4587 assert(isDependencyDirectivesLexer());
4588
4589 using namespace dependency_directives_scan;
4590
4591 bool Stop = false;
4592 unsigned NestedIfs = 0;
4593 do {
4594 DepDirectives = DepDirectives.drop_front();
4595 switch (DepDirectives.front().Kind) {
4596 case pp_none:
4597 llvm_unreachable("unexpected 'pp_none'");
4598 case pp_include:
4600 case pp_define:
4601 case pp_undef:
4602 case pp_import:
4603 case pp_pragma_import:
4604 case pp_pragma_once:
4609 case pp_include_next:
4610 case decl_at_import:
4611 case cxx_module_decl:
4612 case cxx_import_decl:
4616 break;
4617 case pp_if:
4618 case pp_ifdef:
4619 case pp_ifndef:
4620 ++NestedIfs;
4621 break;
4622 case pp_elif:
4623 case pp_elifdef:
4624 case pp_elifndef:
4625 case pp_else:
4626 if (!NestedIfs) {
4627 Stop = true;
4628 }
4629 break;
4630 case pp_endif:
4631 if (!NestedIfs) {
4632 Stop = true;
4633 } else {
4634 --NestedIfs;
4635 }
4636 break;
4637 case pp_eof:
4638 NextDepDirectiveTokenIndex = 0;
4639 return LexEndOfFile(Result, BufferEnd);
4640 }
4641 } while (!Stop);
4642
4644 DepDirectives.front().Tokens.front();
4645 assert(DDTok.is(tok::hash));
4646 NextDepDirectiveTokenIndex = 1;
4647
4648 convertDependencyDirectiveToken(DDTok, Result);
4649 return false;
4650}
StringRef P
#define SM(sm)
Definition: Cuda.cpp:83
Defines the Diagnostic-related interfaces.
Expr * E
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:948
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition: Lexer.cpp:1545
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1739
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1261
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:325
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3241
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1664
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:561
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:285
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1189
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1559
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1629
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1526
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition: Lexer.cpp:1613
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3345
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition: Lexer.cpp:1532
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:920
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2757
static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:1905
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1242
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1587
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1635
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:544
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
SourceRange Range
Definition: SemaObjC.cpp:758
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
SourceLocation Begin
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ int
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16260
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16052
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1271
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:192
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1547
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:916
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool isInvalid() const
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:134
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:123
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:97
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
bool isKeyword(const LangOptions &LangOpts) const
Return true if this token is a keyword in the specified language.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:476
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:1024
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1358
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:278
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1060
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:3065
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:872
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1232
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
bool Lex(Token &Result)
Lex - Return the next token in the file.
Definition: Lexer.cpp:3678
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:791
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:184
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:894
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:955
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1138
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1213
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1158
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:452
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1134
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:499
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:609
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:220
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1107
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:243
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:637
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:510
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1325
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:850
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:310
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:586
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:137
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
SourceManager & getSourceManager() const
EmptylineHandler * getEmptylineHandler() const
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
IdentifierTable & getIdentifierTable()
const LangOptions & getLangOpts() const
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
DiagnosticsEngine & getDiagnostics() const
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
void setBegin(SourceLocation b)
bool isInvalid() const
SourceLocation getEnd() const
SourceLocation getBegin() const
void setEnd(SourceLocation e)
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
Definition: Token.h:36
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:306
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:116
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
unsigned getLength() const
Definition: Token.h:135
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:70
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:99
tok::TokenKind getKind() const
Definition: Token.h:94
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:276
@ HasUCN
Definition: Token.h:83
@ IsEditorPlaceholder
Definition: Token.h:88
@ LeadingEmptyMacro
Definition: Token.h:81
@ LeadingSpace
Definition: Token.h:77
@ StartOfLine
Definition: Token.h:75
@ HasUDSuffix
Definition: Token.h:82
@ NeedsCleaning
Definition: Token.h:80
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:121
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:61
bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const
Determine whether the token kind starts a simple-type-specifier.
Definition: Lexer.cpp:78
void startToken()
Reset all flags to cleared.
Definition: Token.h:177
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:295
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:213
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:225
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:244
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4270
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3082
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3447
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3730
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3432
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:89
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:41
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:99
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
@ LineComment
Definition: LangStandard.h:50
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition: CharInfo.h:61
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:91
@ Result
The result type of a method or function.
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:175
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:108
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:168
const FunctionProtoType * T
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:53
unsigned int uint32_t
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
#define _SIDD_UBYTE_OPS
Definition: smmintrin.h:1532
#define _mm_cmpistri(A, B, M)
Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...
Definition: smmintrin.h:1664
#define _SIDD_LEAST_SIGNIFICANT
Definition: smmintrin.h:1550
#define _SIDD_NEGATIVE_POLARITY
Definition: smmintrin.h:1545
#define _SIDD_CMP_RANGES
Definition: smmintrin.h:1539
Represents a char and the number of bytes parsed to produce it.
Definition: Lexer.h:579
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.