clang 19.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
18#include "clang/Basic/LLVM.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MathExtras.h"
36#include "llvm/Support/MemoryBufferRef.h"
37#include "llvm/Support/NativeFormatting.h"
38#include "llvm/Support/Unicode.h"
39#include "llvm/Support/UnicodeCharRanges.h"
40#include <algorithm>
41#include <cassert>
42#include <cstddef>
43#include <cstdint>
44#include <cstring>
45#include <optional>
46#include <string>
47#include <tuple>
48#include <utility>
49
50#ifdef __SSE4_2__
51#include <nmmintrin.h>
52#endif
53
54using namespace clang;
55
56//===----------------------------------------------------------------------===//
57// Token Class Implementation
58//===----------------------------------------------------------------------===//
59
60/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
62 if (isAnnotation())
63 return false;
64 if (const IdentifierInfo *II = getIdentifierInfo())
65 return II->getObjCKeywordID() == objcKey;
66 return false;
67}
68
69/// getObjCKeywordID - Return the ObjC keyword kind.
71 if (isAnnotation())
72 return tok::objc_not_keyword;
73 const IdentifierInfo *specId = getIdentifierInfo();
74 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
75}
76
77/// Determine whether the token kind starts a simple-type-specifier.
78bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
79 switch (getKind()) {
80 case tok::annot_typename:
81 case tok::annot_decltype:
82 case tok::annot_pack_indexing_type:
83 return true;
84
85 case tok::kw_short:
86 case tok::kw_long:
87 case tok::kw___int64:
88 case tok::kw___int128:
89 case tok::kw_signed:
90 case tok::kw_unsigned:
91 case tok::kw_void:
92 case tok::kw_char:
93 case tok::kw_int:
94 case tok::kw_half:
95 case tok::kw_float:
96 case tok::kw_double:
97 case tok::kw___bf16:
98 case tok::kw__Float16:
99 case tok::kw___float128:
100 case tok::kw___ibm128:
101 case tok::kw_wchar_t:
102 case tok::kw_bool:
103 case tok::kw__Bool:
104 case tok::kw__Accum:
105 case tok::kw__Fract:
106 case tok::kw__Sat:
107#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
108#include "clang/Basic/TransformTypeTraits.def"
109 case tok::kw___auto_type:
110 case tok::kw_char16_t:
111 case tok::kw_char32_t:
112 case tok::kw_typeof:
113 case tok::kw_decltype:
114 case tok::kw_char8_t:
115 return getIdentifierInfo()->isKeyword(LangOpts);
116
117 default:
118 return false;
119 }
120}
121
122//===----------------------------------------------------------------------===//
123// Lexer Class Implementation
124//===----------------------------------------------------------------------===//
125
126void Lexer::anchor() {}
127
128void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
129 const char *BufEnd) {
130 BufferStart = BufStart;
131 BufferPtr = BufPtr;
132 BufferEnd = BufEnd;
133
134 assert(BufEnd[0] == 0 &&
135 "We assume that the input buffer has a null character at the end"
136 " to simplify lexing!");
137
138 // Check whether we have a BOM in the beginning of the buffer. If yes - act
139 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
140 // skip the UTF-8 BOM if it's present.
141 if (BufferStart == BufferPtr) {
142 // Determine the size of the BOM.
143 StringRef Buf(BufferStart, BufferEnd - BufferStart);
144 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
145 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
146 .Default(0);
147
148 // Skip the BOM.
149 BufferPtr += BOMLength;
150 }
151
152 Is_PragmaLexer = false;
153 CurrentConflictMarkerState = CMK_None;
154
155 // Start of the file is a start of line.
156 IsAtStartOfLine = true;
157 IsAtPhysicalStartOfLine = true;
158
159 HasLeadingSpace = false;
160 HasLeadingEmptyMacro = false;
161
162 // We are not after parsing a #.
164
165 // We are not after parsing #include.
166 ParsingFilename = false;
167
168 // We are not in raw mode. Raw mode disables diagnostics and interpretation
169 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
170 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
171 // or otherwise skipping over tokens.
172 LexingRawMode = false;
173
174 // Default to not keeping comments.
175 ExtendedTokenMode = 0;
176
177 NewLinePtr = nullptr;
178}
179
180/// Lexer constructor - Create a new lexer object for the specified buffer
181/// with the specified preprocessor managing the lexing process. This lexer
182/// assumes that the associated file buffer and Preprocessor objects will
183/// outlive it, so it doesn't take ownership of either of them.
184Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
185 Preprocessor &PP, bool IsFirstIncludeOfFile)
186 : PreprocessorLexer(&PP, FID),
187 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
188 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
189 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
190 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
191 InputFile.getBufferEnd());
192
194}
195
196/// Lexer constructor - Create a new raw lexer object. This object is only
197/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
198/// range will outlive it, so it doesn't take ownership of it.
199Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
200 const char *BufStart, const char *BufPtr, const char *BufEnd,
201 bool IsFirstIncludeOfFile)
202 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
203 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
204 InitLexer(BufStart, BufPtr, BufEnd);
205
206 // We *are* in raw mode.
207 LexingRawMode = true;
208}
209
210/// Lexer constructor - Create a new raw lexer object. This object is only
211/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
212/// range will outlive it, so it doesn't take ownership of it.
213Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
214 const SourceManager &SM, const LangOptions &langOpts,
215 bool IsFirstIncludeOfFile)
216 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
217 FromFile.getBufferStart(), FromFile.getBufferEnd(),
218 IsFirstIncludeOfFile) {}
219
221 assert(PP && "Cannot reset token mode without a preprocessor");
222 if (LangOpts.TraditionalCPP)
224 else
226}
227
228/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
229/// _Pragma expansion. This has a variety of magic semantics that this method
230/// sets up. It returns a new'd Lexer that must be delete'd when done.
231///
232/// On entrance to this routine, TokStartLoc is a macro location which has a
233/// spelling loc that indicates the bytes to be lexed for the token and an
234/// expansion location that indicates where all lexed tokens should be
235/// "expanded from".
236///
237/// TODO: It would really be nice to make _Pragma just be a wrapper around a
238/// normal lexer that remaps tokens as they fly by. This would require making
239/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
240/// interface that could handle this stuff. This would pull GetMappedTokenLoc
241/// out of the critical path of the lexer!
242///
244 SourceLocation ExpansionLocStart,
245 SourceLocation ExpansionLocEnd,
246 unsigned TokLen, Preprocessor &PP) {
248
249 // Create the lexer as if we were going to lex the file normally.
250 FileID SpellingFID = SM.getFileID(SpellingLoc);
251 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
252 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
253
254 // Now that the lexer is created, change the start/end locations so that we
255 // just lex the subsection of the file that we want. This is lexing from a
256 // scratch buffer.
257 const char *StrData = SM.getCharacterData(SpellingLoc);
258
259 L->BufferPtr = StrData;
260 L->BufferEnd = StrData+TokLen;
261 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
262
263 // Set the SourceLocation with the remapping information. This ensures that
264 // GetMappedTokenLoc will remap the tokens as they are lexed.
265 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
266 ExpansionLocStart,
267 ExpansionLocEnd, TokLen);
268
269 // Ensure that the lexer thinks it is inside a directive, so that end \n will
270 // return an EOD token.
272
273 // This lexer really is for _Pragma.
274 L->Is_PragmaLexer = true;
275 return L;
276}
277
278void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
279 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
280 this->IsAtStartOfLine = IsAtStartOfLine;
281 assert((BufferStart + Offset) <= BufferEnd);
282 BufferPtr = BufferStart + Offset;
283}
284
285template <typename T> static void StringifyImpl(T &Str, char Quote) {
286 typename T::size_type i = 0, e = Str.size();
287 while (i < e) {
288 if (Str[i] == '\\' || Str[i] == Quote) {
289 Str.insert(Str.begin() + i, '\\');
290 i += 2;
291 ++e;
292 } else if (Str[i] == '\n' || Str[i] == '\r') {
293 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
294 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
295 Str[i] != Str[i + 1]) {
296 Str[i] = '\\';
297 Str[i + 1] = 'n';
298 } else {
299 // Replace '\n' and '\r' to '\\' followed by 'n'.
300 Str[i] = '\\';
301 Str.insert(Str.begin() + i + 1, 'n');
302 ++e;
303 }
304 i += 2;
305 } else
306 ++i;
307 }
308}
309
310std::string Lexer::Stringify(StringRef Str, bool Charify) {
311 std::string Result = std::string(Str);
312 char Quote = Charify ? '\'' : '"';
313 StringifyImpl(Result, Quote);
314 return Result;
315}
316
318
319//===----------------------------------------------------------------------===//
320// Token Spelling
321//===----------------------------------------------------------------------===//
322
323/// Slow case of getSpelling. Extract the characters comprising the
324/// spelling of this token from the provided input buffer.
325static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
326 const LangOptions &LangOpts, char *Spelling) {
327 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
328
329 size_t Length = 0;
330 const char *BufEnd = BufPtr + Tok.getLength();
331
332 if (tok::isStringLiteral(Tok.getKind())) {
333 // Munch the encoding-prefix and opening double-quote.
334 while (BufPtr < BufEnd) {
335 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
336 Spelling[Length++] = CharAndSize.Char;
337 BufPtr += CharAndSize.Size;
338
339 if (Spelling[Length - 1] == '"')
340 break;
341 }
342
343 // Raw string literals need special handling; trigraph expansion and line
344 // splicing do not occur within their d-char-sequence nor within their
345 // r-char-sequence.
346 if (Length >= 2 &&
347 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
348 // Search backwards from the end of the token to find the matching closing
349 // quote.
350 const char *RawEnd = BufEnd;
351 do --RawEnd; while (*RawEnd != '"');
352 size_t RawLength = RawEnd - BufPtr + 1;
353
354 // Everything between the quotes is included verbatim in the spelling.
355 memcpy(Spelling + Length, BufPtr, RawLength);
356 Length += RawLength;
357 BufPtr += RawLength;
358
359 // The rest of the token is lexed normally.
360 }
361 }
362
363 while (BufPtr < BufEnd) {
364 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
365 Spelling[Length++] = CharAndSize.Char;
366 BufPtr += CharAndSize.Size;
367 }
368
369 assert(Length < Tok.getLength() &&
370 "NeedsCleaning flag set on token that didn't need cleaning!");
371 return Length;
372}
373
374/// getSpelling() - Return the 'spelling' of this token. The spelling of a
375/// token are the characters used to represent the token in the source file
376/// after trigraph expansion and escaped-newline folding. In particular, this
377/// wants to get the true, uncanonicalized, spelling of things like digraphs
378/// UCNs, etc.
380 SmallVectorImpl<char> &buffer,
381 const SourceManager &SM,
382 const LangOptions &options,
383 bool *invalid) {
384 // Break down the source location.
385 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
386
387 // Try to the load the file buffer.
388 bool invalidTemp = false;
389 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
390 if (invalidTemp) {
391 if (invalid) *invalid = true;
392 return {};
393 }
394
395 const char *tokenBegin = file.data() + locInfo.second;
396
397 // Lex from the start of the given location.
398 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
399 file.begin(), tokenBegin, file.end());
400 Token token;
401 lexer.LexFromRawLexer(token);
402
403 unsigned length = token.getLength();
404
405 // Common case: no need for cleaning.
406 if (!token.needsCleaning())
407 return StringRef(tokenBegin, length);
408
409 // Hard case, we need to relex the characters into the string.
410 buffer.resize(length);
411 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
412 return StringRef(buffer.data(), buffer.size());
413}
414
415/// getSpelling() - Return the 'spelling' of this token. The spelling of a
416/// token are the characters used to represent the token in the source file
417/// after trigraph expansion and escaped-newline folding. In particular, this
418/// wants to get the true, uncanonicalized, spelling of things like digraphs
419/// UCNs, etc.
420std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
421 const LangOptions &LangOpts, bool *Invalid) {
422 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
423
424 bool CharDataInvalid = false;
425 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
426 &CharDataInvalid);
427 if (Invalid)
428 *Invalid = CharDataInvalid;
429 if (CharDataInvalid)
430 return {};
431
432 // If this token contains nothing interesting, return it directly.
433 if (!Tok.needsCleaning())
434 return std::string(TokStart, TokStart + Tok.getLength());
435
436 std::string Result;
437 Result.resize(Tok.getLength());
438 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
439 return Result;
440}
441
442/// getSpelling - This method is used to get the spelling of a token into a
443/// preallocated buffer, instead of as an std::string. The caller is required
444/// to allocate enough space for the token, which is guaranteed to be at least
445/// Tok.getLength() bytes long. The actual length of the token is returned.
446///
447/// Note that this method may do two possible things: it may either fill in
448/// the buffer specified with characters, or it may *change the input pointer*
449/// to point to a constant buffer with the data already in it (avoiding a
450/// copy). The caller is not allowed to modify the returned buffer pointer
451/// if an internal buffer is returned.
452unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
453 const SourceManager &SourceMgr,
454 const LangOptions &LangOpts, bool *Invalid) {
455 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
456
457 const char *TokStart = nullptr;
458 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
459 if (Tok.is(tok::raw_identifier))
460 TokStart = Tok.getRawIdentifier().data();
461 else if (!Tok.hasUCN()) {
462 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
463 // Just return the string from the identifier table, which is very quick.
464 Buffer = II->getNameStart();
465 return II->getLength();
466 }
467 }
468
469 // NOTE: this can be checked even after testing for an IdentifierInfo.
470 if (Tok.isLiteral())
471 TokStart = Tok.getLiteralData();
472
473 if (!TokStart) {
474 // Compute the start of the token in the input lexer buffer.
475 bool CharDataInvalid = false;
476 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
477 if (Invalid)
478 *Invalid = CharDataInvalid;
479 if (CharDataInvalid) {
480 Buffer = "";
481 return 0;
482 }
483 }
484
485 // If this token contains nothing interesting, return it directly.
486 if (!Tok.needsCleaning()) {
487 Buffer = TokStart;
488 return Tok.getLength();
489 }
490
491 // Otherwise, hard case, relex the characters into the string.
492 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
493}
494
495/// MeasureTokenLength - Relex the token at the specified location and return
496/// its length in bytes in the input file. If the token needs cleaning (e.g.
497/// includes a trigraph or an escaped newline) then this count includes bytes
498/// that are part of that.
500 const SourceManager &SM,
501 const LangOptions &LangOpts) {
502 Token TheTok;
503 if (getRawToken(Loc, TheTok, SM, LangOpts))
504 return 0;
505 return TheTok.getLength();
506}
507
508/// Relex the token at the specified location.
509/// \returns true if there was a failure, false on success.
511 const SourceManager &SM,
512 const LangOptions &LangOpts,
513 bool IgnoreWhiteSpace) {
514 // TODO: this could be special cased for common tokens like identifiers, ')',
515 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
516 // all obviously single-char tokens. This could use
517 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
518 // something.
519
520 // If this comes from a macro expansion, we really do want the macro name, not
521 // the token this macro expanded to.
522 Loc = SM.getExpansionLoc(Loc);
523 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
524 bool Invalid = false;
525 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
526 if (Invalid)
527 return true;
528
529 const char *StrData = Buffer.data()+LocInfo.second;
530
531 if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
532 return true;
533
534 // Create a lexer starting at the beginning of this token.
535 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
536 Buffer.begin(), StrData, Buffer.end());
537 TheLexer.SetCommentRetentionState(true);
538 TheLexer.LexFromRawLexer(Result);
539 return false;
540}
541
542/// Returns the pointer that points to the beginning of line that contains
543/// the given offset, or null if the offset if invalid.
544static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
545 const char *BufStart = Buffer.data();
546 if (Offset >= Buffer.size())
547 return nullptr;
548
549 const char *LexStart = BufStart + Offset;
550 for (; LexStart != BufStart; --LexStart) {
551 if (isVerticalWhitespace(LexStart[0]) &&
552 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
553 // LexStart should point at first character of logical line.
554 ++LexStart;
555 break;
556 }
557 }
558 return LexStart;
559}
560
562 const SourceManager &SM,
563 const LangOptions &LangOpts) {
564 assert(Loc.isFileID());
565 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
566 if (LocInfo.first.isInvalid())
567 return Loc;
568
569 bool Invalid = false;
570 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
571 if (Invalid)
572 return Loc;
573
574 // Back up from the current location until we hit the beginning of a line
575 // (or the buffer). We'll relex from that point.
576 const char *StrData = Buffer.data() + LocInfo.second;
577 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
578 if (!LexStart || LexStart == StrData)
579 return Loc;
580
581 // Create a lexer starting at the beginning of this token.
582 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
583 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
584 Buffer.end());
585 TheLexer.SetCommentRetentionState(true);
586
587 // Lex tokens until we find the token that contains the source location.
588 Token TheTok;
589 do {
590 TheLexer.LexFromRawLexer(TheTok);
591
592 if (TheLexer.getBufferLocation() > StrData) {
593 // Lexing this token has taken the lexer past the source location we're
594 // looking for. If the current token encompasses our source location,
595 // return the beginning of that token.
596 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
597 return TheTok.getLocation();
598
599 // We ended up skipping over the source location entirely, which means
600 // that it points into whitespace. We're done here.
601 break;
602 }
603 } while (TheTok.getKind() != tok::eof);
604
605 // We've passed our source location; just return the original source location.
606 return Loc;
607}
608
610 const SourceManager &SM,
611 const LangOptions &LangOpts) {
612 if (Loc.isFileID())
613 return getBeginningOfFileToken(Loc, SM, LangOpts);
614
615 if (!SM.isMacroArgExpansion(Loc))
616 return Loc;
617
618 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
619 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
620 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
621 std::pair<FileID, unsigned> BeginFileLocInfo =
622 SM.getDecomposedLoc(BeginFileLoc);
623 assert(FileLocInfo.first == BeginFileLocInfo.first &&
624 FileLocInfo.second >= BeginFileLocInfo.second);
625 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
626}
627
628namespace {
629
630enum PreambleDirectiveKind {
631 PDK_Skipped,
632 PDK_Unknown
633};
634
635} // namespace
636
638 const LangOptions &LangOpts,
639 unsigned MaxLines) {
640 // Create a lexer starting at the beginning of the file. Note that we use a
641 // "fake" file source location at offset 1 so that the lexer will track our
642 // position within the file.
643 const SourceLocation::UIntTy StartOffset = 1;
645 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
646 Buffer.end());
647 TheLexer.SetCommentRetentionState(true);
648
649 bool InPreprocessorDirective = false;
650 Token TheTok;
651 SourceLocation ActiveCommentLoc;
652
653 unsigned MaxLineOffset = 0;
654 if (MaxLines) {
655 const char *CurPtr = Buffer.begin();
656 unsigned CurLine = 0;
657 while (CurPtr != Buffer.end()) {
658 char ch = *CurPtr++;
659 if (ch == '\n') {
660 ++CurLine;
661 if (CurLine == MaxLines)
662 break;
663 }
664 }
665 if (CurPtr != Buffer.end())
666 MaxLineOffset = CurPtr - Buffer.begin();
667 }
668
669 do {
670 TheLexer.LexFromRawLexer(TheTok);
671
672 if (InPreprocessorDirective) {
673 // If we've hit the end of the file, we're done.
674 if (TheTok.getKind() == tok::eof) {
675 break;
676 }
677
678 // If we haven't hit the end of the preprocessor directive, skip this
679 // token.
680 if (!TheTok.isAtStartOfLine())
681 continue;
682
683 // We've passed the end of the preprocessor directive, and will look
684 // at this token again below.
685 InPreprocessorDirective = false;
686 }
687
688 // Keep track of the # of lines in the preamble.
689 if (TheTok.isAtStartOfLine()) {
690 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
691
692 // If we were asked to limit the number of lines in the preamble,
693 // and we're about to exceed that limit, we're done.
694 if (MaxLineOffset && TokOffset >= MaxLineOffset)
695 break;
696 }
697
698 // Comments are okay; skip over them.
699 if (TheTok.getKind() == tok::comment) {
700 if (ActiveCommentLoc.isInvalid())
701 ActiveCommentLoc = TheTok.getLocation();
702 continue;
703 }
704
705 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
706 // This is the start of a preprocessor directive.
707 Token HashTok = TheTok;
708 InPreprocessorDirective = true;
709 ActiveCommentLoc = SourceLocation();
710
711 // Figure out which directive this is. Since we're lexing raw tokens,
712 // we don't have an identifier table available. Instead, just look at
713 // the raw identifier to recognize and categorize preprocessor directives.
714 TheLexer.LexFromRawLexer(TheTok);
715 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
716 StringRef Keyword = TheTok.getRawIdentifier();
717 PreambleDirectiveKind PDK
718 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
719 .Case("include", PDK_Skipped)
720 .Case("__include_macros", PDK_Skipped)
721 .Case("define", PDK_Skipped)
722 .Case("undef", PDK_Skipped)
723 .Case("line", PDK_Skipped)
724 .Case("error", PDK_Skipped)
725 .Case("pragma", PDK_Skipped)
726 .Case("import", PDK_Skipped)
727 .Case("include_next", PDK_Skipped)
728 .Case("warning", PDK_Skipped)
729 .Case("ident", PDK_Skipped)
730 .Case("sccs", PDK_Skipped)
731 .Case("assert", PDK_Skipped)
732 .Case("unassert", PDK_Skipped)
733 .Case("if", PDK_Skipped)
734 .Case("ifdef", PDK_Skipped)
735 .Case("ifndef", PDK_Skipped)
736 .Case("elif", PDK_Skipped)
737 .Case("elifdef", PDK_Skipped)
738 .Case("elifndef", PDK_Skipped)
739 .Case("else", PDK_Skipped)
740 .Case("endif", PDK_Skipped)
741 .Default(PDK_Unknown);
742
743 switch (PDK) {
744 case PDK_Skipped:
745 continue;
746
747 case PDK_Unknown:
748 // We don't know what this directive is; stop at the '#'.
749 break;
750 }
751 }
752
753 // We only end up here if we didn't recognize the preprocessor
754 // directive or it was one that can't occur in the preamble at this
755 // point. Roll back the current token to the location of the '#'.
756 TheTok = HashTok;
757 } else if (TheTok.isAtStartOfLine() &&
758 TheTok.getKind() == tok::raw_identifier &&
759 TheTok.getRawIdentifier() == "module" &&
760 LangOpts.CPlusPlusModules) {
761 // The initial global module fragment introducer "module;" is part of
762 // the preamble, which runs up to the module declaration "module foo;".
763 Token ModuleTok = TheTok;
764 do {
765 TheLexer.LexFromRawLexer(TheTok);
766 } while (TheTok.getKind() == tok::comment);
767 if (TheTok.getKind() != tok::semi) {
768 // Not global module fragment, roll back.
769 TheTok = ModuleTok;
770 break;
771 }
772 continue;
773 }
774
775 // We hit a token that we don't recognize as being in the
776 // "preprocessing only" part of the file, so we're no longer in
777 // the preamble.
778 break;
779 } while (true);
780
781 SourceLocation End;
782 if (ActiveCommentLoc.isValid())
783 End = ActiveCommentLoc; // don't truncate a decl comment.
784 else
785 End = TheTok.getLocation();
786
787 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
788 TheTok.isAtStartOfLine());
789}
790
791unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
792 const SourceManager &SM,
793 const LangOptions &LangOpts) {
794 // Figure out how many physical characters away the specified expansion
795 // character is. This needs to take into consideration newlines and
796 // trigraphs.
797 bool Invalid = false;
798 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
799
800 // If they request the first char of the token, we're trivially done.
801 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
802 return 0;
803
804 unsigned PhysOffset = 0;
805
806 // The usual case is that tokens don't contain anything interesting. Skip
807 // over the uninteresting characters. If a token only consists of simple
808 // chars, this method is extremely fast.
809 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
810 if (CharNo == 0)
811 return PhysOffset;
812 ++TokPtr;
813 --CharNo;
814 ++PhysOffset;
815 }
816
817 // If we have a character that may be a trigraph or escaped newline, use a
818 // lexer to parse it correctly.
819 for (; CharNo; --CharNo) {
820 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
821 TokPtr += CharAndSize.Size;
822 PhysOffset += CharAndSize.Size;
823 }
824
825 // Final detail: if we end up on an escaped newline, we want to return the
826 // location of the actual byte of the token. For example foo<newline>bar
827 // advanced by 3 should return the location of b, not of \\. One compounding
828 // detail of this is that the escape may be made by a trigraph.
829 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
830 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
831
832 return PhysOffset;
833}
834
835/// Computes the source location just past the end of the
836/// token at this source location.
837///
838/// This routine can be used to produce a source location that
839/// points just past the end of the token referenced by \p Loc, and
840/// is generally used when a diagnostic needs to point just after a
841/// token where it expected something different that it received. If
842/// the returned source location would not be meaningful (e.g., if
843/// it points into a macro), this routine returns an invalid
844/// source location.
845///
846/// \param Offset an offset from the end of the token, where the source
847/// location should refer to. The default offset (0) produces a source
848/// location pointing just past the end of the token; an offset of 1 produces
849/// a source location pointing to the last character in the token, etc.
851 const SourceManager &SM,
852 const LangOptions &LangOpts) {
853 if (Loc.isInvalid())
854 return {};
855
856 if (Loc.isMacroID()) {
857 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
858 return {}; // Points inside the macro expansion.
859 }
860
861 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
862 if (Len > Offset)
863 Len = Len - Offset;
864 else
865 return Loc;
866
867 return Loc.getLocWithOffset(Len);
868}
869
870/// Returns true if the given MacroID location points at the first
871/// token of the macro expansion.
873 const SourceManager &SM,
874 const LangOptions &LangOpts,
875 SourceLocation *MacroBegin) {
876 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
877
878 SourceLocation expansionLoc;
879 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
880 return false;
881
882 if (expansionLoc.isFileID()) {
883 // No other macro expansions, this is the first.
884 if (MacroBegin)
885 *MacroBegin = expansionLoc;
886 return true;
887 }
888
889 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
890}
891
892/// Returns true if the given MacroID location points at the last
893/// token of the macro expansion.
895 const SourceManager &SM,
896 const LangOptions &LangOpts,
897 SourceLocation *MacroEnd) {
898 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
899
900 SourceLocation spellLoc = SM.getSpellingLoc(loc);
901 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
902 if (tokLen == 0)
903 return false;
904
905 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
906 SourceLocation expansionLoc;
907 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
908 return false;
909
910 if (expansionLoc.isFileID()) {
911 // No other macro expansions.
912 if (MacroEnd)
913 *MacroEnd = expansionLoc;
914 return true;
915 }
916
917 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
918}
919
921 const SourceManager &SM,
922 const LangOptions &LangOpts) {
923 SourceLocation Begin = Range.getBegin();
924 SourceLocation End = Range.getEnd();
925 assert(Begin.isFileID() && End.isFileID());
926 if (Range.isTokenRange()) {
927 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
928 if (End.isInvalid())
929 return {};
930 }
931
932 // Break down the source locations.
933 FileID FID;
934 unsigned BeginOffs;
935 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
936 if (FID.isInvalid())
937 return {};
938
939 unsigned EndOffs;
940 if (!SM.isInFileID(End, FID, &EndOffs) ||
941 BeginOffs > EndOffs)
942 return {};
943
945}
946
947// Assumes that `Loc` is in an expansion.
949 const SourceManager &SM) {
950 return SM.getSLocEntry(SM.getFileID(Loc))
951 .getExpansion()
952 .isExpansionTokenRange();
953}
954
956 const SourceManager &SM,
957 const LangOptions &LangOpts) {
958 SourceLocation Begin = Range.getBegin();
959 SourceLocation End = Range.getEnd();
960 if (Begin.isInvalid() || End.isInvalid())
961 return {};
962
963 if (Begin.isFileID() && End.isFileID())
964 return makeRangeFromFileLocs(Range, SM, LangOpts);
965
966 if (Begin.isMacroID() && End.isFileID()) {
967 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
968 return {};
969 Range.setBegin(Begin);
970 return makeRangeFromFileLocs(Range, SM, LangOpts);
971 }
972
973 if (Begin.isFileID() && End.isMacroID()) {
974 if (Range.isTokenRange()) {
975 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
976 return {};
977 // Use the *original* end, not the expanded one in `End`.
978 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
979 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
980 return {};
981 Range.setEnd(End);
982 return makeRangeFromFileLocs(Range, SM, LangOpts);
983 }
984
985 assert(Begin.isMacroID() && End.isMacroID());
986 SourceLocation MacroBegin, MacroEnd;
987 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
988 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
989 &MacroEnd)) ||
990 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
991 &MacroEnd)))) {
992 Range.setBegin(MacroBegin);
993 Range.setEnd(MacroEnd);
994 // Use the *original* `End`, not the expanded one in `MacroEnd`.
995 if (Range.isTokenRange())
996 Range.setTokenRange(isInExpansionTokenRange(End, SM));
997 return makeRangeFromFileLocs(Range, SM, LangOpts);
998 }
999
1000 bool Invalid = false;
1001 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
1002 &Invalid);
1003 if (Invalid)
1004 return {};
1005
1006 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
1007 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
1008 &Invalid);
1009 if (Invalid)
1010 return {};
1011
1012 if (EndEntry.getExpansion().isMacroArgExpansion() &&
1013 BeginEntry.getExpansion().getExpansionLocStart() ==
1014 EndEntry.getExpansion().getExpansionLocStart()) {
1015 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
1016 Range.setEnd(SM.getImmediateSpellingLoc(End));
1017 return makeFileCharRange(Range, SM, LangOpts);
1018 }
1019 }
1020
1021 return {};
1022}
1023
1025 const SourceManager &SM,
1026 const LangOptions &LangOpts,
1027 bool *Invalid) {
1028 Range = makeFileCharRange(Range, SM, LangOpts);
1029 if (Range.isInvalid()) {
1030 if (Invalid) *Invalid = true;
1031 return {};
1032 }
1033
1034 // Break down the source location.
1035 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
1036 if (beginInfo.first.isInvalid()) {
1037 if (Invalid) *Invalid = true;
1038 return {};
1039 }
1040
1041 unsigned EndOffs;
1042 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
1043 beginInfo.second > EndOffs) {
1044 if (Invalid) *Invalid = true;
1045 return {};
1046 }
1047
1048 // Try to the load the file buffer.
1049 bool invalidTemp = false;
1050 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1051 if (invalidTemp) {
1052 if (Invalid) *Invalid = true;
1053 return {};
1054 }
1055
1056 if (Invalid) *Invalid = false;
1057 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1058}
1059
1061 const SourceManager &SM,
1062 const LangOptions &LangOpts) {
1063 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1064
1065 // Find the location of the immediate macro expansion.
1066 while (true) {
1067 FileID FID = SM.getFileID(Loc);
1068 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1069 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1070 Loc = Expansion.getExpansionLocStart();
1071 if (!Expansion.isMacroArgExpansion())
1072 break;
1073
1074 // For macro arguments we need to check that the argument did not come
1075 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1076
1077 // Loc points to the argument id of the macro definition, move to the
1078 // macro expansion.
1079 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1080 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1081 if (SpellLoc.isFileID())
1082 break; // No inner macro.
1083
1084 // If spelling location resides in the same FileID as macro expansion
1085 // location, it means there is no inner macro.
1086 FileID MacroFID = SM.getFileID(Loc);
1087 if (SM.isInFileID(SpellLoc, MacroFID))
1088 break;
1089
1090 // Argument came from inner macro.
1091 Loc = SpellLoc;
1092 }
1093
1094 // Find the spelling location of the start of the non-argument expansion
1095 // range. This is where the macro name was spelled in order to begin
1096 // expanding this macro.
1097 Loc = SM.getSpellingLoc(Loc);
1098
1099 // Dig out the buffer where the macro name was spelled and the extents of the
1100 // name so that we can render it into the expansion note.
1101 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1102 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1103 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1104 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1105}
1106
1108 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1109 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1110 // Walk past macro argument expansions.
1111 while (SM.isMacroArgExpansion(Loc))
1112 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1113
1114 // If the macro's spelling isn't FileID or from scratch space, then it's
1115 // actually a token paste or stringization (or similar) and not a macro at
1116 // all.
1117 SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1118 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1119 return {};
1120
1121 // Find the spelling location of the start of the non-argument expansion
1122 // range. This is where the macro name was spelled in order to begin
1123 // expanding this macro.
1124 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1125
1126 // Dig out the buffer where the macro name was spelled and the extents of the
1127 // name so that we can render it into the expansion note.
1128 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1129 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1130 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1131 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1132}
1133
1135 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1136}
1137
1138bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1139 assert(isVerticalWhitespace(Str[0]));
1140 if (Str - 1 < BufferStart)
1141 return false;
1142
1143 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1144 (Str[0] == '\r' && Str[-1] == '\n')) {
1145 if (Str - 2 < BufferStart)
1146 return false;
1147 --Str;
1148 }
1149 --Str;
1150
1151 // Rewind to first non-space character:
1152 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1153 --Str;
1154
1155 return *Str == '\\';
1156}
1157
1159 const SourceManager &SM) {
1160 if (Loc.isInvalid() || Loc.isMacroID())
1161 return {};
1162 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1163 if (LocInfo.first.isInvalid())
1164 return {};
1165 bool Invalid = false;
1166 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1167 if (Invalid)
1168 return {};
1169 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1170 if (!Line)
1171 return {};
1172 StringRef Rest = Buffer.substr(Line - Buffer.data());
1173 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1174 return NumWhitespaceChars == StringRef::npos
1175 ? ""
1176 : Rest.take_front(NumWhitespaceChars);
1177}
1178
1179//===----------------------------------------------------------------------===//
1180// Diagnostics forwarding code.
1181//===----------------------------------------------------------------------===//
1182
1183/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1184/// lexer buffer was all expanded at a single point, perform the mapping.
1185/// This is currently only used for _Pragma implementation, so it is the slow
1186/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1187static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1188 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1190 SourceLocation FileLoc,
1191 unsigned CharNo, unsigned TokLen) {
1192 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1193
1194 // Otherwise, we're lexing "mapped tokens". This is used for things like
1195 // _Pragma handling. Combine the expansion location of FileLoc with the
1196 // spelling location.
1198
1199 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1200 // characters come from spelling(FileLoc)+Offset.
1201 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1202 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1203
1204 // Figure out the expansion loc range, which is the range covered by the
1205 // original _Pragma(...) sequence.
1206 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1207
1208 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1209}
1210
1211/// getSourceLocation - Return a source location identifier for the specified
1212/// offset in the current file.
1214 unsigned TokLen) const {
1215 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1216 "Location out of range for this buffer!");
1217
1218 // In the normal case, we're just lexing from a simple file buffer, return
1219 // the file id from FileLoc with the offset specified.
1220 unsigned CharNo = Loc-BufferStart;
1221 if (FileLoc.isFileID())
1222 return FileLoc.getLocWithOffset(CharNo);
1223
1224 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1225 // tokens are lexed from where the _Pragma was defined.
1226 assert(PP && "This doesn't work on raw lexers");
1227 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1228}
1229
1230/// Diag - Forwarding function for diagnostics. This translate a source
1231/// position in the current buffer into a SourceLocation object for rendering.
1232DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1233 return PP->Diag(getSourceLocation(Loc), DiagID);
1234}
1235
1236//===----------------------------------------------------------------------===//
1237// Trigraph and Escaped Newline Handling Code.
1238//===----------------------------------------------------------------------===//
1239
1240/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1241/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1242static char GetTrigraphCharForLetter(char Letter) {
1243 switch (Letter) {
1244 default: return 0;
1245 case '=': return '#';
1246 case ')': return ']';
1247 case '(': return '[';
1248 case '!': return '|';
1249 case '\'': return '^';
1250 case '>': return '}';
1251 case '/': return '\\';
1252 case '<': return '{';
1253 case '-': return '~';
1254 }
1255}
1256
1257/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1258/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1259/// return the result character. Finally, emit a warning about trigraph use
1260/// whether trigraphs are enabled or not.
1261static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1262 char Res = GetTrigraphCharForLetter(*CP);
1263 if (!Res)
1264 return Res;
1265
1266 if (!Trigraphs) {
1267 if (L && !L->isLexingRawMode())
1268 L->Diag(CP-2, diag::trigraph_ignored);
1269 return 0;
1270 }
1271
1272 if (L && !L->isLexingRawMode())
1273 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1274 return Res;
1275}
1276
1277/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1278/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1279/// trigraph equivalent on entry to this function.
1280unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1281 unsigned Size = 0;
1282 while (isWhitespace(Ptr[Size])) {
1283 ++Size;
1284
1285 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1286 continue;
1287
1288 // If this is a \r\n or \n\r, skip the other half.
1289 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1290 Ptr[Size-1] != Ptr[Size])
1291 ++Size;
1292
1293 return Size;
1294 }
1295
1296 // Not an escaped newline, must be a \t or something else.
1297 return 0;
1298}
1299
1300/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1301/// them), skip over them and return the first non-escaped-newline found,
1302/// otherwise return P.
1303const char *Lexer::SkipEscapedNewLines(const char *P) {
1304 while (true) {
1305 const char *AfterEscape;
1306 if (*P == '\\') {
1307 AfterEscape = P+1;
1308 } else if (*P == '?') {
1309 // If not a trigraph for escape, bail out.
1310 if (P[1] != '?' || P[2] != '/')
1311 return P;
1312 // FIXME: Take LangOpts into account; the language might not
1313 // support trigraphs.
1314 AfterEscape = P+3;
1315 } else {
1316 return P;
1317 }
1318
1319 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1320 if (NewLineSize == 0) return P;
1321 P = AfterEscape+NewLineSize;
1322 }
1323}
1324
1325std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
1326 const SourceManager &SM,
1327 const LangOptions &LangOpts) {
1328 if (Loc.isMacroID()) {
1329 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1330 return std::nullopt;
1331 }
1332 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1333
1334 // Break down the source location.
1335 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1336
1337 // Try to load the file buffer.
1338 bool InvalidTemp = false;
1339 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1340 if (InvalidTemp)
1341 return std::nullopt;
1342
1343 const char *TokenBegin = File.data() + LocInfo.second;
1344
1345 // Lex from the start of the given location.
1346 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1347 TokenBegin, File.end());
1348 // Find the token.
1349 Token Tok;
1350 lexer.LexFromRawLexer(Tok);
1351 return Tok;
1352}
1353
1354/// Checks that the given token is the first token that occurs after the
1355/// given location (this excludes comments and whitespace). Returns the location
1356/// immediately after the specified token. If the token is not found or the
1357/// location is inside a macro, the returned source location will be invalid.
1359 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1360 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1361 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1362 if (!Tok || Tok->isNot(TKind))
1363 return {};
1364 SourceLocation TokenLoc = Tok->getLocation();
1365
1366 // Calculate how much whitespace needs to be skipped if any.
1367 unsigned NumWhitespaceChars = 0;
1368 if (SkipTrailingWhitespaceAndNewLine) {
1369 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1370 unsigned char C = *TokenEnd;
1371 while (isHorizontalWhitespace(C)) {
1372 C = *(++TokenEnd);
1373 NumWhitespaceChars++;
1374 }
1375
1376 // Skip \r, \n, \r\n, or \n\r
1377 if (C == '\n' || C == '\r') {
1378 char PrevC = C;
1379 C = *(++TokenEnd);
1380 NumWhitespaceChars++;
1381 if ((C == '\n' || C == '\r') && C != PrevC)
1382 NumWhitespaceChars++;
1383 }
1384 }
1385
1386 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1387}
1388
1389/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1390/// get its size, and return it. This is tricky in several cases:
1391/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1392/// then either return the trigraph (skipping 3 chars) or the '?',
1393/// depending on whether trigraphs are enabled or not.
1394/// 2. If this is an escaped newline (potentially with whitespace between
1395/// the backslash and newline), implicitly skip the newline and return
1396/// the char after it.
1397///
1398/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1399/// know that we can accumulate into Size, and that we have already incremented
1400/// Ptr by Size bytes.
1401///
1402/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1403/// be updated to match.
1404Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1405 unsigned Size = 0;
1406 // If we have a slash, look for an escaped newline.
1407 if (Ptr[0] == '\\') {
1408 ++Size;
1409 ++Ptr;
1410Slash:
1411 // Common case, backslash-char where the char is not whitespace.
1412 if (!isWhitespace(Ptr[0]))
1413 return {'\\', Size};
1414
1415 // See if we have optional whitespace characters between the slash and
1416 // newline.
1417 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1418 // Remember that this token needs to be cleaned.
1419 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1420
1421 // Warn if there was whitespace between the backslash and newline.
1422 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1423 Diag(Ptr, diag::backslash_newline_space);
1424
1425 // Found backslash<whitespace><newline>. Parse the char after it.
1426 Size += EscapedNewLineSize;
1427 Ptr += EscapedNewLineSize;
1428
1429 // Use slow version to accumulate a correct size field.
1430 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1431 CharAndSize.Size += Size;
1432 return CharAndSize;
1433 }
1434
1435 // Otherwise, this is not an escaped newline, just return the slash.
1436 return {'\\', Size};
1437 }
1438
1439 // If this is a trigraph, process it.
1440 if (Ptr[0] == '?' && Ptr[1] == '?') {
1441 // If this is actually a legal trigraph (not something like "??x"), emit
1442 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1443 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1444 LangOpts.Trigraphs)) {
1445 // Remember that this token needs to be cleaned.
1446 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1447
1448 Ptr += 3;
1449 Size += 3;
1450 if (C == '\\') goto Slash;
1451 return {C, Size};
1452 }
1453 }
1454
1455 // If this is neither, return a single character.
1456 return {*Ptr, Size + 1u};
1457}
1458
1459/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1460/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1461/// and that we have already incremented Ptr by Size bytes.
1462///
1463/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1464/// be updated to match.
1465Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1466 const LangOptions &LangOpts) {
1467
1468 unsigned Size = 0;
1469 // If we have a slash, look for an escaped newline.
1470 if (Ptr[0] == '\\') {
1471 ++Size;
1472 ++Ptr;
1473Slash:
1474 // Common case, backslash-char where the char is not whitespace.
1475 if (!isWhitespace(Ptr[0]))
1476 return {'\\', Size};
1477
1478 // See if we have optional whitespace characters followed by a newline.
1479 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1480 // Found backslash<whitespace><newline>. Parse the char after it.
1481 Size += EscapedNewLineSize;
1482 Ptr += EscapedNewLineSize;
1483
1484 // Use slow version to accumulate a correct size field.
1485 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1486 CharAndSize.Size += Size;
1487 return CharAndSize;
1488 }
1489
1490 // Otherwise, this is not an escaped newline, just return the slash.
1491 return {'\\', Size};
1492 }
1493
1494 // If this is a trigraph, process it.
1495 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1496 // If this is actually a legal trigraph (not something like "??x"), return
1497 // it.
1498 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1499 Ptr += 3;
1500 Size += 3;
1501 if (C == '\\') goto Slash;
1502 return {C, Size};
1503 }
1504 }
1505
1506 // If this is neither, return a single character.
1507 return {*Ptr, Size + 1u};
1508}
1509
1510//===----------------------------------------------------------------------===//
1511// Helper methods for lexing.
1512//===----------------------------------------------------------------------===//
1513
1514/// Routine that indiscriminately sets the offset into the source file.
1515void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1516 BufferPtr = BufferStart + Offset;
1517 if (BufferPtr > BufferEnd)
1518 BufferPtr = BufferEnd;
1519 // FIXME: What exactly does the StartOfLine bit mean? There are two
1520 // possible meanings for the "start" of the line: the first token on the
1521 // unexpanded line, or the first token on the expanded line.
1522 IsAtStartOfLine = StartOfLine;
1523 IsAtPhysicalStartOfLine = StartOfLine;
1524}
1525
1526static bool isUnicodeWhitespace(uint32_t Codepoint) {
1527 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1529 return UnicodeWhitespaceChars.contains(Codepoint);
1530}
1531
1533 llvm::SmallString<5> CharBuf;
1534 llvm::raw_svector_ostream CharOS(CharBuf);
1535 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1536 return CharBuf;
1537}
1538
1539// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1540// we allow "Mathematical Notation Characters" in identifiers.
1541// This is a proposed profile that extends the XID_Start/XID_continue
1542// with mathematical symbols, superscipts and subscripts digits
1543// found in some production software.
1544// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1545static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1546 bool IsStart, bool &IsExtension) {
1547 static const llvm::sys::UnicodeCharSet MathStartChars(
1549 static const llvm::sys::UnicodeCharSet MathContinueChars(
1551 if (MathStartChars.contains(C) ||
1552 (!IsStart && MathContinueChars.contains(C))) {
1553 IsExtension = true;
1554 return true;
1555 }
1556 return false;
1557}
1558
1559static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1560 bool &IsExtension) {
1561 if (LangOpts.AsmPreprocessor) {
1562 return false;
1563 } else if (LangOpts.DollarIdents && '$' == C) {
1564 return true;
1565 } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1566 // A non-leading codepoint must have the XID_Continue property.
1567 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1568 // so we need to check both tables.
1569 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1570 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1571 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1572 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1573 return true;
1574 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1575 IsExtension);
1576 } else if (LangOpts.C11) {
1577 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1579 return C11AllowedIDChars.contains(C);
1580 } else {
1581 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1583 return C99AllowedIDChars.contains(C);
1584 }
1585}
1586
1587static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1588 bool &IsExtension) {
1589 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1590 IsExtension = false;
1591 if (LangOpts.AsmPreprocessor) {
1592 return false;
1593 }
1594 if (LangOpts.CPlusPlus || LangOpts.C23) {
1595 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1596 if (XIDStartChars.contains(C))
1597 return true;
1598 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1599 IsExtension);
1600 }
1601 if (!isAllowedIDChar(C, LangOpts, IsExtension))
1602 return false;
1603 if (LangOpts.C11) {
1604 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1606 return !C11DisallowedInitialIDChars.contains(C);
1607 }
1608 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1610 return !C99DisallowedInitialIDChars.contains(C);
1611}
1612
1614 CharSourceRange Range) {
1615
1616 static const llvm::sys::UnicodeCharSet MathStartChars(
1618 static const llvm::sys::UnicodeCharSet MathContinueChars(
1620
1621 (void)MathStartChars;
1622 (void)MathContinueChars;
1623 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1624 "Unexpected mathematical notation codepoint");
1625 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1626 << codepointAsHexString(C) << Range;
1627}
1628
1629static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1630 const char *End) {
1632 L.getSourceLocation(End));
1633}
1634
1635static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1636 CharSourceRange Range, bool IsFirst) {
1637 // Check C99 compatibility.
1638 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1639 enum {
1640 CannotAppearInIdentifier = 0,
1641 CannotStartIdentifier
1642 };
1643
1644 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1646 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1648 if (!C99AllowedIDChars.contains(C)) {
1649 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1650 << Range
1651 << CannotAppearInIdentifier;
1652 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1653 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1654 << Range
1655 << CannotStartIdentifier;
1656 }
1657 }
1658}
1659
1660/// After encountering UTF-8 character C and interpreting it as an identifier
1661/// character, check whether it's a homoglyph for a common non-identifier
1662/// source character that is unlikely to be an intentional identifier
1663/// character and warn if so.
1665 CharSourceRange Range) {
1666 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1667 struct HomoglyphPair {
1668 uint32_t Character;
1669 char LooksLike;
1670 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1671 };
1672 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1673 {U'\u00ad', 0}, // SOFT HYPHEN
1674 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1675 {U'\u037e', ';'}, // GREEK QUESTION MARK
1676 {U'\u200b', 0}, // ZERO WIDTH SPACE
1677 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1678 {U'\u200d', 0}, // ZERO WIDTH JOINER
1679 {U'\u2060', 0}, // WORD JOINER
1680 {U'\u2061', 0}, // FUNCTION APPLICATION
1681 {U'\u2062', 0}, // INVISIBLE TIMES
1682 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1683 {U'\u2064', 0}, // INVISIBLE PLUS
1684 {U'\u2212', '-'}, // MINUS SIGN
1685 {U'\u2215', '/'}, // DIVISION SLASH
1686 {U'\u2216', '\\'}, // SET MINUS
1687 {U'\u2217', '*'}, // ASTERISK OPERATOR
1688 {U'\u2223', '|'}, // DIVIDES
1689 {U'\u2227', '^'}, // LOGICAL AND
1690 {U'\u2236', ':'}, // RATIO
1691 {U'\u223c', '~'}, // TILDE OPERATOR
1692 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1693 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1694 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1695 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1696 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1697 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1698 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1699 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1700 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1701 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1702 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1703 {U'\uff0c', ','}, // FULLWIDTH COMMA
1704 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1705 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1706 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1707 {U'\uff1a', ':'}, // FULLWIDTH COLON
1708 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1709 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1710 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1711 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1712 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1713 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1714 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1715 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1716 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1717 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1718 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1719 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1720 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1721 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1722 {0, 0}
1723 };
1724 auto Homoglyph =
1725 std::lower_bound(std::begin(SortedHomoglyphs),
1726 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1727 if (Homoglyph->Character == C) {
1728 if (Homoglyph->LooksLike) {
1729 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1730 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1731 << Range << codepointAsHexString(C) << LooksLikeStr;
1732 } else {
1733 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1734 << Range << codepointAsHexString(C);
1735 }
1736 }
1737}
1738
1740 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1741 CharSourceRange Range, bool IsFirst) {
1742 if (isASCII(CodePoint))
1743 return;
1744
1745 bool IsExtension;
1746 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1747 bool IsIDContinue =
1748 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1749
1750 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1751 return;
1752
1753 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1754
1755 if (!IsFirst || InvalidOnlyAtStart) {
1756 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1757 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1758 << FixItHint::CreateRemoval(Range);
1759 } else {
1760 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1761 << Range << codepointAsHexString(CodePoint)
1762 << FixItHint::CreateRemoval(Range);
1763 }
1764}
1765
1766bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1767 Token &Result) {
1768 const char *UCNPtr = CurPtr + Size;
1769 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1770 if (CodePoint == 0) {
1771 return false;
1772 }
1773 bool IsExtension = false;
1774 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1775 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1776 return false;
1780 PP->getDiagnostics(), LangOpts, CodePoint,
1781 makeCharRange(*this, CurPtr, UCNPtr),
1782 /*IsFirst=*/false);
1783
1784 // We got a unicode codepoint that is neither a space nor a
1785 // a valid identifier part.
1786 // Carry on as if the codepoint was valid for recovery purposes.
1787 } else if (!isLexingRawMode()) {
1788 if (IsExtension)
1790 makeCharRange(*this, CurPtr, UCNPtr));
1791
1793 makeCharRange(*this, CurPtr, UCNPtr),
1794 /*IsFirst=*/false);
1795 }
1796
1797 Result.setFlag(Token::HasUCN);
1798 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1799 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1800 CurPtr = UCNPtr;
1801 else
1802 while (CurPtr != UCNPtr)
1803 (void)getAndAdvanceChar(CurPtr, Result);
1804 return true;
1805}
1806
1807bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1808 llvm::UTF32 CodePoint;
1809
1810 // If a UTF-8 codepoint appears immediately after an escaped new line,
1811 // CurPtr may point to the splicing \ on the preceding line,
1812 // so we need to skip it.
1813 unsigned FirstCodeUnitSize;
1814 getCharAndSize(CurPtr, FirstCodeUnitSize);
1815 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1816 const char *UnicodePtr = CharStart;
1817
1818 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1819 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1820 &CodePoint, llvm::strictConversion);
1821 if (ConvResult != llvm::conversionOK)
1822 return false;
1823
1824 bool IsExtension = false;
1825 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1826 IsExtension)) {
1827 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1828 return false;
1829
1833 PP->getDiagnostics(), LangOpts, CodePoint,
1834 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1835 // We got a unicode codepoint that is neither a space nor a
1836 // a valid identifier part. Carry on as if the codepoint was
1837 // valid for recovery purposes.
1838 } else if (!isLexingRawMode()) {
1839 if (IsExtension)
1841 PP->getDiagnostics(), CodePoint,
1842 makeCharRange(*this, CharStart, UnicodePtr));
1844 makeCharRange(*this, CharStart, UnicodePtr),
1845 /*IsFirst=*/false);
1847 makeCharRange(*this, CharStart, UnicodePtr));
1848 }
1849
1850 // Once we sucessfully parsed some UTF-8,
1851 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1852 // being lexed, and that warnings about trailing spaces are emitted.
1853 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1854 CurPtr = UnicodePtr;
1855 return true;
1856}
1857
1858bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1859 const char *CurPtr) {
1860 bool IsExtension = false;
1861 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1864 if (IsExtension)
1866 makeCharRange(*this, BufferPtr, CurPtr));
1868 makeCharRange(*this, BufferPtr, CurPtr),
1869 /*IsFirst=*/true);
1871 makeCharRange(*this, BufferPtr, CurPtr));
1872 }
1873
1874 MIOpt.ReadToken();
1875 return LexIdentifierContinue(Result, CurPtr);
1876 }
1877
1879 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1881 // Non-ASCII characters tend to creep into source code unintentionally.
1882 // Instead of letting the parser complain about the unknown token,
1883 // just drop the character.
1884 // Note that we can /only/ do this when the non-ASCII character is actually
1885 // spelled as Unicode, not written as a UCN. The standard requires that
1886 // we not throw away any possible preprocessor tokens, but there's a
1887 // loophole in the mapping of Unicode characters to basic character set
1888 // characters that allows us to map these particular characters to, say,
1889 // whitespace.
1891 PP->getDiagnostics(), LangOpts, C,
1892 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1893 BufferPtr = CurPtr;
1894 return false;
1895 }
1896
1897 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1898 // up by accident.
1899 MIOpt.ReadToken();
1900 FormTokenWithChars(Result, CurPtr, tok::unknown);
1901 return true;
1902}
1903
1904static const char *
1905fastParseASCIIIdentifier(const char *CurPtr,
1906 [[maybe_unused]] const char *BufferEnd) {
1907#ifdef __SSE4_2__
1908 alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1909 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1910 };
1911 constexpr ssize_t BytesPerRegister = 16;
1912
1913 __m128i AsciiIdentifierRangeV =
1914 _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1915
1916 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1917 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1918
1919 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1922 CurPtr += Consumed;
1923 if (Consumed == BytesPerRegister)
1924 continue;
1925 return CurPtr;
1926 }
1927#endif
1928
1929 unsigned char C = *CurPtr;
1931 C = *++CurPtr;
1932 return CurPtr;
1933}
1934
1935bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1936 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1937
1938 while (true) {
1939
1940 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1941
1942 unsigned Size;
1943 // Slow path: handle trigraph, unicode codepoints, UCNs.
1944 unsigned char C = getCharAndSize(CurPtr, Size);
1946 CurPtr = ConsumeChar(CurPtr, Size, Result);
1947 continue;
1948 }
1949 if (C == '$') {
1950 // If we hit a $ and they are not supported in identifiers, we are done.
1951 if (!LangOpts.DollarIdents)
1952 break;
1953 // Otherwise, emit a diagnostic and continue.
1954 if (!isLexingRawMode())
1955 Diag(CurPtr, diag::ext_dollar_in_identifier);
1956 CurPtr = ConsumeChar(CurPtr, Size, Result);
1957 continue;
1958 }
1959 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1960 continue;
1961 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1962 continue;
1963 // Neither an expected Unicode codepoint nor a UCN.
1964 break;
1965 }
1966
1967 const char *IdStart = BufferPtr;
1968 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1969 Result.setRawIdentifierData(IdStart);
1970
1971 // If we are in raw mode, return this identifier raw. There is no need to
1972 // look up identifier information or attempt to macro expand it.
1973 if (LexingRawMode)
1974 return true;
1975
1976 // Fill in Result.IdentifierInfo and update the token kind,
1977 // looking up the identifier in the identifier table.
1979 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1980 // completion, it writes IdentifierInfo into Result, and callers rely on it.
1981
1982 // If the completion point is at the end of an identifier, we want to treat
1983 // the identifier as incomplete even if it resolves to a macro or a keyword.
1984 // This allows e.g. 'class^' to complete to 'classifier'.
1985 if (isCodeCompletionPoint(CurPtr)) {
1986 // Return the code-completion token.
1987 Result.setKind(tok::code_completion);
1988 // Skip the code-completion char and all immediate identifier characters.
1989 // This ensures we get consistent behavior when completing at any point in
1990 // an identifier (i.e. at the start, in the middle, at the end). Note that
1991 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1992 // simpler.
1993 assert(*CurPtr == 0 && "Completion character must be 0");
1994 ++CurPtr;
1995 // Note that code completion token is not added as a separate character
1996 // when the completion point is at the end of the buffer. Therefore, we need
1997 // to check if the buffer has ended.
1998 if (CurPtr < BufferEnd) {
1999 while (isAsciiIdentifierContinue(*CurPtr))
2000 ++CurPtr;
2001 }
2002 BufferPtr = CurPtr;
2003 return true;
2004 }
2005
2006 // Finally, now that we know we have an identifier, pass this off to the
2007 // preprocessor, which may macro expand it or something.
2008 if (II->isHandleIdentifierCase())
2009 return PP->HandleIdentifier(Result);
2010
2011 return true;
2012}
2013
2014/// isHexaLiteral - Return true if Start points to a hex constant.
2015/// in microsoft mode (where this is supposed to be several different tokens).
2016bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
2017 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
2018 char C1 = CharAndSize1.Char;
2019 if (C1 != '0')
2020 return false;
2021
2022 auto CharAndSize2 =
2023 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
2024 char C2 = CharAndSize2.Char;
2025 return (C2 == 'x' || C2 == 'X');
2026}
2027
2028/// LexNumericConstant - Lex the remainder of a integer or floating point
2029/// constant. From[-1] is the first character lexed. Return the end of the
2030/// constant.
2031bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2032 unsigned Size;
2033 char C = getCharAndSize(CurPtr, Size);
2034 char PrevCh = 0;
2035 while (isPreprocessingNumberBody(C)) {
2036 CurPtr = ConsumeChar(CurPtr, Size, Result);
2037 PrevCh = C;
2038 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
2039 CurPtr -= Size;
2040 break;
2041 }
2042 C = getCharAndSize(CurPtr, Size);
2043 }
2044
2045 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
2046 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2047 // If we are in Microsoft mode, don't continue if the constant is hex.
2048 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2049 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2050 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2051 }
2052
2053 // If we have a hex FP constant, continue.
2054 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2055 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2056 // not-quite-conforming extension. Only do so if this looks like it's
2057 // actually meant to be a hexfloat, and not if it has a ud-suffix.
2058 bool IsHexFloat = true;
2059 if (!LangOpts.C99) {
2060 if (!isHexaLiteral(BufferPtr, LangOpts))
2061 IsHexFloat = false;
2062 else if (!LangOpts.CPlusPlus17 &&
2063 std::find(BufferPtr, CurPtr, '_') != CurPtr)
2064 IsHexFloat = false;
2065 }
2066 if (IsHexFloat)
2067 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2068 }
2069
2070 // If we have a digit separator, continue.
2071 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2072 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2073 if (isAsciiIdentifierContinue(Next)) {
2074 if (!isLexingRawMode())
2075 Diag(CurPtr, LangOpts.CPlusPlus
2076 ? diag::warn_cxx11_compat_digit_separator
2077 : diag::warn_c23_compat_digit_separator);
2078 CurPtr = ConsumeChar(CurPtr, Size, Result);
2079 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2080 return LexNumericConstant(Result, CurPtr);
2081 }
2082 }
2083
2084 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2085 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2086 return LexNumericConstant(Result, CurPtr);
2087 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2088 return LexNumericConstant(Result, CurPtr);
2089
2090 // Update the location of token as well as BufferPtr.
2091 const char *TokStart = BufferPtr;
2092 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2093 Result.setLiteralData(TokStart);
2094 return true;
2095}
2096
2097/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2098/// in C++11, or warn on a ud-suffix in C++98.
2099const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2100 bool IsStringLiteral) {
2101 assert(LangOpts.CPlusPlus);
2102
2103 // Maximally munch an identifier.
2104 unsigned Size;
2105 char C = getCharAndSize(CurPtr, Size);
2106 bool Consumed = false;
2107
2108 if (!isAsciiIdentifierStart(C)) {
2109 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2110 Consumed = true;
2111 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2112 Consumed = true;
2113 else
2114 return CurPtr;
2115 }
2116
2117 if (!LangOpts.CPlusPlus11) {
2118 if (!isLexingRawMode())
2119 Diag(CurPtr,
2120 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2121 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2123 return CurPtr;
2124 }
2125
2126 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2127 // that does not start with an underscore is ill-formed. As a conforming
2128 // extension, we treat all such suffixes as if they had whitespace before
2129 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2130 // likely to be a ud-suffix than a macro, however, and accept that.
2131 if (!Consumed) {
2132 bool IsUDSuffix = false;
2133 if (C == '_')
2134 IsUDSuffix = true;
2135 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2136 // In C++1y, we need to look ahead a few characters to see if this is a
2137 // valid suffix for a string literal or a numeric literal (this could be
2138 // the 'operator""if' defining a numeric literal operator).
2139 const unsigned MaxStandardSuffixLength = 3;
2140 char Buffer[MaxStandardSuffixLength] = { C };
2141 unsigned Consumed = Size;
2142 unsigned Chars = 1;
2143 while (true) {
2144 auto [Next, NextSize] =
2145 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2146 if (!isAsciiIdentifierContinue(Next)) {
2147 // End of suffix. Check whether this is on the allowed list.
2148 const StringRef CompleteSuffix(Buffer, Chars);
2149 IsUDSuffix =
2150 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2151 break;
2152 }
2153
2154 if (Chars == MaxStandardSuffixLength)
2155 // Too long: can't be a standard suffix.
2156 break;
2157
2158 Buffer[Chars++] = Next;
2159 Consumed += NextSize;
2160 }
2161 }
2162
2163 if (!IsUDSuffix) {
2164 if (!isLexingRawMode())
2165 Diag(CurPtr, LangOpts.MSVCCompat
2166 ? diag::ext_ms_reserved_user_defined_literal
2167 : diag::ext_reserved_user_defined_literal)
2169 return CurPtr;
2170 }
2171
2172 CurPtr = ConsumeChar(CurPtr, Size, Result);
2173 }
2174
2175 Result.setFlag(Token::HasUDSuffix);
2176 while (true) {
2177 C = getCharAndSize(CurPtr, Size);
2179 CurPtr = ConsumeChar(CurPtr, Size, Result);
2180 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2181 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2182 } else
2183 break;
2184 }
2185
2186 return CurPtr;
2187}
2188
2189/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2190/// either " or L" or u8" or u" or U".
2191bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2192 tok::TokenKind Kind) {
2193 const char *AfterQuote = CurPtr;
2194 // Does this string contain the \0 character?
2195 const char *NulCharacter = nullptr;
2196
2197 if (!isLexingRawMode() &&
2198 (Kind == tok::utf8_string_literal ||
2199 Kind == tok::utf16_string_literal ||
2200 Kind == tok::utf32_string_literal))
2201 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2202 : diag::warn_c99_compat_unicode_literal);
2203
2204 char C = getAndAdvanceChar(CurPtr, Result);
2205 while (C != '"') {
2206 // Skip escaped characters. Escaped newlines will already be processed by
2207 // getAndAdvanceChar.
2208 if (C == '\\')
2209 C = getAndAdvanceChar(CurPtr, Result);
2210
2211 if (C == '\n' || C == '\r' || // Newline.
2212 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2213 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2214 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2215 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2216 return true;
2217 }
2218
2219 if (C == 0) {
2220 if (isCodeCompletionPoint(CurPtr-1)) {
2221 if (ParsingFilename)
2222 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2223 else
2225 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2226 cutOffLexing();
2227 return true;
2228 }
2229
2230 NulCharacter = CurPtr-1;
2231 }
2232 C = getAndAdvanceChar(CurPtr, Result);
2233 }
2234
2235 // If we are in C++11, lex the optional ud-suffix.
2236 if (LangOpts.CPlusPlus)
2237 CurPtr = LexUDSuffix(Result, CurPtr, true);
2238
2239 // If a nul character existed in the string, warn about it.
2240 if (NulCharacter && !isLexingRawMode())
2241 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2242
2243 // Update the location of the token as well as the BufferPtr instance var.
2244 const char *TokStart = BufferPtr;
2245 FormTokenWithChars(Result, CurPtr, Kind);
2246 Result.setLiteralData(TokStart);
2247 return true;
2248}
2249
2250/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2251/// having lexed R", LR", u8R", uR", or UR".
2252bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2253 tok::TokenKind Kind) {
2254 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2255 // Between the initial and final double quote characters of the raw string,
2256 // any transformations performed in phases 1 and 2 (trigraphs,
2257 // universal-character-names, and line splicing) are reverted.
2258
2259 if (!isLexingRawMode())
2260 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2261
2262 unsigned PrefixLen = 0;
2263
2264 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
2265 ++PrefixLen;
2266
2267 // If the last character was not a '(', then we didn't lex a valid delimiter.
2268 if (CurPtr[PrefixLen] != '(') {
2269 if (!isLexingRawMode()) {
2270 const char *PrefixEnd = &CurPtr[PrefixLen];
2271 if (PrefixLen == 16) {
2272 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2273 } else if (*PrefixEnd == '\n') {
2274 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
2275 } else {
2276 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2277 << StringRef(PrefixEnd, 1);
2278 }
2279 }
2280
2281 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2282 // it's possible the '"' was intended to be part of the raw string, but
2283 // there's not much we can do about that.
2284 while (true) {
2285 char C = *CurPtr++;
2286
2287 if (C == '"')
2288 break;
2289 if (C == 0 && CurPtr-1 == BufferEnd) {
2290 --CurPtr;
2291 break;
2292 }
2293 }
2294
2295 FormTokenWithChars(Result, CurPtr, tok::unknown);
2296 return true;
2297 }
2298
2299 // Save prefix and move CurPtr past it
2300 const char *Prefix = CurPtr;
2301 CurPtr += PrefixLen + 1; // skip over prefix and '('
2302
2303 while (true) {
2304 char C = *CurPtr++;
2305
2306 if (C == ')') {
2307 // Check for prefix match and closing quote.
2308 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2309 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2310 break;
2311 }
2312 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2313 if (!isLexingRawMode())
2314 Diag(BufferPtr, diag::err_unterminated_raw_string)
2315 << StringRef(Prefix, PrefixLen);
2316 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2317 return true;
2318 }
2319 }
2320
2321 // If we are in C++11, lex the optional ud-suffix.
2322 if (LangOpts.CPlusPlus)
2323 CurPtr = LexUDSuffix(Result, CurPtr, true);
2324
2325 // Update the location of token as well as BufferPtr.
2326 const char *TokStart = BufferPtr;
2327 FormTokenWithChars(Result, CurPtr, Kind);
2328 Result.setLiteralData(TokStart);
2329 return true;
2330}
2331
2332/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2333/// after having lexed the '<' character. This is used for #include filenames.
2334bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2335 // Does this string contain the \0 character?
2336 const char *NulCharacter = nullptr;
2337 const char *AfterLessPos = CurPtr;
2338 char C = getAndAdvanceChar(CurPtr, Result);
2339 while (C != '>') {
2340 // Skip escaped characters. Escaped newlines will already be processed by
2341 // getAndAdvanceChar.
2342 if (C == '\\')
2343 C = getAndAdvanceChar(CurPtr, Result);
2344
2345 if (isVerticalWhitespace(C) || // Newline.
2346 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2347 // If the filename is unterminated, then it must just be a lone <
2348 // character. Return this as such.
2349 FormTokenWithChars(Result, AfterLessPos, tok::less);
2350 return true;
2351 }
2352
2353 if (C == 0) {
2354 if (isCodeCompletionPoint(CurPtr - 1)) {
2355 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2356 cutOffLexing();
2357 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2358 return true;
2359 }
2360 NulCharacter = CurPtr-1;
2361 }
2362 C = getAndAdvanceChar(CurPtr, Result);
2363 }
2364
2365 // If a nul character existed in the string, warn about it.
2366 if (NulCharacter && !isLexingRawMode())
2367 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2368
2369 // Update the location of token as well as BufferPtr.
2370 const char *TokStart = BufferPtr;
2371 FormTokenWithChars(Result, CurPtr, tok::header_name);
2372 Result.setLiteralData(TokStart);
2373 return true;
2374}
2375
2376void Lexer::codeCompleteIncludedFile(const char *PathStart,
2377 const char *CompletionPoint,
2378 bool IsAngled) {
2379 // Completion only applies to the filename, after the last slash.
2380 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2381 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2382 auto Slash = PartialPath.find_last_of(SlashChars);
2383 StringRef Dir =
2384 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2385 const char *StartOfFilename =
2386 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2387 // Code completion filter range is the filename only, up to completion point.
2389 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2390 // We should replace the characters up to the closing quote or closest slash,
2391 // if any.
2392 while (CompletionPoint < BufferEnd) {
2393 char Next = *(CompletionPoint + 1);
2394 if (Next == 0 || Next == '\r' || Next == '\n')
2395 break;
2396 ++CompletionPoint;
2397 if (Next == (IsAngled ? '>' : '"'))
2398 break;
2399 if (SlashChars.contains(Next))
2400 break;
2401 }
2402
2404 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2405 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2406 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2407}
2408
2409/// LexCharConstant - Lex the remainder of a character constant, after having
2410/// lexed either ' or L' or u8' or u' or U'.
2411bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2412 tok::TokenKind Kind) {
2413 // Does this character contain the \0 character?
2414 const char *NulCharacter = nullptr;
2415
2416 if (!isLexingRawMode()) {
2417 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2418 Diag(BufferPtr, LangOpts.CPlusPlus
2419 ? diag::warn_cxx98_compat_unicode_literal
2420 : diag::warn_c99_compat_unicode_literal);
2421 else if (Kind == tok::utf8_char_constant)
2422 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2423 }
2424
2425 char C = getAndAdvanceChar(CurPtr, Result);
2426 if (C == '\'') {
2427 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2428 Diag(BufferPtr, diag::ext_empty_character);
2429 FormTokenWithChars(Result, CurPtr, tok::unknown);
2430 return true;
2431 }
2432
2433 while (C != '\'') {
2434 // Skip escaped characters.
2435 if (C == '\\')
2436 C = getAndAdvanceChar(CurPtr, Result);
2437
2438 if (C == '\n' || C == '\r' || // Newline.
2439 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2440 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2441 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2442 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2443 return true;
2444 }
2445
2446 if (C == 0) {
2447 if (isCodeCompletionPoint(CurPtr-1)) {
2449 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2450 cutOffLexing();
2451 return true;
2452 }
2453
2454 NulCharacter = CurPtr-1;
2455 }
2456 C = getAndAdvanceChar(CurPtr, Result);
2457 }
2458
2459 // If we are in C++11, lex the optional ud-suffix.
2460 if (LangOpts.CPlusPlus)
2461 CurPtr = LexUDSuffix(Result, CurPtr, false);
2462
2463 // If a nul character existed in the character, warn about it.
2464 if (NulCharacter && !isLexingRawMode())
2465 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2466
2467 // Update the location of token as well as BufferPtr.
2468 const char *TokStart = BufferPtr;
2469 FormTokenWithChars(Result, CurPtr, Kind);
2470 Result.setLiteralData(TokStart);
2471 return true;
2472}
2473
2474/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2475/// Update BufferPtr to point to the next non-whitespace character and return.
2476///
2477/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2478bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2479 bool &TokAtPhysicalStartOfLine) {
2480 // Whitespace - Skip it, then return the token after the whitespace.
2481 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2482
2483 unsigned char Char = *CurPtr;
2484
2485 const char *lastNewLine = nullptr;
2486 auto setLastNewLine = [&](const char *Ptr) {
2487 lastNewLine = Ptr;
2488 if (!NewLinePtr)
2489 NewLinePtr = Ptr;
2490 };
2491 if (SawNewline)
2492 setLastNewLine(CurPtr - 1);
2493
2494 // Skip consecutive spaces efficiently.
2495 while (true) {
2496 // Skip horizontal whitespace very aggressively.
2497 while (isHorizontalWhitespace(Char))
2498 Char = *++CurPtr;
2499
2500 // Otherwise if we have something other than whitespace, we're done.
2501 if (!isVerticalWhitespace(Char))
2502 break;
2503
2505 // End of preprocessor directive line, let LexTokenInternal handle this.
2506 BufferPtr = CurPtr;
2507 return false;
2508 }
2509
2510 // OK, but handle newline.
2511 if (*CurPtr == '\n')
2512 setLastNewLine(CurPtr);
2513 SawNewline = true;
2514 Char = *++CurPtr;
2515 }
2516
2517 // If the client wants us to return whitespace, return it now.
2518 if (isKeepWhitespaceMode()) {
2519 FormTokenWithChars(Result, CurPtr, tok::unknown);
2520 if (SawNewline) {
2521 IsAtStartOfLine = true;
2522 IsAtPhysicalStartOfLine = true;
2523 }
2524 // FIXME: The next token will not have LeadingSpace set.
2525 return true;
2526 }
2527
2528 // If this isn't immediately after a newline, there is leading space.
2529 char PrevChar = CurPtr[-1];
2530 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2531
2532 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2533 if (SawNewline) {
2534 Result.setFlag(Token::StartOfLine);
2535 TokAtPhysicalStartOfLine = true;
2536
2537 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2538 if (auto *Handler = PP->getEmptylineHandler())
2539 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2540 getSourceLocation(lastNewLine)));
2541 }
2542 }
2543
2544 BufferPtr = CurPtr;
2545 return false;
2546}
2547
2548/// We have just read the // characters from input. Skip until we find the
2549/// newline character that terminates the comment. Then update BufferPtr and
2550/// return.
2551///
2552/// If we're in KeepCommentMode or any CommentHandler has inserted
2553/// some tokens, this will store the first token and return true.
2554bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2555 bool &TokAtPhysicalStartOfLine) {
2556 // If Line comments aren't explicitly enabled for this language, emit an
2557 // extension warning.
2558 if (!LineComment) {
2559 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2560 Diag(BufferPtr, diag::ext_line_comment);
2561
2562 // Mark them enabled so we only emit one warning for this translation
2563 // unit.
2564 LineComment = true;
2565 }
2566
2567 // Scan over the body of the comment. The common case, when scanning, is that
2568 // the comment contains normal ascii characters with nothing interesting in
2569 // them. As such, optimize for this case with the inner loop.
2570 //
2571 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2572 // character that ends the line comment.
2573
2574 // C++23 [lex.phases] p1
2575 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2576 // diagnostic only once per entire ill-formed subsequence to avoid
2577 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2578 bool UnicodeDecodingAlreadyDiagnosed = false;
2579
2580 char C;
2581 while (true) {
2582 C = *CurPtr;
2583 // Skip over characters in the fast loop.
2584 while (isASCII(C) && C != 0 && // Potentially EOF.
2585 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2586 C = *++CurPtr;
2587 UnicodeDecodingAlreadyDiagnosed = false;
2588 }
2589
2590 if (!isASCII(C)) {
2591 unsigned Length = llvm::getUTF8SequenceSize(
2592 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2593 if (Length == 0) {
2594 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2595 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2596 UnicodeDecodingAlreadyDiagnosed = true;
2597 ++CurPtr;
2598 } else {
2599 UnicodeDecodingAlreadyDiagnosed = false;
2600 CurPtr += Length;
2601 }
2602 continue;
2603 }
2604
2605 const char *NextLine = CurPtr;
2606 if (C != 0) {
2607 // We found a newline, see if it's escaped.
2608 const char *EscapePtr = CurPtr-1;
2609 bool HasSpace = false;
2610 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2611 --EscapePtr;
2612 HasSpace = true;
2613 }
2614
2615 if (*EscapePtr == '\\')
2616 // Escaped newline.
2617 CurPtr = EscapePtr;
2618 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2619 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2620 // Trigraph-escaped newline.
2621 CurPtr = EscapePtr-2;
2622 else
2623 break; // This is a newline, we're done.
2624
2625 // If there was space between the backslash and newline, warn about it.
2626 if (HasSpace && !isLexingRawMode())
2627 Diag(EscapePtr, diag::backslash_newline_space);
2628 }
2629
2630 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2631 // properly decode the character. Read it in raw mode to avoid emitting
2632 // diagnostics about things like trigraphs. If we see an escaped newline,
2633 // we'll handle it below.
2634 const char *OldPtr = CurPtr;
2635 bool OldRawMode = isLexingRawMode();
2636 LexingRawMode = true;
2637 C = getAndAdvanceChar(CurPtr, Result);
2638 LexingRawMode = OldRawMode;
2639
2640 // If we only read only one character, then no special handling is needed.
2641 // We're done and can skip forward to the newline.
2642 if (C != 0 && CurPtr == OldPtr+1) {
2643 CurPtr = NextLine;
2644 break;
2645 }
2646
2647 // If we read multiple characters, and one of those characters was a \r or
2648 // \n, then we had an escaped newline within the comment. Emit diagnostic
2649 // unless the next line is also a // comment.
2650 if (CurPtr != OldPtr + 1 && C != '/' &&
2651 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2652 for (; OldPtr != CurPtr; ++OldPtr)
2653 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2654 // Okay, we found a // comment that ends in a newline, if the next
2655 // line is also a // comment, but has spaces, don't emit a diagnostic.
2656 if (isWhitespace(C)) {
2657 const char *ForwardPtr = CurPtr;
2658 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2659 ++ForwardPtr;
2660 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2661 break;
2662 }
2663
2664 if (!isLexingRawMode())
2665 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2666 break;
2667 }
2668 }
2669
2670 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2671 --CurPtr;
2672 break;
2673 }
2674
2675 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2677 cutOffLexing();
2678 return false;
2679 }
2680 }
2681
2682 // Found but did not consume the newline. Notify comment handlers about the
2683 // comment unless we're in a #if 0 block.
2684 if (PP && !isLexingRawMode() &&
2686 getSourceLocation(CurPtr)))) {
2687 BufferPtr = CurPtr;
2688 return true; // A token has to be returned.
2689 }
2690
2691 // If we are returning comments as tokens, return this comment as a token.
2692 if (inKeepCommentMode())
2693 return SaveLineComment(Result, CurPtr);
2694
2695 // If we are inside a preprocessor directive and we see the end of line,
2696 // return immediately, so that the lexer can return this as an EOD token.
2697 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2698 BufferPtr = CurPtr;
2699 return false;
2700 }
2701
2702 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2703 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2704 // contribute to another token), it isn't needed for correctness. Note that
2705 // this is ok even in KeepWhitespaceMode, because we would have returned the
2706 // comment above in that mode.
2707 NewLinePtr = CurPtr++;
2708
2709 // The next returned token is at the start of the line.
2710 Result.setFlag(Token::StartOfLine);
2711 TokAtPhysicalStartOfLine = true;
2712 // No leading whitespace seen so far.
2713 Result.clearFlag(Token::LeadingSpace);
2714 BufferPtr = CurPtr;
2715 return false;
2716}
2717
2718/// If in save-comment mode, package up this Line comment in an appropriate
2719/// way and return it.
2720bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2721 // If we're not in a preprocessor directive, just return the // comment
2722 // directly.
2723 FormTokenWithChars(Result, CurPtr, tok::comment);
2724
2726 return true;
2727
2728 // If this Line-style comment is in a macro definition, transmogrify it into
2729 // a C-style block comment.
2730 bool Invalid = false;
2731 std::string Spelling = PP->getSpelling(Result, &Invalid);
2732 if (Invalid)
2733 return true;
2734
2735 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2736 Spelling[1] = '*'; // Change prefix to "/*".
2737 Spelling += "*/"; // add suffix.
2738
2739 Result.setKind(tok::comment);
2740 PP->CreateString(Spelling, Result,
2741 Result.getLocation(), Result.getLocation());
2742 return true;
2743}
2744
2745/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2746/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2747/// a diagnostic if so. We know that the newline is inside of a block comment.
2748static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2749 bool Trigraphs) {
2750 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2751
2752 // Position of the first trigraph in the ending sequence.
2753 const char *TrigraphPos = nullptr;
2754 // Position of the first whitespace after a '\' in the ending sequence.
2755 const char *SpacePos = nullptr;
2756
2757 while (true) {
2758 // Back up off the newline.
2759 --CurPtr;
2760
2761 // If this is a two-character newline sequence, skip the other character.
2762 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2763 // \n\n or \r\r -> not escaped newline.
2764 if (CurPtr[0] == CurPtr[1])
2765 return false;
2766 // \n\r or \r\n -> skip the newline.
2767 --CurPtr;
2768 }
2769
2770 // If we have horizontal whitespace, skip over it. We allow whitespace
2771 // between the slash and newline.
2772 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2773 SpacePos = CurPtr;
2774 --CurPtr;
2775 }
2776
2777 // If we have a slash, this is an escaped newline.
2778 if (*CurPtr == '\\') {
2779 --CurPtr;
2780 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2781 // This is a trigraph encoding of a slash.
2782 TrigraphPos = CurPtr - 2;
2783 CurPtr -= 3;
2784 } else {
2785 return false;
2786 }
2787
2788 // If the character preceding the escaped newline is a '*', then after line
2789 // splicing we have a '*/' ending the comment.
2790 if (*CurPtr == '*')
2791 break;
2792
2793 if (*CurPtr != '\n' && *CurPtr != '\r')
2794 return false;
2795 }
2796
2797 if (TrigraphPos) {
2798 // If no trigraphs are enabled, warn that we ignored this trigraph and
2799 // ignore this * character.
2800 if (!Trigraphs) {
2801 if (!L->isLexingRawMode())
2802 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2803 return false;
2804 }
2805 if (!L->isLexingRawMode())
2806 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2807 }
2808
2809 // Warn about having an escaped newline between the */ characters.
2810 if (!L->isLexingRawMode())
2811 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2812
2813 // If there was space between the backslash and newline, warn about it.
2814 if (SpacePos && !L->isLexingRawMode())
2815 L->Diag(SpacePos, diag::backslash_newline_space);
2816
2817 return true;
2818}
2819
2820#ifdef __SSE2__
2821#include <emmintrin.h>
2822#elif __ALTIVEC__
2823#include <altivec.h>
2824#undef bool
2825#endif
2826
2827/// We have just read from input the / and * characters that started a comment.
2828/// Read until we find the * and / characters that terminate the comment.
2829/// Note that we don't bother decoding trigraphs or escaped newlines in block
2830/// comments, because they cannot cause the comment to end. The only thing
2831/// that can happen is the comment could end with an escaped newline between
2832/// the terminating * and /.
2833///
2834/// If we're in KeepCommentMode or any CommentHandler has inserted
2835/// some tokens, this will store the first token and return true.
2836bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2837 bool &TokAtPhysicalStartOfLine) {
2838 // Scan one character past where we should, looking for a '/' character. Once
2839 // we find it, check to see if it was preceded by a *. This common
2840 // optimization helps people who like to put a lot of * characters in their
2841 // comments.
2842
2843 // The first character we get with newlines and trigraphs skipped to handle
2844 // the degenerate /*/ case below correctly if the * has an escaped newline
2845 // after it.
2846 unsigned CharSize;
2847 unsigned char C = getCharAndSize(CurPtr, CharSize);
2848 CurPtr += CharSize;
2849 if (C == 0 && CurPtr == BufferEnd+1) {
2850 if (!isLexingRawMode())
2851 Diag(BufferPtr, diag::err_unterminated_block_comment);
2852 --CurPtr;
2853
2854 // KeepWhitespaceMode should return this broken comment as a token. Since
2855 // it isn't a well formed comment, just return it as an 'unknown' token.
2856 if (isKeepWhitespaceMode()) {
2857 FormTokenWithChars(Result, CurPtr, tok::unknown);
2858 return true;
2859 }
2860
2861 BufferPtr = CurPtr;
2862 return false;
2863 }
2864
2865 // Check to see if the first character after the '/*' is another /. If so,
2866 // then this slash does not end the block comment, it is part of it.
2867 if (C == '/')
2868 C = *CurPtr++;
2869
2870 // C++23 [lex.phases] p1
2871 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2872 // diagnostic only once per entire ill-formed subsequence to avoid
2873 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2874 bool UnicodeDecodingAlreadyDiagnosed = false;
2875
2876 while (true) {
2877 // Skip over all non-interesting characters until we find end of buffer or a
2878 // (probably ending) '/' character.
2879 if (CurPtr + 24 < BufferEnd &&
2880 // If there is a code-completion point avoid the fast scan because it
2881 // doesn't check for '\0'.
2882 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2883 // While not aligned to a 16-byte boundary.
2884 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2885 if (!isASCII(C))
2886 goto MultiByteUTF8;
2887 C = *CurPtr++;
2888 }
2889 if (C == '/') goto FoundSlash;
2890
2891#ifdef __SSE2__
2892 __m128i Slashes = _mm_set1_epi8('/');
2893 while (CurPtr + 16 < BufferEnd) {
2894 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2895 if (LLVM_UNLIKELY(Mask != 0)) {
2896 goto MultiByteUTF8;
2897 }
2898 // look for slashes
2899 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2900 Slashes));
2901 if (cmp != 0) {
2902 // Adjust the pointer to point directly after the first slash. It's
2903 // not necessary to set C here, it will be overwritten at the end of
2904 // the outer loop.
2905 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2906 goto FoundSlash;
2907 }
2908 CurPtr += 16;
2909 }
2910#elif __ALTIVEC__
2911 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2912 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2913 0x80, 0x80, 0x80, 0x80};
2914 __vector unsigned char Slashes = {
2915 '/', '/', '/', '/', '/', '/', '/', '/',
2916 '/', '/', '/', '/', '/', '/', '/', '/'
2917 };
2918 while (CurPtr + 16 < BufferEnd) {
2919 if (LLVM_UNLIKELY(
2920 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2921 goto MultiByteUTF8;
2922 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2923 break;
2924 }
2925 CurPtr += 16;
2926 }
2927
2928#else
2929 while (CurPtr + 16 < BufferEnd) {
2930 bool HasNonASCII = false;
2931 for (unsigned I = 0; I < 16; ++I)
2932 HasNonASCII |= !isASCII(CurPtr[I]);
2933
2934 if (LLVM_UNLIKELY(HasNonASCII))
2935 goto MultiByteUTF8;
2936
2937 bool HasSlash = false;
2938 for (unsigned I = 0; I < 16; ++I)
2939 HasSlash |= CurPtr[I] == '/';
2940 if (HasSlash)
2941 break;
2942 CurPtr += 16;
2943 }
2944#endif
2945
2946 // It has to be one of the bytes scanned, increment to it and read one.
2947 C = *CurPtr++;
2948 }
2949
2950 // Loop to scan the remainder, warning on invalid UTF-8
2951 // if the corresponding warning is enabled, emitting a diagnostic only once
2952 // per sequence that cannot be decoded.
2953 while (C != '/' && C != '\0') {
2954 if (isASCII(C)) {
2955 UnicodeDecodingAlreadyDiagnosed = false;
2956 C = *CurPtr++;
2957 continue;
2958 }
2959 MultiByteUTF8:
2960 // CurPtr is 1 code unit past C, so to decode
2961 // the codepoint, we need to read from the previous position.
2962 unsigned Length = llvm::getUTF8SequenceSize(
2963 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2964 if (Length == 0) {
2965 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2966 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2967 UnicodeDecodingAlreadyDiagnosed = true;
2968 } else {
2969 UnicodeDecodingAlreadyDiagnosed = false;
2970 CurPtr += Length - 1;
2971 }
2972 C = *CurPtr++;
2973 }
2974
2975 if (C == '/') {
2976 FoundSlash:
2977 if (CurPtr[-2] == '*') // We found the final */. We're done!
2978 break;
2979
2980 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2981 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2982 LangOpts.Trigraphs)) {
2983 // We found the final */, though it had an escaped newline between the
2984 // * and /. We're done!
2985 break;
2986 }
2987 }
2988 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2989 // If this is a /* inside of the comment, emit a warning. Don't do this
2990 // if this is a /*/, which will end the comment. This misses cases with
2991 // embedded escaped newlines, but oh well.
2992 if (!isLexingRawMode())
2993 Diag(CurPtr-1, diag::warn_nested_block_comment);
2994 }
2995 } else if (C == 0 && CurPtr == BufferEnd+1) {
2996 if (!isLexingRawMode())
2997 Diag(BufferPtr, diag::err_unterminated_block_comment);
2998 // Note: the user probably forgot a */. We could continue immediately
2999 // after the /*, but this would involve lexing a lot of what really is the
3000 // comment, which surely would confuse the parser.
3001 --CurPtr;
3002
3003 // KeepWhitespaceMode should return this broken comment as a token. Since
3004 // it isn't a well formed comment, just return it as an 'unknown' token.
3005 if (isKeepWhitespaceMode()) {
3006 FormTokenWithChars(Result, CurPtr, tok::unknown);
3007 return true;
3008 }
3009
3010 BufferPtr = CurPtr;
3011 return false;
3012 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
3014 cutOffLexing();
3015 return false;
3016 }
3017
3018 C = *CurPtr++;
3019 }
3020
3021 // Notify comment handlers about the comment unless we're in a #if 0 block.
3022 if (PP && !isLexingRawMode() &&
3024 getSourceLocation(CurPtr)))) {
3025 BufferPtr = CurPtr;
3026 return true; // A token has to be returned.
3027 }
3028
3029 // If we are returning comments as tokens, return this comment as a token.
3030 if (inKeepCommentMode()) {
3031 FormTokenWithChars(Result, CurPtr, tok::comment);
3032 return true;
3033 }
3034
3035 // It is common for the tokens immediately after a /**/ comment to be
3036 // whitespace. Instead of going through the big switch, handle it
3037 // efficiently now. This is safe even in KeepWhitespaceMode because we would
3038 // have already returned above with the comment as a token.
3039 if (isHorizontalWhitespace(*CurPtr)) {
3040 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
3041 return false;
3042 }
3043
3044 // Otherwise, just return so that the next character will be lexed as a token.
3045 BufferPtr = CurPtr;
3046 Result.setFlag(Token::LeadingSpace);
3047 return false;
3048}
3049
3050//===----------------------------------------------------------------------===//
3051// Primary Lexing Entry Points
3052//===----------------------------------------------------------------------===//
3053
3054/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3055/// uninterpreted string. This switches the lexer out of directive mode.
3057 assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3058 "Must be in a preprocessing directive!");
3059 Token Tmp;
3060 Tmp.startToken();
3061
3062 // CurPtr - Cache BufferPtr in an automatic variable.
3063 const char *CurPtr = BufferPtr;
3064 while (true) {
3065 char Char = getAndAdvanceChar(CurPtr, Tmp);
3066 switch (Char) {
3067 default:
3068 if (Result)
3069 Result->push_back(Char);
3070 break;
3071 case 0: // Null.
3072 // Found end of file?
3073 if (CurPtr-1 != BufferEnd) {
3074 if (isCodeCompletionPoint(CurPtr-1)) {
3076 cutOffLexing();
3077 return;
3078 }
3079
3080 // Nope, normal character, continue.
3081 if (Result)
3082 Result->push_back(Char);
3083 break;
3084 }
3085 // FALL THROUGH.
3086 [[fallthrough]];
3087 case '\r':
3088 case '\n':
3089 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3090 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3091 BufferPtr = CurPtr-1;
3092
3093 // Next, lex the character, which should handle the EOD transition.
3094 Lex(Tmp);
3095 if (Tmp.is(tok::code_completion)) {
3096 if (PP)
3098 Lex(Tmp);
3099 }
3100 assert(Tmp.is(tok::eod) && "Unexpected token!");
3101
3102 // Finally, we're done;
3103 return;
3104 }
3105 }
3106}
3107
3108/// LexEndOfFile - CurPtr points to the end of this file. Handle this
3109/// condition, reporting diagnostics and handling other edge cases as required.
3110/// This returns true if Result contains a token, false if PP.Lex should be
3111/// called again.
3112bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3113 // If we hit the end of the file while parsing a preprocessor directive,
3114 // end the preprocessor directive first. The next token returned will
3115 // then be the end of file.
3117 // Done parsing the "line".
3119 // Update the location of token as well as BufferPtr.
3120 FormTokenWithChars(Result, CurPtr, tok::eod);
3121
3122 // Restore comment saving mode, in case it was disabled for directive.
3123 if (PP)
3125 return true; // Have a token.
3126 }
3127
3128 // If we are in raw mode, return this event as an EOF token. Let the caller
3129 // that put us in raw mode handle the event.
3130 if (isLexingRawMode()) {
3131 Result.startToken();
3132 BufferPtr = BufferEnd;
3133 FormTokenWithChars(Result, BufferEnd, tok::eof);
3134 return true;
3135 }
3136
3139 // If the preamble cuts off the end of a header guard, consider it guarded.
3140 // The guard is valid for the preamble content itself, and for tools the
3141 // most useful answer is "yes, this file has a header guard".
3142 if (!ConditionalStack.empty())
3144 ConditionalStack.clear();
3145 }
3146
3147 // Issue diagnostics for unterminated #if and missing newline.
3148
3149 // If we are in a #if directive, emit an error.
3150 while (!ConditionalStack.empty()) {
3151 if (PP->getCodeCompletionFileLoc() != FileLoc)
3152 PP->Diag(ConditionalStack.back().IfLoc,
3153 diag::err_pp_unterminated_conditional);
3154 ConditionalStack.pop_back();
3155 }
3156
3157 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3158 // a pedwarn.
3159 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3161 SourceLocation EndLoc = getSourceLocation(BufferEnd);
3162 unsigned DiagID;
3163
3164 if (LangOpts.CPlusPlus11) {
3165 // C++11 [lex.phases] 2.2 p2
3166 // Prefer the C++98 pedantic compatibility warning over the generic,
3167 // non-extension, user-requested "missing newline at EOF" warning.
3168 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3169 DiagID = diag::warn_cxx98_compat_no_newline_eof;
3170 } else {
3171 DiagID = diag::warn_no_newline_eof;
3172 }
3173 } else {
3174 DiagID = diag::ext_no_newline_eof;
3175 }
3176
3177 Diag(BufferEnd, DiagID)
3178 << FixItHint::CreateInsertion(EndLoc, "\n");
3179 }
3180
3181 BufferPtr = CurPtr;
3182
3183 // Finally, let the preprocessor handle this.
3185}
3186
3187/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3188/// the specified lexer will return a tok::l_paren token, 0 if it is something
3189/// else and 2 if there are no more tokens in the buffer controlled by the
3190/// lexer.
3191unsigned Lexer::isNextPPTokenLParen() {
3192 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3193
3194 if (isDependencyDirectivesLexer()) {
3195 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3196 return 2;
3197 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3198 tok::l_paren);
3199 }
3200
3201 // Switch to 'skipping' mode. This will ensure that we can lex a token
3202 // without emitting diagnostics, disables macro expansion, and will cause EOF
3203 // to return an EOF token instead of popping the include stack.
3204 LexingRawMode = true;
3205
3206 // Save state that can be changed while lexing so that we can restore it.
3207 const char *TmpBufferPtr = BufferPtr;
3208 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3209 bool atStartOfLine = IsAtStartOfLine;
3210 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3211 bool leadingSpace = HasLeadingSpace;
3212
3213 Token Tok;
3214 Lex(Tok);
3215
3216 // Restore state that may have changed.
3217 BufferPtr = TmpBufferPtr;
3218 ParsingPreprocessorDirective = inPPDirectiveMode;
3219 HasLeadingSpace = leadingSpace;
3220 IsAtStartOfLine = atStartOfLine;
3221 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3222
3223 // Restore the lexer back to non-skipping mode.
3224 LexingRawMode = false;
3225
3226 if (Tok.is(tok::eof))
3227 return 2;
3228 return Tok.is(tok::l_paren);
3229}
3230
3231/// Find the end of a version control conflict marker.
3232static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3233 ConflictMarkerKind CMK) {
3234 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3235 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3236 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3237 size_t Pos = RestOfBuffer.find(Terminator);
3238 while (Pos != StringRef::npos) {
3239 // Must occur at start of line.
3240 if (Pos == 0 ||
3241 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3242 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3243 Pos = RestOfBuffer.find(Terminator);
3244 continue;
3245 }
3246 return RestOfBuffer.data()+Pos;
3247 }
3248 return nullptr;
3249}
3250
3251/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3252/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3253/// and recover nicely. This returns true if it is a conflict marker and false
3254/// if not.
3255bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3256 // Only a conflict marker if it starts at the beginning of a line.
3257 if (CurPtr != BufferStart &&
3258 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3259 return false;
3260
3261 // Check to see if we have <<<<<<< or >>>>.
3262 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3263 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3264 return false;
3265
3266 // If we have a situation where we don't care about conflict markers, ignore
3267 // it.
3268 if (CurrentConflictMarkerState || isLexingRawMode())
3269 return false;
3270
3271 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3272
3273 // Check to see if there is an ending marker somewhere in the buffer at the
3274 // start of a line to terminate this conflict marker.
3275 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3276 // We found a match. We are really in a conflict marker.
3277 // Diagnose this, and ignore to the end of line.
3278 Diag(CurPtr, diag::err_conflict_marker);
3279 CurrentConflictMarkerState = Kind;
3280
3281 // Skip ahead to the end of line. We know this exists because the
3282 // end-of-conflict marker starts with \r or \n.
3283 while (*CurPtr != '\r' && *CurPtr != '\n') {
3284 assert(CurPtr != BufferEnd && "Didn't find end of line");
3285 ++CurPtr;
3286 }
3287 BufferPtr = CurPtr;
3288 return true;
3289 }
3290
3291 // No end of conflict marker found.
3292 return false;
3293}
3294
3295/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3296/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3297/// is the end of a conflict marker. Handle it by ignoring up until the end of
3298/// the line. This returns true if it is a conflict marker and false if not.
3299bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3300 // Only a conflict marker if it starts at the beginning of a line.
3301 if (CurPtr != BufferStart &&
3302 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3303 return false;
3304
3305 // If we have a situation where we don't care about conflict markers, ignore
3306 // it.
3307 if (!CurrentConflictMarkerState || isLexingRawMode())
3308 return false;
3309
3310 // Check to see if we have the marker (4 characters in a row).
3311 for (unsigned i = 1; i != 4; ++i)
3312 if (CurPtr[i] != CurPtr[0])
3313 return false;
3314
3315 // If we do have it, search for the end of the conflict marker. This could
3316 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3317 // be the end of conflict marker.
3318 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3319 CurrentConflictMarkerState)) {
3320 CurPtr = End;
3321
3322 // Skip ahead to the end of line.
3323 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3324 ++CurPtr;
3325
3326 BufferPtr = CurPtr;
3327
3328 // No longer in the conflict marker.
3329 CurrentConflictMarkerState = CMK_None;
3330 return true;
3331 }
3332
3333 return false;
3334}
3335
3336static const char *findPlaceholderEnd(const char *CurPtr,
3337 const char *BufferEnd) {
3338 if (CurPtr == BufferEnd)
3339 return nullptr;
3340 BufferEnd -= 1; // Scan until the second last character.
3341 for (; CurPtr != BufferEnd; ++CurPtr) {
3342 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3343 return CurPtr + 2;
3344 }
3345 return nullptr;
3346}
3347
3348bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3349 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3351 return false;
3352 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3353 if (!End)
3354 return false;
3355 const char *Start = CurPtr - 1;
3356 if (!LangOpts.AllowEditorPlaceholders)
3357 Diag(Start, diag::err_placeholder_in_source);
3358 Result.startToken();
3359 FormTokenWithChars(Result, End, tok::raw_identifier);
3360 Result.setRawIdentifierData(Start);
3363 BufferPtr = End;
3364 return true;
3365}
3366
3367bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3368 if (PP && PP->isCodeCompletionEnabled()) {
3369 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3370 return Loc == PP->getCodeCompletionLoc();
3371 }
3372
3373 return false;
3374}
3375
3376std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3377 const char *SlashLoc,
3378 Token *Result) {
3379 unsigned CharSize;
3380 char Kind = getCharAndSize(StartPtr, CharSize);
3381 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3382
3383 unsigned NumHexDigits;
3384 if (Kind == 'u')
3385 NumHexDigits = 4;
3386 else if (Kind == 'U')
3387 NumHexDigits = 8;
3388
3389 bool Delimited = false;
3390 bool FoundEndDelimiter = false;
3391 unsigned Count = 0;
3392 bool Diagnose = Result && !isLexingRawMode();
3393
3394 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3395 if (Diagnose)
3396 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3397 return std::nullopt;
3398 }
3399
3400 const char *CurPtr = StartPtr + CharSize;
3401 const char *KindLoc = &CurPtr[-1];
3402
3403 uint32_t CodePoint = 0;
3404 while (Count != NumHexDigits || Delimited) {
3405 char C = getCharAndSize(CurPtr, CharSize);
3406 if (!Delimited && Count == 0 && C == '{') {
3407 Delimited = true;
3408 CurPtr += CharSize;
3409 continue;
3410 }
3411
3412 if (Delimited && C == '}') {
3413 CurPtr += CharSize;
3414 FoundEndDelimiter = true;
3415 break;
3416 }
3417
3418 unsigned Value = llvm::hexDigitValue(C);
3419 if (Value == -1U) {
3420 if (!Delimited)
3421 break;
3422 if (Diagnose)
3423 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3424 << StringRef(KindLoc, 1);
3425 return std::nullopt;
3426 }
3427
3428 if (CodePoint & 0xF000'0000) {
3429 if (Diagnose)
3430 Diag(KindLoc, diag::err_escape_too_large) << 0;
3431 return std::nullopt;
3432 }
3433
3434 CodePoint <<= 4;
3435 CodePoint |= Value;
3436 CurPtr += CharSize;
3437 Count++;
3438 }
3439
3440 if (Count == 0) {
3441 if (Diagnose)
3442 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3443 : diag::warn_ucn_escape_no_digits)
3444 << StringRef(KindLoc, 1);
3445 return std::nullopt;
3446 }
3447
3448 if (Delimited && Kind == 'U') {
3449 if (Diagnose)
3450 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3451 return std::nullopt;
3452 }
3453
3454 if (!Delimited && Count != NumHexDigits) {
3455 if (Diagnose) {
3456 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3457 // If the user wrote \U1234, suggest a fixit to \u.
3458 if (Count == 4 && NumHexDigits == 8) {
3459 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3460 Diag(KindLoc, diag::note_ucn_four_not_eight)
3461 << FixItHint::CreateReplacement(URange, "u");
3462 }
3463 }
3464 return std::nullopt;
3465 }
3466
3467 if (Delimited && PP) {
3468 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3469 ? diag::warn_cxx23_delimited_escape_sequence
3470 : diag::ext_delimited_escape_sequence)
3471 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3472 }
3473
3474 if (Result) {
3475 Result->setFlag(Token::HasUCN);
3476 // If the UCN contains either a trigraph or a line splicing,
3477 // we need to call getAndAdvanceChar again to set the appropriate flags
3478 // on Result.
3479 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3480 StartPtr = CurPtr;
3481 else
3482 while (StartPtr != CurPtr)
3483 (void)getAndAdvanceChar(StartPtr, *Result);
3484 } else {
3485 StartPtr = CurPtr;
3486 }
3487 return CodePoint;
3488}
3489
3490std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3491 const char *SlashLoc,
3492 Token *Result) {
3493 unsigned CharSize;
3494 bool Diagnose = Result && !isLexingRawMode();
3495
3496 char C = getCharAndSize(StartPtr, CharSize);
3497 assert(C == 'N' && "expected \\N{...}");
3498
3499 const char *CurPtr = StartPtr + CharSize;
3500 const char *KindLoc = &CurPtr[-1];
3501
3502 C = getCharAndSize(CurPtr, CharSize);
3503 if (C != '{') {
3504 if (Diagnose)
3505 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3506 return std::nullopt;
3507 }
3508 CurPtr += CharSize;
3509 const char *StartName = CurPtr;
3510 bool FoundEndDelimiter = false;
3512 while (C) {
3513 C = getCharAndSize(CurPtr, CharSize);
3514 CurPtr += CharSize;
3515 if (C == '}') {
3516 FoundEndDelimiter = true;
3517 break;
3518 }
3519
3521 break;
3522 Buffer.push_back(C);
3523 }
3524
3525 if (!FoundEndDelimiter || Buffer.empty()) {
3526 if (Diagnose)
3527 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3528 : diag::warn_delimited_ucn_incomplete)
3529 << StringRef(KindLoc, 1);
3530 return std::nullopt;
3531 }
3532
3533 StringRef Name(Buffer.data(), Buffer.size());
3534 std::optional<char32_t> Match =
3535 llvm::sys::unicode::nameToCodepointStrict(Name);
3536 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3537 if (!Match) {
3538 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3539 if (Diagnose) {
3540 Diag(StartName, diag::err_invalid_ucn_name)
3541 << StringRef(Buffer.data(), Buffer.size())
3542 << makeCharRange(*this, StartName, CurPtr - CharSize);
3543 if (LooseMatch) {
3544 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3546 makeCharRange(*this, StartName, CurPtr - CharSize),
3547 LooseMatch->Name);
3548 }
3549 }
3550 // We do not offer misspelled character names suggestions here
3551 // as the set of what would be a valid suggestion depends on context,
3552 // and we should not make invalid suggestions.
3553 }
3554
3555 if (Diagnose && Match)
3556 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3557 ? diag::warn_cxx23_delimited_escape_sequence
3558 : diag::ext_delimited_escape_sequence)
3559 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3560
3561 // If no diagnostic has been emitted yet, likely because we are doing a
3562 // tentative lexing, we do not want to recover here to make sure the token
3563 // will not be incorrectly considered valid. This function will be called
3564 // again and a diagnostic emitted then.
3565 if (LooseMatch && Diagnose)
3566 Match = LooseMatch->CodePoint;
3567
3568 if (Result) {
3569 Result->setFlag(Token::HasUCN);
3570 // If the UCN contains either a trigraph or a line splicing,
3571 // we need to call getAndAdvanceChar again to set the appropriate flags
3572 // on Result.
3573 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3574 StartPtr = CurPtr;
3575 else
3576 while (StartPtr != CurPtr)
3577 (void)getAndAdvanceChar(StartPtr, *Result);
3578 } else {
3579 StartPtr = CurPtr;
3580 }
3581 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3582}
3583
3584uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3585 Token *Result) {
3586
3587 unsigned CharSize;
3588 std::optional<uint32_t> CodePointOpt;
3589 char Kind = getCharAndSize(StartPtr, CharSize);
3590 if (Kind == 'u' || Kind == 'U')
3591 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3592 else if (Kind == 'N')
3593 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3594
3595 if (!CodePointOpt)
3596 return 0;
3597
3598 uint32_t CodePoint = *CodePointOpt;
3599
3600 // Don't apply C family restrictions to UCNs in assembly mode
3601 if (LangOpts.AsmPreprocessor)
3602 return CodePoint;
3603
3604 // C23 6.4.3p2: A universal character name shall not designate a code point
3605 // where the hexadecimal value is:
3606 // - in the range D800 through DFFF inclusive; or
3607 // - greater than 10FFFF.
3608 // A universal-character-name outside the c-char-sequence of a character
3609 // constant, or the s-char-sequence of a string-literal shall not designate
3610 // a control character or a character in the basic character set.
3611
3612 // C++11 [lex.charset]p2: If the hexadecimal value for a
3613 // universal-character-name corresponds to a surrogate code point (in the
3614 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3615 // if the hexadecimal value for a universal-character-name outside the
3616 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3617 // string literal corresponds to a control character (in either of the
3618 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3619 // basic source character set, the program is ill-formed.
3620 if (CodePoint < 0xA0) {
3621 // We don't use isLexingRawMode() here because we need to warn about bad
3622 // UCNs even when skipping preprocessing tokens in a #if block.
3623 if (Result && PP) {
3624 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3625 Diag(BufferPtr, diag::err_ucn_control_character);
3626 else {
3627 char C = static_cast<char>(CodePoint);
3628 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3629 }
3630 }
3631
3632 return 0;
3633 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3634 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3635 // We don't use isLexingRawMode() here because we need to diagnose bad
3636 // UCNs even when skipping preprocessing tokens in a #if block.
3637 if (Result && PP) {
3638 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3639 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3640 else
3641 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3642 }
3643 return 0;
3644 }
3645
3646 return CodePoint;
3647}
3648
3649bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3650 const char *CurPtr) {
3651 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3653 Diag(BufferPtr, diag::ext_unicode_whitespace)
3654 << makeCharRange(*this, BufferPtr, CurPtr);
3655
3656 Result.setFlag(Token::LeadingSpace);
3657 return true;
3658 }
3659 return false;
3660}
3661
3662void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3663 IsAtStartOfLine = Result.isAtStartOfLine();
3664 HasLeadingSpace = Result.hasLeadingSpace();
3665 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3666 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3667}
3668
3670 assert(!isDependencyDirectivesLexer());
3671
3672 // Start a new token.
3673 Result.startToken();
3674
3675 // Set up misc whitespace flags for LexTokenInternal.
3676 if (IsAtStartOfLine) {
3677 Result.setFlag(Token::StartOfLine);
3678 IsAtStartOfLine = false;
3679 }
3680
3681 if (HasLeadingSpace) {
3682 Result.setFlag(Token::LeadingSpace);
3683 HasLeadingSpace = false;
3684 }
3685
3686 if (HasLeadingEmptyMacro) {
3688 HasLeadingEmptyMacro = false;
3689 }
3690
3691 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3692 IsAtPhysicalStartOfLine = false;
3693 bool isRawLex = isLexingRawMode();
3694 (void) isRawLex;
3695 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3696 // (After the LexTokenInternal call, the lexer might be destroyed.)
3697 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3698 return returnedToken;
3699}
3700
3701/// LexTokenInternal - This implements a simple C family lexer. It is an
3702/// extremely performance critical piece of code. This assumes that the buffer
3703/// has a null character at the end of the file. This returns a preprocessing
3704/// token, not a normal token, as such, it is an internal interface. It assumes
3705/// that the Flags of result have been cleared before calling this.
3706bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3707LexStart:
3708 assert(!Result.needsCleaning() && "Result needs cleaning");
3709 assert(!Result.hasPtrData() && "Result has not been reset");
3710
3711 // CurPtr - Cache BufferPtr in an automatic variable.
3712 const char *CurPtr = BufferPtr;
3713
3714 // Small amounts of horizontal whitespace is very common between tokens.
3715 if (isHorizontalWhitespace(*CurPtr)) {
3716 do {
3717 ++CurPtr;
3718 } while (isHorizontalWhitespace(*CurPtr));
3719
3720 // If we are keeping whitespace and other tokens, just return what we just
3721 // skipped. The next lexer invocation will return the token after the
3722 // whitespace.
3723 if (isKeepWhitespaceMode()) {
3724 FormTokenWithChars(Result, CurPtr, tok::unknown);
3725 // FIXME: The next token will not have LeadingSpace set.
3726 return true;
3727 }
3728
3729 BufferPtr = CurPtr;
3730 Result.setFlag(Token::LeadingSpace);
3731 }
3732
3733 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3734
3735 // Read a character, advancing over it.
3736 char Char = getAndAdvanceChar(CurPtr, Result);
3738
3739 if (!isVerticalWhitespace(Char))
3740 NewLinePtr = nullptr;
3741
3742 switch (Char) {
3743 case 0: // Null.
3744 // Found end of file?
3745 if (CurPtr-1 == BufferEnd)
3746 return LexEndOfFile(Result, CurPtr-1);
3747
3748 // Check if we are performing code completion.
3749 if (isCodeCompletionPoint(CurPtr-1)) {
3750 // Return the code-completion token.
3751 Result.startToken();
3752 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3753 return true;
3754 }
3755
3756 if (!isLexingRawMode())
3757 Diag(CurPtr-1, diag::null_in_file);
3758 Result.setFlag(Token::LeadingSpace);
3759 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3760 return true; // KeepWhitespaceMode
3761
3762 // We know the lexer hasn't changed, so just try again with this lexer.
3763 // (We manually eliminate the tail call to avoid recursion.)
3764 goto LexNextToken;
3765
3766 case 26: // DOS & CP/M EOF: "^Z".
3767 // If we're in Microsoft extensions mode, treat this as end of file.
3768 if (LangOpts.MicrosoftExt) {
3769 if (!isLexingRawMode())
3770 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3771 return LexEndOfFile(Result, CurPtr-1);
3772 }
3773
3774 // If Microsoft extensions are disabled, this is just random garbage.
3775 Kind = tok::unknown;
3776 break;
3777
3778 case '\r':
3779 if (CurPtr[0] == '\n')
3780 (void)getAndAdvanceChar(CurPtr, Result);
3781 [[fallthrough]];
3782 case '\n':
3783 // If we are inside a preprocessor directive and we see the end of line,
3784 // we know we are done with the directive, so return an EOD token.
3786 // Done parsing the "line".
3788
3789 // Restore comment saving mode, in case it was disabled for directive.
3790 if (PP)
3792
3793 // Since we consumed a newline, we are back at the start of a line.
3794 IsAtStartOfLine = true;
3795 IsAtPhysicalStartOfLine = true;
3796 NewLinePtr = CurPtr - 1;
3797
3798 Kind = tok::eod;
3799 break;
3800 }
3801
3802 // No leading whitespace seen so far.
3803 Result.clearFlag(Token::LeadingSpace);
3804
3805 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3806 return true; // KeepWhitespaceMode
3807
3808 // We only saw whitespace, so just try again with this lexer.
3809 // (We manually eliminate the tail call to avoid recursion.)
3810 goto LexNextToken;
3811 case ' ':
3812 case '\t':
3813 case '\f':
3814 case '\v':
3815 SkipHorizontalWhitespace:
3816 Result.setFlag(Token::LeadingSpace);
3817 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3818 return true; // KeepWhitespaceMode
3819
3820 SkipIgnoredUnits:
3821 CurPtr = BufferPtr;
3822
3823 // If the next token is obviously a // or /* */ comment, skip it efficiently
3824 // too (without going through the big switch stmt).
3825 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3826 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3827 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3828 return true; // There is a token to return.
3829 goto SkipIgnoredUnits;
3830 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3831 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3832 return true; // There is a token to return.
3833 goto SkipIgnoredUnits;
3834 } else if (isHorizontalWhitespace(*CurPtr)) {
3835 goto SkipHorizontalWhitespace;
3836 }
3837 // We only saw whitespace, so just try again with this lexer.
3838 // (We manually eliminate the tail call to avoid recursion.)
3839 goto LexNextToken;
3840
3841 // C99 6.4.4.1: Integer Constants.
3842 // C99 6.4.4.2: Floating Constants.
3843 case '0': case '1': case '2': case '3': case '4':
3844 case '5': case '6': case '7': case '8': case '9':
3845 // Notify MIOpt that we read a non-whitespace/non-comment token.
3846 MIOpt.ReadToken();
3847 return LexNumericConstant(Result, CurPtr);
3848
3849 // Identifier (e.g., uber), or
3850 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3851 // UTF-8 or UTF-16 string literal (C11/C++11).
3852 case 'u':
3853 // Notify MIOpt that we read a non-whitespace/non-comment token.
3854 MIOpt.ReadToken();
3855
3856 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3857 Char = getCharAndSize(CurPtr, SizeTmp);
3858
3859 // UTF-16 string literal
3860 if (Char == '"')
3861 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3862 tok::utf16_string_literal);
3863
3864 // UTF-16 character constant
3865 if (Char == '\'')
3866 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3867 tok::utf16_char_constant);
3868
3869 // UTF-16 raw string literal
3870 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3871 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3872 return LexRawStringLiteral(Result,
3873 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3874 SizeTmp2, Result),
3875 tok::utf16_string_literal);
3876
3877 if (Char == '8') {
3878 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3879
3880 // UTF-8 string literal
3881 if (Char2 == '"')
3882 return LexStringLiteral(Result,
3883 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3884 SizeTmp2, Result),
3885 tok::utf8_string_literal);
3886 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3887 return LexCharConstant(
3888 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3889 SizeTmp2, Result),
3890 tok::utf8_char_constant);
3891
3892 if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3893 unsigned SizeTmp3;
3894 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3895 // UTF-8 raw string literal
3896 if (Char3 == '"') {
3897 return LexRawStringLiteral(Result,
3898 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3899 SizeTmp2, Result),
3900 SizeTmp3, Result),
3901 tok::utf8_string_literal);
3902 }
3903 }
3904 }
3905 }
3906
3907 // treat u like the start of an identifier.
3908 return LexIdentifierContinue(Result, CurPtr);
3909
3910 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3911 // Notify MIOpt that we read a non-whitespace/non-comment token.
3912 MIOpt.ReadToken();
3913
3914 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3915 Char = getCharAndSize(CurPtr, SizeTmp);
3916
3917 // UTF-32 string literal
3918 if (Char == '"')
3919 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3920 tok::utf32_string_literal);
3921
3922 // UTF-32 character constant
3923 if (Char == '\'')
3924 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3925 tok::utf32_char_constant);
3926
3927 // UTF-32 raw string literal
3928 if (Char == 'R' && LangOpts.CPlusPlus11 &&
3929 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3930 return LexRawStringLiteral(Result,
3931 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3932 SizeTmp2, Result),
3933 tok::utf32_string_literal);
3934 }
3935
3936 // treat U like the start of an identifier.
3937 return LexIdentifierContinue(Result, CurPtr);
3938
3939 case 'R': // Identifier or C++0x raw string literal
3940 // Notify MIOpt that we read a non-whitespace/non-comment token.
3941 MIOpt.ReadToken();
3942
3943 if (LangOpts.CPlusPlus11) {
3944 Char = getCharAndSize(CurPtr, SizeTmp);
3945
3946 if (Char == '"')
3947 return LexRawStringLiteral(Result,
3948 ConsumeChar(CurPtr, SizeTmp, Result),
3949 tok::string_literal);
3950 }
3951
3952 // treat R like the start of an identifier.
3953 return LexIdentifierContinue(Result, CurPtr);
3954
3955 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3956 // Notify MIOpt that we read a non-whitespace/non-comment token.
3957 MIOpt.ReadToken();
3958 Char = getCharAndSize(CurPtr, SizeTmp);
3959
3960 // Wide string literal.
3961 if (Char == '"')
3962 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3963 tok::wide_string_literal);
3964
3965 // Wide raw string literal.
3966 if (LangOpts.CPlusPlus11 && Char == 'R' &&
3967 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3968 return LexRawStringLiteral(Result,
3969 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3970 SizeTmp2, Result),
3971 tok::wide_string_literal);
3972
3973 // Wide character constant.
3974 if (Char == '\'')
3975 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3976 tok::wide_char_constant);
3977 // FALL THROUGH, treating L like the start of an identifier.
3978 [[fallthrough]];
3979
3980 // C99 6.4.2: Identifiers.
3981 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3982 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3983 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3984 case 'V': case 'W': case 'X': case 'Y': case 'Z':
3985 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3986 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3987 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3988 case 'v': case 'w': case 'x': case 'y': case 'z':
3989 case '_':
3990 // Notify MIOpt that we read a non-whitespace/non-comment token.
3991 MIOpt.ReadToken();
3992 return LexIdentifierContinue(Result, CurPtr);
3993
3994 case '$': // $ in identifiers.
3995 if (LangOpts.DollarIdents) {
3996 if (!isLexingRawMode())
3997 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3998 // Notify MIOpt that we read a non-whitespace/non-comment token.
3999 MIOpt.ReadToken();
4000 return LexIdentifierContinue(Result, CurPtr);
4001 }
4002
4003 Kind = tok::unknown;
4004 break;
4005
4006 // C99 6.4.4: Character Constants.
4007 case '\'':
4008 // Notify MIOpt that we read a non-whitespace/non-comment token.
4009 MIOpt.ReadToken();
4010 return LexCharConstant(Result, CurPtr, tok::char_constant);
4011
4012 // C99 6.4.5: String Literals.
4013 case '"':
4014 // Notify MIOpt that we read a non-whitespace/non-comment token.
4015 MIOpt.ReadToken();
4016 return LexStringLiteral(Result, CurPtr,
4017 ParsingFilename ? tok::header_name
4018 : tok::string_literal);
4019
4020 // C99 6.4.6: Punctuators.
4021 case '?':
4022 Kind = tok::question;
4023 break;
4024 case '[':
4025 Kind = tok::l_square;
4026 break;
4027 case ']':
4028 Kind = tok::r_square;
4029 break;
4030 case '(':
4031 Kind = tok::l_paren;
4032 break;
4033 case ')':
4034 Kind = tok::r_paren;
4035 break;
4036 case '{':
4037 Kind = tok::l_brace;
4038 break;
4039 case '}':
4040 Kind = tok::r_brace;
4041 break;
4042 case '.':
4043 Char = getCharAndSize(CurPtr, SizeTmp);
4044 if (Char >= '0' && Char <= '9') {
4045 // Notify MIOpt that we read a non-whitespace/non-comment token.
4046 MIOpt.ReadToken();
4047
4048 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4049 } else if (LangOpts.CPlusPlus && Char == '*') {
4050 Kind = tok::periodstar;
4051 CurPtr += SizeTmp;
4052 } else if (Char == '.' &&
4053 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4054 Kind = tok::ellipsis;
4055 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4056 SizeTmp2, Result);
4057 } else {
4058 Kind = tok::period;
4059 }
4060 break;
4061 case '&':
4062 Char = getCharAndSize(CurPtr, SizeTmp);
4063 if (Char == '&') {
4064 Kind = tok::ampamp;
4065 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4066 } else if (Char == '=') {
4067 Kind = tok::ampequal;
4068 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4069 } else {
4070 Kind = tok::amp;
4071 }
4072 break;
4073 case '*':
4074 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4075 Kind = tok::starequal;
4076 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4077 } else {
4078 Kind = tok::star;
4079 }
4080 break;
4081 case '+':
4082 Char = getCharAndSize(CurPtr, SizeTmp);
4083 if (Char == '+') {
4084 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4085 Kind = tok::plusplus;
4086 } else if (Char == '=') {
4087 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4088 Kind = tok::plusequal;
4089 } else {
4090 Kind = tok::plus;
4091 }
4092 break;
4093 case '-':
4094 Char = getCharAndSize(CurPtr, SizeTmp);
4095 if (Char == '-') { // --
4096 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4097 Kind = tok::minusminus;
4098 } else if (Char == '>' && LangOpts.CPlusPlus &&
4099 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
4100 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4101 SizeTmp2, Result);
4102 Kind = tok::arrowstar;
4103 } else if (Char == '>') { // ->
4104 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4105 Kind = tok::arrow;
4106 } else if (Char == '=') { // -=
4107 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4108 Kind = tok::minusequal;
4109 } else {
4110 Kind = tok::minus;
4111 }
4112 break;
4113 case '~':
4114 Kind = tok::tilde;
4115 break;
4116 case '!':
4117 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4118 Kind = tok::exclaimequal;
4119 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4120 } else {
4121 Kind = tok::exclaim;
4122 }
4123 break;
4124 case '/':
4125 // 6.4.9: Comments
4126 Char = getCharAndSize(CurPtr, SizeTmp);
4127 if (Char == '/') { // Line comment.
4128 // Even if Line comments are disabled (e.g. in C89 mode), we generally
4129 // want to lex this as a comment. There is one problem with this though,
4130 // that in one particular corner case, this can change the behavior of the
4131 // resultant program. For example, In "foo //**/ bar", C89 would lex
4132 // this as "foo / bar" and languages with Line comments would lex it as
4133 // "foo". Check to see if the character after the second slash is a '*'.
4134 // If so, we will lex that as a "/" instead of the start of a comment.
4135 // However, we never do this if we are just preprocessing.
4136 bool TreatAsComment =
4137 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4138 if (!TreatAsComment)
4139 if (!(PP && PP->isPreprocessedOutput()))
4140 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4141
4142 if (TreatAsComment) {
4143 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4144 TokAtPhysicalStartOfLine))
4145 return true; // There is a token to return.
4146
4147 // It is common for the tokens immediately after a // comment to be
4148 // whitespace (indentation for the next line). Instead of going through
4149 // the big switch, handle it efficiently now.
4150 goto SkipIgnoredUnits;
4151 }
4152 }
4153
4154 if (Char == '*') { // /**/ comment.
4155 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4156 TokAtPhysicalStartOfLine))
4157 return true; // There is a token to return.
4158
4159 // We only saw whitespace, so just try again with this lexer.
4160 // (We manually eliminate the tail call to avoid recursion.)
4161 goto LexNextToken;
4162 }
4163
4164 if (Char == '=') {
4165 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4166 Kind = tok::slashequal;
4167 } else {
4168 Kind = tok::slash;
4169 }
4170 break;
4171 case '%':
4172 Char = getCharAndSize(CurPtr, SizeTmp);
4173 if (Char == '=') {
4174 Kind = tok::percentequal;
4175 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4176 } else if (LangOpts.Digraphs && Char == '>') {
4177 Kind = tok::r_brace; // '%>' -> '}'
4178 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4179 } else if (LangOpts.Digraphs && Char == ':') {
4180 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4181 Char = getCharAndSize(CurPtr, SizeTmp);
4182 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4183 Kind = tok::hashhash; // '%:%:' -> '##'
4184 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4185 SizeTmp2, Result);
4186 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4187 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4188 if (!isLexingRawMode())
4189 Diag(BufferPtr, diag::ext_charize_microsoft);
4190 Kind = tok::hashat;
4191 } else { // '%:' -> '#'
4192 // We parsed a # character. If this occurs at the start of the line,
4193 // it's actually the start of a preprocessing directive. Callback to
4194 // the preprocessor to handle it.
4195 // TODO: -fpreprocessed mode??
4196 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4197 goto HandleDirective;
4198
4199 Kind = tok::hash;
4200 }
4201 } else {
4202 Kind = tok::percent;
4203 }
4204 break;
4205 case '<':
4206 Char = getCharAndSize(CurPtr, SizeTmp);
4207 if (ParsingFilename) {
4208 return LexAngledStringLiteral(Result, CurPtr);
4209 } else if (Char == '<') {
4210 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4211 if (After == '=') {
4212 Kind = tok::lesslessequal;
4213 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4214 SizeTmp2, Result);
4215 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4216 // If this is actually a '<<<<<<<' version control conflict marker,
4217 // recognize it as such and recover nicely.
4218 goto LexNextToken;
4219 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4220 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4221 // ignore it.
4222 goto LexNextToken;
4223 } else if (LangOpts.CUDA && After == '<') {
4224 Kind = tok::lesslessless;
4225 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4226 SizeTmp2, Result);
4227 } else {
4228 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4229 Kind = tok::lessless;
4230 }
4231 } else if (Char == '=') {
4232 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4233 if (After == '>') {
4234 if (LangOpts.CPlusPlus20) {
4235 if (!isLexingRawMode())
4236 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4237 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4238 SizeTmp2, Result);
4239 Kind = tok::spaceship;
4240 break;
4241 }
4242 // Suggest adding a space between the '<=' and the '>' to avoid a
4243 // change in semantics if this turns up in C++ <=17 mode.
4244 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4245 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4247 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4248 }
4249 }
4250 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4251 Kind = tok::lessequal;
4252 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4253 if (LangOpts.CPlusPlus11 &&
4254 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4255 // C++0x [lex.pptoken]p3:
4256 // Otherwise, if the next three characters are <:: and the subsequent
4257 // character is neither : nor >, the < is treated as a preprocessor
4258 // token by itself and not as the first character of the alternative
4259 // token <:.
4260 unsigned SizeTmp3;
4261 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4262 if (After != ':' && After != '>') {
4263 Kind = tok::less;
4264 if (!isLexingRawMode())
4265 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4266 break;
4267 }
4268 }
4269
4270 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4271 Kind = tok::l_square;
4272 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4273 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4274 Kind = tok::l_brace;
4275 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4276 lexEditorPlaceholder(Result, CurPtr)) {
4277 return true;
4278 } else {
4279 Kind = tok::less;
4280 }
4281 break;
4282 case '>':
4283 Char = getCharAndSize(CurPtr, SizeTmp);
4284 if (Char == '=') {
4285 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4286 Kind = tok::greaterequal;
4287 } else if (Char == '>') {
4288 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4289 if (After == '=') {
4290 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4291 SizeTmp2, Result);
4292 Kind = tok::greatergreaterequal;
4293 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4294 // If this is actually a '>>>>' conflict marker, recognize it as such
4295 // and recover nicely.
4296 goto LexNextToken;
4297 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4298 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4299 goto LexNextToken;
4300 } else if (LangOpts.CUDA && After == '>') {
4301 Kind = tok::greatergreatergreater;
4302 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4303 SizeTmp2, Result);
4304 } else {
4305 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4306 Kind = tok::greatergreater;
4307 }
4308 } else {
4309 Kind = tok::greater;
4310 }
4311 break;
4312 case '^':
4313 Char = getCharAndSize(CurPtr, SizeTmp);
4314 if (Char == '=') {
4315 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4316 Kind = tok::caretequal;
4317 } else if (LangOpts.OpenCL && Char == '^') {
4318 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4319 Kind = tok::caretcaret;
4320 } else {
4321 Kind = tok::caret;
4322 }
4323 break;
4324 case '|':
4325 Char = getCharAndSize(CurPtr, SizeTmp);
4326 if (Char == '=') {
4327 Kind = tok::pipeequal;
4328 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4329 } else if (Char == '|') {
4330 // If this is '|||||||' and we're in a conflict marker, ignore it.
4331 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4332 goto LexNextToken;
4333 Kind = tok::pipepipe;
4334 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4335 } else {
4336 Kind = tok::pipe;
4337 }
4338 break;
4339 case ':':
4340 Char = getCharAndSize(CurPtr, SizeTmp);
4341 if (LangOpts.Digraphs && Char == '>') {
4342 Kind = tok::r_square; // ':>' -> ']'
4343 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4344 } else if (Char == ':') {
4345 Kind = tok::coloncolon;
4346 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4347 } else {
4348 Kind = tok::colon;
4349 }
4350 break;
4351 case ';':
4352 Kind = tok::semi;
4353 break;
4354 case '=':
4355 Char = getCharAndSize(CurPtr, SizeTmp);
4356 if (Char == '=') {
4357 // If this is '====' and we're in a conflict marker, ignore it.
4358 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4359 goto LexNextToken;
4360
4361 Kind = tok::equalequal;
4362 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4363 } else {
4364 Kind = tok::equal;
4365 }
4366 break;
4367 case ',':
4368 Kind = tok::comma;
4369 break;
4370 case '#':
4371 Char = getCharAndSize(CurPtr, SizeTmp);
4372 if (Char == '#') {
4373 Kind = tok::hashhash;
4374 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4375 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4376 Kind = tok::hashat;
4377 if (!isLexingRawMode())
4378 Diag(BufferPtr, diag::ext_charize_microsoft);
4379 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4380 } else {
4381 // We parsed a # character. If this occurs at the start of the line,
4382 // it's actually the start of a preprocessing directive. Callback to
4383 // the preprocessor to handle it.
4384 // TODO: -fpreprocessed mode??
4385 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4386 goto HandleDirective;
4387
4388 Kind = tok::hash;
4389 }
4390 break;
4391
4392 case '@':
4393 // Objective C support.
4394 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4395 Kind = tok::at;
4396 else
4397 Kind = tok::unknown;
4398 break;
4399
4400 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4401 case '\\':
4402 if (!LangOpts.AsmPreprocessor) {
4403 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4404 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4405 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4406 return true; // KeepWhitespaceMode
4407
4408 // We only saw whitespace, so just try again with this lexer.
4409 // (We manually eliminate the tail call to avoid recursion.)
4410 goto LexNextToken;
4411 }
4412
4413 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4414 }
4415 }
4416
4417 Kind = tok::unknown;
4418 break;
4419
4420 default: {
4421 if (isASCII(Char)) {
4422 Kind = tok::unknown;
4423 break;
4424 }
4425
4426 llvm::UTF32 CodePoint;
4427
4428 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4429 // an escaped newline.
4430 --CurPtr;
4431 llvm::ConversionResult Status =
4432 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4433 (const llvm::UTF8 *)BufferEnd,
4434 &CodePoint,
4435 llvm::strictConversion);
4436 if (Status == llvm::conversionOK) {
4437 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4438 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4439 return true; // KeepWhitespaceMode
4440
4441 // We only saw whitespace, so just try again with this lexer.
4442 // (We manually eliminate the tail call to avoid recursion.)
4443 goto LexNextToken;
4444 }
4445 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4446 }
4447
4450 ++CurPtr;
4451 Kind = tok::unknown;
4452 break;
4453 }
4454
4455 // Non-ASCII characters tend to creep into source code unintentionally.
4456 // Instead of letting the parser complain about the unknown token,
4457 // just diagnose the invalid UTF-8, then drop the character.
4458 Diag(CurPtr, diag::err_invalid_utf8);
4459
4460 BufferPtr = CurPtr+1;
4461 // We're pretending the character didn't exist, so just try again with
4462 // this lexer.
4463 // (We manually eliminate the tail call to avoid recursion.)
4464 goto LexNextToken;
4465 }
4466 }
4467
4468 // Notify MIOpt that we read a non-whitespace/non-comment token.
4469 MIOpt.ReadToken();
4470
4471 // Update the location of token as well as BufferPtr.
4472 FormTokenWithChars(Result, CurPtr, Kind);
4473 return true;
4474
4475HandleDirective:
4476 // We parsed a # character and it's the start of a preprocessing directive.
4477
4478 FormTokenWithChars(Result, CurPtr, tok::hash);
4480
4482 // With a fatal failure in the module loader, we abort parsing.
4483 return true;
4484
4485 // We parsed the directive; lex a token with the new state.
4486 return false;
4487
4488LexNextToken:
4489 Result.clearFlag(Token::NeedsCleaning);
4490 goto LexStart;
4491}
4492
4493const char *Lexer::convertDependencyDirectiveToken(
4495 const char *TokPtr = BufferStart + DDTok.Offset;
4496 Result.startToken();
4497 Result.setLocation(getSourceLocation(TokPtr));
4498 Result.setKind(DDTok.Kind);
4499 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4500 Result.setLength(DDTok.Length);
4501 BufferPtr = TokPtr + DDTok.Length;
4502 return TokPtr;
4503}
4504
4505bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4506 assert(isDependencyDirectivesLexer());
4507
4508 using namespace dependency_directives_scan;
4509
4510 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4511 if (DepDirectives.front().Kind == pp_eof)
4512 return LexEndOfFile(Result, BufferEnd);
4513 if (DepDirectives.front().Kind == tokens_present_before_eof)
4514 MIOpt.ReadToken();
4515 NextDepDirectiveTokenIndex = 0;
4516 DepDirectives = DepDirectives.drop_front();
4517 }
4518
4520 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4521 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4522 // Read something other than a preprocessor directive hash.
4523 MIOpt.ReadToken();
4524 }
4525
4526 if (ParsingFilename && DDTok.is(tok::less)) {
4527 BufferPtr = BufferStart + DDTok.Offset;
4528 LexAngledStringLiteral(Result, BufferPtr + 1);
4529 if (Result.isNot(tok::header_name))
4530 return true;
4531 // Advance the index of lexed tokens.
4532 while (true) {
4533 const dependency_directives_scan::Token &NextTok =
4534 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4535 if (BufferStart + NextTok.Offset >= BufferPtr)
4536 break;
4537 ++NextDepDirectiveTokenIndex;
4538 }
4539 return true;
4540 }
4541
4542 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4543
4544 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4546 return false;
4547 }
4548 if (Result.is(tok::raw_identifier)) {
4549 Result.setRawIdentifierData(TokPtr);
4550 if (!isLexingRawMode()) {
4552 if (II->isHandleIdentifierCase())
4553 return PP->HandleIdentifier(Result);
4554 }
4555 return true;
4556 }
4557 if (Result.isLiteral()) {
4558 Result.setLiteralData(TokPtr);
4559 return true;
4560 }
4561 if (Result.is(tok::colon)) {
4562 // Convert consecutive colons to 'tok::coloncolon'.
4563 if (*BufferPtr == ':') {
4564 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4565 tok::colon));
4566 ++NextDepDirectiveTokenIndex;
4567 Result.setKind(tok::coloncolon);
4568 }
4569 return true;
4570 }
4571 if (Result.is(tok::eod))
4573
4574 return true;
4575}
4576
4577bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4578 assert(isDependencyDirectivesLexer());
4579
4580 using namespace dependency_directives_scan;
4581
4582 bool Stop = false;
4583 unsigned NestedIfs = 0;
4584 do {
4585 DepDirectives = DepDirectives.drop_front();
4586 switch (DepDirectives.front().Kind) {
4587 case pp_none:
4588 llvm_unreachable("unexpected 'pp_none'");
4589 case pp_include:
4591 case pp_define:
4592 case pp_undef:
4593 case pp_import:
4594 case pp_pragma_import:
4595 case pp_pragma_once:
4600 case pp_include_next:
4601 case decl_at_import:
4602 case cxx_module_decl:
4603 case cxx_import_decl:
4607 break;
4608 case pp_if:
4609 case pp_ifdef:
4610 case pp_ifndef:
4611 ++NestedIfs;
4612 break;
4613 case pp_elif:
4614 case pp_elifdef:
4615 case pp_elifndef:
4616 case pp_else:
4617 if (!NestedIfs) {
4618 Stop = true;
4619 }
4620 break;
4621 case pp_endif:
4622 if (!NestedIfs) {
4623 Stop = true;
4624 } else {
4625 --NestedIfs;
4626 }
4627 break;
4628 case pp_eof:
4629 NextDepDirectiveTokenIndex = 0;
4630 return LexEndOfFile(Result, BufferEnd);
4631 }
4632 } while (!Stop);
4633
4635 DepDirectives.front().Tokens.front();
4636 assert(DDTok.is(tok::hash));
4637 NextDepDirectiveTokenIndex = 1;
4638
4639 convertDependencyDirectiveToken(DDTok, Result);
4640 return false;
4641}
StringRef P
#define SM(sm)
Definition: Cuda.cpp:82
Defines the Diagnostic-related interfaces.
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:948
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition: Lexer.cpp:1545
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1739
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1261
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:325
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3232
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1664
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:561
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:285
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1189
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1559
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1629
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1526
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition: Lexer.cpp:1613
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3336
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition: Lexer.cpp:1532
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:920
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2748
static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:1905
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1242
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1587
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1635
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:544
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
SourceLocation Begin
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ int
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16260
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16052
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1271
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:192
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1547
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:916
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool isInvalid() const
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:134
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:123
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:97
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
bool isKeyword(const LangOptions &LangOpts) const
Return true if this token is a keyword in the specified language.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:418
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:1024
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1358
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:278
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1060
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:3056
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:872
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1232
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
bool Lex(Token &Result)
Lex - Return the next token in the file.
Definition: Lexer.cpp:3669
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:791
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:184
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:894
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:955
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1138
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1213
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1158
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:452
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1134
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:499
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:609
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:220
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1107
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:243
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:637
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:510
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1325
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:850
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:310
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:586
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:128
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
SourceManager & getSourceManager() const
EmptylineHandler * getEmptylineHandler() const
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
IdentifierTable & getIdentifierTable()
const LangOptions & getLangOpts() const
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
DiagnosticsEngine & getDiagnostics() const
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
Definition: Token.h:36
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:306
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:116
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
unsigned getLength() const
Definition: Token.h:135
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:70
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:99
tok::TokenKind getKind() const
Definition: Token.h:94
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:276
@ HasUCN
Definition: Token.h:83
@ IsEditorPlaceholder
Definition: Token.h:88
@ LeadingEmptyMacro
Definition: Token.h:81
@ LeadingSpace
Definition: Token.h:77
@ StartOfLine
Definition: Token.h:75
@ HasUDSuffix
Definition: Token.h:82
@ NeedsCleaning
Definition: Token.h:80
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:121
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:61
bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const
Determine whether the token kind starts a simple-type-specifier.
Definition: Lexer.cpp:78
void startToken()
Reset all flags to cleared.
Definition: Token.h:177
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:295
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:213
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:225
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:244
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4201
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3029
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3378
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3661
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3363
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:89
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:42
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:100
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
@ LineComment
Definition: LangStandard.h:49
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition: CharInfo.h:62
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:92
@ Result
The result type of a method or function.
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:176
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:109
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:169
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:54
Definition: Format.h:5304
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
#define _SIDD_UBYTE_OPS
Definition: smmintrin.h:1526
#define _mm_cmpistri(A, B, M)
Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...
Definition: smmintrin.h:1658
#define _SIDD_LEAST_SIGNIFICANT
Definition: smmintrin.h:1544
#define _SIDD_NEGATIVE_POLARITY
Definition: smmintrin.h:1539
#define _SIDD_CMP_RANGES
Definition: smmintrin.h:1533
Represents a char and the number of bytes parsed to produce it.
Definition: Lexer.h:579
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.