clang 23.0.0git
DependencyDirectivesScanner.cpp
Go to the documentation of this file.
1//===- DependencyDirectivesScanner.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This is the interface for scanning header and source files to get the
11/// minimum necessary preprocessor directives for evaluating includes. It
12/// reduces the source down to #define, #include, #import, @import, and any
13/// conditional preprocessor logic that contains one of those.
14///
15//===----------------------------------------------------------------------===//
16
21#include "clang/Lex/Lexer.h"
22#include "clang/Lex/Pragma.h"
23#include "llvm/ADT/ScopeExit.h"
24#include "llvm/ADT/SmallString.h"
25#include "llvm/ADT/StringMap.h"
26#include "llvm/ADT/StringSwitch.h"
27#include <optional>
28
29using namespace clang;
31using namespace llvm;
32
33namespace {
34
35struct DirectiveWithTokens {
36 DirectiveKind Kind;
37 unsigned NumTokens;
38
39 DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)
40 : Kind(Kind), NumTokens(NumTokens) {}
41};
42
43/// Does an efficient "scan" of the sources to detect the presence of
44/// preprocessor (or module import) directives and collects the raw lexed tokens
45/// for those directives so that the \p Lexer can "replay" them when the file is
46/// included.
47///
48/// Note that the behavior of the raw lexer is affected by the language mode,
49/// while at this point we want to do a scan and collect tokens once,
50/// irrespective of the language mode that the file will get included in. To
51/// compensate for that the \p Lexer, while "replaying", will adjust a token
52/// where appropriate, when it could affect the preprocessor's state.
53/// For example in a directive like
54///
55/// \code
56/// #if __has_cpp_attribute(clang::fallthrough)
57/// \endcode
58///
59/// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2
60/// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'
61/// while in C++ mode.
62struct Scanner {
63 Scanner(StringRef Input,
64 SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
65 DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)
66 : Input(Input), Tokens(Tokens), Diags(Diags),
67 InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()),
68 TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
69 Input.end()) {}
70
71 static LangOptions getLangOptsForDepScanning() {
72 LangOptions LangOpts;
73 // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
74 LangOpts.ObjC = true;
75 LangOpts.LineComment = true;
76 LangOpts.RawStringLiterals = true;
77 LangOpts.AllowLiteralDigitSeparator = true;
78 // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"".
79 return LangOpts;
80 }
81
82 /// Lex the provided source and emit the directive tokens.
83 ///
84 /// \returns True on error.
85 bool scan(SmallVectorImpl<Directive> &Directives);
86
87 friend bool clang::scanInputForCXX20ModulesUsage(StringRef Source);
88 friend bool clang::isPreprocessedModuleFile(StringRef Source);
89
90private:
91 /// Lexes next token and advances \p First and the \p Lexer.
92 [[nodiscard]] dependency_directives_scan::Token &
93 lexToken(const char *&First, const char *const End);
94
95 [[nodiscard]] dependency_directives_scan::Token &
96 lexIncludeFilename(const char *&First, const char *const End);
97
98 void skipLine(const char *&First, const char *const End);
99 void skipDirective(StringRef Name, const char *&First, const char *const End);
100
101 /// Returns the spelling of a string literal or identifier after performing
102 /// any processing needed to handle \c clang::Token::NeedsCleaning.
103 StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
104
105 /// Lexes next token and if it is identifier returns its string, otherwise
106 /// it skips the current line and returns \p std::nullopt.
107 ///
108 /// In any case (whatever the token kind) \p First and the \p Lexer will
109 /// advance beyond the token.
110 [[nodiscard]] std::optional<StringRef>
111 tryLexIdentifierOrSkipLine(const char *&First, const char *const End);
112
113 /// Used when it is certain that next token is an identifier.
114 [[nodiscard]] StringRef lexIdentifier(const char *&First,
115 const char *const End);
116
117 /// Lexes next token and returns true iff it is an identifier that matches \p
118 /// Id, otherwise it skips the current line and returns false.
119 ///
120 /// In any case (whatever the token kind) \p First and the \p Lexer will
121 /// advance beyond the token.
122 [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id,
123 const char *&First,
124 const char *const End);
125
126 /// Lexes next token and returns true iff it matches the kind \p K.
127 /// Otherwise it skips the current line and returns false.
128 ///
129 /// In any case (whatever the token kind) \p First and the \p Lexer will
130 /// advance beyond the token.
131 [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
132 const char *const End);
133
134 /// Lexes next token and if it is string literal, returns its string.
135 /// Otherwise, it skips the current line and returns \p std::nullopt.
136 ///
137 /// In any case (whatever the token kind) \p First and the \p Lexer will
138 /// advance beyond the token.
139 [[nodiscard]] std::optional<StringRef>
140 tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);
141
142 [[nodiscard]] bool scanImpl(const char *First, const char *const End);
143 [[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
144 [[nodiscard]] bool lexAt(const char *&First, const char *const End);
145 [[nodiscard]] bool lexModule(const char *&First, const char *const End);
146 [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
147 const char *const End);
148 [[nodiscard]] bool lexPragma(const char *&First, const char *const End);
149 [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
150 [[nodiscard]] bool lexEndif(const char *&First, const char *const End);
151 [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
152 const char *const End);
153 [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind,
154 const char *&First,
155 const char *const End);
156 void lexPPDirectiveBody(const char *&First, const char *const End);
157
158 DirectiveWithTokens &pushDirective(DirectiveKind Kind) {
159 Tokens.append(CurDirToks);
160 DirsWithToks.emplace_back(Kind, CurDirToks.size());
161 CurDirToks.clear();
162 return DirsWithToks.back();
163 }
164 void popDirective() {
165 Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);
166 }
167 DirectiveKind topDirective() const {
168 return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind;
169 }
170
171 unsigned getOffsetAt(const char *CurPtr) const {
172 return CurPtr - Input.data();
173 }
174
175 /// Reports a diagnostic if the diagnostic engine is provided. Always returns
176 /// true at the end.
177 bool reportError(const char *CurPtr, unsigned Err);
178
179 bool ScanningPreprocessedModuleFile = false;
180 StringMap<char> SplitIds;
181 StringRef Input;
182 SmallVectorImpl<dependency_directives_scan::Token> &Tokens;
183 DiagnosticsEngine *Diags;
184 SourceLocation InputSourceLoc;
185
186 const char *LastTokenPtr = nullptr;
187 /// Keeps track of the tokens for the currently lexed directive. Once a
188 /// directive is fully lexed and "committed" then the tokens get appended to
189 /// \p Tokens and \p CurDirToks is cleared for the next directive.
190 SmallVector<dependency_directives_scan::Token, 32> CurDirToks;
191 /// The directives that were lexed along with the number of tokens that each
192 /// directive contains. The tokens of all the directives are kept in \p Tokens
193 /// vector, in the same order as the directives order in \p DirsWithToks.
194 SmallVector<DirectiveWithTokens, 64> DirsWithToks;
195 LangOptions LangOpts;
196 Lexer TheLexer;
197};
198
199} // end anonymous namespace
200
201bool Scanner::reportError(const char *CurPtr, unsigned Err) {
202 if (!Diags)
203 return true;
204 assert(CurPtr >= Input.data() && "invalid buffer ptr");
205 Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);
206 return true;
207}
208
209static void skipOverSpaces(const char *&First, const char *const End) {
210 while (First != End && isHorizontalWhitespace(*First))
211 ++First;
212}
213
214// Move back by one character, skipping escaped newlines (backslash + \n)
215static char previousChar(const char *First, const char *&Current) {
216 assert(Current > First);
217 --Current;
218 while (Current > First && isVerticalWhitespace(*Current)) {
219 // Check if the previous character is a backslash
220 if (Current > First && *(Current - 1) == '\\') {
221 // Use Lexer's getEscapedNewLineSize to get the size of the escaped
222 // newline
223 unsigned EscapeSize = Lexer::getEscapedNewLineSize(Current);
224 if (EscapeSize > 0) {
225 // Skip back over the entire escaped newline sequence (backslash +
226 // newline)
227 Current -= (1 + EscapeSize);
228 } else {
229 break;
230 }
231 } else {
232 break;
233 }
234 }
235 return *Current;
236}
237
238[[nodiscard]] static bool isRawStringLiteral(const char *First,
239 const char *Current) {
240 assert(First <= Current);
241
242 // Check if we can even back up.
243 if (*Current != '"' || First == Current)
244 return false;
245
246 // Check for an "R".
247 if (previousChar(First, Current) != 'R')
248 return false;
249 if (First == Current ||
251 return true;
252
253 // Check for a prefix of "u", "U", or "L".
254 if (*Current == 'u' || *Current == 'U' || *Current == 'L')
255 return First == Current ||
257
258 // Check for a prefix of "u8".
259 if (*Current != '8' || First == Current ||
260 previousChar(First, Current) != 'u')
261 return false;
262 return First == Current ||
264}
265
266static void skipRawString(const char *&First, const char *const End) {
267 assert(First[0] == '"');
268
269 const char *Last = ++First;
270 while (Last != End && *Last != '(')
271 ++Last;
272 if (Last == End) {
273 First = Last; // Hit the end... just give up.
274 return;
275 }
276
277 StringRef Terminator(First, Last - First);
278 for (;;) {
279 // Move First to just past the next ")".
280 First = Last;
281 while (First != End && *First != ')')
282 ++First;
283 if (First == End)
284 return;
285 ++First;
286
287 // Look ahead for the terminator sequence.
288 Last = First;
289 while (Last != End && size_t(Last - First) < Terminator.size() &&
290 Terminator[Last - First] == *Last)
291 ++Last;
292
293 // Check if we hit it (or the end of the file).
294 if (Last == End) {
295 First = Last;
296 return;
297 }
298 if (size_t(Last - First) < Terminator.size())
299 continue;
300 if (*Last != '"')
301 continue;
302 First = Last + 1;
303 return;
304 }
305}
306
307// Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)
308static unsigned isEOL(const char *First, const char *const End) {
309 if (First == End)
310 return 0;
311 if (End - First > 1 && isVerticalWhitespace(First[0]) &&
312 isVerticalWhitespace(First[1]) && First[0] != First[1])
313 return 2;
314 return !!isVerticalWhitespace(First[0]);
315}
316
317static void skipString(const char *&First, const char *const End) {
318 assert(*First == '\'' || *First == '"' || *First == '<');
319 const char Terminator = *First == '<' ? '>' : *First;
320 for (++First; First != End && *First != Terminator; ++First) {
321 // String and character literals don't extend past the end of the line.
323 return;
324 if (*First != '\\')
325 continue;
326 // Skip past backslash to the next character. This ensures that the
327 // character right after it is skipped as well, which matters if it's
328 // the terminator.
329 if (++First == End)
330 return;
331 if (!isWhitespace(*First))
332 continue;
333 // Whitespace after the backslash might indicate a line continuation.
334 const char *FirstAfterBackslashPastSpace = First;
335 skipOverSpaces(FirstAfterBackslashPastSpace, End);
336 if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) {
337 // Advance the character pointer to the next line for the next
338 // iteration.
339 First = FirstAfterBackslashPastSpace + NLSize - 1;
340 }
341 }
342 if (First != End)
343 ++First; // Finish off the string.
344}
345
346// Returns the length of the skipped newline
347static unsigned skipNewline(const char *&First, const char *End) {
348 if (First == End)
349 return 0;
350 assert(isVerticalWhitespace(*First));
351 unsigned Len = isEOL(First, End);
352 assert(Len && "expected newline");
353 First += Len;
354 return Len;
355}
356
357static void skipToNewlineRaw(const char *&First, const char *const End) {
358 for (;;) {
359 if (First == End)
360 return;
361
362 unsigned Len = isEOL(First, End);
363 if (Len)
364 return;
365
366 char LastNonWhitespace = ' ';
367 do {
369 LastNonWhitespace = *First;
370 if (++First == End)
371 return;
372 Len = isEOL(First, End);
373 } while (!Len);
374
375 if (LastNonWhitespace != '\\')
376 return;
377
378 First += Len;
379 // Keep skipping lines...
380 }
381}
382
383static void skipLineComment(const char *&First, const char *const End) {
384 assert(First[0] == '/' && First[1] == '/');
385 First += 2;
387}
388
389static void skipBlockComment(const char *&First, const char *const End) {
390 assert(First[0] == '/' && First[1] == '*');
391 if (End - First < 4) {
392 First = End;
393 return;
394 }
395 for (First += 3; First != End; ++First)
396 if (First[-1] == '*' && First[0] == '/') {
397 ++First;
398 return;
399 }
400}
401
402/// \returns True if the current single quotation mark character is a C++14
403/// digit separator.
404static bool isQuoteCppDigitSeparator(const char *const Start,
405 const char *const Cur,
406 const char *const End) {
407 assert(*Cur == '\'' && "expected quotation character");
408 // skipLine called in places where we don't expect a valid number
409 // body before `start` on the same line, so always return false at the start.
410 if (Start == Cur)
411 return false;
412 // The previous character must be a valid PP number character.
413 // Make sure that the L, u, U, u8 prefixes don't get marked as a
414 // separator though.
415 char Prev = *(Cur - 1);
416 if (Prev == 'L' || Prev == 'U' || Prev == 'u')
417 return false;
418 if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u')
419 return false;
420 if (!isPreprocessingNumberBody(Prev))
421 return false;
422 // The next character should be a valid identifier body character.
423 return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));
424}
425
426void Scanner::skipLine(const char *&First, const char *const End) {
427 for (;;) {
428 assert(First <= End);
429 if (First == End)
430 return;
431
433 skipNewline(First, End);
434 return;
435 }
436 const char *Start = First;
437 // Use `LastNonWhitespace`to track if a line-continuation has ever been seen
438 // before a new-line character:
439 char LastNonWhitespace = ' ';
440 while (First != End && !isVerticalWhitespace(*First)) {
441 // Iterate over strings correctly to avoid comments and newlines.
442 if (*First == '"' ||
443 (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) {
444 LastTokenPtr = First;
445 if (isRawStringLiteral(Start, First))
446 skipRawString(First, End);
447 else
448 skipString(First, End);
449 continue;
450 }
451
452 // Continue on the same line if an EOL is preceded with backslash
453 if (First + 1 < End && *First == '\\') {
454 if (unsigned Len = isEOL(First + 1, End)) {
455 First += 1 + Len;
456 continue;
457 }
458 }
459
460 // Iterate over comments correctly.
461 if (*First != '/' || End - First < 2) {
462 LastTokenPtr = First;
463 if (!isWhitespace(*First))
464 LastNonWhitespace = *First;
465 ++First;
466 continue;
467 }
468
469 if (First[1] == '/') {
470 // "//...".
472 continue;
473 }
474
475 if (First[1] != '*') {
476 LastTokenPtr = First;
477 if (!isWhitespace(*First))
478 LastNonWhitespace = *First;
479 ++First;
480 continue;
481 }
482
483 // "/*...*/".
485 }
486 if (First == End)
487 return;
488
489 // Skip over the newline.
490 skipNewline(First, End);
491
492 if (LastNonWhitespace != '\\')
493 break;
494 }
495}
496
497void Scanner::skipDirective(StringRef Name, const char *&First,
498 const char *const End) {
499 if (llvm::StringSwitch<bool>(Name)
500 .Case("warning", true)
501 .Case("error", true)
502 .Default(false))
503 // Do not process quotes or comments.
505 else
506 skipLine(First, End);
507}
508
509static void skipWhitespace(const char *&First, const char *const End) {
510 for (;;) {
511 assert(First <= End);
512 skipOverSpaces(First, End);
513
514 if (End - First < 2)
515 return;
516
517 if (*First == '\\') {
518 const char *Ptr = First + 1;
519 while (Ptr < End && isHorizontalWhitespace(*Ptr))
520 ++Ptr;
521 if (Ptr != End && isVerticalWhitespace(*Ptr)) {
522 skipNewline(Ptr, End);
523 First = Ptr;
524 continue;
525 }
526 return;
527 }
528
529 // Check for a non-comment character.
530 if (First[0] != '/')
531 return;
532
533 // "// ...".
534 if (First[1] == '/') {
536 return;
537 }
538
539 // Cannot be a comment.
540 if (First[1] != '*')
541 return;
542
543 // "/*...*/".
545 }
546}
547
548bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
549 const char *const End) {
550 assert(Kind == DirectiveKind::cxx_export_import_decl ||
551 Kind == DirectiveKind::cxx_export_module_decl ||
552 Kind == DirectiveKind::cxx_import_decl ||
553 Kind == DirectiveKind::cxx_module_decl ||
554 Kind == DirectiveKind::decl_at_import);
555
556 const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
557 for (;;) {
558 // Keep a copy of the First char incase it needs to be reset.
559 const char *Previous = First;
560 const dependency_directives_scan::Token &Tok = lexToken(First, End);
561 if ((Tok.is(tok::hash) || Tok.is(tok::at)) &&
563 CurDirToks.pop_back();
564 First = Previous;
565 return false;
566 }
567 if (Tok.isOneOf(tok::eof, tok::eod))
568 return reportError(
569 DirectiveLoc,
570 diag::err_dep_source_scanner_missing_semi_after_at_import);
571 if (Tok.is(tok::semi))
572 break;
573 }
574
575 bool IsCXXModules = Kind == DirectiveKind::cxx_export_import_decl ||
576 Kind == DirectiveKind::cxx_export_module_decl ||
577 Kind == DirectiveKind::cxx_import_decl ||
578 Kind == DirectiveKind::cxx_module_decl;
579 if (IsCXXModules) {
580 lexPPDirectiveBody(First, End);
581 pushDirective(Kind);
582 return false;
583 }
584
585 const auto &Tok = lexToken(First, End);
586 pushDirective(Kind);
587 if (Tok.is(tok::eof) || Tok.is(tok::eod))
588 return false;
589 return reportError(DirectiveLoc,
590 diag::err_dep_source_scanner_unexpected_tokens_at_import);
591}
592
593dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
594 const char *const End) {
595 clang::Token Tok;
596 TheLexer.LexFromRawLexer(Tok);
597 First = Input.data() + TheLexer.getCurrentBufferOffset();
598 assert(First <= End);
599
600 unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
601 CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
602 Tok.getFlags());
603 return CurDirToks.back();
604}
605
606dependency_directives_scan::Token &
607Scanner::lexIncludeFilename(const char *&First, const char *const End) {
608 clang::Token Tok;
609 TheLexer.LexIncludeFilename(Tok);
610 First = Input.data() + TheLexer.getCurrentBufferOffset();
611 assert(First <= End);
612
613 unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
614 CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
615 Tok.getFlags());
616 return CurDirToks.back();
617}
618
619void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
620 while (true) {
621 const dependency_directives_scan::Token &Tok = lexToken(First, End);
622 if (Tok.is(tok::eod) || Tok.is(tok::eof))
623 break;
624 }
625}
626
627StringRef
628Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
629 bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
630 if (LLVM_LIKELY(!NeedsCleaning))
631 return Input.slice(Tok.Offset, Tok.getEnd());
632
633 SmallString<64> Spelling;
634 Spelling.resize(Tok.Length);
635
636 // FIXME: C++11 raw string literals need special handling (see getSpellingSlow
637 // in the Lexer). Currently we cannot see them due to our LangOpts.
638
639 unsigned SpellingLength = 0;
640 const char *BufPtr = Input.begin() + Tok.Offset;
641 const char *AfterIdent = Input.begin() + Tok.getEnd();
642 while (BufPtr < AfterIdent) {
643 auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
644 Spelling[SpellingLength++] = Char;
645 BufPtr += Size;
646 }
647
648 return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
649 .first->first();
650}
651
652std::optional<StringRef>
653Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
654 const dependency_directives_scan::Token &Tok = lexToken(First, End);
655 if (Tok.isNot(tok::raw_identifier)) {
656 if (!Tok.is(tok::eod))
657 skipLine(First, End);
658 return std::nullopt;
659 }
660
661 return cleanStringIfNeeded(Tok);
662}
663
664StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
665 std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
666 assert(Id && "expected identifier token");
667 return *Id;
668}
669
670bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
671 const char *const End) {
672 if (std::optional<StringRef> FoundId =
673 tryLexIdentifierOrSkipLine(First, End)) {
674 if (*FoundId == Id)
675 return true;
676 skipLine(First, End);
677 }
678 return false;
679}
680
681bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
682 const char *const End) {
683 const dependency_directives_scan::Token &Tok = lexToken(First, End);
684 if (Tok.is(K))
685 return true;
686 skipLine(First, End);
687 return false;
688}
689
690std::optional<StringRef>
691Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
692 const char *const End) {
693 const dependency_directives_scan::Token &Tok = lexToken(First, End);
695 if (!Tok.is(tok::eod))
696 skipLine(First, End);
697 return std::nullopt;
698 }
699
700 return cleanStringIfNeeded(Tok);
701}
702
703bool Scanner::lexAt(const char *&First, const char *const End) {
704 // Handle "@import".
705
706 // Lex '@'.
707 const dependency_directives_scan::Token &AtTok = lexToken(First, End);
708 assert(AtTok.is(tok::at));
709 (void)AtTok;
710
711 if (!isNextIdentifierOrSkipLine("import", First, End))
712 return false;
713 return lexModuleDirectiveBody(decl_at_import, First, End);
714}
715
716bool Scanner::lexModule(const char *&First, const char *const End) {
717 StringRef Id = lexIdentifier(First, End);
718 bool Export = false;
719 if (Id == "export") {
720 Export = true;
721 std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End);
722 if (!NextId)
723 return false;
724 Id = *NextId;
725 }
726
727 StringRef Module =
728 ScanningPreprocessedModuleFile ? "__preprocessed_module" : "module";
729 StringRef Import =
730 ScanningPreprocessedModuleFile ? "__preprocessed_import" : "import";
731
732 if (Id != Module && Id != Import) {
733 skipLine(First, End);
734 return false;
735 }
736
737 skipWhitespace(First, End);
738
739 // Ignore this as a module directive if the next character can't be part of
740 // an import.
741
742 switch (*First) {
743 case ':': {
744 // `module :` is never the start of a valid module declaration.
745 if (Id == Module) {
746 skipLine(First, End);
747 return false;
748 }
749 // A module partition starts with exactly one ':'. If we have '::', this is
750 // a scope resolution instead and shouldn't be recognized as a directive
751 // per P1857R3.
752 if (First + 1 != End && First[1] == ':') {
753 skipLine(First, End);
754 return false;
755 }
756 // `import:(type)name` is a valid ObjC method decl, so check one more token.
757 (void)lexToken(First, End);
758 if (!tryLexIdentifierOrSkipLine(First, End))
759 return false;
760 break;
761 }
762 case ';': {
763 // Handle the global module fragment `module;`.
764 if (Id == Module && !Export)
765 break;
766 skipLine(First, End);
767 return false;
768 }
769 case '<':
770 case '"':
771 break;
772 default:
774 skipLine(First, End);
775 return false;
776 }
777 }
778
779 TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false);
780
782 if (Id == Module)
784 else
786
787 return lexModuleDirectiveBody(Kind, First, End);
788}
789
790bool Scanner::lex_Pragma(const char *&First, const char *const End) {
791 if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
792 return false;
793
794 std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
795
796 if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))
797 return false;
798
799 SmallString<64> Buffer(*Str);
800 prepare_PragmaString(Buffer);
801
802 // Use a new scanner instance since the tokens will be inside the allocated
803 // string. We should already have captured all the relevant tokens in the
804 // current scanner.
805 SmallVector<dependency_directives_scan::Token> DiscardTokens;
806 const char *Begin = Buffer.c_str();
807 Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
808 InputSourceLoc};
809
810 PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
811 if (PragmaScanner.lexPragma(Begin, Buffer.end()))
812 return true;
813
814 DirectiveKind K = PragmaScanner.topDirective();
815 if (K == pp_none) {
816 skipLine(First, End);
817 return false;
818 }
819
820 assert(Begin == Buffer.end());
821 pushDirective(K);
822 return false;
823}
824
825bool Scanner::lexPragma(const char *&First, const char *const End) {
826 std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
827 if (!FoundId)
828 return false;
829
830 StringRef Id = *FoundId;
831 auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
832 .Case("once", pp_pragma_once)
833 .Case("push_macro", pp_pragma_push_macro)
834 .Case("pop_macro", pp_pragma_pop_macro)
835 .Case("include_alias", pp_pragma_include_alias)
836 .Default(pp_none);
837 if (Kind != pp_none) {
838 lexPPDirectiveBody(First, End);
839 pushDirective(Kind);
840 return false;
841 }
842
843 if (Id != "clang") {
844 skipLine(First, End);
845 return false;
846 }
847
848 FoundId = tryLexIdentifierOrSkipLine(First, End);
849 if (!FoundId)
850 return false;
851 Id = *FoundId;
852
853 // #pragma clang system_header
854 if (Id == "system_header") {
855 lexPPDirectiveBody(First, End);
856 pushDirective(pp_pragma_system_header);
857 return false;
858 }
859
860 if (Id != "module") {
861 skipLine(First, End);
862 return false;
863 }
864
865 // #pragma clang module.
866 if (!isNextIdentifierOrSkipLine("import", First, End))
867 return false;
868
869 // #pragma clang module import.
870 lexPPDirectiveBody(First, End);
871 pushDirective(pp_pragma_import);
872 return false;
873}
874
875bool Scanner::lexEndif(const char *&First, const char *const End) {
876 // Strip out "#else" if it's empty.
877 if (topDirective() == pp_else)
878 popDirective();
879
880 // If "#ifdef" is empty, strip it and skip the "#endif".
881 //
882 // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
883 // we can skip empty `#if` and `#elif` blocks as well after scanning for a
884 // literal __has_include in the condition. Even without that rule we could
885 // drop the tokens if we scan for identifiers in the condition and find none.
886 if (topDirective() == pp_ifdef || topDirective() == pp_ifndef) {
887 popDirective();
888 skipLine(First, End);
889 return false;
890 }
891
892 return lexDefault(pp_endif, First, End);
893}
894
895bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,
896 const char *const End) {
897 lexPPDirectiveBody(First, End);
898 pushDirective(Kind);
899 return false;
900}
901
902static bool isStartOfRelevantLine(char First) {
903 switch (First) {
904 case '#':
905 case '@':
906 case 'i':
907 case 'e':
908 case 'm':
909 case '_':
910 return true;
911 }
912 return false;
913}
914
915static inline bool isStartWithPreprocessedModuleDirective(const char *First,
916 const char *End) {
917 assert(First <= End);
918 if (*First == '_') {
919 StringRef Str(First, End - First);
920 return Str.starts_with(
921 tok::getPPKeywordSpelling(tok::pp___preprocessed_module)) ||
922 Str.starts_with(
923 tok::getPPKeywordSpelling(tok::pp___preprocessed_import));
924 }
925 return false;
926}
927
928bool Scanner::lexPPLine(const char *&First, const char *const End) {
929 assert(First != End);
930
931 skipWhitespace(First, End);
932 assert(First <= End);
933 if (First == End)
934 return false;
935
937 skipLine(First, End);
938 assert(First <= End);
939 return false;
940 }
941
942 LastTokenPtr = First;
943
944 TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true);
945
946 llvm::scope_exit ScEx1([&]() {
947 /// Clear Scanner's CurDirToks before returning, in case we didn't push a
948 /// new directive.
949 CurDirToks.clear();
950 });
951
952 bool IsPreprocessedModule =
954 if (*First == '_' && !IsPreprocessedModule) {
955 if (isNextIdentifierOrSkipLine("_Pragma", First, End))
956 return lex_Pragma(First, End);
957 return false;
958 }
959
960 // Handle preprocessing directives.
961
962 TheLexer.setParsingPreprocessorDirective(true);
963 llvm::scope_exit ScEx2(
964 [&]() { TheLexer.setParsingPreprocessorDirective(false); });
965
966 if (*First == '@')
967 return lexAt(First, End);
968
969 // Handle module directives for C++20 modules.
970 if (*First == 'i' || *First == 'e' || *First == 'm' || IsPreprocessedModule)
971 return lexModule(First, End);
972
973 // Lex '#'.
974 const dependency_directives_scan::Token &HashTok = lexToken(First, End);
975 if (HashTok.is(tok::hashhash)) {
976 // A \p tok::hashhash at this location is passed by the preprocessor to the
977 // parser to interpret, like any other token. So for dependency scanning
978 // skip it like a normal token not affecting the preprocessor.
979 skipLine(First, End);
980 assert(First <= End);
981 return false;
982 }
983 assert(HashTok.is(tok::hash));
984 (void)HashTok;
985
986 std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
987 if (!FoundId)
988 return false;
989
990 StringRef Id = *FoundId;
991
992 if (Id == "pragma")
993 return lexPragma(First, End);
994
995 auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
996 .Case("include", pp_include)
997 .Case("__include_macros", pp___include_macros)
998 .Case("define", pp_define)
999 .Case("undef", pp_undef)
1000 .Case("import", pp_import)
1001 .Case("include_next", pp_include_next)
1002 .Case("if", pp_if)
1003 .Case("ifdef", pp_ifdef)
1004 .Case("ifndef", pp_ifndef)
1005 .Case("elif", pp_elif)
1006 .Case("elifdef", pp_elifdef)
1007 .Case("elifndef", pp_elifndef)
1008 .Case("else", pp_else)
1009 .Case("endif", pp_endif)
1010 .Default(pp_none);
1011 if (Kind == pp_none) {
1012 skipDirective(Id, First, End);
1013 return false;
1014 }
1015
1016 if (Kind == pp_endif)
1017 return lexEndif(First, End);
1018
1019 switch (Kind) {
1020 case pp_include:
1022 case pp_include_next:
1023 case pp_import:
1024 // Ignore missing filenames in include or import directives.
1025 if (lexIncludeFilename(First, End).is(tok::eod)) {
1026 return false;
1027 }
1028 break;
1029 default:
1030 break;
1031 }
1032
1033 // Everything else.
1034 return lexDefault(Kind, First, End);
1035}
1036
1037static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
1038 if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' &&
1039 First[2] == '\xbf')
1040 First += 3;
1041}
1042
1043bool Scanner::scanImpl(const char *First, const char *const End) {
1045 while (First != End)
1046 if (lexPPLine(First, End))
1047 return true;
1048 return false;
1049}
1050
1051bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {
1052 ScanningPreprocessedModuleFile = clang::isPreprocessedModuleFile(Input);
1053 bool Error = scanImpl(Input.begin(), Input.end());
1054
1055 if (!Error) {
1056 // Add an EOF on success.
1057 if (LastTokenPtr &&
1058 (Tokens.empty() || LastTokenPtr > Input.begin() + Tokens.back().Offset))
1059 pushDirective(tokens_present_before_eof);
1060 pushDirective(pp_eof);
1061 }
1062
1063 ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;
1064 for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {
1065 assert(RemainingTokens.size() >= DirWithToks.NumTokens);
1066 Directives.emplace_back(DirWithToks.Kind,
1067 RemainingTokens.take_front(DirWithToks.NumTokens));
1068 RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);
1069 }
1070 assert(RemainingTokens.empty());
1071
1072 return Error;
1073}
1074
1078 SourceLocation InputSourceLoc) {
1079 return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);
1080}
1081
1083 StringRef Source,
1085 llvm::raw_ostream &OS) {
1086 // Add a space separator where it is convenient for testing purposes.
1087 auto needsSpaceSeparator =
1088 [](tok::TokenKind Prev,
1089 const dependency_directives_scan::Token &Tok) -> bool {
1090 if (Prev == Tok.Kind)
1091 return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
1092 tok::r_square);
1093 if (Prev == tok::raw_identifier &&
1094 Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,
1095 tok::char_constant, tok::header_name))
1096 return true;
1097 if (Prev == tok::r_paren &&
1098 Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,
1099 tok::char_constant, tok::unknown))
1100 return true;
1101 if (Prev == tok::comma &&
1102 Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less))
1103 return true;
1104 return false;
1105 };
1106
1107 for (const dependency_directives_scan::Directive &Directive : Directives) {
1109 OS << "<TokBeforeEOF>";
1110 std::optional<tok::TokenKind> PrevTokenKind;
1112 if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok))
1113 OS << ' ';
1114 PrevTokenKind = Tok.Kind;
1115 OS << Source.slice(Tok.Offset, Tok.getEnd());
1116 }
1117 }
1118}
1119
1121 const char *const End) {
1122 assert(First <= End);
1123 while (First != End) {
1124 if (*First == '#') {
1125 ++First;
1126 skipToNewlineRaw(First, End);
1127 }
1128 skipWhitespace(First, End);
1129 if (const auto Len = isEOL(First, End)) {
1130 First += Len;
1131 continue;
1132 }
1133 break;
1134 }
1135}
1136
1138 const char *First = Source.begin();
1139 const char *const End = Source.end();
1141 if (First == End)
1142 return false;
1143
1144 // Check if the next token can even be a module directive before creating a
1145 // full lexer.
1146 if (!(*First == 'i' || *First == 'e' || *First == 'm'))
1147 return false;
1148
1150 Scanner S(StringRef(First, End - First), Tokens, nullptr, SourceLocation());
1151 S.TheLexer.setParsingPreprocessorDirective(true);
1152 if (S.lexModule(First, End))
1153 return false;
1154 auto IsCXXNamedModuleDirective = [](const DirectiveWithTokens &D) {
1155 switch (D.Kind) {
1160 return true;
1161 default:
1162 return false;
1163 }
1164 };
1165 return llvm::any_of(S.DirsWithToks, IsCXXNamedModuleDirective);
1166}
1167
1168bool clang::isPreprocessedModuleFile(StringRef Source) {
1169 const char *First = Source.begin();
1170 const char *const End = Source.end();
1171
1173 if (First == End)
1174 return false;
1175
1177 Scanner S(StringRef(First, End - First), Tokens, nullptr, SourceLocation());
1178 while (First != End) {
1179 if (*First == '#') {
1180 ++First;
1181 skipToNewlineRaw(First, End);
1182 } else if (*First == 'e') {
1183 S.TheLexer.seek(S.getOffsetAt(First), /*IsAtStartOfLine=*/true);
1184 StringRef Id = S.lexIdentifier(First, End);
1185 if (Id == "export") {
1186 std::optional<StringRef> NextId =
1187 S.tryLexIdentifierOrSkipLine(First, End);
1188 if (!NextId)
1189 return false;
1190 Id = *NextId;
1191 }
1192 if (Id == "__preprocessed_module" || Id == "__preprocessed_import")
1193 return true;
1194 skipToNewlineRaw(First, End);
1196 return true;
1197 else
1198 skipToNewlineRaw(First, End);
1199
1200 skipWhitespace(First, End);
1201 if (const auto Len = isEOL(First, End)) {
1202 First += Len;
1203 continue;
1204 }
1205 break;
1206 }
1207 return false;
1208}
Defines the Diagnostic-related interfaces.
static void skipBlockComment(const char *&First, const char *const End)
static void skipRawString(const char *&First, const char *const End)
static void skipString(const char *&First, const char *const End)
static bool isStartOfRelevantLine(char First)
static bool isStartWithPreprocessedModuleDirective(const char *First, const char *End)
static bool isRawStringLiteral(const char *First, const char *Current)
static void skipUntilMaybeCXX20ModuleDirective(const char *&First, const char *const End)
static void skipOverSpaces(const char *&First, const char *const End)
static unsigned isEOL(const char *First, const char *const End)
static char previousChar(const char *First, const char *&Current)
static void skipToNewlineRaw(const char *&First, const char *const End)
static unsigned skipNewline(const char *&First, const char *End)
static void skipUTF8ByteOrderMark(const char *&First, const char *const End)
static void skipLineComment(const char *&First, const char *const End)
static bool isQuoteCppDigitSeparator(const char *const Start, const char *const Cur, const char *const End)
This is the interface for scanning header and source files to get the minimum necessary preprocessor ...
bool is(tok::TokenKind Kind) const
Token Tok
The Token.
FormatToken * Previous
The previous token in the unwrapped line.
static unsigned skipWhitespace(unsigned Idx, StringRef Str, unsigned Length)
Skip over whitespace in the string, starting at the given index.
Concrete class used by the front-end to report problems and issues.
Definition Diagnostic.h:233
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition Lexer.h:236
static unsigned getEscapedNewLineSize(const char *P)
getEscapedNewLineSize - Return the size of the specified escaped newline, or 0 if it is not an escape...
Definition Lexer.cpp:1308
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition Lexer.cpp:288
unsigned getCurrentBufferOffset()
Returns the current lexing offset.
Definition Lexer.h:311
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition Lexer.h:604
void setParsingPreprocessorDirective(bool f)
Inform the lexer whether or not we are currently lexing a preprocessor directive.
void LexIncludeFilename(Token &FilenameTok)
Lex a token, producing a header-name token if possible.
Encodes a location in the source.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
@ StartOfLine
Definition Token.h:75
@ NeedsCleaning
Definition Token.h:80
DirectiveKind
Represents the kind of preprocessor directive or a module declaration that is tracked by the scanner ...
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition TokenKinds.h:93
const char * getPPKeywordSpelling(PPKeywordKind Kind) LLVM_READNONE
Returns the spelling of preprocessor keywords, such as "else".
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition CharInfo.h:99
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition CharInfo.h:61
void printDependencyDirectivesAsSource(StringRef Source, ArrayRef< dependency_directives_scan::Directive > Directives, llvm::raw_ostream &OS)
Print the previously scanned dependency directives as minimized source text.
bool scanInputForCXX20ModulesUsage(StringRef Source)
Scan an input source buffer for C++20 named module usage.
bool isPreprocessedModuleFile(StringRef Source)
Scan an input source buffer, and check whether the input source is a preprocessed output.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition CharInfo.h:91
bool scanSourceForDependencyDirectives(StringRef Input, SmallVectorImpl< dependency_directives_scan::Token > &Tokens, SmallVectorImpl< dependency_directives_scan::Directive > &Directives, DiagnosticsEngine *Diags=nullptr, SourceLocation InputSourceLoc=SourceLocation())
Scan the input for the preprocessor directives that might have an effect on the dependencies for a co...
@ Module
Module linkage, which indicates that the entity can be referred to from other translation units withi...
Definition Linkage.h:54
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition CharInfo.h:108
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition CharInfo.h:168
void prepare_PragmaString(SmallVectorImpl< char > &StrVal)
Destringize a _Pragma("") string according to C11 6.10.9.1: "The string literal is destringized by de...
Definition Pragma.cpp:303
Diagnostic wrappers for TextAPI types for error reporting.
Definition Dominators.h:30
Represents a directive that's lexed as part of the dependency directives scanning.
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.