clang 19.0.0git
SourceCode.cpp
Go to the documentation of this file.
1//===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file provides functions that simplify extraction of source code.
10//
11//===----------------------------------------------------------------------===//
14#include "clang/AST/Attr.h"
15#include "clang/AST/Comment.h"
16#include "clang/AST/Decl.h"
17#include "clang/AST/DeclCXX.h"
19#include "clang/AST/Expr.h"
21#include "clang/Lex/Lexer.h"
22#include "llvm/Support/Errc.h"
23#include "llvm/Support/Error.h"
24#include <set>
25
26using namespace clang;
27
28using llvm::errc;
29using llvm::StringError;
30
32 const ASTContext &Context) {
34 Context.getLangOpts());
35}
36
38 tok::TokenKind Next,
39 ASTContext &Context) {
41 Context.getLangOpts());
42 if (R.isInvalid())
43 return Range;
44 Token Tok;
45 bool Err =
46 Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(),
47 Context.getLangOpts(), /*IgnoreWhiteSpace=*/true);
48 if (Err || !Tok.is(Next))
49 return Range;
50 return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation());
51}
52
54 const SourceManager &SM,
55 bool AllowSystemHeaders) {
56 if (Range.isInvalid())
57 return llvm::make_error<StringError>(errc::invalid_argument,
58 "Invalid range");
59
60 if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
61 return llvm::make_error<StringError>(
62 errc::invalid_argument, "Range starts or ends in a macro expansion");
63
64 if (!AllowSystemHeaders) {
65 if (SM.isInSystemHeader(Range.getBegin()) ||
66 SM.isInSystemHeader(Range.getEnd()))
67 return llvm::make_error<StringError>(errc::invalid_argument,
68 "Range is in system header");
69 }
70
71 std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
72 std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
73 if (BeginInfo.first != EndInfo.first)
74 return llvm::make_error<StringError>(
75 errc::invalid_argument, "Range begins and ends in different files");
76
77 if (BeginInfo.second > EndInfo.second)
78 return llvm::make_error<StringError>(errc::invalid_argument,
79 "Range's begin is past its end");
80
81 return llvm::Error::success();
82}
83
85 const SourceManager &SM) {
86 return validateRange(Range, SM, /*AllowSystemHeaders=*/false);
87}
88
90 const SourceManager &SM) {
91 while (Loc.isMacroID()) {
92 const auto &Expansion = SM.getSLocEntry(SM.getFileID(Loc)).getExpansion();
93 if (Expansion.isMacroArgExpansion()) {
94 // Check the spelling location of the macro arg, in case the arg itself is
95 // in a macro expansion.
96 Loc = Expansion.getSpellingLoc();
97 } else {
98 return true;
99 }
100 }
101 return false;
102}
103
105 const SourceManager &SM,
106 const LangOptions &LangOpts,
107 bool IncludeMacroExpansion) {
108 CharSourceRange Range;
109 if (IncludeMacroExpansion) {
110 Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
111 } else {
112 if (spelledInMacroDefinition(EditRange.getBegin(), SM) ||
113 spelledInMacroDefinition(EditRange.getEnd(), SM))
114 return {};
115
116 auto B = SM.getSpellingLoc(EditRange.getBegin());
117 auto E = SM.getSpellingLoc(EditRange.getEnd());
118 if (EditRange.isTokenRange())
119 E = Lexer::getLocForEndOfToken(E, 0, SM, LangOpts);
120 Range = CharSourceRange::getCharRange(B, E);
121 }
122 return Range;
123}
124
125std::optional<CharSourceRange> clang::tooling::getFileRangeForEdit(
126 const CharSourceRange &EditRange, const SourceManager &SM,
127 const LangOptions &LangOpts, bool IncludeMacroExpansion) {
129 getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);
130 bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
131 if (IsInvalid)
132 return std::nullopt;
133 return Range;
134}
135
136std::optional<CharSourceRange> clang::tooling::getFileRange(
137 const CharSourceRange &EditRange, const SourceManager &SM,
138 const LangOptions &LangOpts, bool IncludeMacroExpansion) {
140 getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);
141 bool IsInvalid =
142 llvm::errorToBool(validateRange(Range, SM, /*AllowSystemHeaders=*/true));
143 if (IsInvalid)
144 return std::nullopt;
145 return Range;
146}
147
148static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
149 return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
150}
151
152static bool contains(const std::set<tok::TokenKind> &Terminators,
153 const Token &Tok) {
154 return Terminators.count(Tok.getKind()) > 0;
155}
156
157// Returns the exclusive, *file* end location of the entity whose last token is
158// at location 'EntityLast'. That is, it returns the location one past the last
159// relevant character.
160//
161// Associated tokens include comments, horizontal whitespace and 'Terminators'
162// -- optional tokens, which, if any are found, will be included; if
163// 'Terminators' is empty, we will not include any extra tokens beyond comments
164// and horizontal whitespace.
165static SourceLocation
167 const std::set<tok::TokenKind> &Terminators,
168 const LangOptions &LangOpts) {
169 assert(EntityLast.isValid() && "Invalid end location found.");
170
171 // We remember the last location of a non-horizontal-whitespace token we have
172 // lexed; this is the location up to which we will want to delete.
173 // FIXME: Support using the spelling loc here for cases where we want to
174 // analyze the macro text.
175
176 CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
177 // FIXME: Should check isTokenRange(), for the (rare) case that
178 // `ExpansionRange` is a character range.
179 std::unique_ptr<Lexer> Lexer = [&]() {
180 bool Invalid = false;
181 auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
182 llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
183 assert(!Invalid && "Cannot get file/offset");
184 return std::make_unique<clang::Lexer>(
185 SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
186 File.data() + FileOffset.second, File.end());
187 }();
188
189 // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
191
192 // Generally, the code we want to include looks like this ([] are optional),
193 // If Terminators is empty:
194 // [ <comment> ] [ <newline> ]
195 // Otherwise:
196 // ... <terminator> [ <comment> ] [ <newline> ]
197
198 Token Tok;
199 bool Terminated = false;
200
201 // First, lex to the current token (which is the last token of the range that
202 // is definitely associated with the decl). Then, we process the first token
203 // separately from the rest based on conditions that hold specifically for
204 // that first token.
205 //
206 // We do not search for a terminator if none is required or we've already
207 // encountered it. Otherwise, if the original `EntityLast` location was in a
208 // macro expansion, we don't have visibility into the text, so we assume we've
209 // already terminated. However, we note this assumption with
210 // `TerminatedByMacro`, because we'll want to handle it somewhat differently
211 // for the terminators semicolon and comma. These terminators can be safely
212 // associated with the entity when they appear after the macro -- extra
213 // semicolons have no effect on the program and a well-formed program won't
214 // have multiple commas in a row, so we're guaranteed that there is only one.
215 //
216 // FIXME: This handling of macros is more conservative than necessary. When
217 // the end of the expansion coincides with the end of the node, we can still
218 // safely analyze the code. But, it is more complicated, because we need to
219 // start by lexing the spelling loc for the first token and then switch to the
220 // expansion loc.
221 bool TerminatedByMacro = false;
223 if (Terminators.empty() || contains(Terminators, Tok))
224 Terminated = true;
225 else if (EntityLast.isMacroID()) {
226 Terminated = true;
227 TerminatedByMacro = true;
228 }
229
230 // We save the most recent candidate for the exclusive end location.
231 SourceLocation End = Tok.getEndLoc();
232
233 while (!Terminated) {
234 // Lex the next token we want to possibly expand the range with.
236
237 switch (Tok.getKind()) {
238 case tok::eof:
239 // Unexpected separators.
240 case tok::l_brace:
241 case tok::r_brace:
242 case tok::comma:
243 return End;
244 // Whitespace pseudo-tokens.
245 case tok::unknown:
246 if (startsWithNewline(SM, Tok))
247 // Include at least until the end of the line.
248 End = Tok.getEndLoc();
249 break;
250 default:
251 if (contains(Terminators, Tok))
252 Terminated = true;
253 End = Tok.getEndLoc();
254 break;
255 }
256 }
257
258 do {
259 // Lex the next token we want to possibly expand the range with.
261
262 switch (Tok.getKind()) {
263 case tok::unknown:
264 if (startsWithNewline(SM, Tok))
265 // We're done, but include this newline.
266 return Tok.getEndLoc();
267 break;
268 case tok::comment:
269 // Include any comments we find on the way.
270 End = Tok.getEndLoc();
271 break;
272 case tok::semi:
273 case tok::comma:
274 if (TerminatedByMacro && contains(Terminators, Tok)) {
275 End = Tok.getEndLoc();
276 // We've found a real terminator.
277 TerminatedByMacro = false;
278 break;
279 }
280 // Found an unrelated token; stop and don't include it.
281 return End;
282 default:
283 // Found an unrelated token; stop and don't include it.
284 return End;
285 }
286 } while (true);
287}
288
289// Returns the expected terminator tokens for the given declaration.
290//
291// If we do not know the correct terminator token, returns an empty set.
292//
293// There are cases where we have more than one possible terminator (for example,
294// we find either a comma or a semicolon after a VarDecl).
295static std::set<tok::TokenKind> getTerminators(const Decl &D) {
296 if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
297 return {tok::semi};
298
299 if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
300 return {tok::r_brace, tok::semi};
301
302 if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
303 return {tok::comma, tok::semi};
304
305 return {};
306}
307
308// Starting from `Loc`, skips whitespace up to, and including, a single
309// newline. Returns the (exclusive) end of any skipped whitespace (that is, the
310// location immediately after the whitespace).
312 SourceLocation Loc,
313 const LangOptions &LangOpts) {
314 const char *LocChars = SM.getCharacterData(Loc);
315 int i = 0;
316 while (isHorizontalWhitespace(LocChars[i]))
317 ++i;
318 if (isVerticalWhitespace(LocChars[i]))
319 ++i;
320 return Loc.getLocWithOffset(i);
321}
322
323// Is `Loc` separated from any following decl by something meaningful (e.g. an
324// empty line, a comment), ignoring horizontal whitespace? Since this is a
325// heuristic, we return false when in doubt. `Loc` cannot be the first location
326// in the file.
328 const LangOptions &LangOpts) {
329 // If the preceding character is a newline, we'll check for an empty line as a
330 // separator. However, we can't identify an empty line using tokens, so we
331 // analyse the characters. If we try to use tokens, we'll just end up with a
332 // whitespace token, whose characters we'd have to analyse anyhow.
333 bool Invalid = false;
334 const char *LocChars =
335 SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
336 assert(!Invalid &&
337 "Loc must be a valid character and not the first of the source file.");
338 if (isVerticalWhitespace(LocChars[0])) {
339 for (int i = 1; isWhitespace(LocChars[i]); ++i)
340 if (isVerticalWhitespace(LocChars[i]))
341 return true;
342 }
343 // We didn't find an empty line, so lex the next token, skipping past any
344 // whitespace we just scanned.
345 Token Tok;
346 bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
347 /*IgnoreWhiteSpace=*/true);
348 if (Failed)
349 // Any text that confuses the lexer seems fair to consider a separation.
350 return true;
351
352 switch (Tok.getKind()) {
353 case tok::comment:
354 case tok::l_brace:
355 case tok::r_brace:
356 case tok::eof:
357 return true;
358 default:
359 return false;
360 }
361}
362
364 ASTContext &Context) {
365 const SourceManager &SM = Context.getSourceManager();
366 const LangOptions &LangOpts = Context.getLangOpts();
368
369 // First, expand to the start of the template<> declaration if necessary.
370 if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
371 if (const auto *T = Record->getDescribedClassTemplate())
372 if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
373 Range.setBegin(T->getBeginLoc());
374 } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
375 if (const auto *T = F->getDescribedFunctionTemplate())
376 if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
377 Range.setBegin(T->getBeginLoc());
378 }
379
380 // Next, expand the end location past trailing comments to include a potential
381 // newline at the end of the decl's line.
382 Range.setEnd(
384 Range.setTokenRange(false);
385
386 // Expand to include preceeding associated comments. We ignore any comments
387 // that are not preceeding the decl, since we've already skipped trailing
388 // comments with getEntityEndLoc.
389 if (const RawComment *Comment =
391 // Only include a preceding comment if:
392 // * it is *not* separate from the declaration (not including any newline
393 // that immediately follows the comment),
394 // * the decl *is* separate from any following entity (so, there are no
395 // other entities the comment could refer to), and
396 // * it is not a IfThisThenThat lint check.
397 if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
398 Range.getBegin()) &&
400 SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
401 LangOpts) &&
402 atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
403 const StringRef CommentText = Comment->getRawText(SM);
404 if (!CommentText.contains("LINT.IfChange") &&
405 !CommentText.contains("LINT.ThenChange"))
406 Range.setBegin(Comment->getBeginLoc());
407 }
408 // Add leading attributes.
409 for (auto *Attr : Decl.attrs()) {
410 if (Attr->getLocation().isInvalid() ||
411 !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
412 continue;
413 Range.setBegin(Attr->getLocation());
414
415 // Extend to the left '[[' or '__attribute((' if we saw the attribute,
416 // unless it is not a valid location.
417 bool Invalid;
418 StringRef Source =
419 SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
420 if (Invalid)
421 continue;
422 llvm::StringRef BeforeAttr =
423 Source.substr(0, SM.getFileOffset(Range.getBegin()));
424 llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
425
426 for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
427 // Handle whitespace between attribute prefix and attribute value.
428 if (BeforeAttrStripped.ends_with(Prefix)) {
429 // Move start to start position of prefix, which is
430 // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
431 // positions to the left.
432 Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
433 -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
434 break;
435 // If we didn't see '[[' or '__attribute' it's probably coming from a
436 // macro expansion which is already handled by makeFileCharRange(),
437 // below.
438 }
439 }
440 }
441
442 // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
443 // Range.getBegin() may be inside an expansion.
444 return Lexer::makeFileCharRange(Range, SM, LangOpts);
445}
Defines the clang::ASTContext interface.
#define SM(sm)
Definition: Cuda.cpp:82
Defines the C++ Decl subclasses, other than those for templates (found in DeclTemplate....
Defines the C++ template declaration subclasses.
llvm::MachO::Record Record
Definition: MachO.h:28
static bool contains(const std::set< tok::TokenKind > &Terminators, const Token &Tok)
Definition: SourceCode.cpp:152
static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc, const LangOptions &LangOpts)
Definition: SourceCode.cpp:327
static bool startsWithNewline(const SourceManager &SM, const Token &Tok)
Definition: SourceCode.cpp:148
static SourceLocation getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast, const std::set< tok::TokenKind > &Terminators, const LangOptions &LangOpts)
Definition: SourceCode.cpp:166
static bool spelledInMacroDefinition(SourceLocation Loc, const SourceManager &SM)
Definition: SourceCode.cpp:89
static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM, SourceLocation Loc, const LangOptions &LangOpts)
Definition: SourceCode.cpp:311
static CharSourceRange getRange(const CharSourceRange &EditRange, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeMacroExpansion)
Definition: SourceCode.cpp:104
static std::set< tok::TokenKind > getTerminators(const Decl &D)
Definition: SourceCode.cpp:295
Defines the SourceManager interface.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:182
SourceManager & getSourceManager()
Definition: ASTContext.h:702
const LangOptions & getLangOpts() const
Definition: ASTContext.h:772
RawComment * getRawCommentForDeclNoCache(const Decl *D) const
Return the documentation comment attached to a given declaration, without looking into cache.
Definition: ASTContext.cpp:293
Attr - This represents one attribute.
Definition: Attr.h:42
SourceLocation getLocation() const
Definition: Attr.h:95
Represents a character-granular source range.
bool isTokenRange() const
Return true if the end of this range specifies the start of the last token.
static CharSourceRange getCharRange(SourceRange R)
static CharSourceRange getTokenRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:85
SourceLocation getEndLoc() const LLVM_READONLY
Definition: DeclBase.h:440
ASTContext & getASTContext() const LLVM_READONLY
Definition: DeclBase.cpp:501
attr_range attrs() const
Definition: DeclBase.h:540
virtual SourceRange getSourceRange() const LLVM_READONLY
Source range that this declaration covers.
Definition: DeclBase.h:432
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:449
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:1024
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
static CharSourceRange getAsCharRange(SourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Given a token range, produce a corresponding CharSourceRange that is not a token range.
Definition: Lexer.h:430
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:955
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:510
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:850
Encodes a location in the source.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
Token - This structure provides full information about a lexed token.
Definition: Token.h:36
SourceLocation getEndLoc() const
Definition: Token.h:159
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:99
tok::TokenKind getKind() const
Definition: Token.h:94
A source range independent of the SourceManager.
Definition: Replacement.h:44
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
std::optional< CharSourceRange > getFileRangeForEdit(const CharSourceRange &EditRange, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeMacroExpansion=true)
Attempts to resolve the given range to one that can be edited by a rewrite; generally,...
Definition: SourceCode.cpp:125
llvm::Error validateRange(const CharSourceRange &Range, const SourceManager &SM, bool AllowSystemHeaders)
Determines whether Range is one that can be read from.
Definition: SourceCode.cpp:53
llvm::Error validateEditRange(const CharSourceRange &Range, const SourceManager &SM)
Determines whether Range is one that can be edited by a rewrite; generally, one that starts and ends ...
Definition: SourceCode.cpp:84
std::optional< CharSourceRange > getFileRange(const CharSourceRange &EditRange, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeMacroExpansion)
Attempts to resolve the given range to one that starts and ends in a particular file.
Definition: SourceCode.cpp:136
CharSourceRange maybeExtendRange(CharSourceRange Range, tok::TokenKind Terminator, ASTContext &Context)
Extends Range to include the token Terminator, if it immediately follows the end of the range.
Definition: SourceCode.cpp:37
StringRef getText(CharSourceRange Range, const ASTContext &Context)
Returns the source-code text in the specified range.
Definition: SourceCode.cpp:31
CharSourceRange getAssociatedRange(const Decl &D, ASTContext &Context)
Returns the logical source range of the node extended to include associated comments and whitespace b...
Definition: SourceCode.cpp:363
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:100
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:92
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:109