clang-tools 23.0.0git
LexerUtils.cpp
Go to the documentation of this file.
1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "LexerUtils.h"
10#include "clang/Basic/SourceManager.h"
11#include <optional>
12#include <utility>
13#include <vector>
14
16
17std::pair<std::optional<Token>, SourceLocation>
18getPreviousTokenAndStart(SourceLocation Location, const SourceManager &SM,
19 const LangOptions &LangOpts, bool SkipComments) {
20 const std::optional<Token> Tok =
21 Lexer::findPreviousToken(Location, SM, LangOpts, !SkipComments);
22
23 if (Tok.has_value())
24 return {*Tok, Lexer::GetBeginningOfToken(Tok->getLocation(), SM, LangOpts)};
25
26 return {std::nullopt, SourceLocation()};
27}
28
29std::optional<Token> getPreviousToken(SourceLocation Location,
30 const SourceManager &SM,
31 const LangOptions &LangOpts,
32 bool SkipComments) {
33 auto [Token, Start] =
34 getPreviousTokenAndStart(Location, SM, LangOpts, SkipComments);
35 return Token;
36}
37
38SourceLocation findPreviousTokenStart(SourceLocation Start,
39 const SourceManager &SM,
40 const LangOptions &LangOpts) {
41 if (Start.isInvalid() || Start.isMacroID())
42 return {};
43
44 const SourceLocation BeforeStart = Start.getLocWithOffset(-1);
45 if (BeforeStart.isInvalid() || BeforeStart.isMacroID())
46 return {};
47
48 return Lexer::GetBeginningOfToken(BeforeStart, SM, LangOpts);
49}
50
51SourceLocation findPreviousTokenKind(SourceLocation Start,
52 const SourceManager &SM,
53 const LangOptions &LangOpts,
54 tok::TokenKind TK) {
55 if (Start.isInvalid() || Start.isMacroID())
56 return {};
57
58 while (true) {
59 const SourceLocation L = findPreviousTokenStart(Start, SM, LangOpts);
60 if (L.isInvalid() || L.isMacroID())
61 return {};
62
63 Token T;
64 if (Lexer::getRawToken(L, T, SM, LangOpts, /*IgnoreWhiteSpace=*/true))
65 return {};
66
67 if (T.is(TK))
68 return T.getLocation();
69
70 Start = L;
71 }
72}
73
74SourceLocation findNextTerminator(SourceLocation Start, const SourceManager &SM,
75 const LangOptions &LangOpts) {
76 return findNextAnyTokenKind(Start, SM, LangOpts, tok::comma, tok::semi);
77}
78
80 const SourceManager &SM,
81 const LangOptions &LangOpts) {
82 assert(Range.isValid() && "Invalid Range for relexing provided");
83 SourceLocation Loc = Range.getBegin();
84
85 while (Loc <= Range.getEnd()) {
86 if (Loc.isMacroID())
87 return true;
88
89 std::optional<Token> Tok = findNextTokenSkippingComments(Loc, SM, LangOpts);
90
91 if (!Tok)
92 return true;
93
94 if (Tok->is(tok::hash))
95 return true;
96
97 Loc = Tok->getLocation();
98 }
99
100 return false;
101}
102
103namespace {
104enum class CommentCollectionMode { AllComments, TrailingComments };
105} // namespace
106
107static std::vector<CommentToken>
108collectCommentsInRange(CharSourceRange Range, const SourceManager &SM,
109 const LangOptions &LangOpts,
110 CommentCollectionMode Mode) {
111 std::vector<CommentToken> Comments;
112 if (Range.isInvalid())
113 return Comments;
114
115 const CharSourceRange FileRange =
116 Lexer::makeFileCharRange(Range, SM, LangOpts);
117 if (FileRange.isInvalid())
118 return Comments;
119
120 const std::pair<FileID, unsigned> BeginLoc =
121 SM.getDecomposedLoc(FileRange.getBegin());
122 const std::pair<FileID, unsigned> EndLoc =
123 SM.getDecomposedLoc(FileRange.getEnd());
124
125 if (BeginLoc.first != EndLoc.first)
126 return Comments;
127
128 bool Invalid = false;
129 const StringRef Buffer = SM.getBufferData(BeginLoc.first, &Invalid);
130 if (Invalid)
131 return Comments;
132
133 const char *StrData = Buffer.data() + BeginLoc.second;
134
135 Lexer TheLexer(SM.getLocForStartOfFile(BeginLoc.first), LangOpts,
136 Buffer.begin(), StrData, Buffer.end());
137 // Use raw lexing with comment retention so we can see comment tokens without
138 // preprocessing or macro expansion effects.
139 TheLexer.SetCommentRetentionState(true);
140
141 while (true) {
142 Token Tok;
143 if (TheLexer.LexFromRawLexer(Tok))
144 break;
145 if (Tok.is(tok::eof) || Tok.getLocation() == FileRange.getEnd() ||
146 SM.isBeforeInTranslationUnit(FileRange.getEnd(), Tok.getLocation()))
147 break;
148
149 if (Tok.is(tok::comment)) {
150 const std::pair<FileID, unsigned> CommentLoc =
151 SM.getDecomposedLoc(Tok.getLocation());
152 assert(CommentLoc.first == BeginLoc.first);
153 Comments.emplace_back(CommentToken{
154 Tok.getLocation(),
155 StringRef(Buffer.begin() + CommentLoc.second, Tok.getLength()),
156 });
157 } else if (Mode == CommentCollectionMode::TrailingComments) {
158 // Clear comments found before the different token, e.g. comma. Callers
159 // use this to retrieve only the contiguous comment block that directly
160 // precedes a token of interest.
161 Comments.clear();
162 }
163 }
164
165 return Comments;
166}
167
168std::vector<CommentToken> getCommentsInRange(CharSourceRange Range,
169 const SourceManager &SM,
170 const LangOptions &LangOpts) {
171 return collectCommentsInRange(Range, SM, LangOpts,
172 CommentCollectionMode::AllComments);
173}
174
175std::vector<CommentToken>
176getTrailingCommentsInRange(CharSourceRange Range, const SourceManager &SM,
177 const LangOptions &LangOpts) {
178 return collectCommentsInRange(Range, SM, LangOpts,
179 CommentCollectionMode::TrailingComments);
180}
181
182CharSourceRange
183findTokenTextInRange(CharSourceRange Range, const SourceManager &SM,
184 const LangOptions &LangOpts,
185 llvm::function_ref<bool(const Token &)> Pred) {
186 if (Range.isInvalid())
187 return {};
188
189 // Normalize to a file-based char range so raw lexing can operate on one
190 // contiguous buffer and reject unmappable (e.g. macro) ranges.
191 const CharSourceRange FileRange =
192 Lexer::makeFileCharRange(Range, SM, LangOpts);
193 if (FileRange.isInvalid())
194 return {};
195
196 const auto [BeginFID, BeginOffset] =
197 SM.getDecomposedLoc(FileRange.getBegin());
198 const auto [EndFID, EndOffset] = SM.getDecomposedLoc(FileRange.getEnd());
199 if (BeginFID != EndFID || BeginOffset > EndOffset)
200 return {};
201
202 bool Invalid = false;
203 const StringRef Buffer = SM.getBufferData(BeginFID, &Invalid);
204 if (Invalid)
205 return {};
206
207 const char *LexStart = Buffer.data() + BeginOffset;
208 // Re-lex raw tokens in the bounded file buffer while preserving comments so
209 // callers can match tokens regardless of interleaved comments.
210 Lexer TheLexer(SM.getLocForStartOfFile(BeginFID), LangOpts, Buffer.begin(),
211 LexStart, Buffer.end());
212 TheLexer.SetCommentRetentionState(true);
213
214 while (true) {
215 Token Tok;
216 if (TheLexer.LexFromRawLexer(Tok))
217 return {};
218
219 if (Tok.is(tok::eof) || Tok.getLocation() == FileRange.getEnd() ||
220 SM.isBeforeInTranslationUnit(FileRange.getEnd(), Tok.getLocation()))
221 return {};
222
223 if (!Pred(Tok))
224 continue;
225
226 Token NextTok;
227 if (TheLexer.LexFromRawLexer(NextTok))
228 return {};
229 // Return a char range ending at the next token start so trailing trivia of
230 // the matched token is included (useful for fix-it removals).
231 return CharSourceRange::getCharRange(Tok.getLocation(),
232 NextTok.getLocation());
233 }
234}
235
236std::optional<Token> getQualifyingToken(tok::TokenKind TK,
237 CharSourceRange Range,
238 const ASTContext &Context,
239 const SourceManager &SM) {
240 assert((TK == tok::kw_const || TK == tok::kw_volatile ||
241 TK == tok::kw_restrict) &&
242 "TK is not a qualifier keyword");
243 const std::pair<FileID, unsigned> LocInfo =
244 SM.getDecomposedLoc(Range.getBegin());
245 const StringRef File = SM.getBufferData(LocInfo.first);
246 Lexer RawLexer(SM.getLocForStartOfFile(LocInfo.first), Context.getLangOpts(),
247 File.begin(), File.data() + LocInfo.second, File.end());
248 std::optional<Token> LastMatchBeforeTemplate;
249 std::optional<Token> LastMatchAfterTemplate;
250 bool SawTemplate = false;
251 Token Tok;
252 while (!RawLexer.LexFromRawLexer(Tok) &&
253 Range.getEnd() != Tok.getLocation() &&
254 !SM.isBeforeInTranslationUnit(Range.getEnd(), Tok.getLocation())) {
255 if (Tok.is(tok::raw_identifier)) {
256 IdentifierInfo &Info = Context.Idents.get(
257 StringRef(SM.getCharacterData(Tok.getLocation()), Tok.getLength()));
258 Tok.setIdentifierInfo(&Info);
259 Tok.setKind(Info.getTokenID());
260 }
261 if (Tok.is(tok::less)) {
262 SawTemplate = true;
263 } else if (Tok.isOneOf(tok::greater, tok::greatergreater)) {
264 LastMatchAfterTemplate = std::nullopt;
265 } else if (Tok.is(TK)) {
266 if (SawTemplate)
267 LastMatchAfterTemplate = Tok;
268 else
269 LastMatchBeforeTemplate = Tok;
270 }
271 }
272 return LastMatchAfterTemplate != std::nullopt ? LastMatchAfterTemplate
273 : LastMatchBeforeTemplate;
274}
275
276static bool breakAndReturnEnd(const Stmt &S) {
277 return isa<CompoundStmt, DeclStmt, NullStmt>(S);
278}
279
280static bool breakAndReturnEndPlus1Token(const Stmt &S) {
281 return isa<Expr, DoStmt, ReturnStmt, BreakStmt, ContinueStmt, GotoStmt,
282 SEHLeaveStmt>(S);
283}
284
285// Given a Stmt which does not include it's semicolon this method returns the
286// SourceLocation of the semicolon.
287static SourceLocation getSemicolonAfterStmtEndLoc(const SourceLocation &EndLoc,
288 const SourceManager &SM,
289 const LangOptions &LangOpts) {
290 if (EndLoc.isMacroID()) {
291 // Assuming EndLoc points to a function call foo within macro F.
292 // This method is supposed to return location of the semicolon within
293 // those macro arguments:
294 // F ( foo() ; )
295 // ^ EndLoc ^ SpellingLoc ^ next token of SpellingLoc
296 const SourceLocation SpellingLoc = SM.getSpellingLoc(EndLoc);
297 std::optional<Token> NextTok =
298 findNextTokenSkippingComments(SpellingLoc, SM, LangOpts);
299
300 // Was the next token found successfully?
301 // All macro issues are simply resolved by ensuring it's a semicolon.
302 if (NextTok && NextTok->is(tok::TokenKind::semi)) {
303 // Ideally this would return `F` with spelling location `;` (NextTok)
304 // following the example above. For now simply return NextTok location.
305 return NextTok->getLocation();
306 }
307
308 // Fallthrough to 'normal handling'.
309 // F ( foo() ) ;
310 // ^ EndLoc ^ SpellingLoc ) ^ next token of EndLoc
311 }
312
313 std::optional<Token> NextTok =
314 findNextTokenSkippingComments(EndLoc, SM, LangOpts);
315
316 // Testing for semicolon again avoids some issues with macros.
317 if (NextTok && NextTok->is(tok::TokenKind::semi))
318 return NextTok->getLocation();
319
320 return {};
321}
322
323SourceLocation getUnifiedEndLoc(const Stmt &S, const SourceManager &SM,
324 const LangOptions &LangOpts) {
325 const Stmt *LastChild = &S;
326 while (!LastChild->children().empty() && !breakAndReturnEnd(*LastChild) &&
327 !breakAndReturnEndPlus1Token(*LastChild)) {
328 for (const Stmt *Child : LastChild->children())
329 LastChild = Child;
330 }
331
332 if (!breakAndReturnEnd(*LastChild) && breakAndReturnEndPlus1Token(*LastChild))
333 return getSemicolonAfterStmtEndLoc(S.getEndLoc(), SM, LangOpts);
334
335 return S.getEndLoc();
336}
337
338SourceLocation getLocationForNoexceptSpecifier(const FunctionDecl *FuncDecl,
339 const SourceManager &SM) {
340 if (!FuncDecl)
341 return {};
342
343 const LangOptions &LangOpts = FuncDecl->getLangOpts();
344
345 if (FuncDecl->getNumParams() == 0) {
346 // Start at the beginning of the function declaration, and find the closing
347 // parenthesis after which we would place the noexcept specifier.
348 Token CurrentToken;
349 SourceLocation CurrentLocation = FuncDecl->getBeginLoc();
350 while (!Lexer::getRawToken(CurrentLocation, CurrentToken, SM, LangOpts,
351 true)) {
352 if (CurrentToken.is(tok::r_paren))
353 return CurrentLocation.getLocWithOffset(1);
354
355 CurrentLocation = CurrentToken.getEndLoc();
356 }
357
358 // Failed to find the closing parenthesis, so just return an invalid
359 // SourceLocation.
360 return {};
361 }
362
363 // FunctionDecl with parameters
364 const SourceLocation NoexceptLoc =
365 FuncDecl->getParamDecl(FuncDecl->getNumParams() - 1)->getEndLoc();
366 if (NoexceptLoc.isValid())
367 return Lexer::findLocationAfterToken(
368 NoexceptLoc, tok::r_paren, SM, LangOpts,
369 /*SkipTrailingWhitespaceAndNewLine=*/true);
370
371 return {};
372}
373
374} // namespace clang::tidy::utils::lexer
std::pair< std::optional< Token >, SourceLocation > getPreviousTokenAndStart(SourceLocation Location, const SourceManager &SM, const LangOptions &LangOpts, bool SkipComments)
SourceLocation getLocationForNoexceptSpecifier(const FunctionDecl *FuncDecl, const SourceManager &SM)
For a given FunctionDecl returns the location where you would need to place the noexcept specifier.
SourceLocation getUnifiedEndLoc(const Stmt &S, const SourceManager &SM, const LangOptions &LangOpts)
Stmt->getEndLoc does not always behave the same way depending on Token type.
bool rangeContainsExpansionsOrDirectives(SourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Re-lex the provide Range and return false if either a macro spans multiple tokens,...
SourceLocation findNextTerminator(SourceLocation Start, const SourceManager &SM, const LangOptions &LangOpts)
std::optional< Token > getPreviousToken(SourceLocation Location, const SourceManager &SM, const LangOptions &LangOpts, bool SkipComments)
Returns previous token or std::nullopt if not found.
std::vector< CommentToken > getTrailingCommentsInRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Returns comment tokens found in the given range. If a non-comment token is encountered,...
SourceLocation findNextAnyTokenKind(SourceLocation Start, const SourceManager &SM, const LangOptions &LangOpts, TokenKind TK, TokenKinds... TKs)
Definition LexerUtils.h:73
std::optional< Token > findNextTokenSkippingComments(SourceLocation Start, const SourceManager &SM, const LangOptions &LangOpts)
Definition LexerUtils.h:106
SourceLocation findPreviousTokenStart(SourceLocation Start, const SourceManager &SM, const LangOptions &LangOpts)
static SourceLocation getSemicolonAfterStmtEndLoc(const SourceLocation &EndLoc, const SourceManager &SM, const LangOptions &LangOpts)
CharSourceRange findTokenTextInRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, llvm::function_ref< bool(const Token &)> Pred)
Returns source range of the first token in Range matching Pred. The returned char range starts at the...
static bool breakAndReturnEndPlus1Token(const Stmt &S)
static bool breakAndReturnEnd(const Stmt &S)
std::vector< CommentToken > getCommentsInRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Returns all comment tokens found in the given range.
static std::vector< CommentToken > collectCommentsInRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, CommentCollectionMode Mode)
SourceLocation findPreviousTokenKind(SourceLocation Start, const SourceManager &SM, const LangOptions &LangOpts, tok::TokenKind TK)
std::optional< Token > getQualifyingToken(tok::TokenKind TK, CharSourceRange Range, const ASTContext &Context, const SourceManager &SM)
Assuming that Range spans a CVR-qualified type, returns the token in Range that is responsible for th...
static constexpr const char FuncDecl[]