clang 18.0.0git
UnwrappedLineParser.h
Go to the documentation of this file.
1//===--- UnwrappedLineParser.h - Format C++ code ----------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains the declaration of the UnwrappedLineParser,
11/// which turns a stream of tokens into UnwrappedLines.
12///
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
16#define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
17
18#include "Encoding.h"
19#include "FormatToken.h"
20#include "Macros.h"
22#include "clang/Format/Format.h"
23#include "llvm/ADT/ArrayRef.h"
24#include "llvm/ADT/BitVector.h"
25#include "llvm/Support/Regex.h"
26#include <list>
27#include <stack>
28#include <vector>
29
30namespace clang {
31namespace format {
32
33struct UnwrappedLineNode;
34
35/// An unwrapped line is a sequence of \c Token, that we would like to
36/// put on a single line if there was no column limit.
37///
38/// This is used as a main interface between the \c UnwrappedLineParser and the
39/// \c UnwrappedLineFormatter. The key property is that changing the formatting
40/// within an unwrapped line does not affect any other unwrapped lines.
42 UnwrappedLine() = default;
43
44 /// The \c Tokens comprising this \c UnwrappedLine.
45 std::list<UnwrappedLineNode> Tokens;
46
47 /// The indent level of the \c UnwrappedLine.
48 unsigned Level = 0;
49
50 /// The \c PPBranchLevel (adjusted for header guards) if this line is a
51 /// \c InMacroBody line, and 0 otherwise.
52 unsigned PPLevel = 0;
53
54 /// Whether this \c UnwrappedLine is part of a preprocessor directive.
55 bool InPPDirective = false;
56 /// Whether this \c UnwrappedLine is part of a pramga directive.
57 bool InPragmaDirective = false;
58 /// Whether it is part of a macro body.
59 bool InMacroBody = false;
60
61 bool MustBeDeclaration = false;
62
63 /// Whether the parser has seen \c decltype(auto) in this line.
64 bool SeenDecltypeAuto = false;
65
66 /// \c True if this line should be indented by ContinuationIndent in
67 /// addition to the normal indention level.
68 bool IsContinuation = false;
69
70 /// If this \c UnwrappedLine closes a block in a sequence of lines,
71 /// \c MatchingOpeningBlockLineIndex stores the index of the corresponding
72 /// opening line. Otherwise, \c MatchingOpeningBlockLineIndex must be
73 /// \c kInvalidIndex.
75
76 /// If this \c UnwrappedLine opens a block, stores the index of the
77 /// line with the corresponding closing brace.
79
80 static const size_t kInvalidIndex = -1;
81
82 unsigned FirstStartColumn = 0;
83};
84
85/// Interface for users of the UnwrappedLineParser to receive the parsed lines.
86/// Parsing a single snippet of code can lead to multiple runs, where each
87/// run is a coherent view of the file.
88///
89/// For example, different runs are generated:
90/// - for different combinations of #if blocks
91/// - when macros are involved, for the expanded code and the as-written code
92///
93/// Some tokens will only be visible in a subset of the runs.
94/// For each run, \c UnwrappedLineParser will call \c consumeUnwrappedLine
95/// for each parsed unwrapped line, and then \c finishRun to indicate
96/// that the set of unwrapped lines before is one coherent view of the
97/// code snippet to be formatted.
99public:
101 virtual void consumeUnwrappedLine(const UnwrappedLine &Line) = 0;
102 virtual void finishRun() = 0;
103};
104
106
108public:
109 UnwrappedLineParser(SourceManager &SourceMgr, const FormatStyle &Style,
110 const AdditionalKeywords &Keywords,
111 unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens,
112 UnwrappedLineConsumer &Callback,
113 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
114 IdentifierTable &IdentTable);
115
116 void parse();
117
118private:
119 enum class IfStmtKind {
120 NotIf, // Not an if statement.
121 IfOnly, // An if statement without the else clause.
122 IfElse, // An if statement followed by else but not else if.
123 IfElseIf // An if statement followed by else if.
124 };
125
126 void reset();
127 void parseFile();
128 bool precededByCommentOrPPDirective() const;
129 bool parseLevel(const FormatToken *OpeningBrace = nullptr,
130 IfStmtKind *IfKind = nullptr,
131 FormatToken **IfLeftBrace = nullptr);
132 bool mightFitOnOneLine(UnwrappedLine &Line,
133 const FormatToken *OpeningBrace = nullptr) const;
134 FormatToken *parseBlock(bool MustBeDeclaration = false,
135 unsigned AddLevels = 1u, bool MunchSemi = true,
136 bool KeepBraces = true, IfStmtKind *IfKind = nullptr,
137 bool UnindentWhitesmithsBraces = false);
138 void parseChildBlock();
139 void parsePPDirective();
140 void parsePPDefine();
141 void parsePPIf(bool IfDef);
142 void parsePPElse();
143 void parsePPEndIf();
144 void parsePPPragma();
145 void parsePPUnknown();
146 void readTokenWithJavaScriptASI();
147 void parseStructuralElement(const FormatToken *OpeningBrace = nullptr,
148 IfStmtKind *IfKind = nullptr,
149 FormatToken **IfLeftBrace = nullptr,
150 bool *HasDoWhile = nullptr,
151 bool *HasLabel = nullptr);
152 bool tryToParseBracedList();
153 bool parseBracedList(bool IsAngleBracket = false, bool IsEnum = false);
154 bool parseParens(TokenType AmpAmpTokenType = TT_Unknown);
155 void parseSquare(bool LambdaIntroducer = false);
156 void keepAncestorBraces();
157 void parseUnbracedBody(bool CheckEOF = false);
158 void handleAttributes();
159 bool handleCppAttributes();
160 bool isBlockBegin(const FormatToken &Tok) const;
161 FormatToken *parseIfThenElse(IfStmtKind *IfKind, bool KeepBraces = false,
162 bool IsVerilogAssert = false);
163 void parseTryCatch();
164 void parseLoopBody(bool KeepBraces, bool WrapRightBrace);
165 void parseForOrWhileLoop(bool HasParens = true);
166 void parseDoWhile();
167 void parseLabel(bool LeftAlignLabel = false);
168 void parseCaseLabel();
169 void parseSwitch();
170 void parseNamespace();
171 bool parseModuleImport();
172 void parseNew();
173 void parseAccessSpecifier();
174 bool parseEnum();
175 bool parseStructLike();
176 bool parseRequires();
177 void parseRequiresClause(FormatToken *RequiresToken);
178 void parseRequiresExpression(FormatToken *RequiresToken);
179 void parseConstraintExpression();
180 void parseJavaEnumBody();
181 // Parses a record (aka class) as a top level element. If ParseAsExpr is true,
182 // parses the record as a child block, i.e. if the class declaration is an
183 // expression.
184 void parseRecord(bool ParseAsExpr = false);
185 void parseObjCLightweightGenerics();
186 void parseObjCMethod();
187 void parseObjCProtocolList();
188 void parseObjCUntilAtEnd();
189 void parseObjCInterfaceOrImplementation();
190 bool parseObjCProtocol();
191 void parseJavaScriptEs6ImportExport();
192 void parseStatementMacro();
193 void parseCSharpAttribute();
194 // Parse a C# generic type constraint: `where T : IComparable<T>`.
195 // See:
196 // https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/keywords/where-generic-type-constraint
197 void parseCSharpGenericTypeConstraint();
198 bool tryToParseLambda();
199 bool tryToParseChildBlock();
200 bool tryToParseLambdaIntroducer();
201 bool tryToParsePropertyAccessor();
202 void tryToParseJSFunction();
203 bool tryToParseSimpleAttribute();
204 void parseVerilogHierarchyIdentifier();
205 void parseVerilogSensitivityList();
206 // Returns the number of levels of indentation in addition to the normal 1
207 // level for a block, used for indenting case labels.
208 unsigned parseVerilogHierarchyHeader();
209 void parseVerilogTable();
210 void parseVerilogCaseLabel();
211 std::optional<llvm::SmallVector<llvm::SmallVector<FormatToken *, 8>, 1>>
212 parseMacroCall();
213
214 // Used by addUnwrappedLine to denote whether to keep or remove a level
215 // when resetting the line state.
216 enum class LineLevel { Remove, Keep };
217
218 void addUnwrappedLine(LineLevel AdjustLevel = LineLevel::Remove);
219 bool eof() const;
220 // LevelDifference is the difference of levels after and before the current
221 // token. For example:
222 // - if the token is '{' and opens a block, LevelDifference is 1.
223 // - if the token is '}' and closes a block, LevelDifference is -1.
224 void nextToken(int LevelDifference = 0);
225 void readToken(int LevelDifference = 0);
226
227 // Decides which comment tokens should be added to the current line and which
228 // should be added as comments before the next token.
229 //
230 // Comments specifies the sequence of comment tokens to analyze. They get
231 // either pushed to the current line or added to the comments before the next
232 // token.
233 //
234 // NextTok specifies the next token. A null pointer NextTok is supported, and
235 // signifies either the absence of a next token, or that the next token
236 // shouldn't be taken into account for the analysis.
237 void distributeComments(const SmallVectorImpl<FormatToken *> &Comments,
238 const FormatToken *NextTok);
239
240 // Adds the comment preceding the next token to unwrapped lines.
241 void flushComments(bool NewlineBeforeNext);
242 void pushToken(FormatToken *Tok);
243 void calculateBraceTypes(bool ExpectClassBody = false);
244 void setPreviousRBraceType(TokenType Type);
245
246 // Marks a conditional compilation edge (for example, an '#if', '#ifdef',
247 // '#else' or merge conflict marker). If 'Unreachable' is true, assumes
248 // this branch either cannot be taken (for example '#if false'), or should
249 // not be taken in this round.
250 void conditionalCompilationCondition(bool Unreachable);
251 void conditionalCompilationStart(bool Unreachable);
252 void conditionalCompilationAlternative();
253 void conditionalCompilationEnd();
254
255 bool isOnNewLine(const FormatToken &FormatTok);
256
257 // Returns whether there is a macro expansion in the line, i.e. a token that
258 // was expanded from a macro call.
259 bool containsExpansion(const UnwrappedLine &Line) const;
260
261 // Compute hash of the current preprocessor branch.
262 // This is used to identify the different branches, and thus track if block
263 // open and close in the same branch.
264 size_t computePPHash() const;
265
266 bool parsingPPDirective() const { return CurrentLines != &Lines; }
267
268 // FIXME: We are constantly running into bugs where Line.Level is incorrectly
269 // subtracted from beyond 0. Introduce a method to subtract from Line.Level
270 // and use that everywhere in the Parser.
271 std::unique_ptr<UnwrappedLine> Line;
272
273 // Lines that are created by macro expansion.
274 // When formatting code containing macro calls, we first format the expanded
275 // lines to set the token types correctly. Afterwards, we format the
276 // reconstructed macro calls, re-using the token types determined in the first
277 // step.
278 // ExpandedLines will be reset every time we create a new LineAndExpansion
279 // instance once a line containing macro calls has been parsed.
280 SmallVector<UnwrappedLine, 8> CurrentExpandedLines;
281
282 // Maps from the first token of a top-level UnwrappedLine that contains
283 // a macro call to the replacement UnwrappedLines expanded from the macro
284 // call.
285 llvm::DenseMap<FormatToken *, SmallVector<UnwrappedLine, 8>> ExpandedLines;
286
287 // Map from the macro identifier to a line containing the full unexpanded
288 // macro call.
289 llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> Unexpanded;
290
291 // For recursive macro expansions, trigger reconstruction only on the
292 // outermost expansion.
293 bool InExpansion = false;
294
295 // Set while we reconstruct a macro call.
296 // For reconstruction, we feed the expanded lines into the reconstructor
297 // until it is finished.
298 std::optional<MacroCallReconstructor> Reconstruct;
299
300 // Comments are sorted into unwrapped lines by whether they are in the same
301 // line as the previous token, or not. If not, they belong to the next token.
302 // Since the next token might already be in a new unwrapped line, we need to
303 // store the comments belonging to that token.
304 SmallVector<FormatToken *, 1> CommentsBeforeNextToken;
305 FormatToken *FormatTok = nullptr;
306 bool MustBreakBeforeNextToken;
307
308 // The parsed lines. Only added to through \c CurrentLines.
310
311 // Preprocessor directives are parsed out-of-order from other unwrapped lines.
312 // Thus, we need to keep a list of preprocessor directives to be reported
313 // after an unwrapped line that has been started was finished.
314 SmallVector<UnwrappedLine, 4> PreprocessorDirectives;
315
316 // New unwrapped lines are added via CurrentLines.
317 // Usually points to \c &Lines. While parsing a preprocessor directive when
318 // there is an unfinished previous unwrapped line, will point to
319 // \c &PreprocessorDirectives.
320 SmallVectorImpl<UnwrappedLine> *CurrentLines;
321
322 // We store for each line whether it must be a declaration depending on
323 // whether we are in a compound statement or not.
324 llvm::BitVector DeclarationScopeStack;
325
326 const FormatStyle &Style;
327 const AdditionalKeywords &Keywords;
328
329 llvm::Regex CommentPragmasRegex;
330
331 FormatTokenSource *Tokens;
332 UnwrappedLineConsumer &Callback;
333
334 ArrayRef<FormatToken *> AllTokens;
335
336 // Keeps a stack of the states of nested control statements (true if the
337 // statement contains more than some predefined number of nested statements).
338 SmallVector<bool, 8> NestedTooDeep;
339
340 // Keeps a stack of the states of nested lambdas (true if the return type of
341 // the lambda is `decltype(auto)`).
342 SmallVector<bool, 4> NestedLambdas;
343
344 // Whether the parser is parsing the body of a function whose return type is
345 // `decltype(auto)`.
346 bool IsDecltypeAutoFunction = false;
347
348 // Represents preprocessor branch type, so we can find matching
349 // #if/#else/#endif directives.
350 enum PPBranchKind {
351 PP_Conditional, // Any #if, #ifdef, #ifndef, #elif, block outside #if 0
352 PP_Unreachable // #if 0 or a conditional preprocessor block inside #if 0
353 };
354
355 struct PPBranch {
356 PPBranch(PPBranchKind Kind, size_t Line) : Kind(Kind), Line(Line) {}
357 PPBranchKind Kind;
358 size_t Line;
359 };
360
361 // Keeps a stack of currently active preprocessor branching directives.
363
364 // The \c UnwrappedLineParser re-parses the code for each combination
365 // of preprocessor branches that can be taken.
366 // To that end, we take the same branch (#if, #else, or one of the #elif
367 // branches) for each nesting level of preprocessor branches.
368 // \c PPBranchLevel stores the current nesting level of preprocessor
369 // branches during one pass over the code.
370 int PPBranchLevel;
371
372 // Contains the current branch (#if, #else or one of the #elif branches)
373 // for each nesting level.
374 SmallVector<int, 8> PPLevelBranchIndex;
375
376 // Contains the maximum number of branches at each nesting level.
377 SmallVector<int, 8> PPLevelBranchCount;
378
379 // Contains the number of branches per nesting level we are currently
380 // in while parsing a preprocessor branch sequence.
381 // This is used to update PPLevelBranchCount at the end of a branch
382 // sequence.
383 std::stack<int> PPChainBranchIndex;
384
385 // Include guard search state. Used to fixup preprocessor indent levels
386 // so that include guards do not participate in indentation.
387 enum IncludeGuardState {
388 IG_Inited, // Search started, looking for #ifndef.
389 IG_IfNdefed, // #ifndef found, IncludeGuardToken points to condition.
390 IG_Defined, // Matching #define found, checking other requirements.
391 IG_Found, // All requirements met, need to fix indents.
392 IG_Rejected, // Search failed or never started.
393 };
394
395 // Current state of include guard search.
396 IncludeGuardState IncludeGuard;
397
398 // Points to the #ifndef condition for a potential include guard. Null unless
399 // IncludeGuardState == IG_IfNdefed.
400 FormatToken *IncludeGuardToken;
401
402 // Contains the first start column where the source begins. This is zero for
403 // normal source code and may be nonzero when formatting a code fragment that
404 // does not start at the beginning of the file.
405 unsigned FirstStartColumn;
406
407 MacroExpander Macros;
408
409 friend class ScopedLineState;
411};
412
414 UnwrappedLineNode() : Tok(nullptr) {}
417 : Tok(Tok), Children(Children.begin(), Children.end()) {}
418
421};
422
423} // end namespace format
424} // end namespace clang
425
426#endif
Contains functions for text encoding manipulation.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
Various functions to configurably format source code.
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
This file contains the main building blocks of macro support in clang-format.
Implements an efficient mapping from strings to IdentifierInfo nodes.
This class handles loading and caching of source files into memory.
The base class of the type hierarchy.
Definition: Type.h:1602
Takes a set of macro definitions as strings and allows expanding calls to those macros.
Definition: Macros.h:86
Interface for users of the UnwrappedLineParser to receive the parsed lines.
virtual void consumeUnwrappedLine(const UnwrappedLine &Line)=0
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:184
Represents a complete lambda introducer.
Definition: DeclSpec.h:2750
Encapsulates keywords that are context sensitive or for languages not properly supported by Clang's l...
Definition: FormatToken.h:964
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
A wrapper around a Token storing information about the whitespace characters preceding it.
Definition: FormatToken.h:269
SmallVector< UnwrappedLine, 0 > Children
UnwrappedLineNode(FormatToken *Tok, llvm::ArrayRef< UnwrappedLine > Children={})
An unwrapped line is a sequence of Token, that we would like to put on a single line if there was no ...
unsigned PPLevel
The PPBranchLevel (adjusted for header guards) if this line is a InMacroBody line,...
bool InMacroBody
Whether it is part of a macro body.
std::list< UnwrappedLineNode > Tokens
The Tokens comprising this UnwrappedLine.
bool IsContinuation
True if this line should be indented by ContinuationIndent in addition to the normal indention level.
unsigned Level
The indent level of the UnwrappedLine.
bool InPragmaDirective
Whether this UnwrappedLine is part of a pramga directive.
bool InPPDirective
Whether this UnwrappedLine is part of a preprocessor directive.
bool SeenDecltypeAuto
Whether the parser has seen decltype(auto) in this line.
size_t MatchingClosingBlockLineIndex
If this UnwrappedLine opens a block, stores the index of the line with the corresponding closing brac...
static const size_t kInvalidIndex
size_t MatchingOpeningBlockLineIndex
If this UnwrappedLine closes a block in a sequence of lines, MatchingOpeningBlockLineIndex stores the...