clang 22.0.0git
UnwrappedLineParser.h
Go to the documentation of this file.
1//===--- UnwrappedLineParser.h - Format C++ code ----------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains the declaration of the UnwrappedLineParser,
11/// which turns a stream of tokens into UnwrappedLines.
12///
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
16#define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
17
18#include "Macros.h"
19#include <stack>
20
21namespace clang {
22namespace format {
23
25
26/// An unwrapped line is a sequence of \c Token, that we would like to
27/// put on a single line if there was no column limit.
28///
29/// This is used as a main interface between the \c UnwrappedLineParser and the
30/// \c UnwrappedLineFormatter. The key property is that changing the formatting
31/// within an unwrapped line does not affect any other unwrapped lines.
33 UnwrappedLine() = default;
34
35 /// The \c Tokens comprising this \c UnwrappedLine.
36 std::list<UnwrappedLineNode> Tokens;
37
38 /// The indent level of the \c UnwrappedLine.
39 unsigned Level = 0;
40
41 /// The \c PPBranchLevel (adjusted for header guards) if this line is a
42 /// \c InMacroBody line, and 0 otherwise.
43 unsigned PPLevel = 0;
44
45 /// Whether this \c UnwrappedLine is part of a preprocessor directive.
46 bool InPPDirective = false;
47 /// Whether this \c UnwrappedLine is part of a pramga directive.
48 bool InPragmaDirective = false;
49 /// Whether it is part of a macro body.
50 bool InMacroBody = false;
51
52 /// Nesting level of unbraced body of a control statement.
53 unsigned UnbracedBodyLevel = 0;
54
55 bool MustBeDeclaration = false;
56
57 /// Whether the parser has seen \c decltype(auto) in this line.
58 bool SeenDecltypeAuto = false;
59
60 /// \c True if this line should be indented by ContinuationIndent in
61 /// addition to the normal indention level.
62 bool IsContinuation = false;
63
64 /// If this \c UnwrappedLine closes a block in a sequence of lines,
65 /// \c MatchingOpeningBlockLineIndex stores the index of the corresponding
66 /// opening line. Otherwise, \c MatchingOpeningBlockLineIndex must be
67 /// \c kInvalidIndex.
69
70 /// If this \c UnwrappedLine opens a block, stores the index of the
71 /// line with the corresponding closing brace.
73
74 static const size_t kInvalidIndex = -1;
75
76 unsigned FirstStartColumn = 0;
77};
78
79/// Interface for users of the UnwrappedLineParser to receive the parsed lines.
80/// Parsing a single snippet of code can lead to multiple runs, where each
81/// run is a coherent view of the file.
82///
83/// For example, different runs are generated:
84/// - for different combinations of #if blocks
85/// - when macros are involved, for the expanded code and the as-written code
86///
87/// Some tokens will only be visible in a subset of the runs.
88/// For each run, \c UnwrappedLineParser will call \c consumeUnwrappedLine
89/// for each parsed unwrapped line, and then \c finishRun to indicate
90/// that the set of unwrapped lines before is one coherent view of the
91/// code snippet to be formatted.
93public:
95 virtual void consumeUnwrappedLine(const UnwrappedLine &Line) = 0;
96 virtual void finishRun() = 0;
97};
98
100
102public:
103 UnwrappedLineParser(SourceManager &SourceMgr, const FormatStyle &Style,
104 const AdditionalKeywords &Keywords,
105 unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens,
106 UnwrappedLineConsumer &Callback,
107 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
108 IdentifierTable &IdentTable);
109
110 void parse();
111
112private:
113 enum class IfStmtKind {
114 NotIf, // Not an if statement.
115 IfOnly, // An if statement without the else clause.
116 IfElse, // An if statement followed by else but not else if.
117 IfElseIf // An if statement followed by else if.
118 };
119
120 void reset();
121 void parseFile();
122 bool precededByCommentOrPPDirective() const;
123 bool parseLevel(const FormatToken *OpeningBrace = nullptr,
124 IfStmtKind *IfKind = nullptr,
125 FormatToken **IfLeftBrace = nullptr);
126 bool mightFitOnOneLine(UnwrappedLine &Line,
127 const FormatToken *OpeningBrace = nullptr) const;
128 FormatToken *parseBlock(bool MustBeDeclaration = false,
129 unsigned AddLevels = 1u, bool MunchSemi = true,
130 bool KeepBraces = true, IfStmtKind *IfKind = nullptr,
131 bool UnindentWhitesmithsBraces = false);
132 void parseChildBlock();
133 void parsePPDirective();
134 void parsePPDefine();
135 void parsePPIf(bool IfDef);
136 void parsePPElse();
137 void parsePPEndIf();
138 void parsePPPragma();
139 void parsePPUnknown();
140 void readTokenWithJavaScriptASI();
141 void parseStructuralElement(const FormatToken *OpeningBrace = nullptr,
142 IfStmtKind *IfKind = nullptr,
143 FormatToken **IfLeftBrace = nullptr,
144 bool *HasDoWhile = nullptr,
145 bool *HasLabel = nullptr);
146 bool tryToParseBracedList();
147 bool parseBracedList(bool IsAngleBracket = false, bool IsEnum = false);
148 bool parseParens(TokenType AmpAmpTokenType = TT_Unknown,
149 bool InMacroCall = false);
150 void parseSquare(bool LambdaIntroducer = false);
151 void keepAncestorBraces();
152 void parseUnbracedBody(bool CheckEOF = false);
153 void handleAttributes();
154 bool handleCppAttributes();
155 bool isBlockBegin(const FormatToken &Tok) const;
156 FormatToken *parseIfThenElse(IfStmtKind *IfKind, bool KeepBraces = false,
157 bool IsVerilogAssert = false);
158 void parseTryCatch();
159 void parseLoopBody(bool KeepBraces, bool WrapRightBrace);
160 void parseForOrWhileLoop(bool HasParens = true);
161 void parseDoWhile();
162 void parseLabel(bool LeftAlignLabel = false);
163 void parseCaseLabel();
164 void parseSwitch(bool IsExpr);
165 void parseNamespace();
166 bool parseModuleImport();
167 void parseNew();
168 void parseAccessSpecifier();
169 bool parseEnum();
170 bool parseStructLike();
171 bool parseRequires(bool SeenEqual);
172 void parseRequiresClause(FormatToken *RequiresToken);
173 void parseRequiresExpression(FormatToken *RequiresToken);
174 void parseConstraintExpression();
175 void parseCppExportBlock();
176 void parseNamespaceOrExportBlock(unsigned AddLevels);
177 void parseJavaEnumBody();
178 // Parses a record (aka class) as a top level element. If ParseAsExpr is true,
179 // parses the record as a child block, i.e. if the class declaration is an
180 // expression.
181 void parseRecord(bool ParseAsExpr = false, bool IsJavaRecord = false);
182 void parseObjCLightweightGenerics();
183 void parseObjCMethod();
184 void parseObjCProtocolList();
185 void parseObjCUntilAtEnd();
186 void parseObjCInterfaceOrImplementation();
187 bool parseObjCProtocol();
188 void parseJavaScriptEs6ImportExport();
189 void parseStatementMacro();
190 void parseCSharpAttribute();
191 // Parse a C# generic type constraint: `where T : IComparable<T>`.
192 // See:
193 // https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/keywords/where-generic-type-constraint
194 void parseCSharpGenericTypeConstraint();
195 bool tryToParseLambda();
196 bool tryToParseChildBlock();
197 bool tryToParseLambdaIntroducer();
198 bool tryToParsePropertyAccessor();
199 void tryToParseJSFunction();
200 bool tryToParseSimpleAttribute();
201 void parseVerilogHierarchyIdentifier();
202 void parseVerilogSensitivityList();
203 // Returns the number of levels of indentation in addition to the normal 1
204 // level for a block, used for indenting case labels.
205 unsigned parseVerilogHierarchyHeader();
206 void parseVerilogTable();
207 void parseVerilogCaseLabel();
208 // For import, export, and extern.
209 void parseVerilogExtern();
210 std::optional<llvm::SmallVector<llvm::SmallVector<FormatToken *, 8>, 1>>
211 parseMacroCall();
212
213 // Used by addUnwrappedLine to denote whether to keep or remove a level
214 // when resetting the line state.
215 enum class LineLevel { Remove, Keep };
216
217 void addUnwrappedLine(LineLevel AdjustLevel = LineLevel::Remove);
218 bool eof() const;
219 // LevelDifference is the difference of levels after and before the current
220 // token. For example:
221 // - if the token is '{' and opens a block, LevelDifference is 1.
222 // - if the token is '}' and closes a block, LevelDifference is -1.
223 void nextToken(int LevelDifference = 0);
224 void readToken(int LevelDifference = 0);
225
226 // Decides which comment tokens should be added to the current line and which
227 // should be added as comments before the next token.
228 //
229 // Comments specifies the sequence of comment tokens to analyze. They get
230 // either pushed to the current line or added to the comments before the next
231 // token.
232 //
233 // NextTok specifies the next token. A null pointer NextTok is supported, and
234 // signifies either the absence of a next token, or that the next token
235 // shouldn't be taken into account for the analysis.
236 void distributeComments(const ArrayRef<FormatToken *> &Comments,
237 const FormatToken *NextTok);
238
239 // Adds the comment preceding the next token to unwrapped lines.
240 void flushComments(bool NewlineBeforeNext);
241 void pushToken(FormatToken *Tok);
242 void calculateBraceTypes(bool ExpectClassBody = false);
243 void setPreviousRBraceType(TokenType Type);
244
245 // Marks a conditional compilation edge (for example, an '#if', '#ifdef',
246 // '#else' or merge conflict marker). If 'Unreachable' is true, assumes
247 // this branch either cannot be taken (for example '#if false'), or should
248 // not be taken in this round.
249 void conditionalCompilationCondition(bool Unreachable);
250 void conditionalCompilationStart(bool Unreachable);
251 void conditionalCompilationAlternative();
252 void conditionalCompilationEnd();
253
254 bool isOnNewLine(const FormatToken &FormatTok);
255
256 // Returns whether there is a macro expansion in the line, i.e. a token that
257 // was expanded from a macro call.
258 bool containsExpansion(const UnwrappedLine &Line) const;
259
260 // Compute hash of the current preprocessor branch.
261 // This is used to identify the different branches, and thus track if block
262 // open and close in the same branch.
263 size_t computePPHash() const;
264
265 bool parsingPPDirective() const { return CurrentLines != &Lines; }
266
267 // FIXME: We are constantly running into bugs where Line.Level is incorrectly
268 // subtracted from beyond 0. Introduce a method to subtract from Line.Level
269 // and use that everywhere in the Parser.
270 std::unique_ptr<UnwrappedLine> Line;
271
272 // Lines that are created by macro expansion.
273 // When formatting code containing macro calls, we first format the expanded
274 // lines to set the token types correctly. Afterwards, we format the
275 // reconstructed macro calls, re-using the token types determined in the first
276 // step.
277 // ExpandedLines will be reset every time we create a new LineAndExpansion
278 // instance once a line containing macro calls has been parsed.
279 SmallVector<UnwrappedLine, 8> CurrentExpandedLines;
280
281 // Maps from the first token of a top-level UnwrappedLine that contains
282 // a macro call to the replacement UnwrappedLines expanded from the macro
283 // call.
284 llvm::DenseMap<FormatToken *, SmallVector<UnwrappedLine, 8>> ExpandedLines;
285
286 // Map from the macro identifier to a line containing the full unexpanded
287 // macro call.
288 llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> Unexpanded;
289
290 // For recursive macro expansions, trigger reconstruction only on the
291 // outermost expansion.
292 bool InExpansion = false;
293
294 // Set while we reconstruct a macro call.
295 // For reconstruction, we feed the expanded lines into the reconstructor
296 // until it is finished.
297 std::optional<MacroCallReconstructor> Reconstruct;
298
299 // Comments are sorted into unwrapped lines by whether they are in the same
300 // line as the previous token, or not. If not, they belong to the next token.
301 // Since the next token might already be in a new unwrapped line, we need to
302 // store the comments belonging to that token.
303 SmallVector<FormatToken *, 1> CommentsBeforeNextToken;
304
305 FormatToken *FormatTok = nullptr;
306
307 // Has just finished parsing a preprocessor line.
308 bool AtEndOfPPLine;
309
310 // The parsed lines. Only added to through \c CurrentLines.
312
313 // Preprocessor directives are parsed out-of-order from other unwrapped lines.
314 // Thus, we need to keep a list of preprocessor directives to be reported
315 // after an unwrapped line that has been started was finished.
316 SmallVector<UnwrappedLine, 4> PreprocessorDirectives;
317
318 // New unwrapped lines are added via CurrentLines.
319 // Usually points to \c &Lines. While parsing a preprocessor directive when
320 // there is an unfinished previous unwrapped line, will point to
321 // \c &PreprocessorDirectives.
322 SmallVectorImpl<UnwrappedLine> *CurrentLines;
323
324 // We store for each line whether it must be a declaration depending on
325 // whether we are in a compound statement or not.
326 llvm::BitVector DeclarationScopeStack;
327
328 const FormatStyle &Style;
329 bool IsCpp;
330 LangOptions LangOpts;
331 const AdditionalKeywords &Keywords;
332
333 llvm::Regex CommentPragmasRegex;
334
335 FormatTokenSource *Tokens;
336 UnwrappedLineConsumer &Callback;
337
338 ArrayRef<FormatToken *> AllTokens;
339
340 // Keeps a stack of the states of nested control statements (true if the
341 // statement contains more than some predefined number of nested statements).
342 SmallVector<bool, 8> NestedTooDeep;
343
344 // Keeps a stack of the states of nested lambdas (true if the return type of
345 // the lambda is `decltype(auto)`).
346 SmallVector<bool, 4> NestedLambdas;
347
348 // Whether the parser is parsing the body of a function whose return type is
349 // `decltype(auto)`.
350 bool IsDecltypeAutoFunction = false;
351
352 // Represents preprocessor branch type, so we can find matching
353 // #if/#else/#endif directives.
354 enum PPBranchKind {
355 PP_Conditional, // Any #if, #ifdef, #ifndef, #elif, block outside #if 0
356 PP_Unreachable // #if 0 or a conditional preprocessor block inside #if 0
357 };
358
359 struct PPBranch {
360 PPBranch(PPBranchKind Kind, size_t Line) : Kind(Kind), Line(Line) {}
361 PPBranchKind Kind;
362 size_t Line;
363 };
364
365 // Keeps a stack of currently active preprocessor branching directives.
367
368 // The \c UnwrappedLineParser re-parses the code for each combination
369 // of preprocessor branches that can be taken.
370 // To that end, we take the same branch (#if, #else, or one of the #elif
371 // branches) for each nesting level of preprocessor branches.
372 // \c PPBranchLevel stores the current nesting level of preprocessor
373 // branches during one pass over the code.
374 int PPBranchLevel;
375
376 // Contains the current branch (#if, #else or one of the #elif branches)
377 // for each nesting level.
378 SmallVector<int, 8> PPLevelBranchIndex;
379
380 // Contains the maximum number of branches at each nesting level.
381 SmallVector<int, 8> PPLevelBranchCount;
382
383 // Contains the number of branches per nesting level we are currently
384 // in while parsing a preprocessor branch sequence.
385 // This is used to update PPLevelBranchCount at the end of a branch
386 // sequence.
387 std::stack<int> PPChainBranchIndex;
388
389 // Include guard search state. Used to fixup preprocessor indent levels
390 // so that include guards do not participate in indentation.
391 enum IncludeGuardState {
392 IG_Inited, // Search started, looking for #ifndef.
393 IG_IfNdefed, // #ifndef found, IncludeGuardToken points to condition.
394 IG_Defined, // Matching #define found, checking other requirements.
395 IG_Found, // All requirements met, need to fix indents.
396 IG_Rejected, // Search failed or never started.
397 };
398
399 // Current state of include guard search.
400 IncludeGuardState IncludeGuard;
401
402 IncludeGuardState
403 getIncludeGuardState(FormatStyle::PPDirectiveIndentStyle Style) const {
404 return Style == FormatStyle::PPDIS_None || Style == FormatStyle::PPDIS_Leave
405 ? IG_Rejected
406 : IG_Inited;
407 }
408
409 // Points to the #ifndef condition for a potential include guard. Null unless
410 // IncludeGuardState == IG_IfNdefed.
411 FormatToken *IncludeGuardToken;
412
413 // Contains the first start column where the source begins. This is zero for
414 // normal source code and may be nonzero when formatting a code fragment that
415 // does not start at the beginning of the file.
416 unsigned FirstStartColumn;
417
418 MacroExpander Macros;
419
420 friend class ScopedLineState;
422};
423
433
434std::ostream &operator<<(std::ostream &Stream, const UnwrappedLine &Line);
435
436} // end namespace format
437} // end namespace clang
438
439#endif
Token Tok
The Token.
SmallVector< AnnotatedLine *, 1 > Children
If this token starts a block, this contains all the unwrapped lines in it.
This file contains the main building blocks of macro support in clang-format.
Implements an efficient mapping from strings to IdentifierInfo nodes.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
This class handles loading and caching of source files into memory.
The base class of the type hierarchy.
Definition TypeBase.h:1833
Takes a set of macro definitions as strings and allows expanding calls to those macros.
Definition Macros.h:80
Interface for users of the UnwrappedLineParser to receive the parsed lines.
virtual void consumeUnwrappedLine(const UnwrappedLine &Line)=0
UnwrappedLineParser(SourceManager &SourceMgr, const FormatStyle &Style, const AdditionalKeywords &Keywords, unsigned FirstStartColumn, ArrayRef< FormatToken * > Tokens, UnwrappedLineConsumer &Callback, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
std::ostream & operator<<(std::ostream &Stream, const UnwrappedLine &Line)
TokenType
Determines the semantic type of a syntactic token, e.g.
The JSON file list parser is used to communicate input to InstallAPI.
nullptr
This class represents a compute construct, representing a 'Kind' of ‘parallel’, 'serial',...
Encapsulates keywords that are context sensitive or for languages not properly supported by Clang's l...
Represents a complete lambda introducer.
Definition DeclSpec.h:2806
A wrapper around a Token storing information about the whitespace characters preceding it.
SmallVector< UnwrappedLine, 0 > Children
UnwrappedLineNode(FormatToken *Tok, llvm::ArrayRef< UnwrappedLine > Children={})
An unwrapped line is a sequence of Token, that we would like to put on a single line if there was no ...
unsigned PPLevel
The PPBranchLevel (adjusted for header guards) if this line is a InMacroBody line,...
bool InMacroBody
Whether it is part of a macro body.
std::list< UnwrappedLineNode > Tokens
The Tokens comprising this UnwrappedLine.
bool IsContinuation
True if this line should be indented by ContinuationIndent in addition to the normal indention level.
unsigned Level
The indent level of the UnwrappedLine.
unsigned UnbracedBodyLevel
Nesting level of unbraced body of a control statement.
bool InPragmaDirective
Whether this UnwrappedLine is part of a pramga directive.
bool InPPDirective
Whether this UnwrappedLine is part of a preprocessor directive.
bool SeenDecltypeAuto
Whether the parser has seen decltype(auto) in this line.
size_t MatchingClosingBlockLineIndex
If this UnwrappedLine opens a block, stores the index of the line with the corresponding closing brac...
size_t MatchingOpeningBlockLineIndex
If this UnwrappedLine closes a block in a sequence of lines, MatchingOpeningBlockLineIndex stores the...