clang 17.0.0git
UnwrappedLineParser.h
Go to the documentation of this file.
1//===--- UnwrappedLineParser.h - Format C++ code ----------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains the declaration of the UnwrappedLineParser,
11/// which turns a stream of tokens into UnwrappedLines.
12///
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
16#define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
17
18#include "Encoding.h"
19#include "FormatToken.h"
20#include "Macros.h"
22#include "clang/Format/Format.h"
23#include "llvm/ADT/ArrayRef.h"
24#include "llvm/ADT/BitVector.h"
25#include "llvm/ADT/DenseSet.h"
26#include "llvm/Support/Regex.h"
27#include <list>
28#include <stack>
29#include <vector>
30
31namespace clang {
32namespace format {
33
34struct UnwrappedLineNode;
35
36/// An unwrapped line is a sequence of \c Token, that we would like to
37/// put on a single line if there was no column limit.
38///
39/// This is used as a main interface between the \c UnwrappedLineParser and the
40/// \c UnwrappedLineFormatter. The key property is that changing the formatting
41/// within an unwrapped line does not affect any other unwrapped lines.
44
45 /// The \c Tokens comprising this \c UnwrappedLine.
46 std::list<UnwrappedLineNode> Tokens;
47
48 /// The indent level of the \c UnwrappedLine.
49 unsigned Level;
50
51 /// The \c PPBranchLevel (adjusted for header guards) if this line is a
52 /// \c InMacroBody line, and 0 otherwise.
53 unsigned PPLevel;
54
55 /// Whether this \c UnwrappedLine is part of a preprocessor directive.
57 /// Whether this \c UnwrappedLine is part of a pramga directive.
59 /// Whether it is part of a macro body.
61
63
64 /// \c True if this line should be indented by ContinuationIndent in
65 /// addition to the normal indention level.
66 bool IsContinuation = false;
67
68 /// If this \c UnwrappedLine closes a block in a sequence of lines,
69 /// \c MatchingOpeningBlockLineIndex stores the index of the corresponding
70 /// opening line. Otherwise, \c MatchingOpeningBlockLineIndex must be
71 /// \c kInvalidIndex.
73
74 /// If this \c UnwrappedLine opens a block, stores the index of the
75 /// line with the corresponding closing brace.
77
78 static const size_t kInvalidIndex = -1;
79
80 unsigned FirstStartColumn = 0;
81};
82
83/// Interface for users of the UnwrappedLineParser to receive the parsed lines.
84/// Parsing a single snippet of code can lead to multiple runs, where each
85/// run is a coherent view of the file.
86///
87/// For example, different runs are generated:
88/// - for different combinations of #if blocks
89/// - when macros are involved, for the expanded code and the as-written code
90///
91/// Some tokens will only be visible in a subset of the runs.
92/// For each run, \c UnwrappedLineParser will call \c consumeUnwrappedLine
93/// for each parsed unwrapped line, and then \c finishRun to indicate
94/// that the set of unwrapped lines before is one coherent view of the
95/// code snippet to be formatted.
97public:
99 virtual void consumeUnwrappedLine(const UnwrappedLine &Line) = 0;
100 virtual void finishRun() = 0;
101};
102
104
106public:
107 UnwrappedLineParser(SourceManager &SourceMgr, const FormatStyle &Style,
108 const AdditionalKeywords &Keywords,
109 unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens,
110 UnwrappedLineConsumer &Callback,
111 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
112 IdentifierTable &IdentTable);
113
114 void parse();
115
116private:
117 enum class IfStmtKind {
118 NotIf, // Not an if statement.
119 IfOnly, // An if statement without the else clause.
120 IfElse, // An if statement followed by else but not else if.
121 IfElseIf // An if statement followed by else if.
122 };
123
124 void reset();
125 void parseFile();
126 bool precededByCommentOrPPDirective() const;
127 bool parseLevel(const FormatToken *OpeningBrace = nullptr,
128 bool CanContainBracedList = true,
129 TokenType NextLBracesType = TT_Unknown,
130 IfStmtKind *IfKind = nullptr,
131 FormatToken **IfLeftBrace = nullptr);
132 bool mightFitOnOneLine(UnwrappedLine &Line,
133 const FormatToken *OpeningBrace = nullptr) const;
134 FormatToken *parseBlock(bool MustBeDeclaration = false,
135 unsigned AddLevels = 1u, bool MunchSemi = true,
136 bool KeepBraces = true, IfStmtKind *IfKind = nullptr,
137 bool UnindentWhitesmithsBraces = false,
138 bool CanContainBracedList = true,
139 TokenType NextLBracesType = TT_Unknown);
140 void parseChildBlock(bool CanContainBracedList = true,
141 TokenType NextLBracesType = TT_Unknown);
142 void parsePPDirective();
143 void parsePPDefine();
144 void parsePPIf(bool IfDef);
145 void parsePPElse();
146 void parsePPEndIf();
147 void parsePPPragma();
148 void parsePPUnknown();
149 void readTokenWithJavaScriptASI();
150 void parseStructuralElement(bool IsTopLevel = false,
151 TokenType NextLBracesType = TT_Unknown,
152 IfStmtKind *IfKind = nullptr,
153 FormatToken **IfLeftBrace = nullptr,
154 bool *HasDoWhile = nullptr,
155 bool *HasLabel = nullptr);
156 bool tryToParseBracedList();
157 bool parseBracedList(bool ContinueOnSemicolons = false, bool IsEnum = false,
158 tok::TokenKind ClosingBraceKind = tok::r_brace);
159 void parseParens(TokenType AmpAmpTokenType = TT_Unknown);
160 void parseSquare(bool LambdaIntroducer = false);
161 void keepAncestorBraces();
162 void parseUnbracedBody(bool CheckEOF = false);
163 void handleAttributes();
164 bool handleCppAttributes();
165 bool isBlockBegin(const FormatToken &Tok) const;
166 FormatToken *parseIfThenElse(IfStmtKind *IfKind, bool KeepBraces = false);
167 void parseTryCatch();
168 void parseLoopBody(bool KeepBraces, bool WrapRightBrace);
169 void parseForOrWhileLoop(bool HasParens = true);
170 void parseDoWhile();
171 void parseLabel(bool LeftAlignLabel = false);
172 void parseCaseLabel();
173 void parseSwitch();
174 void parseNamespace();
175 bool parseModuleImport();
176 void parseNew();
177 void parseAccessSpecifier();
178 bool parseEnum();
179 bool parseStructLike();
180 bool parseRequires();
181 void parseRequiresClause(FormatToken *RequiresToken);
182 void parseRequiresExpression(FormatToken *RequiresToken);
183 void parseConstraintExpression();
184 void parseJavaEnumBody();
185 // Parses a record (aka class) as a top level element. If ParseAsExpr is true,
186 // parses the record as a child block, i.e. if the class declaration is an
187 // expression.
188 void parseRecord(bool ParseAsExpr = false);
189 void parseObjCLightweightGenerics();
190 void parseObjCMethod();
191 void parseObjCProtocolList();
192 void parseObjCUntilAtEnd();
193 void parseObjCInterfaceOrImplementation();
194 bool parseObjCProtocol();
195 void parseJavaScriptEs6ImportExport();
196 void parseStatementMacro();
197 void parseCSharpAttribute();
198 // Parse a C# generic type constraint: `where T : IComparable<T>`.
199 // See:
200 // https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/keywords/where-generic-type-constraint
201 void parseCSharpGenericTypeConstraint();
202 bool tryToParseLambda();
203 bool tryToParseChildBlock();
204 bool tryToParseLambdaIntroducer();
205 bool tryToParsePropertyAccessor();
206 void tryToParseJSFunction();
207 bool tryToParseSimpleAttribute();
208 void parseVerilogHierarchyIdentifier();
209 void parseVerilogSensitivityList();
210 // Returns the number of levels of indentation in addition to the normal 1
211 // level for a block, used for indenting case labels.
212 unsigned parseVerilogHierarchyHeader();
213 void parseVerilogTable();
214 void parseVerilogCaseLabel();
215 std::optional<llvm::SmallVector<llvm::SmallVector<FormatToken *, 8>, 1>>
216 parseMacroCall();
217
218 // Used by addUnwrappedLine to denote whether to keep or remove a level
219 // when resetting the line state.
220 enum class LineLevel { Remove, Keep };
221
222 void addUnwrappedLine(LineLevel AdjustLevel = LineLevel::Remove);
223 bool eof() const;
224 // LevelDifference is the difference of levels after and before the current
225 // token. For example:
226 // - if the token is '{' and opens a block, LevelDifference is 1.
227 // - if the token is '}' and closes a block, LevelDifference is -1.
228 void nextToken(int LevelDifference = 0);
229 void readToken(int LevelDifference = 0);
230
231 // Decides which comment tokens should be added to the current line and which
232 // should be added as comments before the next token.
233 //
234 // Comments specifies the sequence of comment tokens to analyze. They get
235 // either pushed to the current line or added to the comments before the next
236 // token.
237 //
238 // NextTok specifies the next token. A null pointer NextTok is supported, and
239 // signifies either the absence of a next token, or that the next token
240 // shouldn't be taken into account for the analysis.
241 void distributeComments(const SmallVectorImpl<FormatToken *> &Comments,
242 const FormatToken *NextTok);
243
244 // Adds the comment preceding the next token to unwrapped lines.
245 void flushComments(bool NewlineBeforeNext);
246 void pushToken(FormatToken *Tok);
247 void calculateBraceTypes(bool ExpectClassBody = false);
248
249 // Marks a conditional compilation edge (for example, an '#if', '#ifdef',
250 // '#else' or merge conflict marker). If 'Unreachable' is true, assumes
251 // this branch either cannot be taken (for example '#if false'), or should
252 // not be taken in this round.
253 void conditionalCompilationCondition(bool Unreachable);
254 void conditionalCompilationStart(bool Unreachable);
255 void conditionalCompilationAlternative();
256 void conditionalCompilationEnd();
257
258 bool isOnNewLine(const FormatToken &FormatTok);
259
260 // Returns whether there is a macro expansion in the line, i.e. a token that
261 // was expanded from a macro call.
262 bool containsExpansion(const UnwrappedLine &Line) const;
263
264 // Compute hash of the current preprocessor branch.
265 // This is used to identify the different branches, and thus track if block
266 // open and close in the same branch.
267 size_t computePPHash() const;
268
269 bool parsingPPDirective() const { return CurrentLines != &Lines; }
270
271 // FIXME: We are constantly running into bugs where Line.Level is incorrectly
272 // subtracted from beyond 0. Introduce a method to subtract from Line.Level
273 // and use that everywhere in the Parser.
274 std::unique_ptr<UnwrappedLine> Line;
275
276 // Lines that are created by macro expansion.
277 // When formatting code containing macro calls, we first format the expanded
278 // lines to set the token types correctly. Afterwards, we format the
279 // reconstructed macro calls, re-using the token types determined in the first
280 // step.
281 // ExpandedLines will be reset every time we create a new LineAndExpansion
282 // instance once a line containing macro calls has been parsed.
283 SmallVector<UnwrappedLine, 8> CurrentExpandedLines;
284
285 // Maps from the first token of a top-level UnwrappedLine that contains
286 // a macro call to the replacement UnwrappedLines expanded from the macro
287 // call.
288 llvm::DenseMap<FormatToken *, SmallVector<UnwrappedLine, 8>> ExpandedLines;
289
290 // Map from the macro identifier to a line containing the full unexpanded
291 // macro call.
292 llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> Unexpanded;
293
294 // For recursive macro expansions, trigger reconstruction only on the
295 // outermost expansion.
296 bool InExpansion = false;
297
298 // Set while we reconstruct a macro call.
299 // For reconstruction, we feed the expanded lines into the reconstructor
300 // until it is finished.
301 std::optional<MacroCallReconstructor> Reconstruct;
302
303 // Comments are sorted into unwrapped lines by whether they are in the same
304 // line as the previous token, or not. If not, they belong to the next token.
305 // Since the next token might already be in a new unwrapped line, we need to
306 // store the comments belonging to that token.
307 SmallVector<FormatToken *, 1> CommentsBeforeNextToken;
308 FormatToken *FormatTok;
309 bool MustBreakBeforeNextToken;
310
311 // The parsed lines. Only added to through \c CurrentLines.
313
314 // Preprocessor directives are parsed out-of-order from other unwrapped lines.
315 // Thus, we need to keep a list of preprocessor directives to be reported
316 // after an unwrapped line that has been started was finished.
317 SmallVector<UnwrappedLine, 4> PreprocessorDirectives;
318
319 // New unwrapped lines are added via CurrentLines.
320 // Usually points to \c &Lines. While parsing a preprocessor directive when
321 // there is an unfinished previous unwrapped line, will point to
322 // \c &PreprocessorDirectives.
323 SmallVectorImpl<UnwrappedLine> *CurrentLines;
324
325 // We store for each line whether it must be a declaration depending on
326 // whether we are in a compound statement or not.
327 llvm::BitVector DeclarationScopeStack;
328
329 const FormatStyle &Style;
330 const AdditionalKeywords &Keywords;
331
332 llvm::Regex CommentPragmasRegex;
333
334 FormatTokenSource *Tokens;
335 UnwrappedLineConsumer &Callback;
336
337 ArrayRef<FormatToken *> AllTokens;
338
339 // Keeps a stack of the states of nested control statements (true if the
340 // statement contains more than some predefined number of nested statements).
341 SmallVector<bool, 8> NestedTooDeep;
342
343 // Represents preprocessor branch type, so we can find matching
344 // #if/#else/#endif directives.
345 enum PPBranchKind {
346 PP_Conditional, // Any #if, #ifdef, #ifndef, #elif, block outside #if 0
347 PP_Unreachable // #if 0 or a conditional preprocessor block inside #if 0
348 };
349
350 struct PPBranch {
351 PPBranch(PPBranchKind Kind, size_t Line) : Kind(Kind), Line(Line) {}
352 PPBranchKind Kind;
353 size_t Line;
354 };
355
356 // Keeps a stack of currently active preprocessor branching directives.
358
359 // The \c UnwrappedLineParser re-parses the code for each combination
360 // of preprocessor branches that can be taken.
361 // To that end, we take the same branch (#if, #else, or one of the #elif
362 // branches) for each nesting level of preprocessor branches.
363 // \c PPBranchLevel stores the current nesting level of preprocessor
364 // branches during one pass over the code.
365 int PPBranchLevel;
366
367 // Contains the current branch (#if, #else or one of the #elif branches)
368 // for each nesting level.
369 SmallVector<int, 8> PPLevelBranchIndex;
370
371 // Contains the maximum number of branches at each nesting level.
372 SmallVector<int, 8> PPLevelBranchCount;
373
374 // Contains the number of branches per nesting level we are currently
375 // in while parsing a preprocessor branch sequence.
376 // This is used to update PPLevelBranchCount at the end of a branch
377 // sequence.
378 std::stack<int> PPChainBranchIndex;
379
380 // Include guard search state. Used to fixup preprocessor indent levels
381 // so that include guards do not participate in indentation.
382 enum IncludeGuardState {
383 IG_Inited, // Search started, looking for #ifndef.
384 IG_IfNdefed, // #ifndef found, IncludeGuardToken points to condition.
385 IG_Defined, // Matching #define found, checking other requirements.
386 IG_Found, // All requirements met, need to fix indents.
387 IG_Rejected, // Search failed or never started.
388 };
389
390 // Current state of include guard search.
391 IncludeGuardState IncludeGuard;
392
393 // Points to the #ifndef condition for a potential include guard. Null unless
394 // IncludeGuardState == IG_IfNdefed.
395 FormatToken *IncludeGuardToken;
396
397 // Contains the first start column where the source begins. This is zero for
398 // normal source code and may be nonzero when formatting a code fragment that
399 // does not start at the beginning of the file.
400 unsigned FirstStartColumn;
401
402 MacroExpander Macros;
403
404 friend class ScopedLineState;
406};
407
409 UnwrappedLineNode() : Tok(nullptr) {}
412 : Tok(Tok), Children(Children.begin(), Children.end()) {}
413
416};
417
419 : Level(0), PPLevel(0), InPPDirective(false), InPragmaDirective(false),
420 InMacroBody(false), MustBeDeclaration(false),
421 MatchingOpeningBlockLineIndex(kInvalidIndex) {}
422
423} // end namespace format
424} // end namespace clang
425
426#endif
Contains functions for text encoding manipulation.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
Various functions to configurably format source code.
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
This file contains the main building blocks of macro support in clang-format.
Implements an efficient mapping from strings to IdentifierInfo nodes.
This class handles loading and caching of source files into memory.
Takes a set of macro definitions as strings and allows expanding calls to those macros.
Definition: Macros.h:86
Interface for users of the UnwrappedLineParser to receive the parsed lines.
virtual void consumeUnwrappedLine(const UnwrappedLine &Line)=0
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:164
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
#define false
Definition: stdbool.h:22
Represents a complete lambda introducer.
Definition: DeclSpec.h:2755
Encapsulates keywords that are context sensitive or for languages not properly supported by Clang's l...
Definition: FormatToken.h:935
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
A wrapper around a Token storing information about the whitespace characters preceding it.
Definition: FormatToken.h:249
SmallVector< UnwrappedLine, 0 > Children
UnwrappedLineNode(FormatToken *Tok, llvm::ArrayRef< UnwrappedLine > Children={})
An unwrapped line is a sequence of Token, that we would like to put on a single line if there was no ...
unsigned PPLevel
The PPBranchLevel (adjusted for header guards) if this line is a InMacroBody line,...
bool InMacroBody
Whether it is part of a macro body.
std::list< UnwrappedLineNode > Tokens
The Tokens comprising this UnwrappedLine.
bool IsContinuation
True if this line should be indented by ContinuationIndent in addition to the normal indention level.
unsigned Level
The indent level of the UnwrappedLine.
bool InPragmaDirective
Whether this UnwrappedLine is part of a pramga directive.
bool InPPDirective
Whether this UnwrappedLine is part of a preprocessor directive.
size_t MatchingClosingBlockLineIndex
If this UnwrappedLine opens a block, stores the index of the line with the corresponding closing brac...
static const size_t kInvalidIndex
size_t MatchingOpeningBlockLineIndex
If this UnwrappedLine closes a block in a sequence of lines, MatchingOpeningBlockLineIndex stores the...