clang  9.0.0svn
BreakableToken.h
Go to the documentation of this file.
1 //===--- BreakableToken.h - Format C++ code ---------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// Declares BreakableToken, BreakableStringLiteral, BreakableComment,
11 /// BreakableBlockComment and BreakableLineCommentSection classes, that contain
12 /// token type-specific logic to break long lines in tokens and reflow content
13 /// between tokens.
14 ///
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
18 #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
19 
20 #include "Encoding.h"
21 #include "TokenAnnotator.h"
22 #include "WhitespaceManager.h"
23 #include "llvm/ADT/StringSet.h"
24 #include "llvm/Support/Regex.h"
25 #include <utility>
26 
27 namespace clang {
28 namespace format {
29 
30 /// Checks if \p Token switches formatting, like /* clang-format off */.
31 /// \p Token must be a comment.
32 bool switchesFormatting(const FormatToken &Token);
33 
34 struct FormatStyle;
35 
36 /// Base class for tokens / ranges of tokens that can allow breaking
37 /// within the tokens - for example, to avoid whitespace beyond the column
38 /// limit, or to reflow text.
39 ///
40 /// Generally, a breakable token consists of logical lines, addressed by a line
41 /// index. For example, in a sequence of line comments, each line comment is its
42 /// own logical line; similarly, for a block comment, each line in the block
43 /// comment is on its own logical line.
44 ///
45 /// There are two methods to compute the layout of the token:
46 /// - getRangeLength measures the number of columns needed for a range of text
47 /// within a logical line, and
48 /// - getContentStartColumn returns the start column at which we want the
49 /// content of a logical line to start (potentially after introducing a line
50 /// break).
51 ///
52 /// The mechanism to adapt the layout of the breakable token is organised
53 /// around the concept of a \c Split, which is a whitespace range that signifies
54 /// a position of the content of a token where a reformatting might be done.
55 ///
56 /// Operating with splits is divided into two operations:
57 /// - getSplit, for finding a split starting at a position,
58 /// - insertBreak, for executing the split using a whitespace manager.
59 ///
60 /// There is a pair of operations that are used to compress a long whitespace
61 /// range with a single space if that will bring the line length under the
62 /// column limit:
63 /// - getLineLengthAfterCompression, for calculating the size in columns of the
64 /// line after a whitespace range has been compressed, and
65 /// - compressWhitespace, for executing the whitespace compression using a
66 /// whitespace manager; note that the compressed whitespace may be in the
67 /// middle of the original line and of the reformatted line.
68 ///
69 /// For tokens where the whitespace before each line needs to be also
70 /// reformatted, for example for tokens supporting reflow, there are analogous
71 /// operations that might be executed before the main line breaking occurs:
72 /// - getReflowSplit, for finding a split such that the content preceding it
73 /// needs to be specially reflown,
74 /// - reflow, for executing the split using a whitespace manager,
75 /// - introducesBreakBefore, for checking if reformatting the beginning
76 /// of the content introduces a line break before it,
77 /// - adaptStartOfLine, for executing the reflow using a whitespace
78 /// manager.
79 ///
80 /// For tokens that require the whitespace after the last line to be
81 /// reformatted, for example in multiline jsdoc comments that require the
82 /// trailing '*/' to be on a line of itself, there are analogous operations
83 /// that might be executed after the last line has been reformatted:
84 /// - getSplitAfterLastLine, for finding a split after the last line that needs
85 /// to be reflown,
86 /// - replaceWhitespaceAfterLastLine, for executing the reflow using a
87 /// whitespace manager.
88 ///
89 class BreakableToken {
90 public:
91  /// Contains starting character index and length of split.
92  typedef std::pair<StringRef::size_type, unsigned> Split;
93 
94  virtual ~BreakableToken() {}
95 
96  /// Returns the number of lines in this token in the original code.
97  virtual unsigned getLineCount() const = 0;
98 
99  /// Returns the number of columns required to format the text in the
100  /// byte range [\p Offset, \p Offset \c + \p Length).
101  ///
102  /// \p Offset is the byte offset from the start of the content of the line
103  /// at \p LineIndex.
104  ///
105  /// \p StartColumn is the column at which the text starts in the formatted
106  /// file, needed to compute tab stops correctly.
107  virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
108  StringRef::size_type Length,
109  unsigned StartColumn) const = 0;
110 
111  /// Returns the number of columns required to format the text following
112  /// the byte \p Offset in the line \p LineIndex, including potentially
113  /// unbreakable sequences of tokens following after the end of the token.
114  ///
115  /// \p Offset is the byte offset from the start of the content of the line
116  /// at \p LineIndex.
117  ///
118  /// \p StartColumn is the column at which the text starts in the formatted
119  /// file, needed to compute tab stops correctly.
120  ///
121  /// For breakable tokens that never use extra space at the end of a line, this
122  /// is equivalent to getRangeLength with a Length of StringRef::npos.
123  virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
124  unsigned StartColumn) const {
125  return getRangeLength(LineIndex, Offset, StringRef::npos, StartColumn);
126  }
127 
128  /// Returns the column at which content in line \p LineIndex starts,
129  /// assuming no reflow.
130  ///
131  /// If \p Break is true, returns the column at which the line should start
132  /// after the line break.
133  /// If \p Break is false, returns the column at which the line itself will
134  /// start.
135  virtual unsigned getContentStartColumn(unsigned LineIndex,
136  bool Break) const = 0;
137 
138  /// Returns additional content indent required for the second line after the
139  /// content at line \p LineIndex is broken.
140  ///
141  // (Next lines do not start with `///` since otherwise -Wdocumentation picks
142  // up the example annotations and generates warnings for them)
143  // For example, Javadoc @param annotations require and indent of 4 spaces and
144  // in this example getContentIndex(1) returns 4.
145  // /**
146  // * @param loooooooooooooong line
147  // * continuation
148  // */
149  virtual unsigned getContentIndent(unsigned LineIndex) const {
150  return 0;
151  }
152 
153  /// Returns a range (offset, length) at which to break the line at
154  /// \p LineIndex, if previously broken at \p TailOffset. If possible, do not
155  /// violate \p ColumnLimit, assuming the text starting at \p TailOffset in
156  /// the token is formatted starting at ContentStartColumn in the reformatted
157  /// file.
158  virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
159  unsigned ColumnLimit, unsigned ContentStartColumn,
160  llvm::Regex &CommentPragmasRegex) const = 0;
161 
162  /// Emits the previously retrieved \p Split via \p Whitespaces.
163  virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
164  unsigned ContentIndent,
165  WhitespaceManager &Whitespaces) const = 0;
166 
167  /// Returns the number of columns needed to format
168  /// \p RemainingTokenColumns, assuming that Split is within the range measured
169  /// by \p RemainingTokenColumns, and that the whitespace in Split is reduced
170  /// to a single space.
171  unsigned getLengthAfterCompression(unsigned RemainingTokenColumns,
172  Split Split) const;
173 
174  /// Replaces the whitespace range described by \p Split with a single
175  /// space.
176  virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset,
177  Split Split,
178  WhitespaceManager &Whitespaces) const = 0;
179 
180  /// Returns whether the token supports reflowing text.
181  virtual bool supportsReflow() const { return false; }
182 
183  /// Returns a whitespace range (offset, length) of the content at \p
184  /// LineIndex such that the content of that line is reflown to the end of the
185  /// previous one.
186  ///
187  /// Returning (StringRef::npos, 0) indicates reflowing is not possible.
188  ///
189  /// The range will include any whitespace preceding the specified line's
190  /// content.
191  ///
192  /// If the split is not contained within one token, for example when reflowing
193  /// line comments, returns (0, <length>).
194  virtual Split getReflowSplit(unsigned LineIndex,
195  llvm::Regex &CommentPragmasRegex) const {
196  return Split(StringRef::npos, 0);
197  }
198 
199  /// Reflows the current line into the end of the previous one.
200  virtual void reflow(unsigned LineIndex,
201  WhitespaceManager &Whitespaces) const {}
202 
203  /// Returns whether there will be a line break at the start of the
204  /// token.
205  virtual bool introducesBreakBeforeToken() const {
206  return false;
207  }
208 
209  /// Replaces the whitespace between \p LineIndex-1 and \p LineIndex.
210  virtual void adaptStartOfLine(unsigned LineIndex,
211  WhitespaceManager &Whitespaces) const {}
212 
213  /// Returns a whitespace range (offset, length) of the content at
214  /// the last line that needs to be reformatted after the last line has been
215  /// reformatted.
216  ///
217  /// A result having offset == StringRef::npos means that no reformat is
218  /// necessary.
219  virtual Split getSplitAfterLastLine(unsigned TailOffset) const {
220  return Split(StringRef::npos, 0);
221  }
222 
223  /// Replaces the whitespace from \p SplitAfterLastLine on the last line
224  /// after the last line has been formatted by performing a reformatting.
225  void replaceWhitespaceAfterLastLine(unsigned TailOffset,
226  Split SplitAfterLastLine,
227  WhitespaceManager &Whitespaces) const {
228  insertBreak(getLineCount() - 1, TailOffset, SplitAfterLastLine,
229  /*ContentIndent=*/0, Whitespaces);
230  }
231 
232  /// Updates the next token of \p State to the next token after this
233  /// one. This can be used when this token manages a set of underlying tokens
234  /// as a unit and is responsible for the formatting of the them.
235  virtual void updateNextToken(LineState &State) const {}
236 
237 protected:
240  : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding),
241  Style(Style) {}
242 
243  const FormatToken &Tok;
244  const bool InPPDirective;
247 };
248 
250 public:
251  /// Creates a breakable token for a single line string literal.
252  ///
253  /// \p StartColumn specifies the column in which the token will start
254  /// after formatting.
255  BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
256  StringRef Prefix, StringRef Postfix,
257  unsigned UnbreakableTailLength, bool InPPDirective,
259 
260  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
261  unsigned ContentStartColumn,
262  llvm::Regex &CommentPragmasRegex) const override;
263  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
264  unsigned ContentIndent,
265  WhitespaceManager &Whitespaces) const override;
266  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
267  WhitespaceManager &Whitespaces) const override {}
268  unsigned getLineCount() const override;
269  unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
270  StringRef::size_type Length,
271  unsigned StartColumn) const override;
272  unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
273  unsigned StartColumn) const override;
274  unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
275 
276 protected:
277  // The column in which the token starts.
278  unsigned StartColumn;
279  // The prefix a line needs after a break in the token.
280  StringRef Prefix;
281  // The postfix a line needs before introducing a break.
282  StringRef Postfix;
283  // The token text excluding the prefix and postfix.
284  StringRef Line;
285  // Length of the sequence of tokens after this string literal that cannot
286  // contain line breaks.
288 };
289 
291 protected:
292  /// Creates a breakable token for a comment.
293  ///
294  /// \p StartColumn specifies the column in which the comment will start after
295  /// formatting.
296  BreakableComment(const FormatToken &Token, unsigned StartColumn,
298  const FormatStyle &Style);
299 
300 public:
301  bool supportsReflow() const override { return true; }
302  unsigned getLineCount() const override;
303  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
304  unsigned ContentStartColumn,
305  llvm::Regex &CommentPragmasRegex) const override;
306  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
307  WhitespaceManager &Whitespaces) const override;
308 
309 protected:
310  // Returns the token containing the line at LineIndex.
311  const FormatToken &tokenAt(unsigned LineIndex) const;
312 
313  // Checks if the content of line LineIndex may be reflown with the previous
314  // line.
315  virtual bool mayReflow(unsigned LineIndex,
316  llvm::Regex &CommentPragmasRegex) const = 0;
317 
318  // Contains the original text of the lines of the block comment.
319  //
320  // In case of a block comments, excludes the leading /* in the first line and
321  // trailing */ in the last line. In case of line comments, excludes the
322  // leading // and spaces.
324 
325  // Contains the text of the lines excluding all leading and trailing
326  // whitespace between the lines. Note that the decoration (if present) is also
327  // not considered part of the text.
329 
330  // Tokens[i] contains a reference to the token containing Lines[i] if the
331  // whitespace range before that token is managed by this block.
332  // Otherwise, Tokens[i] is a null pointer.
334 
335  // ContentColumn[i] is the target column at which Content[i] should be.
336  // Note that this excludes a leading "* " or "*" in case of block comments
337  // where all lines have a "*" prefix, or the leading "// " or "//" in case of
338  // line comments.
339  //
340  // In block comments, the first line's target column is always positive. The
341  // remaining lines' target columns are relative to the first line to allow
342  // correct indentation of comments in \c WhitespaceManager. Thus they can be
343  // negative as well (in case the first line needs to be unindented more than
344  // there's actual whitespace in another line).
346 
347  // The intended start column of the first line of text from this section.
348  unsigned StartColumn;
349 
350  // The prefix to use in front a line that has been reflown up.
351  // For example, when reflowing the second line after the first here:
352  // // comment 1
353  // // comment 2
354  // we expect:
355  // // comment 1 comment 2
356  // and not:
357  // // comment 1comment 2
358  StringRef ReflowPrefix = " ";
359 };
360 
362 public:
363  BreakableBlockComment(const FormatToken &Token, unsigned StartColumn,
364  unsigned OriginalStartColumn, bool FirstInLine,
366  const FormatStyle &Style);
367 
368  unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
369  StringRef::size_type Length,
370  unsigned StartColumn) const override;
371  unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
372  unsigned StartColumn) const override;
373  unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
374  unsigned getContentIndent(unsigned LineIndex) const override;
375  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
376  unsigned ContentIndent,
377  WhitespaceManager &Whitespaces) const override;
378  Split getReflowSplit(unsigned LineIndex,
379  llvm::Regex &CommentPragmasRegex) const override;
380  void reflow(unsigned LineIndex,
381  WhitespaceManager &Whitespaces) const override;
382  bool introducesBreakBeforeToken() const override;
383  void adaptStartOfLine(unsigned LineIndex,
384  WhitespaceManager &Whitespaces) const override;
385  Split getSplitAfterLastLine(unsigned TailOffset) const override;
386 
387  bool mayReflow(unsigned LineIndex,
388  llvm::Regex &CommentPragmasRegex) const override;
389 
390  // Contains Javadoc annotations that require additional indent when continued
391  // on multiple lines.
392  static const llvm::StringSet<> ContentIndentingJavadocAnnotations;
393 
394 private:
395  // Rearranges the whitespace between Lines[LineIndex-1] and Lines[LineIndex].
396  //
397  // Updates Content[LineIndex-1] and Content[LineIndex] by stripping off
398  // leading and trailing whitespace.
399  //
400  // Sets ContentColumn to the intended column in which the text at
401  // Lines[LineIndex] starts (note that the decoration, if present, is not
402  // considered part of the text).
403  void adjustWhitespace(unsigned LineIndex, int IndentDelta);
404 
405  // The column at which the text of a broken line should start.
406  // Note that an optional decoration would go before that column.
407  // IndentAtLineBreak is a uniform position for all lines in a block comment,
408  // regardless of their relative position.
409  // FIXME: Revisit the decision to do this; the main reason was to support
410  // patterns like
411  // /**************//**
412  // * Comment
413  // We could also support such patterns by special casing the first line
414  // instead.
415  unsigned IndentAtLineBreak;
416 
417  // This is to distinguish between the case when the last line was empty and
418  // the case when it started with a decoration ("*" or "* ").
419  bool LastLineNeedsDecoration;
420 
421  // Either "* " if all lines begin with a "*", or empty.
422  StringRef Decoration;
423 
424  // If this block comment has decorations, this is the column of the start of
425  // the decorations.
426  unsigned DecorationColumn;
427 
428  // If true, make sure that the opening '/**' and the closing '*/' ends on a
429  // line of itself. Styles like jsdoc require this for multiline comments.
430  bool DelimitersOnNewline;
431 
432  // Length of the sequence of tokens after this string literal that cannot
433  // contain line breaks.
434  unsigned UnbreakableTailLength;
435 };
436 
438 public:
439  BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn,
440  unsigned OriginalStartColumn, bool FirstInLine,
442  const FormatStyle &Style);
443 
444  unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
445  StringRef::size_type Length,
446  unsigned StartColumn) const override;
447  unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
448  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
449  unsigned ContentIndent,
450  WhitespaceManager &Whitespaces) const override;
451  Split getReflowSplit(unsigned LineIndex,
452  llvm::Regex &CommentPragmasRegex) const override;
453  void reflow(unsigned LineIndex,
454  WhitespaceManager &Whitespaces) const override;
455  void adaptStartOfLine(unsigned LineIndex,
456  WhitespaceManager &Whitespaces) const override;
457  void updateNextToken(LineState &State) const override;
458  bool mayReflow(unsigned LineIndex,
459  llvm::Regex &CommentPragmasRegex) const override;
460 
461 private:
462  // OriginalPrefix[i] contains the original prefix of line i, including
463  // trailing whitespace before the start of the content. The indentation
464  // preceding the prefix is not included.
465  // For example, if the line is:
466  // // content
467  // then the original prefix is "// ".
468  SmallVector<StringRef, 16> OriginalPrefix;
469 
470  // Prefix[i] contains the intended leading "//" with trailing spaces to
471  // account for the indentation of content within the comment at line i after
472  // formatting. It can be different than the original prefix when the original
473  // line starts like this:
474  // //content
475  // Then the original prefix is "//", but the prefix is "// ".
477 
478  SmallVector<unsigned, 16> OriginalContentColumn;
479 
480  /// The token to which the last line of this breakable token belongs
481  /// to; nullptr if that token is the initial token.
482  ///
483  /// The distinction is because if the token of the last line of this breakable
484  /// token is distinct from the initial token, this breakable token owns the
485  /// whitespace before the token of the last line, and the whitespace manager
486  /// must be able to modify it.
487  FormatToken *LastLineTok = nullptr;
488 };
489 } // namespace format
490 } // namespace clang
491 
492 #endif
virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset, unsigned StartColumn) const
Returns the number of columns required to format the text following the byte Offset in the line LineI...
bool switchesFormatting(const FormatToken &Token)
Checks if Token switches formatting, like /* clang-format off.
virtual Split getSplitAfterLastLine(unsigned TailOffset) const
Returns a whitespace range (offset, length) of the content at the last line that needs to be reformat...
virtual unsigned getContentStartColumn(unsigned LineIndex, bool Break) const =0
Returns the column at which content in line LineIndex starts, assuming no reflow. ...
virtual ~BreakableToken()
static const llvm::StringSet ContentIndentingJavadocAnnotations
to be on a line of there are analogous operations *that might be executed after the last line has been for finding a split after the last line that needs *to be * replaceWhitespaceAfterLastLine
SmallVector< int, 16 > ContentColumn
LineState State
Contains functions for text encoding manipulation.
This file implements a token annotator, i.e.
Token - This structure provides full information about a lexed token.
Definition: Token.h:34
virtual void updateNextToken(LineState &State) const
Updates the next token of State to the next token after this one.
Manages the whitespaces around tokens and their replacements.
const FormatToken & Tok
bool supportsReflow() const override
virtual void adaptStartOfLine(unsigned LineIndex, WhitespaceManager &Whitespaces) const
Replaces the whitespace between LineIndex-1 and LineIndex.
The current state when indenting a unwrapped line.
WhitespaceManager class manages whitespace around tokens and their replacements.
unsigned Offset
Definition: Format.cpp:1630
SmallVector< StringRef, 16 > Content
virtual unsigned getContentIndent(unsigned LineIndex) const
Returns additional content indent required for the second line after the content at line LineIndex is...
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:123
virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset, StringRef::size_type Length, unsigned StartColumn) const =0
Returns the number of columns required to format the text in the byte range [Offset, Offset + Length).
SmallVector< FormatToken *, 16 > Tokens
const bool InPPDirective
void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) const override
virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, unsigned ContentIndent, WhitespaceManager &Whitespaces) const =0
Emits the previously retrieved Split via Whitespaces.
virtual bool supportsReflow() const
Returns whether the token supports reflowing text.
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:49
virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) const =0
Replaces the whitespace range described by Split with a single space.
Dataflow Directional Tag Classes.
unsigned getLengthAfterCompression(unsigned RemainingTokenColumns, Split Split) const
Returns the number of columns needed to format RemainingTokenColumns, assuming that Split is within t...
virtual unsigned getLineCount() const =0
Returns the number of lines in this token in the original code.
virtual bool introducesBreakBeforeToken() const
Returns whether there will be a line break at the start of the token.
virtual void reflow(unsigned LineIndex, WhitespaceManager &Whitespaces) const
Reflows the current line into the end of the previous one.
BreakableToken(const FormatToken &Tok, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
SmallVector< StringRef, 16 > Lines
virtual Split getReflowSplit(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const
Returns a whitespace range (offset, length) of the content at LineIndex such that the content of that...
const encoding::Encoding Encoding
const FormatStyle & Style
virtual Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, unsigned ContentStartColumn, llvm::Regex &CommentPragmasRegex) const =0
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...