clang  8.0.0svn
BreakableToken.h
Go to the documentation of this file.
1 //===--- BreakableToken.h - Format C++ code ---------------------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// Declares BreakableToken, BreakableStringLiteral, BreakableComment,
12 /// BreakableBlockComment and BreakableLineCommentSection classes, that contain
13 /// token type-specific logic to break long lines in tokens and reflow content
14 /// between tokens.
15 ///
16 //===----------------------------------------------------------------------===//
17 
18 #ifndef LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
19 #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
20 
21 #include "Encoding.h"
22 #include "TokenAnnotator.h"
23 #include "WhitespaceManager.h"
24 #include "llvm/ADT/StringSet.h"
25 #include "llvm/Support/Regex.h"
26 #include <utility>
27 
28 namespace clang {
29 namespace format {
30 
31 /// Checks if \p Token switches formatting, like /* clang-format off */.
32 /// \p Token must be a comment.
33 bool switchesFormatting(const FormatToken &Token);
34 
35 struct FormatStyle;
36 
37 /// Base class for tokens / ranges of tokens that can allow breaking
38 /// within the tokens - for example, to avoid whitespace beyond the column
39 /// limit, or to reflow text.
40 ///
41 /// Generally, a breakable token consists of logical lines, addressed by a line
42 /// index. For example, in a sequence of line comments, each line comment is its
43 /// own logical line; similarly, for a block comment, each line in the block
44 /// comment is on its own logical line.
45 ///
46 /// There are two methods to compute the layout of the token:
47 /// - getRangeLength measures the number of columns needed for a range of text
48 /// within a logical line, and
49 /// - getContentStartColumn returns the start column at which we want the
50 /// content of a logical line to start (potentially after introducing a line
51 /// break).
52 ///
53 /// The mechanism to adapt the layout of the breakable token is organised
54 /// around the concept of a \c Split, which is a whitespace range that signifies
55 /// a position of the content of a token where a reformatting might be done.
56 ///
57 /// Operating with splits is divided into two operations:
58 /// - getSplit, for finding a split starting at a position,
59 /// - insertBreak, for executing the split using a whitespace manager.
60 ///
61 /// There is a pair of operations that are used to compress a long whitespace
62 /// range with a single space if that will bring the line length under the
63 /// column limit:
64 /// - getLineLengthAfterCompression, for calculating the size in columns of the
65 /// line after a whitespace range has been compressed, and
66 /// - compressWhitespace, for executing the whitespace compression using a
67 /// whitespace manager; note that the compressed whitespace may be in the
68 /// middle of the original line and of the reformatted line.
69 ///
70 /// For tokens where the whitespace before each line needs to be also
71 /// reformatted, for example for tokens supporting reflow, there are analogous
72 /// operations that might be executed before the main line breaking occurs:
73 /// - getReflowSplit, for finding a split such that the content preceding it
74 /// needs to be specially reflown,
75 /// - reflow, for executing the split using a whitespace manager,
76 /// - introducesBreakBefore, for checking if reformatting the beginning
77 /// of the content introduces a line break before it,
78 /// - adaptStartOfLine, for executing the reflow using a whitespace
79 /// manager.
80 ///
81 /// For tokens that require the whitespace after the last line to be
82 /// reformatted, for example in multiline jsdoc comments that require the
83 /// trailing '*/' to be on a line of itself, there are analogous operations
84 /// that might be executed after the last line has been reformatted:
85 /// - getSplitAfterLastLine, for finding a split after the last line that needs
86 /// to be reflown,
87 /// - replaceWhitespaceAfterLastLine, for executing the reflow using a
88 /// whitespace manager.
89 ///
90 class BreakableToken {
91 public:
92  /// Contains starting character index and length of split.
93  typedef std::pair<StringRef::size_type, unsigned> Split;
94 
95  virtual ~BreakableToken() {}
96 
97  /// Returns the number of lines in this token in the original code.
98  virtual unsigned getLineCount() const = 0;
99 
100  /// Returns the number of columns required to format the text in the
101  /// byte range [\p Offset, \p Offset \c + \p Length).
102  ///
103  /// \p Offset is the byte offset from the start of the content of the line
104  /// at \p LineIndex.
105  ///
106  /// \p StartColumn is the column at which the text starts in the formatted
107  /// file, needed to compute tab stops correctly.
108  virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
109  StringRef::size_type Length,
110  unsigned StartColumn) const = 0;
111 
112  /// Returns the number of columns required to format the text following
113  /// the byte \p Offset in the line \p LineIndex, including potentially
114  /// unbreakable sequences of tokens following after the end of the token.
115  ///
116  /// \p Offset is the byte offset from the start of the content of the line
117  /// at \p LineIndex.
118  ///
119  /// \p StartColumn is the column at which the text starts in the formatted
120  /// file, needed to compute tab stops correctly.
121  ///
122  /// For breakable tokens that never use extra space at the end of a line, this
123  /// is equivalent to getRangeLength with a Length of StringRef::npos.
124  virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
125  unsigned StartColumn) const {
126  return getRangeLength(LineIndex, Offset, StringRef::npos, StartColumn);
127  }
128 
129  /// Returns the column at which content in line \p LineIndex starts,
130  /// assuming no reflow.
131  ///
132  /// If \p Break is true, returns the column at which the line should start
133  /// after the line break.
134  /// If \p Break is false, returns the column at which the line itself will
135  /// start.
136  virtual unsigned getContentStartColumn(unsigned LineIndex,
137  bool Break) const = 0;
138 
139  /// Returns additional content indent required for the second line after the
140  /// content at line \p LineIndex is broken.
141  ///
142  // (Next lines do not start with `///` since otherwise -Wdocumentation picks
143  // up the example annotations and generates warnings for them)
144  // For example, Javadoc @param annotations require and indent of 4 spaces and
145  // in this example getContentIndex(1) returns 4.
146  // /**
147  // * @param loooooooooooooong line
148  // * continuation
149  // */
150  virtual unsigned getContentIndent(unsigned LineIndex) const {
151  return 0;
152  }
153 
154  /// Returns a range (offset, length) at which to break the line at
155  /// \p LineIndex, if previously broken at \p TailOffset. If possible, do not
156  /// violate \p ColumnLimit, assuming the text starting at \p TailOffset in
157  /// the token is formatted starting at ContentStartColumn in the reformatted
158  /// file.
159  virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
160  unsigned ColumnLimit, unsigned ContentStartColumn,
161  llvm::Regex &CommentPragmasRegex) const = 0;
162 
163  /// Emits the previously retrieved \p Split via \p Whitespaces.
164  virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
165  unsigned ContentIndent,
166  WhitespaceManager &Whitespaces) const = 0;
167 
168  /// Returns the number of columns needed to format
169  /// \p RemainingTokenColumns, assuming that Split is within the range measured
170  /// by \p RemainingTokenColumns, and that the whitespace in Split is reduced
171  /// to a single space.
172  unsigned getLengthAfterCompression(unsigned RemainingTokenColumns,
173  Split Split) const;
174 
175  /// Replaces the whitespace range described by \p Split with a single
176  /// space.
177  virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset,
178  Split Split,
179  WhitespaceManager &Whitespaces) const = 0;
180 
181  /// Returns whether the token supports reflowing text.
182  virtual bool supportsReflow() const { return false; }
183 
184  /// Returns a whitespace range (offset, length) of the content at \p
185  /// LineIndex such that the content of that line is reflown to the end of the
186  /// previous one.
187  ///
188  /// Returning (StringRef::npos, 0) indicates reflowing is not possible.
189  ///
190  /// The range will include any whitespace preceding the specified line's
191  /// content.
192  ///
193  /// If the split is not contained within one token, for example when reflowing
194  /// line comments, returns (0, <length>).
195  virtual Split getReflowSplit(unsigned LineIndex,
196  llvm::Regex &CommentPragmasRegex) const {
197  return Split(StringRef::npos, 0);
198  }
199 
200  /// Reflows the current line into the end of the previous one.
201  virtual void reflow(unsigned LineIndex,
202  WhitespaceManager &Whitespaces) const {}
203 
204  /// Returns whether there will be a line break at the start of the
205  /// token.
206  virtual bool introducesBreakBeforeToken() const {
207  return false;
208  }
209 
210  /// Replaces the whitespace between \p LineIndex-1 and \p LineIndex.
211  virtual void adaptStartOfLine(unsigned LineIndex,
212  WhitespaceManager &Whitespaces) const {}
213 
214  /// Returns a whitespace range (offset, length) of the content at
215  /// the last line that needs to be reformatted after the last line has been
216  /// reformatted.
217  ///
218  /// A result having offset == StringRef::npos means that no reformat is
219  /// necessary.
220  virtual Split getSplitAfterLastLine(unsigned TailOffset) const {
221  return Split(StringRef::npos, 0);
222  }
223 
224  /// Replaces the whitespace from \p SplitAfterLastLine on the last line
225  /// after the last line has been formatted by performing a reformatting.
226  void replaceWhitespaceAfterLastLine(unsigned TailOffset,
227  Split SplitAfterLastLine,
228  WhitespaceManager &Whitespaces) const {
229  insertBreak(getLineCount() - 1, TailOffset, SplitAfterLastLine,
230  /*ContentIndent=*/0, Whitespaces);
231  }
232 
233  /// Updates the next token of \p State to the next token after this
234  /// one. This can be used when this token manages a set of underlying tokens
235  /// as a unit and is responsible for the formatting of the them.
236  virtual void updateNextToken(LineState &State) const {}
237 
238 protected:
241  : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding),
242  Style(Style) {}
243 
244  const FormatToken &Tok;
245  const bool InPPDirective;
248 };
249 
251 public:
252  /// Creates a breakable token for a single line string literal.
253  ///
254  /// \p StartColumn specifies the column in which the token will start
255  /// after formatting.
256  BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
257  StringRef Prefix, StringRef Postfix,
258  unsigned UnbreakableTailLength, bool InPPDirective,
260 
261  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
262  unsigned ContentStartColumn,
263  llvm::Regex &CommentPragmasRegex) const override;
264  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
265  unsigned ContentIndent,
266  WhitespaceManager &Whitespaces) const override;
267  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
268  WhitespaceManager &Whitespaces) const override {}
269  unsigned getLineCount() const override;
270  unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
271  StringRef::size_type Length,
272  unsigned StartColumn) const override;
273  unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
274  unsigned StartColumn) const override;
275  unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
276 
277 protected:
278  // The column in which the token starts.
279  unsigned StartColumn;
280  // The prefix a line needs after a break in the token.
281  StringRef Prefix;
282  // The postfix a line needs before introducing a break.
283  StringRef Postfix;
284  // The token text excluding the prefix and postfix.
285  StringRef Line;
286  // Length of the sequence of tokens after this string literal that cannot
287  // contain line breaks.
289 };
290 
292 protected:
293  /// Creates a breakable token for a comment.
294  ///
295  /// \p StartColumn specifies the column in which the comment will start after
296  /// formatting.
297  BreakableComment(const FormatToken &Token, unsigned StartColumn,
299  const FormatStyle &Style);
300 
301 public:
302  bool supportsReflow() const override { return true; }
303  unsigned getLineCount() const override;
304  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
305  unsigned ContentStartColumn,
306  llvm::Regex &CommentPragmasRegex) const override;
307  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
308  WhitespaceManager &Whitespaces) const override;
309 
310 protected:
311  // Returns the token containing the line at LineIndex.
312  const FormatToken &tokenAt(unsigned LineIndex) const;
313 
314  // Checks if the content of line LineIndex may be reflown with the previous
315  // line.
316  virtual bool mayReflow(unsigned LineIndex,
317  llvm::Regex &CommentPragmasRegex) const = 0;
318 
319  // Contains the original text of the lines of the block comment.
320  //
321  // In case of a block comments, excludes the leading /* in the first line and
322  // trailing */ in the last line. In case of line comments, excludes the
323  // leading // and spaces.
325 
326  // Contains the text of the lines excluding all leading and trailing
327  // whitespace between the lines. Note that the decoration (if present) is also
328  // not considered part of the text.
330 
331  // Tokens[i] contains a reference to the token containing Lines[i] if the
332  // whitespace range before that token is managed by this block.
333  // Otherwise, Tokens[i] is a null pointer.
335 
336  // ContentColumn[i] is the target column at which Content[i] should be.
337  // Note that this excludes a leading "* " or "*" in case of block comments
338  // where all lines have a "*" prefix, or the leading "// " or "//" in case of
339  // line comments.
340  //
341  // In block comments, the first line's target column is always positive. The
342  // remaining lines' target columns are relative to the first line to allow
343  // correct indentation of comments in \c WhitespaceManager. Thus they can be
344  // negative as well (in case the first line needs to be unindented more than
345  // there's actual whitespace in another line).
347 
348  // The intended start column of the first line of text from this section.
349  unsigned StartColumn;
350 
351  // The prefix to use in front a line that has been reflown up.
352  // For example, when reflowing the second line after the first here:
353  // // comment 1
354  // // comment 2
355  // we expect:
356  // // comment 1 comment 2
357  // and not:
358  // // comment 1comment 2
359  StringRef ReflowPrefix = " ";
360 };
361 
363 public:
364  BreakableBlockComment(const FormatToken &Token, unsigned StartColumn,
365  unsigned OriginalStartColumn, bool FirstInLine,
367  const FormatStyle &Style);
368 
369  unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
370  StringRef::size_type Length,
371  unsigned StartColumn) const override;
372  unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
373  unsigned StartColumn) const override;
374  unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
375  unsigned getContentIndent(unsigned LineIndex) const override;
376  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
377  unsigned ContentIndent,
378  WhitespaceManager &Whitespaces) const override;
379  Split getReflowSplit(unsigned LineIndex,
380  llvm::Regex &CommentPragmasRegex) const override;
381  void reflow(unsigned LineIndex,
382  WhitespaceManager &Whitespaces) const override;
383  bool introducesBreakBeforeToken() const override;
384  void adaptStartOfLine(unsigned LineIndex,
385  WhitespaceManager &Whitespaces) const override;
386  Split getSplitAfterLastLine(unsigned TailOffset) const override;
387 
388  bool mayReflow(unsigned LineIndex,
389  llvm::Regex &CommentPragmasRegex) const override;
390 
391  // Contains Javadoc annotations that require additional indent when continued
392  // on multiple lines.
393  static const llvm::StringSet<> ContentIndentingJavadocAnnotations;
394 
395 private:
396  // Rearranges the whitespace between Lines[LineIndex-1] and Lines[LineIndex].
397  //
398  // Updates Content[LineIndex-1] and Content[LineIndex] by stripping off
399  // leading and trailing whitespace.
400  //
401  // Sets ContentColumn to the intended column in which the text at
402  // Lines[LineIndex] starts (note that the decoration, if present, is not
403  // considered part of the text).
404  void adjustWhitespace(unsigned LineIndex, int IndentDelta);
405 
406  // The column at which the text of a broken line should start.
407  // Note that an optional decoration would go before that column.
408  // IndentAtLineBreak is a uniform position for all lines in a block comment,
409  // regardless of their relative position.
410  // FIXME: Revisit the decision to do this; the main reason was to support
411  // patterns like
412  // /**************//**
413  // * Comment
414  // We could also support such patterns by special casing the first line
415  // instead.
416  unsigned IndentAtLineBreak;
417 
418  // This is to distinguish between the case when the last line was empty and
419  // the case when it started with a decoration ("*" or "* ").
420  bool LastLineNeedsDecoration;
421 
422  // Either "* " if all lines begin with a "*", or empty.
423  StringRef Decoration;
424 
425  // If this block comment has decorations, this is the column of the start of
426  // the decorations.
427  unsigned DecorationColumn;
428 
429  // If true, make sure that the opening '/**' and the closing '*/' ends on a
430  // line of itself. Styles like jsdoc require this for multiline comments.
431  bool DelimitersOnNewline;
432 
433  // Length of the sequence of tokens after this string literal that cannot
434  // contain line breaks.
435  unsigned UnbreakableTailLength;
436 };
437 
439 public:
440  BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn,
441  unsigned OriginalStartColumn, bool FirstInLine,
443  const FormatStyle &Style);
444 
445  unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
446  StringRef::size_type Length,
447  unsigned StartColumn) const override;
448  unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
449  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
450  unsigned ContentIndent,
451  WhitespaceManager &Whitespaces) const override;
452  Split getReflowSplit(unsigned LineIndex,
453  llvm::Regex &CommentPragmasRegex) const override;
454  void reflow(unsigned LineIndex,
455  WhitespaceManager &Whitespaces) const override;
456  void adaptStartOfLine(unsigned LineIndex,
457  WhitespaceManager &Whitespaces) const override;
458  void updateNextToken(LineState &State) const override;
459  bool mayReflow(unsigned LineIndex,
460  llvm::Regex &CommentPragmasRegex) const override;
461 
462 private:
463  // OriginalPrefix[i] contains the original prefix of line i, including
464  // trailing whitespace before the start of the content. The indentation
465  // preceding the prefix is not included.
466  // For example, if the line is:
467  // // content
468  // then the original prefix is "// ".
469  SmallVector<StringRef, 16> OriginalPrefix;
470 
471  // Prefix[i] contains the intended leading "//" with trailing spaces to
472  // account for the indentation of content within the comment at line i after
473  // formatting. It can be different than the original prefix when the original
474  // line starts like this:
475  // //content
476  // Then the original prefix is "//", but the prefix is "// ".
478 
479  SmallVector<unsigned, 16> OriginalContentColumn;
480 
481  /// The token to which the last line of this breakable token belongs
482  /// to; nullptr if that token is the initial token.
483  ///
484  /// The distinction is because if the token of the last line of this breakable
485  /// token is distinct from the initial token, this breakable token owns the
486  /// whitespace before the token of the last line, and the whitespace manager
487  /// must be able to modify it.
488  FormatToken *LastLineTok = nullptr;
489 };
490 } // namespace format
491 } // namespace clang
492 
493 #endif
virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset, unsigned StartColumn) const
Returns the number of columns required to format the text following the byte Offset in the line LineI...
bool switchesFormatting(const FormatToken &Token)
Checks if Token switches formatting, like /* clang-format off.
virtual Split getSplitAfterLastLine(unsigned TailOffset) const
Returns a whitespace range (offset, length) of the content at the last line that needs to be reformat...
virtual unsigned getContentStartColumn(unsigned LineIndex, bool Break) const =0
Returns the column at which content in line LineIndex starts, assuming no reflow. ...
virtual ~BreakableToken()
static const llvm::StringSet ContentIndentingJavadocAnnotations
to be on a line of there are analogous operations *that might be executed after the last line has been for finding a split after the last line that needs *to be * replaceWhitespaceAfterLastLine
SmallVector< int, 16 > ContentColumn
LineState State
Contains functions for text encoding manipulation.
This file implements a token annotator, i.e.
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
virtual void updateNextToken(LineState &State) const
Updates the next token of State to the next token after this one.
Manages the whitespaces around tokens and their replacements.
uint32_t Offset
Definition: CacheTokens.cpp:43
const FormatToken & Tok
bool supportsReflow() const override
virtual void adaptStartOfLine(unsigned LineIndex, WhitespaceManager &Whitespaces) const
Replaces the whitespace between LineIndex-1 and LineIndex.
The current state when indenting a unwrapped line.
WhitespaceManager class manages whitespace around tokens and their replacements.
SmallVector< StringRef, 16 > Content
virtual unsigned getContentIndent(unsigned LineIndex) const
Returns additional content indent required for the second line after the content at line LineIndex is...
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:123
virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset, StringRef::size_type Length, unsigned StartColumn) const =0
Returns the number of columns required to format the text in the byte range [Offset, Offset + Length).
SmallVector< FormatToken *, 16 > Tokens
const bool InPPDirective
void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) const override
virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, unsigned ContentIndent, WhitespaceManager &Whitespaces) const =0
Emits the previously retrieved Split via Whitespaces.
virtual bool supportsReflow() const
Returns whether the token supports reflowing text.
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:48
virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) const =0
Replaces the whitespace range described by Split with a single space.
Dataflow Directional Tag Classes.
unsigned getLengthAfterCompression(unsigned RemainingTokenColumns, Split Split) const
Returns the number of columns needed to format RemainingTokenColumns, assuming that Split is within t...
virtual unsigned getLineCount() const =0
Returns the number of lines in this token in the original code.
virtual bool introducesBreakBeforeToken() const
Returns whether there will be a line break at the start of the token.
virtual void reflow(unsigned LineIndex, WhitespaceManager &Whitespaces) const
Reflows the current line into the end of the previous one.
BreakableToken(const FormatToken &Tok, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
SmallVector< StringRef, 16 > Lines
virtual Split getReflowSplit(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const
Returns a whitespace range (offset, length) of the content at LineIndex such that the content of that...
const encoding::Encoding Encoding
const FormatStyle & Style
virtual Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, unsigned ContentStartColumn, llvm::Regex &CommentPragmasRegex) const =0
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...