clang  6.0.0svn
BreakableToken.h
Go to the documentation of this file.
1 //===--- BreakableToken.h - Format C++ code -------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Declares BreakableToken, BreakableStringLiteral, BreakableComment,
12 /// BreakableBlockComment and BreakableLineCommentSection classes, that contain
13 /// token type-specific logic to break long lines in tokens and reflow content
14 /// between tokens.
15 ///
16 //===----------------------------------------------------------------------===//
17 
18 #ifndef LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
19 #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
20 
21 #include "Encoding.h"
22 #include "TokenAnnotator.h"
23 #include "WhitespaceManager.h"
24 #include "llvm/Support/Regex.h"
25 #include <utility>
26 
27 namespace clang {
28 namespace format {
29 
30 /// \brief Checks if \p Token switches formatting, like /* clang-format off */.
31 /// \p Token must be a comment.
32 bool switchesFormatting(const FormatToken &Token);
33 
34 struct FormatStyle;
35 
36 /// \brief Base class for strategies on how to break tokens.
37 ///
38 /// This is organised around the concept of a \c Split, which is a whitespace
39 /// range that signifies a position of the content of a token where a
40 /// reformatting might be done. Operating with splits is divided into 3
41 /// operations:
42 /// - getSplit, for finding a split starting at a position,
43 /// - getLineLengthAfterSplit, for calculating the size in columns of the rest
44 /// of the content after a split has been used for breaking, and
45 /// - insertBreak, for executing the split using a whitespace manager.
46 ///
47 /// There is a pair of operations that are used to compress a long whitespace
48 /// range with a single space if that will bring the line lenght under the
49 /// column limit:
50 /// - getLineLengthAfterCompression, for calculating the size in columns of the
51 /// line after a whitespace range has been compressed, and
52 /// - compressWhitespace, for executing the whitespace compression using a
53 /// whitespace manager; note that the compressed whitespace may be in the
54 /// middle of the original line and of the reformatted line.
55 ///
56 /// For tokens where the whitespace before each line needs to be also
57 /// reformatted, for example for tokens supporting reflow, there are analogous
58 /// operations that might be executed before the main line breaking occurs:
59 /// - getSplitBefore, for finding a split such that the content preceding it
60 /// needs to be specially reflown,
61 /// - introducesBreakBefore, for checking if reformatting the beginning
62 /// of the content introduces a line break before it,
63 /// - getLineLengthAfterSplitBefore, for calculating the line length in columns
64 /// of the remainder of the content after the beginning of the content has
65 /// been reformatted, and
66 /// - replaceWhitespaceBefore, for executing the reflow using a whitespace
67 /// manager.
68 ///
69 /// For tokens that require the whitespace after the last line to be
70 /// reformatted, for example in multiline jsdoc comments that require the
71 /// trailing '*/' to be on a line of itself, there are analogous operations
72 /// that might be executed after the last line has been reformatted:
73 /// - getSplitAfterLastLine, for finding a split after the last line that needs
74 /// to be reflown,
75 /// - getLineLengthAfterSplitAfterLastLine, for calculating the line length in
76 /// columns of the remainder of the token, and
77 /// - replaceWhitespaceAfterLastLine, for executing the reflow using a
78 /// whitespace manager.
79 ///
80 /// FIXME: The interface seems set in stone, so we might want to just pull the
81 /// strategy into the class, instead of controlling it from the outside.
82 class BreakableToken {
83 public:
84  /// \brief Contains starting character index and length of split.
85  typedef std::pair<StringRef::size_type, unsigned> Split;
86 
87  virtual ~BreakableToken() {}
88 
89  /// \brief Returns the number of lines in this token in the original code.
90  virtual unsigned getLineCount() const = 0;
91 
92  /// \brief Returns the number of columns required to format the piece of line
93  /// at \p LineIndex, from byte offset \p TailOffset with length \p Length.
94  ///
95  /// Note that previous breaks are not taken into account. \p TailOffset is
96  /// always specified from the start of the (original) line.
97  /// \p Length can be set to StringRef::npos, which means "to the end of line".
98  virtual unsigned
99  getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset,
100  StringRef::size_type Length) const = 0;
101 
102  /// \brief Returns a range (offset, length) at which to break the line at
103  /// \p LineIndex, if previously broken at \p TailOffset. If possible, do not
104  /// violate \p ColumnLimit.
105  virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
106  unsigned ColumnLimit,
107  llvm::Regex &CommentPragmasRegex) const = 0;
108 
109  /// \brief Emits the previously retrieved \p Split via \p Whitespaces.
110  virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
111  WhitespaceManager &Whitespaces) = 0;
112 
113  /// \brief Returns the number of columns required to format the piece of line
114  /// at \p LineIndex, from byte offset \p TailOffset after the whitespace range
115  /// \p Split has been compressed into a single space.
116  unsigned getLineLengthAfterCompression(unsigned RemainingTokenColumns,
117  Split Split) const;
118 
119  /// \brief Replaces the whitespace range described by \p Split with a single
120  /// space.
121  virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset,
122  Split Split,
123  WhitespaceManager &Whitespaces) = 0;
124 
125  /// \brief Returns a whitespace range (offset, length) of the content at
126  /// \p LineIndex such that the content preceding this range needs to be
127  /// reformatted before any breaks are made to this line.
128  ///
129  /// \p PreviousEndColumn is the end column of the previous line after
130  /// formatting.
131  ///
132  /// A result having offset == StringRef::npos means that no piece of the line
133  /// needs to be reformatted before any breaks are made.
134  virtual Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn,
135  unsigned ColumnLimit,
136  llvm::Regex &CommentPragmasRegex) const {
137  return Split(StringRef::npos, 0);
138  }
139 
140  /// \brief Returns if a break before the content at \p LineIndex will be
141  /// inserted after the whitespace preceding the content has been reformatted.
142  virtual bool introducesBreakBefore(unsigned LineIndex) const {
143  return false;
144  }
145 
146  /// \brief Returns the number of columns required to format the piece of line
147  /// at \p LineIndex after the content preceding the whitespace range specified
148  /// \p SplitBefore has been reformatted, but before any breaks are made to
149  /// this line.
150  virtual unsigned getLineLengthAfterSplitBefore(unsigned LineIndex,
151  unsigned TailOffset,
152  unsigned PreviousEndColumn,
153  unsigned ColumnLimit,
154  Split SplitBefore) const {
155  return getLineLengthAfterSplit(LineIndex, TailOffset, StringRef::npos);
156  }
157 
158  /// \brief Replaces the whitespace between \p LineIndex-1 and \p LineIndex.
159  /// Performs a reformatting of the content at \p LineIndex preceding the
160  /// whitespace range \p SplitBefore.
161  virtual void replaceWhitespaceBefore(unsigned LineIndex,
162  unsigned PreviousEndColumn,
163  unsigned ColumnLimit, Split SplitBefore,
164  WhitespaceManager &Whitespaces) {}
165 
166  /// \brief Returns a whitespace range (offset, length) of the content at
167  /// the last line that needs to be reformatted after the last line has been
168  /// reformatted.
169  ///
170  /// A result having offset == StringRef::npos means that no reformat is
171  /// necessary.
172  virtual Split getSplitAfterLastLine(unsigned TailOffset,
173  unsigned ColumnLimit) const {
174  return Split(StringRef::npos, 0);
175  }
176 
177  /// \brief Returns the number of columns required to format the piece token
178  /// after the last line after a reformat of the whitespace range \p
179  /// \p SplitAfterLastLine on the last line has been performed.
180  virtual unsigned
182  Split SplitAfterLastLine) const {
184  TailOffset + SplitAfterLastLine.first +
185  SplitAfterLastLine.second,
186  StringRef::npos);
187  }
188 
189  /// \brief Replaces the whitespace from \p SplitAfterLastLine on the last line
190  /// after the last line has been formatted by performing a reformatting.
191  virtual void replaceWhitespaceAfterLastLine(unsigned TailOffset,
192  Split SplitAfterLastLine,
193  WhitespaceManager &Whitespaces) {
194  insertBreak(getLineCount() - 1, TailOffset, SplitAfterLastLine,
195  Whitespaces);
196  }
197 
198  /// \brief Updates the next token of \p State to the next token after this
199  /// one. This can be used when this token manages a set of underlying tokens
200  /// as a unit and is responsible for the formatting of the them.
201  virtual void updateNextToken(LineState &State) const {}
202 
203 protected:
206  : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding),
207  Style(Style) {}
208 
209  const FormatToken &Tok;
210  const bool InPPDirective;
213 };
214 
215 /// \brief Base class for single line tokens that can be broken.
216 ///
217 /// \c getSplit() needs to be implemented by child classes.
219 public:
220  unsigned getLineCount() const override;
221  unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset,
222  StringRef::size_type Length) const override;
223 
224 protected:
225  BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn,
226  StringRef Prefix, StringRef Postfix,
228  const FormatStyle &Style);
229 
230  // The column in which the token starts.
231  unsigned StartColumn;
232  // The prefix a line needs after a break in the token.
233  StringRef Prefix;
234  // The postfix a line needs before introducing a break.
235  StringRef Postfix;
236  // The token text excluding the prefix and postfix.
237  StringRef Line;
238 };
239 
241 public:
242  /// \brief Creates a breakable token for a single line string literal.
243  ///
244  /// \p StartColumn specifies the column in which the token will start
245  /// after formatting.
246  BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
247  StringRef Prefix, StringRef Postfix,
249  const FormatStyle &Style);
250 
251  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
252  llvm::Regex &CommentPragmasRegex) const override;
253  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
254  WhitespaceManager &Whitespaces) override;
255  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
256  WhitespaceManager &Whitespaces) override {}
257 };
258 
260 protected:
261  /// \brief Creates a breakable token for a comment.
262  ///
263  /// \p StartColumn specifies the column in which the comment will start after
264  /// formatting.
265  BreakableComment(const FormatToken &Token, unsigned StartColumn,
267  const FormatStyle &Style);
268 
269 public:
270  unsigned getLineCount() const override;
271  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
272  llvm::Regex &CommentPragmasRegex) const override;
273  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
274  WhitespaceManager &Whitespaces) override;
275 
276 protected:
277  virtual unsigned getContentStartColumn(unsigned LineIndex,
278  unsigned TailOffset) const = 0;
279 
280  // Returns a split that divides Text into a left and right parts, such that
281  // the left part is suitable for reflowing after PreviousEndColumn.
282  Split getReflowSplit(StringRef Text, StringRef ReflowPrefix,
283  unsigned PreviousEndColumn, unsigned ColumnLimit) const;
284 
285  // Returns the token containing the line at LineIndex.
286  const FormatToken &tokenAt(unsigned LineIndex) const;
287 
288  // Checks if the content of line LineIndex may be reflown with the previous
289  // line.
290  virtual bool mayReflow(unsigned LineIndex,
291  llvm::Regex &CommentPragmasRegex) const = 0;
292 
293  // Contains the original text of the lines of the block comment.
294  //
295  // In case of a block comments, excludes the leading /* in the first line and
296  // trailing */ in the last line. In case of line comments, excludes the
297  // leading // and spaces.
299 
300  // Contains the text of the lines excluding all leading and trailing
301  // whitespace between the lines. Note that the decoration (if present) is also
302  // not considered part of the text.
304 
305  // Tokens[i] contains a reference to the token containing Lines[i] if the
306  // whitespace range before that token is managed by this block.
307  // Otherwise, Tokens[i] is a null pointer.
309 
310  // ContentColumn[i] is the target column at which Content[i] should be.
311  // Note that this excludes a leading "* " or "*" in case of block comments
312  // where all lines have a "*" prefix, or the leading "// " or "//" in case of
313  // line comments.
314  //
315  // In block comments, the first line's target column is always positive. The
316  // remaining lines' target columns are relative to the first line to allow
317  // correct indentation of comments in \c WhitespaceManager. Thus they can be
318  // negative as well (in case the first line needs to be unindented more than
319  // there's actual whitespace in another line).
321 
322  // The intended start column of the first line of text from this section.
323  unsigned StartColumn;
324 
325  // The prefix to use in front a line that has been reflown up.
326  // For example, when reflowing the second line after the first here:
327  // // comment 1
328  // // comment 2
329  // we expect:
330  // // comment 1 comment 2
331  // and not:
332  // // comment 1comment 2
333  StringRef ReflowPrefix = " ";
334 };
335 
337 public:
338  BreakableBlockComment(const FormatToken &Token, unsigned StartColumn,
339  unsigned OriginalStartColumn, bool FirstInLine,
341  const FormatStyle &Style);
342 
343  unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset,
344  StringRef::size_type Length) const override;
345  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
346  WhitespaceManager &Whitespaces) override;
347  Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn,
348  unsigned ColumnLimit,
349  llvm::Regex &CommentPragmasRegex) const override;
350  bool introducesBreakBefore(unsigned LineIndex) const override;
351  unsigned getLineLengthAfterSplitBefore(unsigned LineIndex,
352  unsigned TailOffset,
353  unsigned PreviousEndColumn,
354  unsigned ColumnLimit,
355  Split SplitBefore) const override;
356  void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn,
357  unsigned ColumnLimit, Split SplitBefore,
358  WhitespaceManager &Whitespaces) override;
359  Split getSplitAfterLastLine(unsigned TailOffset,
360  unsigned ColumnLimit) const override;
361 
362  bool mayReflow(unsigned LineIndex,
363  llvm::Regex &CommentPragmasRegex) const override;
364 
365 private:
366  // Rearranges the whitespace between Lines[LineIndex-1] and Lines[LineIndex].
367  //
368  // Updates Content[LineIndex-1] and Content[LineIndex] by stripping off
369  // leading and trailing whitespace.
370  //
371  // Sets ContentColumn to the intended column in which the text at
372  // Lines[LineIndex] starts (note that the decoration, if present, is not
373  // considered part of the text).
374  void adjustWhitespace(unsigned LineIndex, int IndentDelta);
375 
376  // Computes the end column if the full Content from LineIndex gets reflown
377  // after PreviousEndColumn.
378  unsigned getReflownColumn(StringRef Content, unsigned LineIndex,
379  unsigned PreviousEndColumn) const;
380 
381  unsigned getContentStartColumn(unsigned LineIndex,
382  unsigned TailOffset) const override;
383 
384  // The column at which the text of a broken line should start.
385  // Note that an optional decoration would go before that column.
386  // IndentAtLineBreak is a uniform position for all lines in a block comment,
387  // regardless of their relative position.
388  // FIXME: Revisit the decision to do this; the main reason was to support
389  // patterns like
390  // /**************//**
391  // * Comment
392  // We could also support such patterns by special casing the first line
393  // instead.
394  unsigned IndentAtLineBreak;
395 
396  // This is to distinguish between the case when the last line was empty and
397  // the case when it started with a decoration ("*" or "* ").
398  bool LastLineNeedsDecoration;
399 
400  // Either "* " if all lines begin with a "*", or empty.
401  StringRef Decoration;
402 
403  // If this block comment has decorations, this is the column of the start of
404  // the decorations.
405  unsigned DecorationColumn;
406 
407  // If true, make sure that the opening '/**' and the closing '*/' ends on a
408  // line of itself. Styles like jsdoc require this for multiline comments.
409  bool DelimitersOnNewline;
410 };
411 
413 public:
414  BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn,
415  unsigned OriginalStartColumn, bool FirstInLine,
417  const FormatStyle &Style);
418 
419  unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset,
420  StringRef::size_type Length) const override;
421  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
422  WhitespaceManager &Whitespaces) override;
423  Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn,
424  unsigned ColumnLimit,
425  llvm::Regex &CommentPragmasRegex) const override;
426  unsigned getLineLengthAfterSplitBefore(unsigned LineIndex,
427  unsigned TailOffset,
428  unsigned PreviousEndColumn,
429  unsigned ColumnLimit,
430  Split SplitBefore) const override;
431  void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn,
432  unsigned ColumnLimit, Split SplitBefore,
433  WhitespaceManager &Whitespaces) override;
434  void updateNextToken(LineState &State) const override;
435  bool mayReflow(unsigned LineIndex,
436  llvm::Regex &CommentPragmasRegex) const override;
437 
438 private:
439  unsigned getContentStartColumn(unsigned LineIndex,
440  unsigned TailOffset) const override;
441 
442  // OriginalPrefix[i] contains the original prefix of line i, including
443  // trailing whitespace before the start of the content. The indentation
444  // preceding the prefix is not included.
445  // For example, if the line is:
446  // // content
447  // then the original prefix is "// ".
448  SmallVector<StringRef, 16> OriginalPrefix;
449 
450  // Prefix[i] contains the intended leading "//" with trailing spaces to
451  // account for the indentation of content within the comment at line i after
452  // formatting. It can be different than the original prefix when the original
453  // line starts like this:
454  // //content
455  // Then the original prefix is "//", but the prefix is "// ".
457 
458  SmallVector<unsigned, 16> OriginalContentColumn;
459 
460  /// \brief The token to which the last line of this breakable token belongs
461  /// to; nullptr if that token is the initial token.
462  ///
463  /// The distinction is because if the token of the last line of this breakable
464  /// token is distinct from the initial token, this breakable token owns the
465  /// whitespace before the token of the last line, and the whitespace manager
466  /// must be able to modify it.
467  FormatToken *LastLineTok = nullptr;
468 };
469 } // namespace format
470 } // namespace clang
471 
472 #endif
bool switchesFormatting(const FormatToken &Token)
Checks if Token switches formatting, like /* clang-format off.
virtual ~BreakableToken()
virtual Split getSplitAfterLastLine(unsigned TailOffset, unsigned ColumnLimit) const
Returns a whitespace range (offset, length) of the content at the last line that needs to be reformat...
unsigned getLineLengthAfterCompression(unsigned RemainingTokenColumns, Split Split) const
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
SmallVector< int, 16 > ContentColumn
LineState State
Contains functions for text encoding manipulation.
This file implements a token annotator, i.e.
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
virtual void updateNextToken(LineState &State) const
Updates the next token of State to the next token after this one.
Manages the whitespaces around tokens and their replacements.
virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces)=0
Replaces the whitespace range described by Split with a single space.
const FormatToken & Tok
virtual unsigned getLineLengthAfterSplitBefore(unsigned LineIndex, unsigned TailOffset, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore) const
Returns the number of columns required to format the piece of line at LineIndex after the content pre...
Base class for single line tokens that can be broken.
virtual Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const =0
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...
The current state when indenting a unwrapped line.
WhitespaceManager class manages whitespace around tokens and their replacements.
void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
SmallVector< StringRef, 16 > Content
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:120
virtual unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const =0
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
SmallVector< FormatToken *, 16 > Tokens
to be on a line of there are analogous operations *that might be executed after the last line has been for finding a split after the last line that needs *to be * getLineLengthAfterSplitAfterLastLine
const bool InPPDirective
virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces)=0
Emits the previously retrieved Split via Whitespaces.
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
virtual Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const
Returns a whitespace range (offset, length) of the content at LineIndex such that the content precedi...
Dataflow Directional Tag Classes.
virtual bool introducesBreakBefore(unsigned LineIndex) const
Returns if a break before the content at LineIndex will be inserted after the whitespace preceding th...
virtual unsigned getLineCount() const =0
Returns the number of lines in this token in the original code.
to be on a line of there are analogous operations *that might be executed after the last line has been for finding a split after the last line that needs *to be for calculating the line length in *columns of the remainder of the and * replaceWhitespaceAfterLastLine
BreakableToken(const FormatToken &Tok, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
StringRef Text
Definition: Format.cpp:1317
SmallVector< StringRef, 16 > Lines
virtual void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore, WhitespaceManager &Whitespaces)
Replaces the whitespace between LineIndex-1 and LineIndex.
const encoding::Encoding Encoding
const FormatStyle & Style