clang  6.0.0svn
BreakableToken.h
Go to the documentation of this file.
1 //===--- BreakableToken.h - Format C++ code -------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Declares BreakableToken, BreakableStringLiteral, BreakableComment,
12 /// BreakableBlockComment and BreakableLineCommentSection classes, that contain
13 /// token type-specific logic to break long lines in tokens and reflow content
14 /// between tokens.
15 ///
16 //===----------------------------------------------------------------------===//
17 
18 #ifndef LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
19 #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
20 
21 #include "Encoding.h"
22 #include "TokenAnnotator.h"
23 #include "WhitespaceManager.h"
24 #include "llvm/Support/Regex.h"
25 #include <utility>
26 
27 namespace clang {
28 namespace format {
29 
30 /// \brief Checks if \p Token switches formatting, like /* clang-format off */.
31 /// \p Token must be a comment.
32 bool switchesFormatting(const FormatToken &Token);
33 
34 struct FormatStyle;
35 
36 /// \brief Base class for tokens / ranges of tokens that can allow breaking
37 /// within the tokens - for example, to avoid whitespace beyond the column
38 /// limit, or to reflow text.
39 ///
40 /// Generally, a breakable token consists of logical lines, addressed by a line
41 /// index. For example, in a sequence of line comments, each line comment is its
42 /// own logical line; similarly, for a block comment, each line in the block
43 /// comment is on its own logical line.
44 ///
45 /// There are two methods to compute the layout of the token:
46 /// - getRangeLength measures the number of columns needed for a range of text
47 /// within a logical line, and
48 /// - getContentStartColumn returns the start column at which we want the
49 /// content of a logical line to start (potentially after introducing a line
50 /// break).
51 ///
52 /// The mechanism to adapt the layout of the breakable token is organised
53 /// around the concept of a \c Split, which is a whitespace range that signifies
54 /// a position of the content of a token where a reformatting might be done.
55 ///
56 /// Operating with splits is divided into two operations:
57 /// - getSplit, for finding a split starting at a position,
58 /// - insertBreak, for executing the split using a whitespace manager.
59 ///
60 /// There is a pair of operations that are used to compress a long whitespace
61 /// range with a single space if that will bring the line length under the
62 /// column limit:
63 /// - getLineLengthAfterCompression, for calculating the size in columns of the
64 /// line after a whitespace range has been compressed, and
65 /// - compressWhitespace, for executing the whitespace compression using a
66 /// whitespace manager; note that the compressed whitespace may be in the
67 /// middle of the original line and of the reformatted line.
68 ///
69 /// For tokens where the whitespace before each line needs to be also
70 /// reformatted, for example for tokens supporting reflow, there are analogous
71 /// operations that might be executed before the main line breaking occurs:
72 /// - getReflowSplit, for finding a split such that the content preceding it
73 /// needs to be specially reflown,
74 /// - reflow, for executing the split using a whitespace manager,
75 /// - introducesBreakBefore, for checking if reformatting the beginning
76 /// of the content introduces a line break before it,
77 /// - adaptStartOfLine, for executing the reflow using a whitespace
78 /// manager.
79 ///
80 /// For tokens that require the whitespace after the last line to be
81 /// reformatted, for example in multiline jsdoc comments that require the
82 /// trailing '*/' to be on a line of itself, there are analogous operations
83 /// that might be executed after the last line has been reformatted:
84 /// - getSplitAfterLastLine, for finding a split after the last line that needs
85 /// to be reflown,
86 /// - replaceWhitespaceAfterLastLine, for executing the reflow using a
87 /// whitespace manager.
88 ///
89 class BreakableToken {
90 public:
91  /// \brief Contains starting character index and length of split.
92  typedef std::pair<StringRef::size_type, unsigned> Split;
93 
94  virtual ~BreakableToken() {}
95 
96  /// \brief Returns the number of lines in this token in the original code.
97  virtual unsigned getLineCount() const = 0;
98 
99  /// \brief Returns the number of columns required to format the text in the
100  /// byte range [\p Offset, \p Offset \c + \p Length).
101  ///
102  /// \p Offset is the byte offset from the start of the content of the line
103  /// at \p LineIndex.
104  ///
105  /// \p StartColumn is the column at which the text starts in the formatted
106  /// file, needed to compute tab stops correctly.
107  virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
108  StringRef::size_type Length,
109  unsigned StartColumn) const = 0;
110 
111  /// \brief Returns the number of columns required to format the text following
112  /// the byte \p Offset in the line \p LineIndex, including potentially
113  /// unbreakable sequences of tokens following after the end of the token.
114  ///
115  /// \p Offset is the byte offset from the start of the content of the line
116  /// at \p LineIndex.
117  ///
118  /// \p StartColumn is the column at which the text starts in the formatted
119  /// file, needed to compute tab stops correctly.
120  ///
121  /// For breakable tokens that never use extra space at the end of a line, this
122  /// is equivalent to getRangeLength with a Length of StringRef::npos.
123  virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
124  unsigned StartColumn) const {
125  return getRangeLength(LineIndex, Offset, StringRef::npos, StartColumn);
126  }
127 
128  /// \brief Returns the column at which content in line \p LineIndex starts,
129  /// assuming no reflow.
130  ///
131  /// If \p Break is true, returns the column at which the line should start
132  /// after the line break.
133  /// If \p Break is false, returns the column at which the line itself will
134  /// start.
135  virtual unsigned getContentStartColumn(unsigned LineIndex,
136  bool Break) const = 0;
137 
138  /// \brief Returns a range (offset, length) at which to break the line at
139  /// \p LineIndex, if previously broken at \p TailOffset. If possible, do not
140  /// violate \p ColumnLimit, assuming the text starting at \p TailOffset in
141  /// the token is formatted starting at ContentStartColumn in the reformatted
142  /// file.
143  virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
144  unsigned ColumnLimit, unsigned ContentStartColumn,
145  llvm::Regex &CommentPragmasRegex) const = 0;
146 
147  /// \brief Emits the previously retrieved \p Split via \p Whitespaces.
148  virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
149  WhitespaceManager &Whitespaces) const = 0;
150 
151  /// \brief Returns the number of columns needed to format
152  /// \p RemainingTokenColumns, assuming that Split is within the range measured
153  /// by \p RemainingTokenColumns, and that the whitespace in Split is reduced
154  /// to a single space.
155  unsigned getLengthAfterCompression(unsigned RemainingTokenColumns,
156  Split Split) const;
157 
158  /// \brief Replaces the whitespace range described by \p Split with a single
159  /// space.
160  virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset,
161  Split Split,
162  WhitespaceManager &Whitespaces) const = 0;
163 
164  /// \brief Returns whether the token supports reflowing text.
165  virtual bool supportsReflow() const { return false; }
166 
167  /// \brief Returns a whitespace range (offset, length) of the content at \p
168  /// LineIndex such that the content of that line is reflown to the end of the
169  /// previous one.
170  ///
171  /// Returning (StringRef::npos, 0) indicates reflowing is not possible.
172  ///
173  /// The range will include any whitespace preceding the specified line's
174  /// content.
175  ///
176  /// If the split is not contained within one token, for example when reflowing
177  /// line comments, returns (0, <length>).
178  virtual Split getReflowSplit(unsigned LineIndex,
179  llvm::Regex &CommentPragmasRegex) const {
180  return Split(StringRef::npos, 0);
181  }
182 
183  /// \brief Reflows the current line into the end of the previous one.
184  virtual void reflow(unsigned LineIndex,
185  WhitespaceManager &Whitespaces) const {}
186 
187  /// \brief Returns whether there will be a line break at the start of the
188  /// token.
189  virtual bool introducesBreakBeforeToken() const {
190  return false;
191  }
192 
193  /// \brief Replaces the whitespace between \p LineIndex-1 and \p LineIndex.
194  virtual void adaptStartOfLine(unsigned LineIndex,
195  WhitespaceManager &Whitespaces) const {}
196 
197  /// \brief Returns a whitespace range (offset, length) of the content at
198  /// the last line that needs to be reformatted after the last line has been
199  /// reformatted.
200  ///
201  /// A result having offset == StringRef::npos means that no reformat is
202  /// necessary.
203  virtual Split getSplitAfterLastLine(unsigned TailOffset) const {
204  return Split(StringRef::npos, 0);
205  }
206 
207  /// \brief Replaces the whitespace from \p SplitAfterLastLine on the last line
208  /// after the last line has been formatted by performing a reformatting.
209  void replaceWhitespaceAfterLastLine(unsigned TailOffset,
210  Split SplitAfterLastLine,
211  WhitespaceManager &Whitespaces) const {
212  insertBreak(getLineCount() - 1, TailOffset, SplitAfterLastLine,
213  Whitespaces);
214  }
215 
216  /// \brief Updates the next token of \p State to the next token after this
217  /// one. This can be used when this token manages a set of underlying tokens
218  /// as a unit and is responsible for the formatting of the them.
219  virtual void updateNextToken(LineState &State) const {}
220 
221 protected:
224  : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding),
225  Style(Style) {}
226 
227  const FormatToken &Tok;
228  const bool InPPDirective;
231 };
232 
234 public:
235  /// \brief Creates a breakable token for a single line string literal.
236  ///
237  /// \p StartColumn specifies the column in which the token will start
238  /// after formatting.
239  BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
240  StringRef Prefix, StringRef Postfix,
242  const FormatStyle &Style);
243 
244  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
245  unsigned ReflowColumn,
246  llvm::Regex &CommentPragmasRegex) const override;
247  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
248  WhitespaceManager &Whitespaces) const override;
249  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
250  WhitespaceManager &Whitespaces) const override {}
251  unsigned getLineCount() const override;
252  unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
253  StringRef::size_type Length,
254  unsigned StartColumn) const override;
255  unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
256  unsigned StartColumn) const override;
257  unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
258 
259 protected:
260  // The column in which the token starts.
261  unsigned StartColumn;
262  // The prefix a line needs after a break in the token.
263  StringRef Prefix;
264  // The postfix a line needs before introducing a break.
265  StringRef Postfix;
266  // The token text excluding the prefix and postfix.
267  StringRef Line;
268  // Length of the sequence of tokens after this string literal that cannot
269  // contain line breaks.
271 };
272 
274 protected:
275  /// \brief Creates a breakable token for a comment.
276  ///
277  /// \p StartColumn specifies the column in which the comment will start after
278  /// formatting.
279  BreakableComment(const FormatToken &Token, unsigned StartColumn,
281  const FormatStyle &Style);
282 
283 public:
284  bool supportsReflow() const override { return true; }
285  unsigned getLineCount() const override;
286  Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
287  unsigned ReflowColumn,
288  llvm::Regex &CommentPragmasRegex) const override;
289  void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
290  WhitespaceManager &Whitespaces) const override;
291 
292 protected:
293  // Returns the token containing the line at LineIndex.
294  const FormatToken &tokenAt(unsigned LineIndex) const;
295 
296  // Checks if the content of line LineIndex may be reflown with the previous
297  // line.
298  virtual bool mayReflow(unsigned LineIndex,
299  llvm::Regex &CommentPragmasRegex) const = 0;
300 
301  // Contains the original text of the lines of the block comment.
302  //
303  // In case of a block comments, excludes the leading /* in the first line and
304  // trailing */ in the last line. In case of line comments, excludes the
305  // leading // and spaces.
307 
308  // Contains the text of the lines excluding all leading and trailing
309  // whitespace between the lines. Note that the decoration (if present) is also
310  // not considered part of the text.
312 
313  // Tokens[i] contains a reference to the token containing Lines[i] if the
314  // whitespace range before that token is managed by this block.
315  // Otherwise, Tokens[i] is a null pointer.
317 
318  // ContentColumn[i] is the target column at which Content[i] should be.
319  // Note that this excludes a leading "* " or "*" in case of block comments
320  // where all lines have a "*" prefix, or the leading "// " or "//" in case of
321  // line comments.
322  //
323  // In block comments, the first line's target column is always positive. The
324  // remaining lines' target columns are relative to the first line to allow
325  // correct indentation of comments in \c WhitespaceManager. Thus they can be
326  // negative as well (in case the first line needs to be unindented more than
327  // there's actual whitespace in another line).
329 
330  // The intended start column of the first line of text from this section.
331  unsigned StartColumn;
332 
333  // The prefix to use in front a line that has been reflown up.
334  // For example, when reflowing the second line after the first here:
335  // // comment 1
336  // // comment 2
337  // we expect:
338  // // comment 1 comment 2
339  // and not:
340  // // comment 1comment 2
341  StringRef ReflowPrefix = " ";
342 };
343 
345 public:
346  BreakableBlockComment(const FormatToken &Token, unsigned StartColumn,
347  unsigned OriginalStartColumn, bool FirstInLine,
349  const FormatStyle &Style);
350 
351  unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
352  StringRef::size_type Length,
353  unsigned StartColumn) const override;
354  unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
355  unsigned StartColumn) const override;
356  unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
357  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
358  WhitespaceManager &Whitespaces) const override;
359  Split getReflowSplit(unsigned LineIndex,
360  llvm::Regex &CommentPragmasRegex) const override;
361  void reflow(unsigned LineIndex,
362  WhitespaceManager &Whitespaces) const override;
363  bool introducesBreakBeforeToken() const override;
364  void adaptStartOfLine(unsigned LineIndex,
365  WhitespaceManager &Whitespaces) const override;
366  Split getSplitAfterLastLine(unsigned TailOffset) const override;
367 
368  bool mayReflow(unsigned LineIndex,
369  llvm::Regex &CommentPragmasRegex) const override;
370 
371 private:
372  // Rearranges the whitespace between Lines[LineIndex-1] and Lines[LineIndex].
373  //
374  // Updates Content[LineIndex-1] and Content[LineIndex] by stripping off
375  // leading and trailing whitespace.
376  //
377  // Sets ContentColumn to the intended column in which the text at
378  // Lines[LineIndex] starts (note that the decoration, if present, is not
379  // considered part of the text).
380  void adjustWhitespace(unsigned LineIndex, int IndentDelta);
381 
382  // The column at which the text of a broken line should start.
383  // Note that an optional decoration would go before that column.
384  // IndentAtLineBreak is a uniform position for all lines in a block comment,
385  // regardless of their relative position.
386  // FIXME: Revisit the decision to do this; the main reason was to support
387  // patterns like
388  // /**************//**
389  // * Comment
390  // We could also support such patterns by special casing the first line
391  // instead.
392  unsigned IndentAtLineBreak;
393 
394  // This is to distinguish between the case when the last line was empty and
395  // the case when it started with a decoration ("*" or "* ").
396  bool LastLineNeedsDecoration;
397 
398  // Either "* " if all lines begin with a "*", or empty.
399  StringRef Decoration;
400 
401  // If this block comment has decorations, this is the column of the start of
402  // the decorations.
403  unsigned DecorationColumn;
404 
405  // If true, make sure that the opening '/**' and the closing '*/' ends on a
406  // line of itself. Styles like jsdoc require this for multiline comments.
407  bool DelimitersOnNewline;
408 
409  // Length of the sequence of tokens after this string literal that cannot
410  // contain line breaks.
411  unsigned UnbreakableTailLength;
412 };
413 
415 public:
416  BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn,
417  unsigned OriginalStartColumn, bool FirstInLine,
419  const FormatStyle &Style);
420 
421  unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
422  StringRef::size_type Length,
423  unsigned StartColumn) const override;
424  unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
425  void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
426  WhitespaceManager &Whitespaces) const override;
427  Split getReflowSplit(unsigned LineIndex,
428  llvm::Regex &CommentPragmasRegex) const override;
429  void reflow(unsigned LineIndex,
430  WhitespaceManager &Whitespaces) const override;
431  void adaptStartOfLine(unsigned LineIndex,
432  WhitespaceManager &Whitespaces) const override;
433  void updateNextToken(LineState &State) const override;
434  bool mayReflow(unsigned LineIndex,
435  llvm::Regex &CommentPragmasRegex) const override;
436 
437 private:
438  // OriginalPrefix[i] contains the original prefix of line i, including
439  // trailing whitespace before the start of the content. The indentation
440  // preceding the prefix is not included.
441  // For example, if the line is:
442  // // content
443  // then the original prefix is "// ".
444  SmallVector<StringRef, 16> OriginalPrefix;
445 
446  // Prefix[i] contains the intended leading "//" with trailing spaces to
447  // account for the indentation of content within the comment at line i after
448  // formatting. It can be different than the original prefix when the original
449  // line starts like this:
450  // //content
451  // Then the original prefix is "//", but the prefix is "// ".
453 
454  SmallVector<unsigned, 16> OriginalContentColumn;
455 
456  /// \brief The token to which the last line of this breakable token belongs
457  /// to; nullptr if that token is the initial token.
458  ///
459  /// The distinction is because if the token of the last line of this breakable
460  /// token is distinct from the initial token, this breakable token owns the
461  /// whitespace before the token of the last line, and the whitespace manager
462  /// must be able to modify it.
463  FormatToken *LastLineTok = nullptr;
464 };
465 } // namespace format
466 } // namespace clang
467 
468 #endif
virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset, unsigned StartColumn) const
Returns the number of columns required to format the text following the byte Offset in the line LineI...
bool switchesFormatting(const FormatToken &Token)
Checks if Token switches formatting, like /* clang-format off.
virtual Split getSplitAfterLastLine(unsigned TailOffset) const
Returns a whitespace range (offset, length) of the content at the last line that needs to be reformat...
virtual unsigned getContentStartColumn(unsigned LineIndex, bool Break) const =0
Returns the column at which content in line LineIndex starts, assuming no reflow. ...
virtual ~BreakableToken()
to be on a line of there are analogous operations *that might be executed after the last line has been for finding a split after the last line that needs *to be * replaceWhitespaceAfterLastLine
SmallVector< int, 16 > ContentColumn
LineState State
Contains functions for text encoding manipulation.
This file implements a token annotator, i.e.
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
virtual void updateNextToken(LineState &State) const
Updates the next token of State to the next token after this one.
Manages the whitespaces around tokens and their replacements.
uint32_t Offset
Definition: CacheTokens.cpp:43
const FormatToken & Tok
bool supportsReflow() const override
virtual void adaptStartOfLine(unsigned LineIndex, WhitespaceManager &Whitespaces) const
Replaces the whitespace between LineIndex-1 and LineIndex.
The current state when indenting a unwrapped line.
WhitespaceManager class manages whitespace around tokens and their replacements.
SmallVector< StringRef, 16 > Content
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:120
virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset, StringRef::size_type Length, unsigned StartColumn) const =0
Returns the number of columns required to format the text in the byte range [Offset, Offset + Length).
SmallVector< FormatToken *, 16 > Tokens
const bool InPPDirective
void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) const override
virtual bool supportsReflow() const
Returns whether the token supports reflowing text.
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) const =0
Replaces the whitespace range described by Split with a single space.
Dataflow Directional Tag Classes.
unsigned getLengthAfterCompression(unsigned RemainingTokenColumns, Split Split) const
Returns the number of columns needed to format RemainingTokenColumns, assuming that Split is within t...
virtual unsigned getLineCount() const =0
Returns the number of lines in this token in the original code.
virtual bool introducesBreakBeforeToken() const
Returns whether there will be a line break at the start of the token.
virtual void reflow(unsigned LineIndex, WhitespaceManager &Whitespaces) const
Reflows the current line into the end of the previous one.
virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) const =0
Emits the previously retrieved Split via Whitespaces.
BreakableToken(const FormatToken &Tok, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
SmallVector< StringRef, 16 > Lines
virtual Split getReflowSplit(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const
Returns a whitespace range (offset, length) of the content at LineIndex such that the content of that...
const encoding::Encoding Encoding
const FormatStyle & Style
virtual Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, unsigned ContentStartColumn, llvm::Regex &CommentPragmasRegex) const =0
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...