clang  5.0.0svn
BreakableToken.cpp
Go to the documentation of this file.
1 //===--- BreakableToken.cpp - Format C++ code -----------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Contains implementation of BreakableToken class and classes derived
12 /// from it.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "BreakableToken.h"
17 #include "ContinuationIndenter.h"
18 #include "clang/Basic/CharInfo.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Support/Debug.h"
22 #include <algorithm>
23 
24 #define DEBUG_TYPE "format-token-breaker"
25 
26 namespace clang {
27 namespace format {
28 
29 static const char *const Blanks = " \t\v\f\r";
30 static bool IsBlank(char C) {
31  switch (C) {
32  case ' ':
33  case '\t':
34  case '\v':
35  case '\f':
36  case '\r':
37  return true;
38  default:
39  return false;
40  }
41 }
42 
43 static StringRef getLineCommentIndentPrefix(StringRef Comment) {
44  static const char *const KnownPrefixes[] = {
45  "///<", "//!<", "///", "//", "//!"};
46  StringRef LongestPrefix;
47  for (StringRef KnownPrefix : KnownPrefixes) {
48  if (Comment.startswith(KnownPrefix)) {
49  size_t PrefixLength = KnownPrefix.size();
50  while (PrefixLength < Comment.size() && Comment[PrefixLength] == ' ')
51  ++PrefixLength;
52  if (PrefixLength > LongestPrefix.size())
53  LongestPrefix = Comment.substr(0, PrefixLength);
54  }
55  }
56  return LongestPrefix;
57 }
58 
60  unsigned ContentStartColumn,
61  unsigned ColumnLimit,
62  unsigned TabWidth,
64  if (ColumnLimit <= ContentStartColumn + 1)
65  return BreakableToken::Split(StringRef::npos, 0);
66 
67  unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
68  unsigned MaxSplitBytes = 0;
69 
70  for (unsigned NumChars = 0;
71  NumChars < MaxSplit && MaxSplitBytes < Text.size();) {
72  unsigned BytesInChar =
73  encoding::getCodePointNumBytes(Text[MaxSplitBytes], Encoding);
74  NumChars +=
75  encoding::columnWidthWithTabs(Text.substr(MaxSplitBytes, BytesInChar),
76  ContentStartColumn, TabWidth, Encoding);
77  MaxSplitBytes += BytesInChar;
78  }
79 
80  StringRef::size_type SpaceOffset = Text.find_last_of(Blanks, MaxSplitBytes);
81 
82  // Do not split before a number followed by a dot: this would be interpreted
83  // as a numbered list, which would prevent re-flowing in subsequent passes.
84  static llvm::Regex kNumberedListRegexp = llvm::Regex("^[1-9][0-9]?\\.");
85  if (SpaceOffset != StringRef::npos &&
86  kNumberedListRegexp.match(Text.substr(SpaceOffset).ltrim(Blanks)))
87  SpaceOffset = Text.find_last_of(Blanks, SpaceOffset);
88 
89  if (SpaceOffset == StringRef::npos ||
90  // Don't break at leading whitespace.
91  Text.find_last_not_of(Blanks, SpaceOffset) == StringRef::npos) {
92  // Make sure that we don't break at leading whitespace that
93  // reaches past MaxSplit.
94  StringRef::size_type FirstNonWhitespace = Text.find_first_not_of(Blanks);
95  if (FirstNonWhitespace == StringRef::npos)
96  // If the comment is only whitespace, we cannot split.
97  return BreakableToken::Split(StringRef::npos, 0);
98  SpaceOffset = Text.find_first_of(
99  Blanks, std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace));
100  }
101  if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
102  StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim(Blanks);
103  StringRef AfterCut = Text.substr(SpaceOffset).ltrim(Blanks);
104  return BreakableToken::Split(BeforeCut.size(),
105  AfterCut.begin() - BeforeCut.end());
106  }
107  return BreakableToken::Split(StringRef::npos, 0);
108 }
109 
111 getStringSplit(StringRef Text, unsigned UsedColumns, unsigned ColumnLimit,
112  unsigned TabWidth, encoding::Encoding Encoding) {
113  // FIXME: Reduce unit test case.
114  if (Text.empty())
115  return BreakableToken::Split(StringRef::npos, 0);
116  if (ColumnLimit <= UsedColumns)
117  return BreakableToken::Split(StringRef::npos, 0);
118  unsigned MaxSplit = ColumnLimit - UsedColumns;
119  StringRef::size_type SpaceOffset = 0;
120  StringRef::size_type SlashOffset = 0;
121  StringRef::size_type WordStartOffset = 0;
122  StringRef::size_type SplitPoint = 0;
123  for (unsigned Chars = 0;;) {
124  unsigned Advance;
125  if (Text[0] == '\\') {
126  Advance = encoding::getEscapeSequenceLength(Text);
127  Chars += Advance;
128  } else {
129  Advance = encoding::getCodePointNumBytes(Text[0], Encoding);
131  Text.substr(0, Advance), UsedColumns + Chars, TabWidth, Encoding);
132  }
133 
134  if (Chars > MaxSplit || Text.size() <= Advance)
135  break;
136 
137  if (IsBlank(Text[0]))
138  SpaceOffset = SplitPoint;
139  if (Text[0] == '/')
140  SlashOffset = SplitPoint;
141  if (Advance == 1 && !isAlphanumeric(Text[0]))
142  WordStartOffset = SplitPoint;
143 
144  SplitPoint += Advance;
145  Text = Text.substr(Advance);
146  }
147 
148  if (SpaceOffset != 0)
149  return BreakableToken::Split(SpaceOffset + 1, 0);
150  if (SlashOffset != 0)
151  return BreakableToken::Split(SlashOffset + 1, 0);
152  if (WordStartOffset != 0)
153  return BreakableToken::Split(WordStartOffset + 1, 0);
154  if (SplitPoint != 0)
155  return BreakableToken::Split(SplitPoint, 0);
156  return BreakableToken::Split(StringRef::npos, 0);
157 }
158 
160  assert((Token.is(TT_BlockComment) || Token.is(TT_LineComment)) &&
161  "formatting regions are switched by comment tokens");
162  StringRef Content = Token.TokenText.substr(2).ltrim();
163  return Content.startswith("clang-format on") ||
164  Content.startswith("clang-format off");
165 }
166 
167 unsigned
168 BreakableToken::getLineLengthAfterCompression(unsigned RemainingTokenColumns,
169  Split Split) const {
170  // Example: consider the content
171  // lala lala
172  // - RemainingTokenColumns is the original number of columns, 10;
173  // - Split is (4, 2), denoting the two spaces between the two words;
174  //
175  // We compute the number of columns when the split is compressed into a single
176  // space, like:
177  // lala lala
178  return RemainingTokenColumns + 1 - Split.second;
179 }
180 
181 unsigned BreakableSingleLineToken::getLineCount() const { return 1; }
182 
184  unsigned LineIndex, unsigned TailOffset,
185  StringRef::size_type Length) const {
186  return StartColumn + Prefix.size() + Postfix.size() +
187  encoding::columnWidthWithTabs(Line.substr(TailOffset, Length),
188  StartColumn + Prefix.size(),
190 }
191 
193  const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
194  StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding,
195  const FormatStyle &Style)
196  : BreakableToken(Tok, InPPDirective, Encoding, Style),
197  StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix) {
198  assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
199  Line = Tok.TokenText.substr(
200  Prefix.size(), Tok.TokenText.size() - Prefix.size() - Postfix.size());
201 }
202 
204  const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
206  const FormatStyle &Style)
207  : BreakableSingleLineToken(Tok, StartColumn, Prefix, Postfix, InPPDirective,
208  Encoding, Style) {}
209 
211 BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset,
212  unsigned ColumnLimit,
213  llvm::Regex &CommentPragmasRegex) const {
214  return getStringSplit(Line.substr(TailOffset),
215  StartColumn + Prefix.size() + Postfix.size(),
216  ColumnLimit, Style.TabWidth, Encoding);
217 }
218 
219 void BreakableStringLiteral::insertBreak(unsigned LineIndex,
220  unsigned TailOffset, Split Split,
221  WhitespaceManager &Whitespaces) {
222  Whitespaces.replaceWhitespaceInToken(
223  Tok, Prefix.size() + TailOffset + Split.first, Split.second, Postfix,
225 }
226 
228  unsigned StartColumn,
229  bool InPPDirective,
231  const FormatStyle &Style)
232  : BreakableToken(Token, InPPDirective, Encoding, Style),
233  StartColumn(StartColumn) {}
234 
235 unsigned BreakableComment::getLineCount() const { return Lines.size(); }
236 
238 BreakableComment::getSplit(unsigned LineIndex, unsigned TailOffset,
239  unsigned ColumnLimit,
240  llvm::Regex &CommentPragmasRegex) const {
241  // Don't break lines matching the comment pragmas regex.
242  if (CommentPragmasRegex.match(Content[LineIndex]))
243  return Split(StringRef::npos, 0);
244  return getCommentSplit(Content[LineIndex].substr(TailOffset),
245  getContentStartColumn(LineIndex, TailOffset),
246  ColumnLimit, Style.TabWidth, Encoding);
247 }
248 
249 void BreakableComment::compressWhitespace(unsigned LineIndex,
250  unsigned TailOffset, Split Split,
251  WhitespaceManager &Whitespaces) {
252  StringRef Text = Content[LineIndex].substr(TailOffset);
253  // Text is relative to the content line, but Whitespaces operates relative to
254  // the start of the corresponding token, so compute the start of the Split
255  // that needs to be compressed into a single space relative to the start of
256  // its token.
257  unsigned BreakOffsetInToken =
258  Text.data() - tokenAt(LineIndex).TokenText.data() + Split.first;
259  unsigned CharsToRemove = Split.second;
260  Whitespaces.replaceWhitespaceInToken(
261  tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove, "", "",
262  /*InPPDirective=*/false, /*Newlines=*/0, /*Spaces=*/1);
263 }
264 
267  unsigned PreviousEndColumn,
268  unsigned ColumnLimit) const {
269  unsigned ReflowStartColumn = PreviousEndColumn + ReflowPrefix.size();
270  StringRef TrimmedText = Text.rtrim(Blanks);
271  // This is the width of the resulting line in case the full line of Text gets
272  // reflown up starting at ReflowStartColumn.
273  unsigned FullWidth = ReflowStartColumn + encoding::columnWidthWithTabs(
274  TrimmedText, ReflowStartColumn,
276  // If the full line fits up, we return a reflow split after it,
277  // otherwise we compute the largest piece of text that fits after
278  // ReflowStartColumn.
279  Split ReflowSplit =
280  FullWidth <= ColumnLimit
281  ? Split(TrimmedText.size(), Text.size() - TrimmedText.size())
282  : getCommentSplit(Text, ReflowStartColumn, ColumnLimit,
284 
285  // We need to be extra careful here, because while it's OK to keep a long line
286  // if it can't be broken into smaller pieces (like when the first word of a
287  // long line is longer than the column limit), it's not OK to reflow that long
288  // word up. So we recompute the size of the previous line after reflowing and
289  // only return the reflow split if that's under the line limit.
290  if (ReflowSplit.first != StringRef::npos &&
291  // Check if the width of the newly reflown line is under the limit.
292  PreviousEndColumn + ReflowPrefix.size() +
293  encoding::columnWidthWithTabs(Text.substr(0, ReflowSplit.first),
294  PreviousEndColumn +
295  ReflowPrefix.size(),
296  Style.TabWidth, Encoding) <=
297  ColumnLimit) {
298  return ReflowSplit;
299  }
300  return Split(StringRef::npos, 0);
301 }
302 
303 const FormatToken &BreakableComment::tokenAt(unsigned LineIndex) const {
304  return Tokens[LineIndex] ? *Tokens[LineIndex] : Tok;
305 }
306 
307 static bool mayReflowContent(StringRef Content) {
308  Content = Content.trim(Blanks);
309  // Lines starting with '@' commonly have special meaning.
310  // Lines starting with '-', '-#', '+' or '*' are bulleted/numbered lists.
311  static const SmallVector<StringRef, 8> kSpecialMeaningPrefixes = {
312  "@", "TODO", "FIXME", "XXX", "-# ", "- ", "+ ", "* " };
313  bool hasSpecialMeaningPrefix = false;
314  for (StringRef Prefix : kSpecialMeaningPrefixes) {
315  if (Content.startswith(Prefix)) {
316  hasSpecialMeaningPrefix = true;
317  break;
318  }
319  }
320 
321  // Numbered lists may also start with a number followed by '.'
322  // To avoid issues if a line starts with a number which is actually the end
323  // of a previous line, we only consider numbers with up to 2 digits.
324  static llvm::Regex kNumberedListRegexp = llvm::Regex("^[1-9][0-9]?\\. ");
325  hasSpecialMeaningPrefix = hasSpecialMeaningPrefix ||
326  kNumberedListRegexp.match(Content);
327 
328  // Simple heuristic for what to reflow: content should contain at least two
329  // characters and either the first or second character must be
330  // non-punctuation.
331  return Content.size() >= 2 && !hasSpecialMeaningPrefix &&
332  !Content.endswith("\\") &&
333  // Note that this is UTF-8 safe, since if isPunctuation(Content[0]) is
334  // true, then the first code point must be 1 byte long.
335  (!isPunctuation(Content[0]) || !isPunctuation(Content[1]));
336 }
337 
339  const FormatToken &Token, unsigned StartColumn,
340  unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective,
342  : BreakableComment(Token, StartColumn, InPPDirective, Encoding, Style) {
343  assert(Tok.is(TT_BlockComment) &&
344  "block comment section must start with a block comment");
345 
346  StringRef TokenText(Tok.TokenText);
347  assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
348  TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n");
349 
350  int IndentDelta = StartColumn - OriginalStartColumn;
351  Content.resize(Lines.size());
352  Content[0] = Lines[0];
353  ContentColumn.resize(Lines.size());
354  // Account for the initial '/*'.
355  ContentColumn[0] = StartColumn + 2;
356  Tokens.resize(Lines.size());
357  for (size_t i = 1; i < Lines.size(); ++i)
358  adjustWhitespace(i, IndentDelta);
359 
360  // Align decorations with the column of the star on the first line,
361  // that is one column after the start "/*".
362  DecorationColumn = StartColumn + 1;
363 
364  // Account for comment decoration patterns like this:
365  //
366  // /*
367  // ** blah blah blah
368  // */
369  if (Lines.size() >= 2 && Content[1].startswith("**") &&
370  static_cast<unsigned>(ContentColumn[1]) == StartColumn) {
371  DecorationColumn = StartColumn;
372  }
373 
374  Decoration = "* ";
375  if (Lines.size() == 1 && !FirstInLine) {
376  // Comments for which FirstInLine is false can start on arbitrary column,
377  // and available horizontal space can be too small to align consecutive
378  // lines with the first one.
379  // FIXME: We could, probably, align them to current indentation level, but
380  // now we just wrap them without stars.
381  Decoration = "";
382  }
383  for (size_t i = 1, e = Lines.size(); i < e && !Decoration.empty(); ++i) {
384  // If the last line is empty, the closing "*/" will have a star.
385  if (i + 1 == e && Content[i].empty())
386  break;
387  if (!Content[i].empty() && i + 1 != e &&
388  Decoration.startswith(Content[i]))
389  continue;
390  while (!Content[i].startswith(Decoration))
391  Decoration = Decoration.substr(0, Decoration.size() - 1);
392  }
393 
394  LastLineNeedsDecoration = true;
395  IndentAtLineBreak = ContentColumn[0] + 1;
396  for (size_t i = 1, e = Lines.size(); i < e; ++i) {
397  if (Content[i].empty()) {
398  if (i + 1 == e) {
399  // Empty last line means that we already have a star as a part of the
400  // trailing */. We also need to preserve whitespace, so that */ is
401  // correctly indented.
402  LastLineNeedsDecoration = false;
403  // Align the star in the last '*/' with the stars on the previous lines.
404  if (e >= 2 && !Decoration.empty()) {
405  ContentColumn[i] = DecorationColumn;
406  }
407  } else if (Decoration.empty()) {
408  // For all other lines, set the start column to 0 if they're empty, so
409  // we do not insert trailing whitespace anywhere.
410  ContentColumn[i] = 0;
411  }
412  continue;
413  }
414 
415  // The first line already excludes the star.
416  // The last line excludes the star if LastLineNeedsDecoration is false.
417  // For all other lines, adjust the line to exclude the star and
418  // (optionally) the first whitespace.
419  unsigned DecorationSize = Decoration.startswith(Content[i])
420  ? Content[i].size()
421  : Decoration.size();
422  if (DecorationSize) {
423  ContentColumn[i] = DecorationColumn + DecorationSize;
424  }
425  Content[i] = Content[i].substr(DecorationSize);
426  if (!Decoration.startswith(Content[i]))
427  IndentAtLineBreak =
428  std::min<int>(IndentAtLineBreak, std::max(0, ContentColumn[i]));
429  }
430  IndentAtLineBreak =
431  std::max<unsigned>(IndentAtLineBreak, Decoration.size());
432 
433  DEBUG({
434  llvm::dbgs() << "IndentAtLineBreak " << IndentAtLineBreak << "\n";
435  for (size_t i = 0; i < Lines.size(); ++i) {
436  llvm::dbgs() << i << " |" << Content[i] << "| "
437  << "CC=" << ContentColumn[i] << "| "
438  << "IN=" << (Content[i].data() - Lines[i].data()) << "\n";
439  }
440  });
441 }
442 
443 void BreakableBlockComment::adjustWhitespace(unsigned LineIndex,
444  int IndentDelta) {
445  // When in a preprocessor directive, the trailing backslash in a block comment
446  // is not needed, but can serve a purpose of uniformity with necessary escaped
447  // newlines outside the comment. In this case we remove it here before
448  // trimming the trailing whitespace. The backslash will be re-added later when
449  // inserting a line break.
450  size_t EndOfPreviousLine = Lines[LineIndex - 1].size();
451  if (InPPDirective && Lines[LineIndex - 1].endswith("\\"))
452  --EndOfPreviousLine;
453 
454  // Calculate the end of the non-whitespace text in the previous line.
455  EndOfPreviousLine =
456  Lines[LineIndex - 1].find_last_not_of(Blanks, EndOfPreviousLine);
457  if (EndOfPreviousLine == StringRef::npos)
458  EndOfPreviousLine = 0;
459  else
460  ++EndOfPreviousLine;
461  // Calculate the start of the non-whitespace text in the current line.
462  size_t StartOfLine = Lines[LineIndex].find_first_not_of(Blanks);
463  if (StartOfLine == StringRef::npos)
464  StartOfLine = Lines[LineIndex].rtrim("\r\n").size();
465 
466  StringRef Whitespace = Lines[LineIndex].substr(0, StartOfLine);
467  // Adjust Lines to only contain relevant text.
468  size_t PreviousContentOffset =
469  Content[LineIndex - 1].data() - Lines[LineIndex - 1].data();
470  Content[LineIndex - 1] = Lines[LineIndex - 1].substr(
471  PreviousContentOffset, EndOfPreviousLine - PreviousContentOffset);
472  Content[LineIndex] = Lines[LineIndex].substr(StartOfLine);
473 
474  // Adjust the start column uniformly across all lines.
475  ContentColumn[LineIndex] =
477  IndentDelta;
478 }
479 
481  unsigned LineIndex, unsigned TailOffset,
482  StringRef::size_type Length) const {
483  unsigned ContentStartColumn = getContentStartColumn(LineIndex, TailOffset);
484  unsigned LineLength =
485  ContentStartColumn + encoding::columnWidthWithTabs(
486  Content[LineIndex].substr(TailOffset, Length),
487  ContentStartColumn, Style.TabWidth, Encoding);
488  // The last line gets a "*/" postfix.
489  if (LineIndex + 1 == Lines.size()) {
490  LineLength += 2;
491  // We never need a decoration when breaking just the trailing "*/" postfix.
492  // Note that checking that Length == 0 is not enough, since Length could
493  // also be StringRef::npos.
494  if (Content[LineIndex].substr(TailOffset, Length).empty()) {
495  LineLength -= Decoration.size();
496  }
497  }
498  return LineLength;
499 }
500 
501 void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
502  Split Split,
503  WhitespaceManager &Whitespaces) {
504  StringRef Text = Content[LineIndex].substr(TailOffset);
505  StringRef Prefix = Decoration;
506  // We need this to account for the case when we have a decoration "* " for all
507  // the lines except for the last one, where the star in "*/" acts as a
508  // decoration.
509  unsigned LocalIndentAtLineBreak = IndentAtLineBreak;
510  if (LineIndex + 1 == Lines.size() &&
511  Text.size() == Split.first + Split.second) {
512  // For the last line we need to break before "*/", but not to add "* ".
513  Prefix = "";
514  if (LocalIndentAtLineBreak >= 2)
515  LocalIndentAtLineBreak -= 2;
516  }
517  // The split offset is from the beginning of the line. Convert it to an offset
518  // from the beginning of the token text.
519  unsigned BreakOffsetInToken =
520  Text.data() - tokenAt(LineIndex).TokenText.data() + Split.first;
521  unsigned CharsToRemove = Split.second;
522  assert(LocalIndentAtLineBreak >= Prefix.size());
523  Whitespaces.replaceWhitespaceInToken(
524  tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove, "", Prefix,
525  InPPDirective, /*Newlines=*/1,
526  /*Spaces=*/LocalIndentAtLineBreak - Prefix.size());
527 }
528 
530  unsigned LineIndex,
531  unsigned PreviousEndColumn,
532  unsigned ColumnLimit,
533  llvm::Regex &CommentPragmasRegex) const {
534  if (!mayReflow(LineIndex, CommentPragmasRegex))
535  return Split(StringRef::npos, 0);
536  StringRef TrimmedContent = Content[LineIndex].ltrim(Blanks);
537  return getReflowSplit(TrimmedContent, ReflowPrefix, PreviousEndColumn,
538  ColumnLimit);
539 }
540 
541 unsigned BreakableBlockComment::getReflownColumn(
542  StringRef Content,
543  unsigned LineIndex,
544  unsigned PreviousEndColumn) const {
545  unsigned StartColumn = PreviousEndColumn + ReflowPrefix.size();
546  // If this is the last line, it will carry around its '*/' postfix.
547  unsigned PostfixLength = (LineIndex + 1 == Lines.size() ? 2 : 0);
548  // The line is composed of previous text, reflow prefix, reflown text and
549  // postfix.
550  unsigned ReflownColumn =
551  StartColumn + encoding::columnWidthWithTabs(Content, StartColumn,
553  PostfixLength;
554  return ReflownColumn;
555 }
556 
558  unsigned LineIndex, unsigned TailOffset,
559  unsigned PreviousEndColumn,
560  unsigned ColumnLimit,
561  Split SplitBefore) const {
562  if (SplitBefore.first == StringRef::npos ||
563  // Block comment line contents contain the trailing whitespace after the
564  // decoration, so the need of left trim. Note that this behavior is
565  // consistent with the breaking of block comments where the indentation of
566  // a broken line is uniform across all the lines of the block comment.
567  SplitBefore.first + SplitBefore.second <
568  Content[LineIndex].ltrim().size()) {
569  // A piece of line, not the whole, gets reflown.
570  return getLineLengthAfterSplit(LineIndex, TailOffset, StringRef::npos);
571  } else {
572  // The whole line gets reflown, need to check if we need to insert a break
573  // for the postfix or not.
574  StringRef TrimmedContent = Content[LineIndex].ltrim(Blanks);
575  unsigned ReflownColumn =
576  getReflownColumn(TrimmedContent, LineIndex, PreviousEndColumn);
577  if (ReflownColumn <= ColumnLimit) {
578  return ReflownColumn;
579  }
580  return getLineLengthAfterSplit(LineIndex, TailOffset, StringRef::npos);
581  }
582 }
584  unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit,
585  Split SplitBefore, WhitespaceManager &Whitespaces) {
586  if (LineIndex == 0) return;
587  StringRef TrimmedContent = Content[LineIndex].ltrim(Blanks);
588  if (SplitBefore.first != StringRef::npos) {
589  // Here we need to reflow.
590  assert(Tokens[LineIndex - 1] == Tokens[LineIndex] &&
591  "Reflowing whitespace within a token");
592  // This is the offset of the end of the last line relative to the start of
593  // the token text in the token.
594  unsigned WhitespaceOffsetInToken = Content[LineIndex - 1].data() +
595  Content[LineIndex - 1].size() -
596  tokenAt(LineIndex).TokenText.data();
597  unsigned WhitespaceLength = TrimmedContent.data() -
598  tokenAt(LineIndex).TokenText.data() -
599  WhitespaceOffsetInToken;
600  Whitespaces.replaceWhitespaceInToken(
601  tokenAt(LineIndex), WhitespaceOffsetInToken,
602  /*ReplaceChars=*/WhitespaceLength, /*PreviousPostfix=*/"",
603  /*CurrentPrefix=*/ReflowPrefix, InPPDirective, /*Newlines=*/0,
604  /*Spaces=*/0);
605  // Check if we need to also insert a break at the whitespace range.
606  // For this we first adapt the reflow split relative to the beginning of the
607  // content.
608  // Note that we don't need a penalty for this break, since it doesn't change
609  // the total number of lines.
610  Split BreakSplit = SplitBefore;
611  BreakSplit.first += TrimmedContent.data() - Content[LineIndex].data();
612  unsigned ReflownColumn =
613  getReflownColumn(TrimmedContent, LineIndex, PreviousEndColumn);
614  if (ReflownColumn > ColumnLimit) {
615  insertBreak(LineIndex, 0, BreakSplit, Whitespaces);
616  }
617  return;
618  }
619 
620  // Here no reflow with the previous line will happen.
621  // Fix the decoration of the line at LineIndex.
622  StringRef Prefix = Decoration;
623  if (Content[LineIndex].empty()) {
624  if (LineIndex + 1 == Lines.size()) {
625  if (!LastLineNeedsDecoration) {
626  // If the last line was empty, we don't need a prefix, as the */ will
627  // line up with the decoration (if it exists).
628  Prefix = "";
629  }
630  } else if (!Decoration.empty()) {
631  // For other empty lines, if we do have a decoration, adapt it to not
632  // contain a trailing whitespace.
633  Prefix = Prefix.substr(0, 1);
634  }
635  } else {
636  if (ContentColumn[LineIndex] == 1) {
637  // This line starts immediately after the decorating *.
638  Prefix = Prefix.substr(0, 1);
639  }
640  }
641  // This is the offset of the end of the last line relative to the start of the
642  // token text in the token.
643  unsigned WhitespaceOffsetInToken = Content[LineIndex - 1].data() +
644  Content[LineIndex - 1].size() -
645  tokenAt(LineIndex).TokenText.data();
646  unsigned WhitespaceLength = Content[LineIndex].data() -
647  tokenAt(LineIndex).TokenText.data() -
648  WhitespaceOffsetInToken;
649  Whitespaces.replaceWhitespaceInToken(
650  tokenAt(LineIndex), WhitespaceOffsetInToken, WhitespaceLength, "", Prefix,
651  InPPDirective, /*Newlines=*/1, ContentColumn[LineIndex] - Prefix.size());
652 }
653 
654 bool BreakableBlockComment::mayReflow(unsigned LineIndex,
655  llvm::Regex &CommentPragmasRegex) const {
656  // Content[LineIndex] may exclude the indent after the '*' decoration. In that
657  // case, we compute the start of the comment pragma manually.
658  StringRef IndentContent = Content[LineIndex];
659  if (Lines[LineIndex].ltrim(Blanks).startswith("*")) {
660  IndentContent = Lines[LineIndex].ltrim(Blanks).substr(1);
661  }
662  return LineIndex > 0 && !CommentPragmasRegex.match(IndentContent) &&
663  mayReflowContent(Content[LineIndex]) && !Tok.Finalized &&
664  !switchesFormatting(tokenAt(LineIndex));
665 }
666 
667 unsigned
668 BreakableBlockComment::getContentStartColumn(unsigned LineIndex,
669  unsigned TailOffset) const {
670  // If we break, we always break at the predefined indent.
671  if (TailOffset != 0)
672  return IndentAtLineBreak;
673  return std::max(0, ContentColumn[LineIndex]);
674 }
675 
677  const FormatToken &Token, unsigned StartColumn,
678  unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective,
680  : BreakableComment(Token, StartColumn, InPPDirective, Encoding, Style) {
681  assert(Tok.is(TT_LineComment) &&
682  "line comment section must start with a line comment");
683  FormatToken *LineTok = nullptr;
684  for (const FormatToken *CurrentTok = &Tok;
685  CurrentTok && CurrentTok->is(TT_LineComment);
686  CurrentTok = CurrentTok->Next) {
687  LastLineTok = LineTok;
688  StringRef TokenText(CurrentTok->TokenText);
689  assert(TokenText.startswith("//"));
690  size_t FirstLineIndex = Lines.size();
691  TokenText.split(Lines, "\n");
692  Content.resize(Lines.size());
693  ContentColumn.resize(Lines.size());
694  OriginalContentColumn.resize(Lines.size());
695  Tokens.resize(Lines.size());
696  Prefix.resize(Lines.size());
697  OriginalPrefix.resize(Lines.size());
698  for (size_t i = FirstLineIndex, e = Lines.size(); i < e; ++i) {
699  // We need to trim the blanks in case this is not the first line in a
700  // multiline comment. Then the indent is included in Lines[i].
701  StringRef IndentPrefix =
702  getLineCommentIndentPrefix(Lines[i].ltrim(Blanks));
703  assert(IndentPrefix.startswith("//"));
704  OriginalPrefix[i] = Prefix[i] = IndentPrefix;
705  if (Lines[i].size() > Prefix[i].size() &&
706  isAlphanumeric(Lines[i][Prefix[i].size()])) {
707  if (Prefix[i] == "//")
708  Prefix[i] = "// ";
709  else if (Prefix[i] == "///")
710  Prefix[i] = "/// ";
711  else if (Prefix[i] == "//!")
712  Prefix[i] = "//! ";
713  else if (Prefix[i] == "///<")
714  Prefix[i] = "///< ";
715  else if (Prefix[i] == "//!<")
716  Prefix[i] = "//!< ";
717  }
718 
719  Tokens[i] = LineTok;
720  Content[i] = Lines[i].substr(IndentPrefix.size());
721  OriginalContentColumn[i] =
722  StartColumn +
723  encoding::columnWidthWithTabs(OriginalPrefix[i],
724  StartColumn,
725  Style.TabWidth,
726  Encoding);
727  ContentColumn[i] =
728  StartColumn +
730  StartColumn,
731  Style.TabWidth,
732  Encoding);
733 
734  // Calculate the end of the non-whitespace text in this line.
735  size_t EndOfLine = Content[i].find_last_not_of(Blanks);
736  if (EndOfLine == StringRef::npos)
737  EndOfLine = Content[i].size();
738  else
739  ++EndOfLine;
740  Content[i] = Content[i].substr(0, EndOfLine);
741  }
742  LineTok = CurrentTok->Next;
743  if (CurrentTok->Next && !CurrentTok->Next->ContinuesLineCommentSection) {
744  // A line comment section needs to broken by a line comment that is
745  // preceded by at least two newlines. Note that we put this break here
746  // instead of breaking at a previous stage during parsing, since that
747  // would split the contents of the enum into two unwrapped lines in this
748  // example, which is undesirable:
749  // enum A {
750  // a, // comment about a
751  //
752  // // comment about b
753  // b
754  // };
755  //
756  // FIXME: Consider putting separate line comment sections as children to
757  // the unwrapped line instead.
758  break;
759  }
760  }
761 }
762 
764  unsigned LineIndex, unsigned TailOffset,
765  StringRef::size_type Length) const {
766  unsigned ContentStartColumn =
767  (TailOffset == 0 ? ContentColumn[LineIndex]
768  : OriginalContentColumn[LineIndex]);
769  return ContentStartColumn + encoding::columnWidthWithTabs(
770  Content[LineIndex].substr(TailOffset, Length),
771  ContentStartColumn, Style.TabWidth, Encoding);
772 }
773 
775  unsigned TailOffset, Split Split,
776  WhitespaceManager &Whitespaces) {
777  StringRef Text = Content[LineIndex].substr(TailOffset);
778  // Compute the offset of the split relative to the beginning of the token
779  // text.
780  unsigned BreakOffsetInToken =
781  Text.data() - tokenAt(LineIndex).TokenText.data() + Split.first;
782  unsigned CharsToRemove = Split.second;
783  // Compute the size of the new indent, including the size of the new prefix of
784  // the newly broken line.
785  unsigned IndentAtLineBreak = OriginalContentColumn[LineIndex] +
786  Prefix[LineIndex].size() -
787  OriginalPrefix[LineIndex].size();
788  assert(IndentAtLineBreak >= Prefix[LineIndex].size());
789  Whitespaces.replaceWhitespaceInToken(
790  tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove, "",
791  Prefix[LineIndex], InPPDirective, /*Newlines=*/1,
792  /*Spaces=*/IndentAtLineBreak - Prefix[LineIndex].size());
793 }
794 
796  unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit,
797  llvm::Regex &CommentPragmasRegex) const {
798  if (!mayReflow(LineIndex, CommentPragmasRegex))
799  return Split(StringRef::npos, 0);
800  return getReflowSplit(Content[LineIndex], ReflowPrefix, PreviousEndColumn,
801  ColumnLimit);
802 }
803 
805  unsigned LineIndex, unsigned TailOffset,
806  unsigned PreviousEndColumn,
807  unsigned ColumnLimit,
808  Split SplitBefore) const {
809  if (SplitBefore.first == StringRef::npos ||
810  SplitBefore.first + SplitBefore.second < Content[LineIndex].size()) {
811  // A piece of line, not the whole line, gets reflown.
812  return getLineLengthAfterSplit(LineIndex, TailOffset, StringRef::npos);
813  } else {
814  // The whole line gets reflown.
815  unsigned StartColumn = PreviousEndColumn + ReflowPrefix.size();
816  return StartColumn + encoding::columnWidthWithTabs(Content[LineIndex],
817  StartColumn,
818  Style.TabWidth,
819  Encoding);
820  }
821 }
822 
824  unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit,
825  Split SplitBefore, WhitespaceManager &Whitespaces) {
826  // If this is the first line of a token, we need to inform Whitespace Manager
827  // about it: either adapt the whitespace range preceding it, or mark it as an
828  // untouchable token.
829  // This happens for instance here:
830  // // line 1 \
831  // // line 2
832  if (LineIndex > 0 && Tokens[LineIndex] != Tokens[LineIndex - 1]) {
833  if (SplitBefore.first != StringRef::npos) {
834  // Reflow happens between tokens. Replace the whitespace between the
835  // tokens by the empty string.
836  Whitespaces.replaceWhitespace(
837  *Tokens[LineIndex], /*Newlines=*/0, /*Spaces=*/0,
838  /*StartOfTokenColumn=*/StartColumn, /*InPPDirective=*/false);
839  // Replace the indent and prefix of the token with the reflow prefix.
840  unsigned WhitespaceLength =
841  Content[LineIndex].data() - tokenAt(LineIndex).TokenText.data();
842  Whitespaces.replaceWhitespaceInToken(*Tokens[LineIndex],
843  /*Offset=*/0,
844  /*ReplaceChars=*/WhitespaceLength,
845  /*PreviousPostfix=*/"",
846  /*CurrentPrefix=*/ReflowPrefix,
847  /*InPPDirective=*/false,
848  /*Newlines=*/0,
849  /*Spaces=*/0);
850  } else {
851  // This is the first line for the current token, but no reflow with the
852  // previous token is necessary. However, we still may need to adjust the
853  // start column. Note that ContentColumn[LineIndex] is the expected
854  // content column after a possible update to the prefix, hence the prefix
855  // length change is included.
856  unsigned LineColumn =
857  ContentColumn[LineIndex] -
858  (Content[LineIndex].data() - Lines[LineIndex].data()) +
859  (OriginalPrefix[LineIndex].size() - Prefix[LineIndex].size());
860 
861  // We always want to create a replacement instead of adding an untouchable
862  // token, even if LineColumn is the same as the original column of the
863  // token. This is because WhitespaceManager doesn't align trailing
864  // comments if they are untouchable.
865  Whitespaces.replaceWhitespace(*Tokens[LineIndex],
866  /*Newlines=*/1,
867  /*Spaces=*/LineColumn,
868  /*StartOfTokenColumn=*/LineColumn,
869  /*InPPDirective=*/false);
870  }
871  }
872  if (OriginalPrefix[LineIndex] != Prefix[LineIndex]) {
873  // Adjust the prefix if necessary.
874 
875  // Take care of the space possibly introduced after a decoration.
876  assert(Prefix[LineIndex] == (OriginalPrefix[LineIndex] + " ").str() &&
877  "Expecting a line comment prefix to differ from original by at most "
878  "a space");
879  Whitespaces.replaceWhitespaceInToken(
880  tokenAt(LineIndex), OriginalPrefix[LineIndex].size(), 0, "", "",
881  /*InPPDirective=*/false, /*Newlines=*/0, /*Spaces=*/1);
882  }
883  // Add a break after a reflow split has been introduced, if necessary.
884  // Note that this break doesn't need to be penalized, since it doesn't change
885  // the number of lines.
886  if (SplitBefore.first != StringRef::npos &&
887  SplitBefore.first + SplitBefore.second < Content[LineIndex].size()) {
888  insertBreak(LineIndex, 0, SplitBefore, Whitespaces);
889  }
890 }
891 
893  if (LastLineTok) {
894  State.NextToken = LastLineTok->Next;
895  }
896 }
897 
899  unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const {
900  // Line comments have the indent as part of the prefix, so we need to
901  // recompute the start of the line.
902  StringRef IndentContent = Content[LineIndex];
903  if (Lines[LineIndex].startswith("//")) {
904  IndentContent = Lines[LineIndex].substr(2);
905  }
906  return LineIndex > 0 && !CommentPragmasRegex.match(IndentContent) &&
907  mayReflowContent(Content[LineIndex]) && !Tok.Finalized &&
908  !switchesFormatting(tokenAt(LineIndex)) &&
909  OriginalPrefix[LineIndex] == OriginalPrefix[LineIndex - 1];
910 }
911 
912 unsigned
913 BreakableLineCommentSection::getContentStartColumn(unsigned LineIndex,
914  unsigned TailOffset) const {
915  if (TailOffset != 0) {
916  return OriginalContentColumn[LineIndex];
917  }
918  return ContentColumn[LineIndex];
919 }
920 
921 } // namespace format
922 } // namespace clang
unsigned getLineCount() const override
Returns the number of lines in this token in the original code.
std::pair< StringRef::size_type, unsigned > Split
Contains starting character index and length of split.
bool switchesFormatting(const FormatToken &Token)
Checks if Token switches formatting, like /* clang-format off.
Declares BreakableToken, BreakableStringLiteral, BreakableComment, BreakableBlockComment and Breakabl...
void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore, WhitespaceManager &Whitespaces) override
Replaces the whitespace between LineIndex-1 and LineIndex.
static const char *const Blanks
static StringRef getLineCommentIndentPrefix(StringRef Comment)
const FormatToken & tokenAt(unsigned LineIndex) const
BreakableBlockComment(const FormatToken &Token, unsigned StartColumn, unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
FormatToken * Next
The next token in the unwrapped line.
Definition: FormatToken.h:280
void replaceWhitespaceInToken(const FormatToken &Tok, unsigned Offset, unsigned ReplaceChars, StringRef PreviousPostfix, StringRef CurrentPrefix, bool InPPDirective, unsigned Newlines, int Spaces)
Inserts or replaces whitespace in the middle of a token.
SmallVector< int, 16 > ContentColumn
const encoding::Encoding Encoding
LineState State
void updateNextToken(LineState &State) const override
Updates the next token of State to the next token after this one.
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const override
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:62
Manages the whitespaces around tokens and their replacements.
BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn, StringRef Prefix, StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
Creates a breakable token for a single line string literal.
bool mayReflow(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const override
Base class for single line tokens that can be broken.
Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...
unsigned getLineCount() const override
Returns the number of lines in this token in the original code.
The current state when indenting a unwrapped line.
SmallVector< StringRef, 16 > Content
unsigned getEscapeSequenceLength(StringRef Text)
Gets the length of an escape sequence inside a C++ string literal.
Definition: Encoding.h:97
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:117
unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding)
Gets the number of bytes in a sequence representing a single codepoint and starting with FirstChar in...
Definition: Encoding.h:78
void replaceWhitespace(FormatToken &Tok, unsigned Newlines, unsigned Spaces, unsigned StartOfTokenColumn, bool InPPDirective=false)
Replaces the whitespace in front of Tok.
SmallVector< FormatToken *, 16 > Tokens
static LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
Definition: CharInfo.h:118
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:294
Various functions to configurably format source code.
void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore, WhitespaceManager &Whitespaces) override
Replaces the whitespace between LineIndex-1 and LineIndex.
BreakableComment(const FormatToken &Token, unsigned StartColumn, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
Creates a breakable token for a comment.
static LLVM_READONLY bool isPunctuation(unsigned char c)
Return true if this character is an ASCII punctuation character.
Definition: CharInfo.h:132
void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Emits the previously retrieved Split via Whitespaces.
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:165
Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...
BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn, unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a whitespace range (offset, length) of the content at LineIndex such that the content precedi...
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Replaces the whitespace range described by Split with a single space.
static bool mayReflowContent(StringRef Content)
unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const override
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
unsigned getLineLengthAfterSplitBefore(unsigned LineIndex, unsigned TailOffset, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore) const override
Returns the number of columns required to format the piece of line at LineIndex after the content pre...
/file This file defines classes for searching and anlyzing source code clones.
Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a whitespace range (offset, length) of the content at LineIndex such that the content precedi...
Split getReflowSplit(StringRef Text, StringRef ReflowPrefix, unsigned PreviousEndColumn, unsigned ColumnLimit) const
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:292
unsigned getLineLengthAfterSplitBefore(unsigned LineIndex, unsigned TailOffset, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore) const override
Returns the number of columns required to format the piece of line at LineIndex after the content pre...
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:1366
Base class for strategies on how to break tokens.
char __ovld __cnfn max(char x, char y)
Returns y if x < y, otherwise it returns x.
bool mayReflow(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const override
void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Emits the previously retrieved Split via Whitespaces.
unsigned getLineLengthAfterCompression(unsigned RemainingTokenColumns, Split Split) const
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
static bool IsBlank(char C)
BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn, StringRef Prefix, StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const override
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
static BreakableToken::Split getStringSplit(StringRef Text, unsigned UsedColumns, unsigned ColumnLimit, unsigned TabWidth, encoding::Encoding Encoding)
void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Emits the previously retrieved Split via Whitespaces.
virtual unsigned getContentStartColumn(unsigned LineIndex, unsigned TailOffset) const =0
StringRef Text
Definition: Format.cpp:1281
FormatToken * NextToken
The token that needs to be next formatted.
static BreakableToken::Split getCommentSplit(StringRef Text, unsigned ContentStartColumn, unsigned ColumnLimit, unsigned TabWidth, encoding::Encoding Encoding)
This file implements an indenter that manages the indentation of continuations.
SmallVector< StringRef, 16 > Lines