clang  5.0.0svn
BreakableToken.cpp
Go to the documentation of this file.
1 //===--- BreakableToken.cpp - Format C++ code -----------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Contains implementation of BreakableToken class and classes derived
12 /// from it.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "BreakableToken.h"
17 #include "ContinuationIndenter.h"
18 #include "clang/Basic/CharInfo.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/Support/Debug.h"
22 #include <algorithm>
23 
24 #define DEBUG_TYPE "format-token-breaker"
25 
26 namespace clang {
27 namespace format {
28 
29 static const char *const Blanks = " \t\v\f\r";
30 static bool IsBlank(char C) {
31  switch (C) {
32  case ' ':
33  case '\t':
34  case '\v':
35  case '\f':
36  case '\r':
37  return true;
38  default:
39  return false;
40  }
41 }
42 
43 static StringRef getLineCommentIndentPrefix(StringRef Comment) {
44  static const char *const KnownPrefixes[] = {"///", "//", "//!"};
45  StringRef LongestPrefix;
46  for (StringRef KnownPrefix : KnownPrefixes) {
47  if (Comment.startswith(KnownPrefix)) {
48  size_t PrefixLength = KnownPrefix.size();
49  while (PrefixLength < Comment.size() && Comment[PrefixLength] == ' ')
50  ++PrefixLength;
51  if (PrefixLength > LongestPrefix.size())
52  LongestPrefix = Comment.substr(0, PrefixLength);
53  }
54  }
55  return LongestPrefix;
56 }
57 
59  unsigned ContentStartColumn,
60  unsigned ColumnLimit,
61  unsigned TabWidth,
63  if (ColumnLimit <= ContentStartColumn + 1)
64  return BreakableToken::Split(StringRef::npos, 0);
65 
66  unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
67  unsigned MaxSplitBytes = 0;
68 
69  for (unsigned NumChars = 0;
70  NumChars < MaxSplit && MaxSplitBytes < Text.size();) {
71  unsigned BytesInChar =
72  encoding::getCodePointNumBytes(Text[MaxSplitBytes], Encoding);
73  NumChars +=
74  encoding::columnWidthWithTabs(Text.substr(MaxSplitBytes, BytesInChar),
75  ContentStartColumn, TabWidth, Encoding);
76  MaxSplitBytes += BytesInChar;
77  }
78 
79  StringRef::size_type SpaceOffset = Text.find_last_of(Blanks, MaxSplitBytes);
80  if (SpaceOffset == StringRef::npos ||
81  // Don't break at leading whitespace.
82  Text.find_last_not_of(Blanks, SpaceOffset) == StringRef::npos) {
83  // Make sure that we don't break at leading whitespace that
84  // reaches past MaxSplit.
85  StringRef::size_type FirstNonWhitespace = Text.find_first_not_of(Blanks);
86  if (FirstNonWhitespace == StringRef::npos)
87  // If the comment is only whitespace, we cannot split.
88  return BreakableToken::Split(StringRef::npos, 0);
89  SpaceOffset = Text.find_first_of(
90  Blanks, std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace));
91  }
92  if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
93  StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim(Blanks);
94  StringRef AfterCut = Text.substr(SpaceOffset).ltrim(Blanks);
95  return BreakableToken::Split(BeforeCut.size(),
96  AfterCut.begin() - BeforeCut.end());
97  }
98  return BreakableToken::Split(StringRef::npos, 0);
99 }
100 
102 getStringSplit(StringRef Text, unsigned UsedColumns, unsigned ColumnLimit,
103  unsigned TabWidth, encoding::Encoding Encoding) {
104  // FIXME: Reduce unit test case.
105  if (Text.empty())
106  return BreakableToken::Split(StringRef::npos, 0);
107  if (ColumnLimit <= UsedColumns)
108  return BreakableToken::Split(StringRef::npos, 0);
109  unsigned MaxSplit = ColumnLimit - UsedColumns;
110  StringRef::size_type SpaceOffset = 0;
111  StringRef::size_type SlashOffset = 0;
112  StringRef::size_type WordStartOffset = 0;
113  StringRef::size_type SplitPoint = 0;
114  for (unsigned Chars = 0;;) {
115  unsigned Advance;
116  if (Text[0] == '\\') {
117  Advance = encoding::getEscapeSequenceLength(Text);
118  Chars += Advance;
119  } else {
120  Advance = encoding::getCodePointNumBytes(Text[0], Encoding);
122  Text.substr(0, Advance), UsedColumns + Chars, TabWidth, Encoding);
123  }
124 
125  if (Chars > MaxSplit || Text.size() <= Advance)
126  break;
127 
128  if (IsBlank(Text[0]))
129  SpaceOffset = SplitPoint;
130  if (Text[0] == '/')
131  SlashOffset = SplitPoint;
132  if (Advance == 1 && !isAlphanumeric(Text[0]))
133  WordStartOffset = SplitPoint;
134 
135  SplitPoint += Advance;
136  Text = Text.substr(Advance);
137  }
138 
139  if (SpaceOffset != 0)
140  return BreakableToken::Split(SpaceOffset + 1, 0);
141  if (SlashOffset != 0)
142  return BreakableToken::Split(SlashOffset + 1, 0);
143  if (WordStartOffset != 0)
144  return BreakableToken::Split(WordStartOffset + 1, 0);
145  if (SplitPoint != 0)
146  return BreakableToken::Split(SplitPoint, 0);
147  return BreakableToken::Split(StringRef::npos, 0);
148 }
149 
151  assert((Token.is(TT_BlockComment) || Token.is(TT_LineComment)) &&
152  "formatting regions are switched by comment tokens");
153  StringRef Content = Token.TokenText.substr(2).ltrim();
154  return Content.startswith("clang-format on") ||
155  Content.startswith("clang-format off");
156 }
157 
158 unsigned
159 BreakableToken::getLineLengthAfterCompression(unsigned RemainingTokenColumns,
160  Split Split) const {
161  // Example: consider the content
162  // lala lala
163  // - RemainingTokenColumns is the original number of columns, 10;
164  // - Split is (4, 2), denoting the two spaces between the two words;
165  //
166  // We compute the number of columns when the split is compressed into a single
167  // space, like:
168  // lala lala
169  return RemainingTokenColumns + 1 - Split.second;
170 }
171 
172 unsigned BreakableSingleLineToken::getLineCount() const { return 1; }
173 
175  unsigned LineIndex, unsigned TailOffset,
176  StringRef::size_type Length) const {
177  return StartColumn + Prefix.size() + Postfix.size() +
178  encoding::columnWidthWithTabs(Line.substr(TailOffset, Length),
179  StartColumn + Prefix.size(),
181 }
182 
184  const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
185  StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding,
186  const FormatStyle &Style)
187  : BreakableToken(Tok, InPPDirective, Encoding, Style),
188  StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix) {
189  assert(Tok.TokenText.endswith(Postfix));
190  Line = Tok.TokenText.substr(
191  Prefix.size(), Tok.TokenText.size() - Prefix.size() - Postfix.size());
192 }
193 
195  const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
197  const FormatStyle &Style)
198  : BreakableSingleLineToken(Tok, StartColumn, Prefix, Postfix, InPPDirective,
199  Encoding, Style) {}
200 
202 BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset,
203  unsigned ColumnLimit,
204  llvm::Regex &CommentPragmasRegex) const {
205  return getStringSplit(Line.substr(TailOffset),
206  StartColumn + Prefix.size() + Postfix.size(),
207  ColumnLimit, Style.TabWidth, Encoding);
208 }
209 
210 void BreakableStringLiteral::insertBreak(unsigned LineIndex,
211  unsigned TailOffset, Split Split,
212  WhitespaceManager &Whitespaces) {
213  unsigned LeadingSpaces = StartColumn;
214  // The '@' of an ObjC string literal (@"Test") does not become part of the
215  // string token.
216  // FIXME: It might be a cleaner solution to merge the tokens as a
217  // precomputation step.
218  if (Prefix.startswith("@"))
219  --LeadingSpaces;
220  Whitespaces.replaceWhitespaceInToken(
221  Tok, Prefix.size() + TailOffset + Split.first, Split.second, Postfix,
222  Prefix, InPPDirective, 1, LeadingSpaces);
223 }
224 
226  unsigned StartColumn,
227  bool InPPDirective,
229  const FormatStyle &Style)
230  : BreakableToken(Token, InPPDirective, Encoding, Style),
231  StartColumn(StartColumn) {}
232 
233 unsigned BreakableComment::getLineCount() const { return Lines.size(); }
234 
236 BreakableComment::getSplit(unsigned LineIndex, unsigned TailOffset,
237  unsigned ColumnLimit,
238  llvm::Regex &CommentPragmasRegex) const {
239  // Don't break lines matching the comment pragmas regex.
240  if (CommentPragmasRegex.match(Content[LineIndex]))
241  return Split(StringRef::npos, 0);
242  return getCommentSplit(Content[LineIndex].substr(TailOffset),
243  getContentStartColumn(LineIndex, TailOffset),
244  ColumnLimit, Style.TabWidth, Encoding);
245 }
246 
247 void BreakableComment::compressWhitespace(unsigned LineIndex,
248  unsigned TailOffset, Split Split,
249  WhitespaceManager &Whitespaces) {
250  StringRef Text = Content[LineIndex].substr(TailOffset);
251  // Text is relative to the content line, but Whitespaces operates relative to
252  // the start of the corresponding token, so compute the start of the Split
253  // that needs to be compressed into a single space relative to the start of
254  // its token.
255  unsigned BreakOffsetInToken =
256  Text.data() - tokenAt(LineIndex).TokenText.data() + Split.first;
257  unsigned CharsToRemove = Split.second;
258  Whitespaces.replaceWhitespaceInToken(
259  tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove, "", "",
260  /*InPPDirective=*/false, /*Newlines=*/0, /*Spaces=*/1);
261 }
262 
265  unsigned PreviousEndColumn,
266  unsigned ColumnLimit) const {
267  unsigned ReflowStartColumn = PreviousEndColumn + ReflowPrefix.size();
268  StringRef TrimmedText = Text.rtrim(Blanks);
269  // This is the width of the resulting line in case the full line of Text gets
270  // reflown up starting at ReflowStartColumn.
271  unsigned FullWidth = ReflowStartColumn + encoding::columnWidthWithTabs(
272  TrimmedText, ReflowStartColumn,
274  // If the full line fits up, we return a reflow split after it,
275  // otherwise we compute the largest piece of text that fits after
276  // ReflowStartColumn.
277  Split ReflowSplit =
278  FullWidth <= ColumnLimit
279  ? Split(TrimmedText.size(), Text.size() - TrimmedText.size())
280  : getCommentSplit(Text, ReflowStartColumn, ColumnLimit,
282 
283  // We need to be extra careful here, because while it's OK to keep a long line
284  // if it can't be broken into smaller pieces (like when the first word of a
285  // long line is longer than the column limit), it's not OK to reflow that long
286  // word up. So we recompute the size of the previous line after reflowing and
287  // only return the reflow split if that's under the line limit.
288  if (ReflowSplit.first != StringRef::npos &&
289  // Check if the width of the newly reflown line is under the limit.
290  PreviousEndColumn + ReflowPrefix.size() +
291  encoding::columnWidthWithTabs(Text.substr(0, ReflowSplit.first),
292  PreviousEndColumn +
293  ReflowPrefix.size(),
294  Style.TabWidth, Encoding) <=
295  ColumnLimit) {
296  return ReflowSplit;
297  }
298  return Split(StringRef::npos, 0);
299 }
300 
301 const FormatToken &BreakableComment::tokenAt(unsigned LineIndex) const {
302  return Tokens[LineIndex] ? *Tokens[LineIndex] : Tok;
303 }
304 
305 static bool mayReflowContent(StringRef Content) {
306  Content = Content.trim(Blanks);
307  // Lines starting with '@' commonly have special meaning.
308  static const SmallVector<StringRef, 4> kSpecialMeaningPrefixes = {
309  "@", "TODO", "FIXME", "XXX"};
310  bool hasSpecialMeaningPrefix = false;
311  for (StringRef Prefix : kSpecialMeaningPrefixes) {
312  if (Content.startswith(Prefix)) {
313  hasSpecialMeaningPrefix = true;
314  break;
315  }
316  }
317  // Simple heuristic for what to reflow: content should contain at least two
318  // characters and either the first or second character must be
319  // non-punctuation.
320  return Content.size() >= 2 && !hasSpecialMeaningPrefix &&
321  !Content.endswith("\\") &&
322  // Note that this is UTF-8 safe, since if isPunctuation(Content[0]) is
323  // true, then the first code point must be 1 byte long.
324  (!isPunctuation(Content[0]) || !isPunctuation(Content[1]));
325 }
326 
328  const FormatToken &Token, unsigned StartColumn,
329  unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective,
331  : BreakableComment(Token, StartColumn, InPPDirective, Encoding, Style) {
332  assert(Tok.is(TT_BlockComment) &&
333  "block comment section must start with a block comment");
334 
335  StringRef TokenText(Tok.TokenText);
336  assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
337  TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n");
338 
339  int IndentDelta = StartColumn - OriginalStartColumn;
340  Content.resize(Lines.size());
341  Content[0] = Lines[0];
342  ContentColumn.resize(Lines.size());
343  // Account for the initial '/*'.
344  ContentColumn[0] = StartColumn + 2;
345  Tokens.resize(Lines.size());
346  for (size_t i = 1; i < Lines.size(); ++i)
347  adjustWhitespace(i, IndentDelta);
348 
349  // Align decorations with the column of the star on the first line,
350  // that is one column after the start "/*".
351  DecorationColumn = StartColumn + 1;
352 
353  // Account for comment decoration patterns like this:
354  //
355  // /*
356  // ** blah blah blah
357  // */
358  if (Lines.size() >= 2 && Content[1].startswith("**") &&
359  static_cast<unsigned>(ContentColumn[1]) == StartColumn) {
360  DecorationColumn = StartColumn;
361  }
362 
363  Decoration = "* ";
364  if (Lines.size() == 1 && !FirstInLine) {
365  // Comments for which FirstInLine is false can start on arbitrary column,
366  // and available horizontal space can be too small to align consecutive
367  // lines with the first one.
368  // FIXME: We could, probably, align them to current indentation level, but
369  // now we just wrap them without stars.
370  Decoration = "";
371  }
372  for (size_t i = 1, e = Lines.size(); i < e && !Decoration.empty(); ++i) {
373  // If the last line is empty, the closing "*/" will have a star.
374  if (i + 1 == e && Content[i].empty())
375  break;
376  if (!Content[i].empty() && i + 1 != e &&
377  Decoration.startswith(Content[i]))
378  continue;
379  while (!Content[i].startswith(Decoration))
380  Decoration = Decoration.substr(0, Decoration.size() - 1);
381  }
382 
383  LastLineNeedsDecoration = true;
384  IndentAtLineBreak = ContentColumn[0] + 1;
385  for (size_t i = 1, e = Lines.size(); i < e; ++i) {
386  if (Content[i].empty()) {
387  if (i + 1 == e) {
388  // Empty last line means that we already have a star as a part of the
389  // trailing */. We also need to preserve whitespace, so that */ is
390  // correctly indented.
391  LastLineNeedsDecoration = false;
392  // Align the star in the last '*/' with the stars on the previous lines.
393  if (e >= 2 && !Decoration.empty()) {
394  ContentColumn[i] = DecorationColumn;
395  }
396  } else if (Decoration.empty()) {
397  // For all other lines, set the start column to 0 if they're empty, so
398  // we do not insert trailing whitespace anywhere.
399  ContentColumn[i] = 0;
400  }
401  continue;
402  }
403 
404  // The first line already excludes the star.
405  // The last line excludes the star if LastLineNeedsDecoration is false.
406  // For all other lines, adjust the line to exclude the star and
407  // (optionally) the first whitespace.
408  unsigned DecorationSize = Decoration.startswith(Content[i])
409  ? Content[i].size()
410  : Decoration.size();
411  if (DecorationSize) {
412  ContentColumn[i] = DecorationColumn + DecorationSize;
413  }
414  Content[i] = Content[i].substr(DecorationSize);
415  if (!Decoration.startswith(Content[i]))
416  IndentAtLineBreak =
417  std::min<int>(IndentAtLineBreak, std::max(0, ContentColumn[i]));
418  }
419  IndentAtLineBreak =
420  std::max<unsigned>(IndentAtLineBreak, Decoration.size());
421 
422  DEBUG({
423  llvm::dbgs() << "IndentAtLineBreak " << IndentAtLineBreak << "\n";
424  for (size_t i = 0; i < Lines.size(); ++i) {
425  llvm::dbgs() << i << " |" << Content[i] << "| "
426  << "CC=" << ContentColumn[i] << "| "
427  << "IN=" << (Content[i].data() - Lines[i].data()) << "\n";
428  }
429  });
430 }
431 
432 void BreakableBlockComment::adjustWhitespace(unsigned LineIndex,
433  int IndentDelta) {
434  // When in a preprocessor directive, the trailing backslash in a block comment
435  // is not needed, but can serve a purpose of uniformity with necessary escaped
436  // newlines outside the comment. In this case we remove it here before
437  // trimming the trailing whitespace. The backslash will be re-added later when
438  // inserting a line break.
439  size_t EndOfPreviousLine = Lines[LineIndex - 1].size();
440  if (InPPDirective && Lines[LineIndex - 1].endswith("\\"))
441  --EndOfPreviousLine;
442 
443  // Calculate the end of the non-whitespace text in the previous line.
444  EndOfPreviousLine =
445  Lines[LineIndex - 1].find_last_not_of(Blanks, EndOfPreviousLine);
446  if (EndOfPreviousLine == StringRef::npos)
447  EndOfPreviousLine = 0;
448  else
449  ++EndOfPreviousLine;
450  // Calculate the start of the non-whitespace text in the current line.
451  size_t StartOfLine = Lines[LineIndex].find_first_not_of(Blanks);
452  if (StartOfLine == StringRef::npos)
453  StartOfLine = Lines[LineIndex].rtrim("\r\n").size();
454 
455  StringRef Whitespace = Lines[LineIndex].substr(0, StartOfLine);
456  // Adjust Lines to only contain relevant text.
457  size_t PreviousContentOffset =
458  Content[LineIndex - 1].data() - Lines[LineIndex - 1].data();
459  Content[LineIndex - 1] = Lines[LineIndex - 1].substr(
460  PreviousContentOffset, EndOfPreviousLine - PreviousContentOffset);
461  Content[LineIndex] = Lines[LineIndex].substr(StartOfLine);
462 
463  // Adjust the start column uniformly across all lines.
464  ContentColumn[LineIndex] =
466  IndentDelta;
467 }
468 
470  unsigned LineIndex, unsigned TailOffset,
471  StringRef::size_type Length) const {
472  unsigned ContentStartColumn = getContentStartColumn(LineIndex, TailOffset);
473  unsigned LineLength =
474  ContentStartColumn + encoding::columnWidthWithTabs(
475  Content[LineIndex].substr(TailOffset, Length),
476  ContentStartColumn, Style.TabWidth, Encoding);
477  // The last line gets a "*/" postfix.
478  if (LineIndex + 1 == Lines.size()) {
479  LineLength += 2;
480  // We never need a decoration when breaking just the trailing "*/" postfix.
481  // Note that checking that Length == 0 is not enough, since Length could
482  // also be StringRef::npos.
483  if (Content[LineIndex].substr(TailOffset, Length).empty()) {
484  LineLength -= Decoration.size();
485  }
486  }
487  return LineLength;
488 }
489 
490 void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
491  Split Split,
492  WhitespaceManager &Whitespaces) {
493  StringRef Text = Content[LineIndex].substr(TailOffset);
494  StringRef Prefix = Decoration;
495  // We need this to account for the case when we have a decoration "* " for all
496  // the lines except for the last one, where the star in "*/" acts as a
497  // decoration.
498  unsigned LocalIndentAtLineBreak = IndentAtLineBreak;
499  if (LineIndex + 1 == Lines.size() &&
500  Text.size() == Split.first + Split.second) {
501  // For the last line we need to break before "*/", but not to add "* ".
502  Prefix = "";
503  if (LocalIndentAtLineBreak >= 2)
504  LocalIndentAtLineBreak -= 2;
505  }
506  // The split offset is from the beginning of the line. Convert it to an offset
507  // from the beginning of the token text.
508  unsigned BreakOffsetInToken =
509  Text.data() - tokenAt(LineIndex).TokenText.data() + Split.first;
510  unsigned CharsToRemove = Split.second;
511  assert(LocalIndentAtLineBreak >= Prefix.size());
512  Whitespaces.replaceWhitespaceInToken(
513  tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove, "", Prefix,
514  InPPDirective, /*Newlines=*/1,
515  /*Spaces=*/LocalIndentAtLineBreak - Prefix.size());
516 }
517 
519  unsigned LineIndex,
520  unsigned PreviousEndColumn,
521  unsigned ColumnLimit,
522  llvm::Regex &CommentPragmasRegex) const {
523  if (!mayReflow(LineIndex, CommentPragmasRegex))
524  return Split(StringRef::npos, 0);
525  StringRef TrimmedContent = Content[LineIndex].ltrim(Blanks);
526  return getReflowSplit(TrimmedContent, ReflowPrefix, PreviousEndColumn,
527  ColumnLimit);
528 }
529 
530 unsigned BreakableBlockComment::getReflownColumn(
531  StringRef Content,
532  unsigned LineIndex,
533  unsigned PreviousEndColumn) const {
534  unsigned StartColumn = PreviousEndColumn + ReflowPrefix.size();
535  // If this is the last line, it will carry around its '*/' postfix.
536  unsigned PostfixLength = (LineIndex + 1 == Lines.size() ? 2 : 0);
537  // The line is composed of previous text, reflow prefix, reflown text and
538  // postfix.
539  unsigned ReflownColumn =
540  StartColumn + encoding::columnWidthWithTabs(Content, StartColumn,
542  PostfixLength;
543  return ReflownColumn;
544 }
545 
547  unsigned LineIndex, unsigned TailOffset,
548  unsigned PreviousEndColumn,
549  unsigned ColumnLimit,
550  Split SplitBefore) const {
551  if (SplitBefore.first == StringRef::npos ||
552  // Block comment line contents contain the trailing whitespace after the
553  // decoration, so the need of left trim. Note that this behavior is
554  // consistent with the breaking of block comments where the indentation of
555  // a broken line is uniform across all the lines of the block comment.
556  SplitBefore.first + SplitBefore.second <
557  Content[LineIndex].ltrim().size()) {
558  // A piece of line, not the whole, gets reflown.
559  return getLineLengthAfterSplit(LineIndex, TailOffset, StringRef::npos);
560  } else {
561  // The whole line gets reflown, need to check if we need to insert a break
562  // for the postfix or not.
563  StringRef TrimmedContent = Content[LineIndex].ltrim(Blanks);
564  unsigned ReflownColumn =
565  getReflownColumn(TrimmedContent, LineIndex, PreviousEndColumn);
566  if (ReflownColumn <= ColumnLimit) {
567  return ReflownColumn;
568  }
569  return getLineLengthAfterSplit(LineIndex, TailOffset, StringRef::npos);
570  }
571 }
573  unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit,
574  Split SplitBefore, WhitespaceManager &Whitespaces) {
575  if (LineIndex == 0) return;
576  StringRef TrimmedContent = Content[LineIndex].ltrim(Blanks);
577  if (SplitBefore.first != StringRef::npos) {
578  // Here we need to reflow.
579  assert(Tokens[LineIndex - 1] == Tokens[LineIndex] &&
580  "Reflowing whitespace within a token");
581  // This is the offset of the end of the last line relative to the start of
582  // the token text in the token.
583  unsigned WhitespaceOffsetInToken = Content[LineIndex - 1].data() +
584  Content[LineIndex - 1].size() -
585  tokenAt(LineIndex).TokenText.data();
586  unsigned WhitespaceLength = TrimmedContent.data() -
587  tokenAt(LineIndex).TokenText.data() -
588  WhitespaceOffsetInToken;
589  Whitespaces.replaceWhitespaceInToken(
590  tokenAt(LineIndex), WhitespaceOffsetInToken,
591  /*ReplaceChars=*/WhitespaceLength, /*PreviousPostfix=*/"",
592  /*CurrentPrefix=*/ReflowPrefix, InPPDirective, /*Newlines=*/0,
593  /*Spaces=*/0);
594  // Check if we need to also insert a break at the whitespace range.
595  // For this we first adapt the reflow split relative to the beginning of the
596  // content.
597  // Note that we don't need a penalty for this break, since it doesn't change
598  // the total number of lines.
599  Split BreakSplit = SplitBefore;
600  BreakSplit.first += TrimmedContent.data() - Content[LineIndex].data();
601  unsigned ReflownColumn =
602  getReflownColumn(TrimmedContent, LineIndex, PreviousEndColumn);
603  if (ReflownColumn > ColumnLimit) {
604  insertBreak(LineIndex, 0, BreakSplit, Whitespaces);
605  }
606  return;
607  }
608 
609  // Here no reflow with the previous line will happen.
610  // Fix the decoration of the line at LineIndex.
611  StringRef Prefix = Decoration;
612  if (Content[LineIndex].empty()) {
613  if (LineIndex + 1 == Lines.size()) {
614  if (!LastLineNeedsDecoration) {
615  // If the last line was empty, we don't need a prefix, as the */ will
616  // line up with the decoration (if it exists).
617  Prefix = "";
618  }
619  } else if (!Decoration.empty()) {
620  // For other empty lines, if we do have a decoration, adapt it to not
621  // contain a trailing whitespace.
622  Prefix = Prefix.substr(0, 1);
623  }
624  } else {
625  if (ContentColumn[LineIndex] == 1) {
626  // This line starts immediately after the decorating *.
627  Prefix = Prefix.substr(0, 1);
628  }
629  }
630  // This is the offset of the end of the last line relative to the start of the
631  // token text in the token.
632  unsigned WhitespaceOffsetInToken = Content[LineIndex - 1].data() +
633  Content[LineIndex - 1].size() -
634  tokenAt(LineIndex).TokenText.data();
635  unsigned WhitespaceLength = Content[LineIndex].data() -
636  tokenAt(LineIndex).TokenText.data() -
637  WhitespaceOffsetInToken;
638  Whitespaces.replaceWhitespaceInToken(
639  tokenAt(LineIndex), WhitespaceOffsetInToken, WhitespaceLength, "", Prefix,
640  InPPDirective, /*Newlines=*/1, ContentColumn[LineIndex] - Prefix.size());
641 }
642 
643 bool BreakableBlockComment::mayReflow(unsigned LineIndex,
644  llvm::Regex &CommentPragmasRegex) const {
645  // Content[LineIndex] may exclude the indent after the '*' decoration. In that
646  // case, we compute the start of the comment pragma manually.
647  StringRef IndentContent = Content[LineIndex];
648  if (Lines[LineIndex].ltrim(Blanks).startswith("*")) {
649  IndentContent = Lines[LineIndex].ltrim(Blanks).substr(1);
650  }
651  return LineIndex > 0 && !CommentPragmasRegex.match(IndentContent) &&
652  mayReflowContent(Content[LineIndex]) && !Tok.Finalized &&
653  !switchesFormatting(tokenAt(LineIndex));
654 }
655 
656 unsigned
657 BreakableBlockComment::getContentStartColumn(unsigned LineIndex,
658  unsigned TailOffset) const {
659  // If we break, we always break at the predefined indent.
660  if (TailOffset != 0)
661  return IndentAtLineBreak;
662  return std::max(0, ContentColumn[LineIndex]);
663 }
664 
666  const FormatToken &Token, unsigned StartColumn,
667  unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective,
669  : BreakableComment(Token, StartColumn, InPPDirective, Encoding, Style) {
670  assert(Tok.is(TT_LineComment) &&
671  "line comment section must start with a line comment");
672  FormatToken *LineTok = nullptr;
673  for (const FormatToken *CurrentTok = &Tok;
674  CurrentTok && CurrentTok->is(TT_LineComment);
675  CurrentTok = CurrentTok->Next) {
676  LastLineTok = LineTok;
677  StringRef TokenText(CurrentTok->TokenText);
678  assert(TokenText.startswith("//"));
679  size_t FirstLineIndex = Lines.size();
680  TokenText.split(Lines, "\n");
681  Content.resize(Lines.size());
682  ContentColumn.resize(Lines.size());
683  OriginalContentColumn.resize(Lines.size());
684  Tokens.resize(Lines.size());
685  Prefix.resize(Lines.size());
686  OriginalPrefix.resize(Lines.size());
687  for (size_t i = FirstLineIndex, e = Lines.size(); i < e; ++i) {
688  // We need to trim the blanks in case this is not the first line in a
689  // multiline comment. Then the indent is included in Lines[i].
690  StringRef IndentPrefix =
691  getLineCommentIndentPrefix(Lines[i].ltrim(Blanks));
692  assert(IndentPrefix.startswith("//"));
693  OriginalPrefix[i] = Prefix[i] = IndentPrefix;
694  if (Lines[i].size() > Prefix[i].size() &&
695  isAlphanumeric(Lines[i][Prefix[i].size()])) {
696  if (Prefix[i] == "//")
697  Prefix[i] = "// ";
698  else if (Prefix[i] == "///")
699  Prefix[i] = "/// ";
700  else if (Prefix[i] == "//!")
701  Prefix[i] = "//! ";
702  }
703 
704  Tokens[i] = LineTok;
705  Content[i] = Lines[i].substr(IndentPrefix.size());
706  OriginalContentColumn[i] =
707  StartColumn +
708  encoding::columnWidthWithTabs(OriginalPrefix[i],
709  StartColumn,
710  Style.TabWidth,
711  Encoding);
712  ContentColumn[i] =
713  StartColumn +
715  StartColumn,
716  Style.TabWidth,
717  Encoding);
718 
719  // Calculate the end of the non-whitespace text in this line.
720  size_t EndOfLine = Content[i].find_last_not_of(Blanks);
721  if (EndOfLine == StringRef::npos)
722  EndOfLine = Content[i].size();
723  else
724  ++EndOfLine;
725  Content[i] = Content[i].substr(0, EndOfLine);
726  }
727  LineTok = CurrentTok->Next;
728  if (CurrentTok->Next && !CurrentTok->Next->ContinuesLineCommentSection) {
729  // A line comment section needs to broken by a line comment that is
730  // preceded by at least two newlines. Note that we put this break here
731  // instead of breaking at a previous stage during parsing, since that
732  // would split the contents of the enum into two unwrapped lines in this
733  // example, which is undesirable:
734  // enum A {
735  // a, // comment about a
736  //
737  // // comment about b
738  // b
739  // };
740  //
741  // FIXME: Consider putting separate line comment sections as children to
742  // the unwrapped line instead.
743  break;
744  }
745  }
746 }
747 
749  unsigned LineIndex, unsigned TailOffset,
750  StringRef::size_type Length) const {
751  unsigned ContentStartColumn =
752  (TailOffset == 0 ? ContentColumn[LineIndex]
753  : OriginalContentColumn[LineIndex]);
754  return ContentStartColumn + encoding::columnWidthWithTabs(
755  Content[LineIndex].substr(TailOffset, Length),
756  ContentStartColumn, Style.TabWidth, Encoding);
757 }
758 
760  unsigned TailOffset, Split Split,
761  WhitespaceManager &Whitespaces) {
762  StringRef Text = Content[LineIndex].substr(TailOffset);
763  // Compute the offset of the split relative to the beginning of the token
764  // text.
765  unsigned BreakOffsetInToken =
766  Text.data() - tokenAt(LineIndex).TokenText.data() + Split.first;
767  unsigned CharsToRemove = Split.second;
768  // Compute the size of the new indent, including the size of the new prefix of
769  // the newly broken line.
770  unsigned IndentAtLineBreak = OriginalContentColumn[LineIndex] +
771  Prefix[LineIndex].size() -
772  OriginalPrefix[LineIndex].size();
773  assert(IndentAtLineBreak >= Prefix[LineIndex].size());
774  Whitespaces.replaceWhitespaceInToken(
775  tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove, "",
776  Prefix[LineIndex], InPPDirective, /*Newlines=*/1,
777  /*Spaces=*/IndentAtLineBreak - Prefix[LineIndex].size());
778 }
779 
781  unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit,
782  llvm::Regex &CommentPragmasRegex) const {
783  if (!mayReflow(LineIndex, CommentPragmasRegex))
784  return Split(StringRef::npos, 0);
785  return getReflowSplit(Content[LineIndex], ReflowPrefix, PreviousEndColumn,
786  ColumnLimit);
787 }
788 
790  unsigned LineIndex, unsigned TailOffset,
791  unsigned PreviousEndColumn,
792  unsigned ColumnLimit,
793  Split SplitBefore) const {
794  if (SplitBefore.first == StringRef::npos ||
795  SplitBefore.first + SplitBefore.second < Content[LineIndex].size()) {
796  // A piece of line, not the whole line, gets reflown.
797  return getLineLengthAfterSplit(LineIndex, TailOffset, StringRef::npos);
798  } else {
799  // The whole line gets reflown.
800  unsigned StartColumn = PreviousEndColumn + ReflowPrefix.size();
801  return StartColumn + encoding::columnWidthWithTabs(Content[LineIndex],
802  StartColumn,
803  Style.TabWidth,
804  Encoding);
805  }
806 }
807 
809  unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit,
810  Split SplitBefore, WhitespaceManager &Whitespaces) {
811  // If this is the first line of a token, we need to inform Whitespace Manager
812  // about it: either adapt the whitespace range preceding it, or mark it as an
813  // untouchable token.
814  // This happens for instance here:
815  // // line 1 \
816  // // line 2
817  if (LineIndex > 0 && Tokens[LineIndex] != Tokens[LineIndex - 1]) {
818  if (SplitBefore.first != StringRef::npos) {
819  // Reflow happens between tokens. Replace the whitespace between the
820  // tokens by the empty string.
821  Whitespaces.replaceWhitespace(
822  *Tokens[LineIndex], /*Newlines=*/0, /*Spaces=*/0,
823  /*StartOfTokenColumn=*/StartColumn, /*InPPDirective=*/false);
824  // Replace the indent and prefix of the token with the reflow prefix.
825  unsigned WhitespaceLength =
826  Content[LineIndex].data() - tokenAt(LineIndex).TokenText.data();
827  Whitespaces.replaceWhitespaceInToken(*Tokens[LineIndex],
828  /*Offset=*/0,
829  /*ReplaceChars=*/WhitespaceLength,
830  /*PreviousPostfix=*/"",
831  /*CurrentPrefix=*/ReflowPrefix,
832  /*InPPDirective=*/false,
833  /*Newlines=*/0,
834  /*Spaces=*/0);
835  } else {
836  // This is the first line for the current token, but no reflow with the
837  // previous token is necessary. However, we still may need to adjust the
838  // start column. Note that ContentColumn[LineIndex] is the expected
839  // content column after a possible update to the prefix, hence the prefix
840  // length change is included.
841  unsigned LineColumn =
842  ContentColumn[LineIndex] -
843  (Content[LineIndex].data() - Lines[LineIndex].data()) +
844  (OriginalPrefix[LineIndex].size() - Prefix[LineIndex].size());
845 
846  // We always want to create a replacement instead of adding an untouchable
847  // token, even if LineColumn is the same as the original column of the
848  // token. This is because WhitespaceManager doesn't align trailing
849  // comments if they are untouchable.
850  Whitespaces.replaceWhitespace(*Tokens[LineIndex],
851  /*Newlines=*/1,
852  /*Spaces=*/LineColumn,
853  /*StartOfTokenColumn=*/LineColumn,
854  /*InPPDirective=*/false);
855  }
856  }
857  if (OriginalPrefix[LineIndex] != Prefix[LineIndex]) {
858  // Adjust the prefix if necessary.
859 
860  // Take care of the space possibly introduced after a decoration.
861  assert(Prefix[LineIndex] == (OriginalPrefix[LineIndex] + " ").str() &&
862  "Expecting a line comment prefix to differ from original by at most "
863  "a space");
864  Whitespaces.replaceWhitespaceInToken(
865  tokenAt(LineIndex), OriginalPrefix[LineIndex].size(), 0, "", "",
866  /*InPPDirective=*/false, /*Newlines=*/0, /*Spaces=*/1);
867  }
868  // Add a break after a reflow split has been introduced, if necessary.
869  // Note that this break doesn't need to be penalized, since it doesn't change
870  // the number of lines.
871  if (SplitBefore.first != StringRef::npos &&
872  SplitBefore.first + SplitBefore.second < Content[LineIndex].size()) {
873  insertBreak(LineIndex, 0, SplitBefore, Whitespaces);
874  }
875 }
876 
878  if (LastLineTok) {
879  State.NextToken = LastLineTok->Next;
880  }
881 }
882 
884  unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const {
885  // Line comments have the indent as part of the prefix, so we need to
886  // recompute the start of the line.
887  StringRef IndentContent = Content[LineIndex];
888  if (Lines[LineIndex].startswith("//")) {
889  IndentContent = Lines[LineIndex].substr(2);
890  }
891  return LineIndex > 0 && !CommentPragmasRegex.match(IndentContent) &&
892  mayReflowContent(Content[LineIndex]) && !Tok.Finalized &&
893  !switchesFormatting(tokenAt(LineIndex)) &&
894  OriginalPrefix[LineIndex] == OriginalPrefix[LineIndex - 1];
895 }
896 
897 unsigned
898 BreakableLineCommentSection::getContentStartColumn(unsigned LineIndex,
899  unsigned TailOffset) const {
900  if (TailOffset != 0) {
901  return OriginalContentColumn[LineIndex];
902  }
903  return ContentColumn[LineIndex];
904 }
905 
906 } // namespace format
907 } // namespace clang
unsigned getLineCount() const override
Returns the number of lines in this token in the original code.
std::pair< StringRef::size_type, unsigned > Split
Contains starting character index and length of split.
bool switchesFormatting(const FormatToken &Token)
Checks if Token switches formatting, like /* clang-format off.
Declares BreakableToken, BreakableStringLiteral, BreakableComment, BreakableBlockComment and Breakabl...
void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore, WhitespaceManager &Whitespaces) override
Replaces the whitespace between LineIndex-1 and LineIndex.
static const char *const Blanks
static StringRef getLineCommentIndentPrefix(StringRef Comment)
const FormatToken & tokenAt(unsigned LineIndex) const
BreakableBlockComment(const FormatToken &Token, unsigned StartColumn, unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
FormatToken * Next
The next token in the unwrapped line.
Definition: FormatToken.h:278
void replaceWhitespaceInToken(const FormatToken &Tok, unsigned Offset, unsigned ReplaceChars, StringRef PreviousPostfix, StringRef CurrentPrefix, bool InPPDirective, unsigned Newlines, int Spaces)
Inserts or replaces whitespace in the middle of a token.
SmallVector< int, 16 > ContentColumn
const encoding::Encoding Encoding
LineState State
void updateNextToken(LineState &State) const override
Updates the next token of State to the next token after this one.
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const override
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:62
Manages the whitespaces around tokens and their replacements.
BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn, StringRef Prefix, StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
Creates a breakable token for a single line string literal.
bool mayReflow(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const override
Base class for single line tokens that can be broken.
Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...
unsigned getLineCount() const override
Returns the number of lines in this token in the original code.
The current state when indenting a unwrapped line.
SmallVector< StringRef, 16 > Content
unsigned getEscapeSequenceLength(StringRef Text)
Gets the length of an escape sequence inside a C++ string literal.
Definition: Encoding.h:97
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:115
unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding)
Gets the number of bytes in a sequence representing a single codepoint and starting with FirstChar in...
Definition: Encoding.h:78
void replaceWhitespace(FormatToken &Tok, unsigned Newlines, unsigned Spaces, unsigned StartOfTokenColumn, bool InPPDirective=false)
Replaces the whitespace in front of Tok.
SmallVector< FormatToken *, 16 > Tokens
static LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
Definition: CharInfo.h:118
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:292
Various functions to configurably format source code.
void replaceWhitespaceBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore, WhitespaceManager &Whitespaces) override
Replaces the whitespace between LineIndex-1 and LineIndex.
BreakableComment(const FormatToken &Token, unsigned StartColumn, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
Creates a breakable token for a comment.
static LLVM_READONLY bool isPunctuation(unsigned char c)
Return true if this character is an ASCII punctuation character.
Definition: CharInfo.h:132
void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Emits the previously retrieved Split via Whitespaces.
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:163
Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a range (offset, length) at which to break the line at LineIndex, if previously broken at Tai...
BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn, unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a whitespace range (offset, length) of the content at LineIndex such that the content precedi...
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Replaces the whitespace range described by Split with a single space.
static bool mayReflowContent(StringRef Content)
unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const override
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
unsigned getLineLengthAfterSplitBefore(unsigned LineIndex, unsigned TailOffset, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore) const override
Returns the number of columns required to format the piece of line at LineIndex after the content pre...
/file This file defines classes for searching and anlyzing source code clones.
Split getSplitBefore(unsigned LineIndex, unsigned PreviousEndColumn, unsigned ColumnLimit, llvm::Regex &CommentPragmasRegex) const override
Returns a whitespace range (offset, length) of the content at LineIndex such that the content precedi...
Split getReflowSplit(StringRef Text, StringRef ReflowPrefix, unsigned PreviousEndColumn, unsigned ColumnLimit) const
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:290
unsigned getLineLengthAfterSplitBefore(unsigned LineIndex, unsigned TailOffset, unsigned PreviousEndColumn, unsigned ColumnLimit, Split SplitBefore) const override
Returns the number of columns required to format the piece of line at LineIndex after the content pre...
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:1270
Base class for strategies on how to break tokens.
char __ovld __cnfn max(char x, char y)
Returns y if x < y, otherwise it returns x.
bool mayReflow(unsigned LineIndex, llvm::Regex &CommentPragmasRegex) const override
void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Emits the previously retrieved Split via Whitespaces.
unsigned getLineLengthAfterCompression(unsigned RemainingTokenColumns, Split Split) const
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
static bool IsBlank(char C)
BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn, StringRef Prefix, StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style)
unsigned getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset, StringRef::size_type Length) const override
Returns the number of columns required to format the piece of line at LineIndex, from byte offset Tai...
static BreakableToken::Split getStringSplit(StringRef Text, unsigned UsedColumns, unsigned ColumnLimit, unsigned TabWidth, encoding::Encoding Encoding)
void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, WhitespaceManager &Whitespaces) override
Emits the previously retrieved Split via Whitespaces.
virtual unsigned getContentStartColumn(unsigned LineIndex, unsigned TailOffset) const =0
StringRef Text
Definition: Format.cpp:1245
FormatToken * NextToken
The token that needs to be next formatted.
static BreakableToken::Split getCommentSplit(StringRef Text, unsigned ContentStartColumn, unsigned ColumnLimit, unsigned TabWidth, encoding::Encoding Encoding)
This file implements an indenter that manages the indentation of continuations.
SmallVector< StringRef, 16 > Lines