clang  5.0.0svn
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief This file implements FormatTokenLexer, which tokenizes a source file
12 /// into a FormatToken stream suitable for ClangFormat.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "FormatTokenLexer.h"
17 #include "FormatToken.h"
20 #include "clang/Format/Format.h"
21 #include "llvm/Support/Regex.h"
22 
23 namespace clang {
24 namespace format {
25 
27  const FormatStyle &Style,
29  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
30  Column(0), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
31  Style(Style), IdentTable(getFormattingLangOpts(Style)),
32  Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
33  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
34  MacroBlockEndRegex(Style.MacroBlockEnd) {
35  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
36  getFormattingLangOpts(Style)));
37  Lex->SetKeepWhitespaceMode(true);
38 
39  for (const std::string &ForEachMacro : Style.ForEachMacros)
40  ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
41  std::sort(ForEachMacros.begin(), ForEachMacros.end());
42 }
43 
45  assert(Tokens.empty());
46  assert(FirstInLineIndex == 0);
47  do {
48  Tokens.push_back(getNextToken());
49  if (Style.Language == FormatStyle::LK_JavaScript) {
50  tryParseJSRegexLiteral();
51  handleTemplateStrings();
52  }
53  tryMergePreviousTokens();
54  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
55  FirstInLineIndex = Tokens.size() - 1;
56  } while (Tokens.back()->Tok.isNot(tok::eof));
57  return Tokens;
58 }
59 
60 void FormatTokenLexer::tryMergePreviousTokens() {
61  if (tryMerge_TMacro())
62  return;
63  if (tryMergeConflictMarkers())
64  return;
65  if (tryMergeLessLess())
66  return;
67  if (tryMergeNSStringLiteral())
68  return;
69 
70  if (Style.Language == FormatStyle::LK_JavaScript) {
71  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
72  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
73  tok::equal};
74  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
75  tok::greaterequal};
76  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
77  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
78  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
79  tok::starequal};
80 
81  // FIXME: Investigate what token type gives the correct operator priority.
82  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
83  return;
84  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
85  return;
86  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
87  return;
88  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
89  return;
90  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
91  return;
92  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
93  Tokens.back()->Tok.setKind(tok::starequal);
94  return;
95  }
96  }
97 
98  if (Style.Language == FormatStyle::LK_Java) {
99  static const tok::TokenKind JavaRightLogicalShift[] = {tok::greater,
100  tok::greater,
101  tok::greater};
102  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {tok::greater,
103  tok::greater,
104  tok::greaterequal};
105  if (tryMergeTokens(JavaRightLogicalShift, TT_BinaryOperator))
106  return;
107  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
108  return;
109  }
110 }
111 
112 bool FormatTokenLexer::tryMergeNSStringLiteral() {
113  if (Tokens.size() < 2)
114  return false;
115  auto &At = *(Tokens.end() - 2);
116  auto &String = *(Tokens.end() - 1);
117  if (!At->is(tok::at) || !String->is(tok::string_literal))
118  return false;
119  At->Tok.setKind(tok::string_literal);
120  At->TokenText = StringRef(At->TokenText.begin(),
121  String->TokenText.end() - At->TokenText.begin());
122  At->ColumnWidth += String->ColumnWidth;
123  At->Type = TT_ObjCStringLiteral;
124  Tokens.erase(Tokens.end() - 1);
125  return true;
126 }
127 
128 bool FormatTokenLexer::tryMergeLessLess() {
129  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
130  if (Tokens.size() < 3)
131  return false;
132 
133  bool FourthTokenIsLess = false;
134  if (Tokens.size() > 3)
135  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
136 
137  auto First = Tokens.end() - 3;
138  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
139  First[0]->isNot(tok::less) || FourthTokenIsLess)
140  return false;
141 
142  // Only merge if there currently is no whitespace between the two "<".
143  if (First[1]->WhitespaceRange.getBegin() !=
144  First[1]->WhitespaceRange.getEnd())
145  return false;
146 
147  First[0]->Tok.setKind(tok::lessless);
148  First[0]->TokenText = "<<";
149  First[0]->ColumnWidth += 1;
150  Tokens.erase(Tokens.end() - 2);
151  return true;
152 }
153 
154 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
155  TokenType NewType) {
156  if (Tokens.size() < Kinds.size())
157  return false;
158 
160  Tokens.end() - Kinds.size();
161  if (!First[0]->is(Kinds[0]))
162  return false;
163  unsigned AddLength = 0;
164  for (unsigned i = 1; i < Kinds.size(); ++i) {
165  if (!First[i]->is(Kinds[i]) ||
166  First[i]->WhitespaceRange.getBegin() !=
167  First[i]->WhitespaceRange.getEnd())
168  return false;
169  AddLength += First[i]->TokenText.size();
170  }
171  Tokens.resize(Tokens.size() - Kinds.size() + 1);
172  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
173  First[0]->TokenText.size() + AddLength);
174  First[0]->ColumnWidth += AddLength;
175  First[0]->Type = NewType;
176  return true;
177 }
178 
179 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
180 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
181  // NB: This is not entirely correct, as an r_paren can introduce an operand
182  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
183  // corner case to not matter in practice, though.
184  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
185  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
186  tok::colon, tok::question, tok::tilde) ||
187  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
188  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
189  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
190  Tok->isBinaryOperator();
191 }
192 
193 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
194  if (!Prev)
195  return true;
196 
197  // Regex literals can only follow after prefix unary operators, not after
198  // postfix unary operators. If the '++' is followed by a non-operand
199  // introducing token, the slash here is the operand and not the start of a
200  // regex.
201  // `!` is an unary prefix operator, but also a post-fix operator that casts
202  // away nullability, so the same check applies.
203  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
204  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
205 
206  // The previous token must introduce an operand location where regex
207  // literals can occur.
208  if (!precedesOperand(Prev))
209  return false;
210 
211  return true;
212 }
213 
214 // Tries to parse a JavaScript Regex literal starting at the current token,
215 // if that begins with a slash and is in a location where JavaScript allows
216 // regex literals. Changes the current token to a regex literal and updates
217 // its text if successful.
218 void FormatTokenLexer::tryParseJSRegexLiteral() {
219  FormatToken *RegexToken = Tokens.back();
220  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
221  return;
222 
223  FormatToken *Prev = nullptr;
224  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
225  // NB: Because previous pointers are not initialized yet, this cannot use
226  // Token.getPreviousNonComment.
227  if ((*I)->isNot(tok::comment)) {
228  Prev = *I;
229  break;
230  }
231  }
232 
233  if (!canPrecedeRegexLiteral(Prev))
234  return;
235 
236  // 'Manually' lex ahead in the current file buffer.
237  const char *Offset = Lex->getBufferLocation();
238  const char *RegexBegin = Offset - RegexToken->TokenText.size();
239  StringRef Buffer = Lex->getBuffer();
240  bool InCharacterClass = false;
241  bool HaveClosingSlash = false;
242  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
243  // Regular expressions are terminated with a '/', which can only be
244  // escaped using '\' or a character class between '[' and ']'.
245  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
246  switch (*Offset) {
247  case '\\':
248  // Skip the escaped character.
249  ++Offset;
250  break;
251  case '[':
252  InCharacterClass = true;
253  break;
254  case ']':
255  InCharacterClass = false;
256  break;
257  case '/':
258  if (!InCharacterClass)
259  HaveClosingSlash = true;
260  break;
261  }
262  }
263 
264  RegexToken->Type = TT_RegexLiteral;
265  // Treat regex literals like other string_literals.
266  RegexToken->Tok.setKind(tok::string_literal);
267  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
268  RegexToken->ColumnWidth = RegexToken->TokenText.size();
269 
270  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
271 }
272 
273 void FormatTokenLexer::handleTemplateStrings() {
274  FormatToken *BacktickToken = Tokens.back();
275 
276  if (BacktickToken->is(tok::l_brace)) {
277  StateStack.push(LexerState::NORMAL);
278  return;
279  }
280  if (BacktickToken->is(tok::r_brace)) {
281  if (StateStack.size() == 1)
282  return;
283  StateStack.pop();
284  if (StateStack.top() != LexerState::TEMPLATE_STRING)
285  return;
286  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
287  } else if (BacktickToken->is(tok::unknown) &&
288  BacktickToken->TokenText == "`") {
289  StateStack.push(LexerState::TEMPLATE_STRING);
290  } else {
291  return; // Not actually a template
292  }
293 
294  // 'Manually' lex ahead in the current file buffer.
295  const char *Offset = Lex->getBufferLocation();
296  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
297  for (; Offset != Lex->getBuffer().end(); ++Offset) {
298  if (Offset[0] == '`') {
299  StateStack.pop();
300  break;
301  }
302  if (Offset[0] == '\\') {
303  ++Offset; // Skip the escaped character.
304  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
305  Offset[1] == '{') {
306  // '${' introduces an expression interpolation in the template string.
307  StateStack.push(LexerState::NORMAL);
308  ++Offset;
309  break;
310  }
311  }
312 
313  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
314  BacktickToken->Type = TT_TemplateString;
315  BacktickToken->Tok.setKind(tok::string_literal);
316  BacktickToken->TokenText = LiteralText;
317 
318  // Adjust width for potentially multiline string literals.
319  size_t FirstBreak = LiteralText.find('\n');
320  StringRef FirstLineText = FirstBreak == StringRef::npos
321  ? LiteralText
322  : LiteralText.substr(0, FirstBreak);
324  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
325  size_t LastBreak = LiteralText.rfind('\n');
326  if (LastBreak != StringRef::npos) {
327  BacktickToken->IsMultiline = true;
328  unsigned StartColumn = 0; // The template tail spans the entire line.
330  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
331  Style.TabWidth, Encoding);
332  }
333 
334  SourceLocation loc = Offset < Lex->getBuffer().end()
335  ? Lex->getSourceLocation(Offset + 1)
336  : SourceMgr.getLocForEndOfFile(ID);
337  resetLexer(SourceMgr.getFileOffset(loc));
338 }
339 
340 bool FormatTokenLexer::tryMerge_TMacro() {
341  if (Tokens.size() < 4)
342  return false;
343  FormatToken *Last = Tokens.back();
344  if (!Last->is(tok::r_paren))
345  return false;
346 
347  FormatToken *String = Tokens[Tokens.size() - 2];
348  if (!String->is(tok::string_literal) || String->IsMultiline)
349  return false;
350 
351  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
352  return false;
353 
354  FormatToken *Macro = Tokens[Tokens.size() - 4];
355  if (Macro->TokenText != "_T")
356  return false;
357 
358  const char *Start = Macro->TokenText.data();
359  const char *End = Last->TokenText.data() + Last->TokenText.size();
360  String->TokenText = StringRef(Start, End - Start);
361  String->IsFirst = Macro->IsFirst;
362  String->LastNewlineOffset = Macro->LastNewlineOffset;
363  String->WhitespaceRange = Macro->WhitespaceRange;
364  String->OriginalColumn = Macro->OriginalColumn;
366  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
367  String->NewlinesBefore = Macro->NewlinesBefore;
368  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
369 
370  Tokens.pop_back();
371  Tokens.pop_back();
372  Tokens.pop_back();
373  Tokens.back() = String;
374  return true;
375 }
376 
377 bool FormatTokenLexer::tryMergeConflictMarkers() {
378  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
379  return false;
380 
381  // Conflict lines look like:
382  // <marker> <text from the vcs>
383  // For example:
384  // >>>>>>> /file/in/file/system at revision 1234
385  //
386  // We merge all tokens in a line that starts with a conflict marker
387  // into a single token with a special token type that the unwrapped line
388  // parser will use to correctly rebuild the underlying code.
389 
390  FileID ID;
391  // Get the position of the first token in the line.
392  unsigned FirstInLineOffset;
393  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
394  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
395  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
396  // Calculate the offset of the start of the current line.
397  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
398  if (LineOffset == StringRef::npos) {
399  LineOffset = 0;
400  } else {
401  ++LineOffset;
402  }
403 
404  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
405  StringRef LineStart;
406  if (FirstSpace == StringRef::npos) {
407  LineStart = Buffer.substr(LineOffset);
408  } else {
409  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
410  }
411 
412  TokenType Type = TT_Unknown;
413  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
414  Type = TT_ConflictStart;
415  } else if (LineStart == "|||||||" || LineStart == "=======" ||
416  LineStart == "====") {
417  Type = TT_ConflictAlternative;
418  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
419  Type = TT_ConflictEnd;
420  }
421 
422  if (Type != TT_Unknown) {
423  FormatToken *Next = Tokens.back();
424 
425  Tokens.resize(FirstInLineIndex + 1);
426  // We do not need to build a complete token here, as we will skip it
427  // during parsing anyway (as we must not touch whitespace around conflict
428  // markers).
429  Tokens.back()->Type = Type;
430  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
431 
432  Tokens.push_back(Next);
433  return true;
434  }
435 
436  return false;
437 }
438 
439 FormatToken *FormatTokenLexer::getStashedToken() {
440  // Create a synthesized second '>' or '<' token.
441  Token Tok = FormatTok->Tok;
442  StringRef TokenText = FormatTok->TokenText;
443 
444  unsigned OriginalColumn = FormatTok->OriginalColumn;
445  FormatTok = new (Allocator.Allocate()) FormatToken;
446  FormatTok->Tok = Tok;
447  SourceLocation TokLocation =
448  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
449  FormatTok->Tok.setLocation(TokLocation);
450  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
451  FormatTok->TokenText = TokenText;
452  FormatTok->ColumnWidth = 1;
453  FormatTok->OriginalColumn = OriginalColumn + 1;
454 
455  return FormatTok;
456 }
457 
458 FormatToken *FormatTokenLexer::getNextToken() {
459  if (StateStack.top() == LexerState::TOKEN_STASHED) {
460  StateStack.pop();
461  return getStashedToken();
462  }
463 
464  FormatTok = new (Allocator.Allocate()) FormatToken;
465  readRawToken(*FormatTok);
466  SourceLocation WhitespaceStart =
467  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
468  FormatTok->IsFirst = IsFirstToken;
469  IsFirstToken = false;
470 
471  // Consume and record whitespace until we find a significant token.
472  unsigned WhitespaceLength = TrailingWhitespace;
473  while (FormatTok->Tok.is(tok::unknown)) {
474  StringRef Text = FormatTok->TokenText;
475  auto EscapesNewline = [&](int pos) {
476  // A '\r' here is just part of '\r\n'. Skip it.
477  if (pos >= 0 && Text[pos] == '\r')
478  --pos;
479  // See whether there is an odd number of '\' before this.
480  // FIXME: This is wrong. A '\' followed by a newline is always removed,
481  // regardless of whether there is another '\' before it.
482  // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
483  unsigned count = 0;
484  for (; pos >= 0; --pos, ++count)
485  if (Text[pos] != '\\')
486  break;
487  return count & 1;
488  };
489  // FIXME: This miscounts tok:unknown tokens that are not just
490  // whitespace, e.g. a '`' character.
491  for (int i = 0, e = Text.size(); i != e; ++i) {
492  switch (Text[i]) {
493  case '\n':
494  ++FormatTok->NewlinesBefore;
495  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
496  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
497  Column = 0;
498  break;
499  case '\r':
500  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
501  Column = 0;
502  break;
503  case '\f':
504  case '\v':
505  Column = 0;
506  break;
507  case ' ':
508  ++Column;
509  break;
510  case '\t':
511  Column += Style.TabWidth - Column % Style.TabWidth;
512  break;
513  case '\\':
514  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
515  FormatTok->Type = TT_ImplicitStringLiteral;
516  break;
517  default:
518  FormatTok->Type = TT_ImplicitStringLiteral;
519  break;
520  }
521  if (FormatTok->Type == TT_ImplicitStringLiteral)
522  break;
523  }
524 
525  if (FormatTok->is(TT_ImplicitStringLiteral))
526  break;
527  WhitespaceLength += FormatTok->Tok.getLength();
528 
529  readRawToken(*FormatTok);
530  }
531 
532  // In case the token starts with escaped newlines, we want to
533  // take them into account as whitespace - this pattern is quite frequent
534  // in macro definitions.
535  // FIXME: Add a more explicit test.
536  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
537  FormatTok->TokenText[1] == '\n') {
538  ++FormatTok->NewlinesBefore;
539  WhitespaceLength += 2;
540  FormatTok->LastNewlineOffset = 2;
541  Column = 0;
542  FormatTok->TokenText = FormatTok->TokenText.substr(2);
543  }
544 
545  FormatTok->WhitespaceRange = SourceRange(
546  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
547 
548  FormatTok->OriginalColumn = Column;
549 
550  TrailingWhitespace = 0;
551  if (FormatTok->Tok.is(tok::comment)) {
552  // FIXME: Add the trimmed whitespace to Column.
553  StringRef UntrimmedText = FormatTok->TokenText;
554  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
555  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
556  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
557  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
558  FormatTok->Tok.setIdentifierInfo(&Info);
559  FormatTok->Tok.setKind(Info.getTokenID());
560  if (Style.Language == FormatStyle::LK_Java &&
561  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
562  tok::kw_operator)) {
563  FormatTok->Tok.setKind(tok::identifier);
564  FormatTok->Tok.setIdentifierInfo(nullptr);
565  } else if (Style.Language == FormatStyle::LK_JavaScript &&
566  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
567  tok::kw_operator)) {
568  FormatTok->Tok.setKind(tok::identifier);
569  FormatTok->Tok.setIdentifierInfo(nullptr);
570  }
571  } else if (FormatTok->Tok.is(tok::greatergreater)) {
572  FormatTok->Tok.setKind(tok::greater);
573  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
574  ++Column;
575  StateStack.push(LexerState::TOKEN_STASHED);
576  } else if (FormatTok->Tok.is(tok::lessless)) {
577  FormatTok->Tok.setKind(tok::less);
578  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
579  ++Column;
580  StateStack.push(LexerState::TOKEN_STASHED);
581  }
582 
583  // Now FormatTok is the next non-whitespace token.
584 
585  StringRef Text = FormatTok->TokenText;
586  size_t FirstNewlinePos = Text.find('\n');
587  if (FirstNewlinePos == StringRef::npos) {
588  // FIXME: ColumnWidth actually depends on the start column, we need to
589  // take this into account when the token is moved.
590  FormatTok->ColumnWidth =
591  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
592  Column += FormatTok->ColumnWidth;
593  } else {
594  FormatTok->IsMultiline = true;
595  // FIXME: ColumnWidth actually depends on the start column, we need to
596  // take this into account when the token is moved.
598  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
599 
600  // The last line of the token always starts in column 0.
601  // Thus, the length can be precomputed even in the presence of tabs.
603  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
604  Column = FormatTok->LastLineColumnWidth;
605  }
606 
607  if (Style.isCpp()) {
608  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
609  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
610  tok::pp_define) &&
611  std::find(ForEachMacros.begin(), ForEachMacros.end(),
612  FormatTok->Tok.getIdentifierInfo()) != ForEachMacros.end()) {
613  FormatTok->Type = TT_ForEachMacro;
614  } else if (FormatTok->is(tok::identifier)) {
615  if (MacroBlockBeginRegex.match(Text)) {
616  FormatTok->Type = TT_MacroBlockBegin;
617  } else if (MacroBlockEndRegex.match(Text)) {
618  FormatTok->Type = TT_MacroBlockEnd;
619  }
620  }
621  }
622 
623  return FormatTok;
624 }
625 
626 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
627  Lex->LexFromRawLexer(Tok.Tok);
628  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
629  Tok.Tok.getLength());
630  // For formatting, treat unterminated string literals like normal string
631  // literals.
632  if (Tok.is(tok::unknown)) {
633  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
634  Tok.Tok.setKind(tok::string_literal);
635  Tok.IsUnterminatedLiteral = true;
636  } else if (Style.Language == FormatStyle::LK_JavaScript &&
637  Tok.TokenText == "''") {
638  Tok.Tok.setKind(tok::string_literal);
639  }
640  }
641 
642  if (Style.Language == FormatStyle::LK_JavaScript &&
643  Tok.is(tok::char_constant)) {
644  Tok.Tok.setKind(tok::string_literal);
645  }
646 
647  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
648  Tok.TokenText == "/* clang-format on */")) {
649  FormattingDisabled = false;
650  }
651 
652  Tok.Finalized = FormattingDisabled;
653 
654  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
655  Tok.TokenText == "/* clang-format off */")) {
656  FormattingDisabled = true;
657  }
658 }
659 
660 void FormatTokenLexer::resetLexer(unsigned Offset) {
661  StringRef Buffer = SourceMgr.getBufferData(ID);
662  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
663  getFormattingLangOpts(Style), Buffer.begin(),
664  Buffer.begin() + Offset, Buffer.end()));
665  Lex->SetKeepWhitespaceMode(true);
666  TrailingWhitespace = 0;
667 }
668 
669 } // namespace format
670 } // namespace clang
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:46
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Token Tok
The Token.
Definition: FormatToken.h:121
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:95
Defines the SourceManager interface.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:212
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
Definition: Type.h:1303
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:150
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:153
bool isBinaryOperator() const
Definition: FormatToken.h:385
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:127
One of these records is kept for each identifier that is lexed.
Should be used for Java.
Definition: Format.h:1040
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:62
void setKind(tok::TokenKind K)
Definition: Token.h:91
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
uint32_t Offset
Definition: CacheTokens.cpp:43
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls...
Definition: Format.h:883
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:303
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:1085
ID
Defines the address space values used by the address space qualifier of QualType. ...
Definition: AddressSpaces.h:26
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:1082
Should be used for JavaScript.
Definition: Format.h:1042
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:1939
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:124
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:117
unsigned LastNewlineOffset
The offset just past the last &#39; &#39; in this token&#39;s leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:138
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:294
Various functions to configurably format source code.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:177
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:186
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:134
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
bool IsUnterminatedLiteral
Set to true if this token is an unterminated literal.
Definition: FormatToken.h:168
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:165
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, const FormatStyle &Style, encoding::Encoding Encoding)
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:1054
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
/file This file defines classes for searching and anlyzing source code clones.
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:143
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:292
unsigned getLength() const
Definition: Token.h:127
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:1366
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:1281
void setLocation(SourceLocation L)
Definition: Token.h:132
#define true
Definition: stdbool.h:32
A trivial tuple used to represent a source range.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:131
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:147