clang  8.0.0svn
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// This file implements FormatTokenLexer, which tokenizes a source file
12 /// into a FormatToken stream suitable for ClangFormat.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "FormatTokenLexer.h"
17 #include "FormatToken.h"
20 #include "clang/Format/Format.h"
21 #include "llvm/Support/Regex.h"
22 
23 namespace clang {
24 namespace format {
25 
27  unsigned Column, const FormatStyle &Style,
29  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
30  Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
31  Style(Style), IdentTable(getFormattingLangOpts(Style)),
32  Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
33  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
34  MacroBlockEndRegex(Style.MacroBlockEnd) {
35  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
36  getFormattingLangOpts(Style)));
37  Lex->SetKeepWhitespaceMode(true);
38 
39  for (const std::string &ForEachMacro : Style.ForEachMacros)
40  Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
41  for (const std::string &StatementMacro : Style.StatementMacros)
42  Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
43 }
44 
46  assert(Tokens.empty());
47  assert(FirstInLineIndex == 0);
48  do {
49  Tokens.push_back(getNextToken());
50  if (Style.Language == FormatStyle::LK_JavaScript) {
51  tryParseJSRegexLiteral();
52  handleTemplateStrings();
53  }
55  tryParsePythonComment();
56  tryMergePreviousTokens();
57  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
58  FirstInLineIndex = Tokens.size() - 1;
59  } while (Tokens.back()->Tok.isNot(tok::eof));
60  return Tokens;
61 }
62 
63 void FormatTokenLexer::tryMergePreviousTokens() {
64  if (tryMerge_TMacro())
65  return;
66  if (tryMergeConflictMarkers())
67  return;
68  if (tryMergeLessLess())
69  return;
70  if (tryMergeNSStringLiteral())
71  return;
72 
73  if (Style.Language == FormatStyle::LK_JavaScript) {
74  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
75  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
76  tok::equal};
77  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
78  tok::greaterequal};
79  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
80  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
81  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
82  tok::starequal};
83 
84  // FIXME: Investigate what token type gives the correct operator priority.
85  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
86  return;
87  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
88  return;
89  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
90  return;
91  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
92  return;
93  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
94  return;
95  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
96  Tokens.back()->Tok.setKind(tok::starequal);
97  return;
98  }
99  }
100 
101  if (Style.Language == FormatStyle::LK_Java) {
102  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
103  tok::greater, tok::greater, tok::greaterequal};
104  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
105  return;
106  }
107 }
108 
109 bool FormatTokenLexer::tryMergeNSStringLiteral() {
110  if (Tokens.size() < 2)
111  return false;
112  auto &At = *(Tokens.end() - 2);
113  auto &String = *(Tokens.end() - 1);
114  if (!At->is(tok::at) || !String->is(tok::string_literal))
115  return false;
116  At->Tok.setKind(tok::string_literal);
117  At->TokenText = StringRef(At->TokenText.begin(),
118  String->TokenText.end() - At->TokenText.begin());
119  At->ColumnWidth += String->ColumnWidth;
120  At->Type = TT_ObjCStringLiteral;
121  Tokens.erase(Tokens.end() - 1);
122  return true;
123 }
124 
125 bool FormatTokenLexer::tryMergeLessLess() {
126  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
127  if (Tokens.size() < 3)
128  return false;
129 
130  bool FourthTokenIsLess = false;
131  if (Tokens.size() > 3)
132  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
133 
134  auto First = Tokens.end() - 3;
135  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
136  First[0]->isNot(tok::less) || FourthTokenIsLess)
137  return false;
138 
139  // Only merge if there currently is no whitespace between the two "<".
140  if (First[1]->WhitespaceRange.getBegin() !=
141  First[1]->WhitespaceRange.getEnd())
142  return false;
143 
144  First[0]->Tok.setKind(tok::lessless);
145  First[0]->TokenText = "<<";
146  First[0]->ColumnWidth += 1;
147  Tokens.erase(Tokens.end() - 2);
148  return true;
149 }
150 
151 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
152  TokenType NewType) {
153  if (Tokens.size() < Kinds.size())
154  return false;
155 
157  Tokens.end() - Kinds.size();
158  if (!First[0]->is(Kinds[0]))
159  return false;
160  unsigned AddLength = 0;
161  for (unsigned i = 1; i < Kinds.size(); ++i) {
162  if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
163  First[i]->WhitespaceRange.getEnd())
164  return false;
165  AddLength += First[i]->TokenText.size();
166  }
167  Tokens.resize(Tokens.size() - Kinds.size() + 1);
168  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
169  First[0]->TokenText.size() + AddLength);
170  First[0]->ColumnWidth += AddLength;
171  First[0]->Type = NewType;
172  return true;
173 }
174 
175 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
176 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
177  // NB: This is not entirely correct, as an r_paren can introduce an operand
178  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
179  // corner case to not matter in practice, though.
180  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
181  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
182  tok::colon, tok::question, tok::tilde) ||
183  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
184  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
185  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
186  Tok->isBinaryOperator();
187 }
188 
189 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
190  if (!Prev)
191  return true;
192 
193  // Regex literals can only follow after prefix unary operators, not after
194  // postfix unary operators. If the '++' is followed by a non-operand
195  // introducing token, the slash here is the operand and not the start of a
196  // regex.
197  // `!` is an unary prefix operator, but also a post-fix operator that casts
198  // away nullability, so the same check applies.
199  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
200  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
201 
202  // The previous token must introduce an operand location where regex
203  // literals can occur.
204  if (!precedesOperand(Prev))
205  return false;
206 
207  return true;
208 }
209 
210 // Tries to parse a JavaScript Regex literal starting at the current token,
211 // if that begins with a slash and is in a location where JavaScript allows
212 // regex literals. Changes the current token to a regex literal and updates
213 // its text if successful.
214 void FormatTokenLexer::tryParseJSRegexLiteral() {
215  FormatToken *RegexToken = Tokens.back();
216  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
217  return;
218 
219  FormatToken *Prev = nullptr;
220  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
221  // NB: Because previous pointers are not initialized yet, this cannot use
222  // Token.getPreviousNonComment.
223  if ((*I)->isNot(tok::comment)) {
224  Prev = *I;
225  break;
226  }
227  }
228 
229  if (!canPrecedeRegexLiteral(Prev))
230  return;
231 
232  // 'Manually' lex ahead in the current file buffer.
233  const char *Offset = Lex->getBufferLocation();
234  const char *RegexBegin = Offset - RegexToken->TokenText.size();
235  StringRef Buffer = Lex->getBuffer();
236  bool InCharacterClass = false;
237  bool HaveClosingSlash = false;
238  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
239  // Regular expressions are terminated with a '/', which can only be
240  // escaped using '\' or a character class between '[' and ']'.
241  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
242  switch (*Offset) {
243  case '\\':
244  // Skip the escaped character.
245  ++Offset;
246  break;
247  case '[':
248  InCharacterClass = true;
249  break;
250  case ']':
251  InCharacterClass = false;
252  break;
253  case '/':
254  if (!InCharacterClass)
255  HaveClosingSlash = true;
256  break;
257  }
258  }
259 
260  RegexToken->Type = TT_RegexLiteral;
261  // Treat regex literals like other string_literals.
262  RegexToken->Tok.setKind(tok::string_literal);
263  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
264  RegexToken->ColumnWidth = RegexToken->TokenText.size();
265 
266  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
267 }
268 
269 void FormatTokenLexer::handleTemplateStrings() {
270  FormatToken *BacktickToken = Tokens.back();
271 
272  if (BacktickToken->is(tok::l_brace)) {
273  StateStack.push(LexerState::NORMAL);
274  return;
275  }
276  if (BacktickToken->is(tok::r_brace)) {
277  if (StateStack.size() == 1)
278  return;
279  StateStack.pop();
280  if (StateStack.top() != LexerState::TEMPLATE_STRING)
281  return;
282  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
283  } else if (BacktickToken->is(tok::unknown) &&
284  BacktickToken->TokenText == "`") {
285  StateStack.push(LexerState::TEMPLATE_STRING);
286  } else {
287  return; // Not actually a template
288  }
289 
290  // 'Manually' lex ahead in the current file buffer.
291  const char *Offset = Lex->getBufferLocation();
292  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
293  for (; Offset != Lex->getBuffer().end(); ++Offset) {
294  if (Offset[0] == '`') {
295  StateStack.pop();
296  break;
297  }
298  if (Offset[0] == '\\') {
299  ++Offset; // Skip the escaped character.
300  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
301  Offset[1] == '{') {
302  // '${' introduces an expression interpolation in the template string.
303  StateStack.push(LexerState::NORMAL);
304  ++Offset;
305  break;
306  }
307  }
308 
309  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
310  BacktickToken->Type = TT_TemplateString;
311  BacktickToken->Tok.setKind(tok::string_literal);
312  BacktickToken->TokenText = LiteralText;
313 
314  // Adjust width for potentially multiline string literals.
315  size_t FirstBreak = LiteralText.find('\n');
316  StringRef FirstLineText = FirstBreak == StringRef::npos
317  ? LiteralText
318  : LiteralText.substr(0, FirstBreak);
320  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
321  size_t LastBreak = LiteralText.rfind('\n');
322  if (LastBreak != StringRef::npos) {
323  BacktickToken->IsMultiline = true;
324  unsigned StartColumn = 0; // The template tail spans the entire line.
326  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
327  Style.TabWidth, Encoding);
328  }
329 
330  SourceLocation loc = Offset < Lex->getBuffer().end()
331  ? Lex->getSourceLocation(Offset + 1)
332  : SourceMgr.getLocForEndOfFile(ID);
333  resetLexer(SourceMgr.getFileOffset(loc));
334 }
335 
336 void FormatTokenLexer::tryParsePythonComment() {
337  FormatToken *HashToken = Tokens.back();
338  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
339  return;
340  // Turn the remainder of this line into a comment.
341  const char *CommentBegin =
342  Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
343  size_t From = CommentBegin - Lex->getBuffer().begin();
344  size_t To = Lex->getBuffer().find_first_of('\n', From);
345  if (To == StringRef::npos)
346  To = Lex->getBuffer().size();
347  size_t Len = To - From;
348  HashToken->Type = TT_LineComment;
349  HashToken->Tok.setKind(tok::comment);
350  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
351  SourceLocation Loc = To < Lex->getBuffer().size()
352  ? Lex->getSourceLocation(CommentBegin + Len)
353  : SourceMgr.getLocForEndOfFile(ID);
354  resetLexer(SourceMgr.getFileOffset(Loc));
355 }
356 
357 bool FormatTokenLexer::tryMerge_TMacro() {
358  if (Tokens.size() < 4)
359  return false;
360  FormatToken *Last = Tokens.back();
361  if (!Last->is(tok::r_paren))
362  return false;
363 
364  FormatToken *String = Tokens[Tokens.size() - 2];
365  if (!String->is(tok::string_literal) || String->IsMultiline)
366  return false;
367 
368  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
369  return false;
370 
371  FormatToken *Macro = Tokens[Tokens.size() - 4];
372  if (Macro->TokenText != "_T")
373  return false;
374 
375  const char *Start = Macro->TokenText.data();
376  const char *End = Last->TokenText.data() + Last->TokenText.size();
377  String->TokenText = StringRef(Start, End - Start);
378  String->IsFirst = Macro->IsFirst;
379  String->LastNewlineOffset = Macro->LastNewlineOffset;
380  String->WhitespaceRange = Macro->WhitespaceRange;
381  String->OriginalColumn = Macro->OriginalColumn;
383  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
384  String->NewlinesBefore = Macro->NewlinesBefore;
385  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
386 
387  Tokens.pop_back();
388  Tokens.pop_back();
389  Tokens.pop_back();
390  Tokens.back() = String;
391  return true;
392 }
393 
394 bool FormatTokenLexer::tryMergeConflictMarkers() {
395  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
396  return false;
397 
398  // Conflict lines look like:
399  // <marker> <text from the vcs>
400  // For example:
401  // >>>>>>> /file/in/file/system at revision 1234
402  //
403  // We merge all tokens in a line that starts with a conflict marker
404  // into a single token with a special token type that the unwrapped line
405  // parser will use to correctly rebuild the underlying code.
406 
407  FileID ID;
408  // Get the position of the first token in the line.
409  unsigned FirstInLineOffset;
410  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
411  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
412  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
413  // Calculate the offset of the start of the current line.
414  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
415  if (LineOffset == StringRef::npos) {
416  LineOffset = 0;
417  } else {
418  ++LineOffset;
419  }
420 
421  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
422  StringRef LineStart;
423  if (FirstSpace == StringRef::npos) {
424  LineStart = Buffer.substr(LineOffset);
425  } else {
426  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
427  }
428 
429  TokenType Type = TT_Unknown;
430  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
431  Type = TT_ConflictStart;
432  } else if (LineStart == "|||||||" || LineStart == "=======" ||
433  LineStart == "====") {
434  Type = TT_ConflictAlternative;
435  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
436  Type = TT_ConflictEnd;
437  }
438 
439  if (Type != TT_Unknown) {
440  FormatToken *Next = Tokens.back();
441 
442  Tokens.resize(FirstInLineIndex + 1);
443  // We do not need to build a complete token here, as we will skip it
444  // during parsing anyway (as we must not touch whitespace around conflict
445  // markers).
446  Tokens.back()->Type = Type;
447  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
448 
449  Tokens.push_back(Next);
450  return true;
451  }
452 
453  return false;
454 }
455 
456 FormatToken *FormatTokenLexer::getStashedToken() {
457  // Create a synthesized second '>' or '<' token.
458  Token Tok = FormatTok->Tok;
459  StringRef TokenText = FormatTok->TokenText;
460 
461  unsigned OriginalColumn = FormatTok->OriginalColumn;
462  FormatTok = new (Allocator.Allocate()) FormatToken;
463  FormatTok->Tok = Tok;
464  SourceLocation TokLocation =
465  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
466  FormatTok->Tok.setLocation(TokLocation);
467  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
468  FormatTok->TokenText = TokenText;
469  FormatTok->ColumnWidth = 1;
470  FormatTok->OriginalColumn = OriginalColumn + 1;
471 
472  return FormatTok;
473 }
474 
475 FormatToken *FormatTokenLexer::getNextToken() {
476  if (StateStack.top() == LexerState::TOKEN_STASHED) {
477  StateStack.pop();
478  return getStashedToken();
479  }
480 
481  FormatTok = new (Allocator.Allocate()) FormatToken;
482  readRawToken(*FormatTok);
483  SourceLocation WhitespaceStart =
484  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
485  FormatTok->IsFirst = IsFirstToken;
486  IsFirstToken = false;
487 
488  // Consume and record whitespace until we find a significant token.
489  unsigned WhitespaceLength = TrailingWhitespace;
490  while (FormatTok->Tok.is(tok::unknown)) {
491  StringRef Text = FormatTok->TokenText;
492  auto EscapesNewline = [&](int pos) {
493  // A '\r' here is just part of '\r\n'. Skip it.
494  if (pos >= 0 && Text[pos] == '\r')
495  --pos;
496  // See whether there is an odd number of '\' before this.
497  // FIXME: This is wrong. A '\' followed by a newline is always removed,
498  // regardless of whether there is another '\' before it.
499  // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
500  unsigned count = 0;
501  for (; pos >= 0; --pos, ++count)
502  if (Text[pos] != '\\')
503  break;
504  return count & 1;
505  };
506  // FIXME: This miscounts tok:unknown tokens that are not just
507  // whitespace, e.g. a '`' character.
508  for (int i = 0, e = Text.size(); i != e; ++i) {
509  switch (Text[i]) {
510  case '\n':
511  ++FormatTok->NewlinesBefore;
512  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
513  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
514  Column = 0;
515  break;
516  case '\r':
517  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
518  Column = 0;
519  break;
520  case '\f':
521  case '\v':
522  Column = 0;
523  break;
524  case ' ':
525  ++Column;
526  break;
527  case '\t':
528  Column += Style.TabWidth - Column % Style.TabWidth;
529  break;
530  case '\\':
531  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
532  FormatTok->Type = TT_ImplicitStringLiteral;
533  break;
534  default:
535  FormatTok->Type = TT_ImplicitStringLiteral;
536  break;
537  }
538  if (FormatTok->Type == TT_ImplicitStringLiteral)
539  break;
540  }
541 
542  if (FormatTok->is(TT_ImplicitStringLiteral))
543  break;
544  WhitespaceLength += FormatTok->Tok.getLength();
545 
546  readRawToken(*FormatTok);
547  }
548 
549  // JavaScript and Java do not allow to escape the end of the line with a
550  // backslash. Backslashes are syntax errors in plain source, but can occur in
551  // comments. When a single line comment ends with a \, it'll cause the next
552  // line of code to be lexed as a comment, breaking formatting. The code below
553  // finds comments that contain a backslash followed by a line break, truncates
554  // the comment token at the backslash, and resets the lexer to restart behind
555  // the backslash.
556  if ((Style.Language == FormatStyle::LK_JavaScript ||
557  Style.Language == FormatStyle::LK_Java) &&
558  FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
559  size_t BackslashPos = FormatTok->TokenText.find('\\');
560  while (BackslashPos != StringRef::npos) {
561  if (BackslashPos + 1 < FormatTok->TokenText.size() &&
562  FormatTok->TokenText[BackslashPos + 1] == '\n') {
563  const char *Offset = Lex->getBufferLocation();
564  Offset -= FormatTok->TokenText.size();
565  Offset += BackslashPos + 1;
566  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
567  FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
569  FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
570  Encoding);
571  break;
572  }
573  BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
574  }
575  }
576 
577  // In case the token starts with escaped newlines, we want to
578  // take them into account as whitespace - this pattern is quite frequent
579  // in macro definitions.
580  // FIXME: Add a more explicit test.
581  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
582  unsigned SkippedWhitespace = 0;
583  if (FormatTok->TokenText.size() > 2 &&
584  (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
585  SkippedWhitespace = 3;
586  else if (FormatTok->TokenText[1] == '\n')
587  SkippedWhitespace = 2;
588  else
589  break;
590 
591  ++FormatTok->NewlinesBefore;
592  WhitespaceLength += SkippedWhitespace;
593  FormatTok->LastNewlineOffset = SkippedWhitespace;
594  Column = 0;
595  FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
596  }
597 
598  FormatTok->WhitespaceRange = SourceRange(
599  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
600 
601  FormatTok->OriginalColumn = Column;
602 
603  TrailingWhitespace = 0;
604  if (FormatTok->Tok.is(tok::comment)) {
605  // FIXME: Add the trimmed whitespace to Column.
606  StringRef UntrimmedText = FormatTok->TokenText;
607  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
608  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
609  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
610  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
611  FormatTok->Tok.setIdentifierInfo(&Info);
612  FormatTok->Tok.setKind(Info.getTokenID());
613  if (Style.Language == FormatStyle::LK_Java &&
614  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
615  tok::kw_operator)) {
616  FormatTok->Tok.setKind(tok::identifier);
617  FormatTok->Tok.setIdentifierInfo(nullptr);
618  } else if (Style.Language == FormatStyle::LK_JavaScript &&
619  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
620  tok::kw_operator)) {
621  FormatTok->Tok.setKind(tok::identifier);
622  FormatTok->Tok.setIdentifierInfo(nullptr);
623  }
624  } else if (FormatTok->Tok.is(tok::greatergreater)) {
625  FormatTok->Tok.setKind(tok::greater);
626  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
627  ++Column;
628  StateStack.push(LexerState::TOKEN_STASHED);
629  } else if (FormatTok->Tok.is(tok::lessless)) {
630  FormatTok->Tok.setKind(tok::less);
631  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
632  ++Column;
633  StateStack.push(LexerState::TOKEN_STASHED);
634  }
635 
636  // Now FormatTok is the next non-whitespace token.
637 
638  StringRef Text = FormatTok->TokenText;
639  size_t FirstNewlinePos = Text.find('\n');
640  if (FirstNewlinePos == StringRef::npos) {
641  // FIXME: ColumnWidth actually depends on the start column, we need to
642  // take this into account when the token is moved.
643  FormatTok->ColumnWidth =
644  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
645  Column += FormatTok->ColumnWidth;
646  } else {
647  FormatTok->IsMultiline = true;
648  // FIXME: ColumnWidth actually depends on the start column, we need to
649  // take this into account when the token is moved.
651  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
652 
653  // The last line of the token always starts in column 0.
654  // Thus, the length can be precomputed even in the presence of tabs.
656  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
657  Column = FormatTok->LastLineColumnWidth;
658  }
659 
660  if (Style.isCpp()) {
661  auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
662  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
663  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
664  tok::pp_define) &&
665  it != Macros.end()) {
666  FormatTok->Type = it->second;
667  } else if (FormatTok->is(tok::identifier)) {
668  if (MacroBlockBeginRegex.match(Text)) {
669  FormatTok->Type = TT_MacroBlockBegin;
670  } else if (MacroBlockEndRegex.match(Text)) {
671  FormatTok->Type = TT_MacroBlockEnd;
672  }
673  }
674  }
675 
676  return FormatTok;
677 }
678 
679 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
680  Lex->LexFromRawLexer(Tok.Tok);
681  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
682  Tok.Tok.getLength());
683  // For formatting, treat unterminated string literals like normal string
684  // literals.
685  if (Tok.is(tok::unknown)) {
686  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
687  Tok.Tok.setKind(tok::string_literal);
688  Tok.IsUnterminatedLiteral = true;
689  } else if (Style.Language == FormatStyle::LK_JavaScript &&
690  Tok.TokenText == "''") {
691  Tok.Tok.setKind(tok::string_literal);
692  }
693  }
694 
695  if ((Style.Language == FormatStyle::LK_JavaScript ||
696  Style.Language == FormatStyle::LK_Proto ||
698  Tok.is(tok::char_constant)) {
699  Tok.Tok.setKind(tok::string_literal);
700  }
701 
702  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
703  Tok.TokenText == "/* clang-format on */")) {
704  FormattingDisabled = false;
705  }
706 
707  Tok.Finalized = FormattingDisabled;
708 
709  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
710  Tok.TokenText == "/* clang-format off */")) {
711  FormattingDisabled = true;
712  }
713 }
714 
715 void FormatTokenLexer::resetLexer(unsigned Offset) {
716  StringRef Buffer = SourceMgr.getBufferData(ID);
717  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
718  getFormattingLangOpts(Style), Buffer.begin(),
719  Buffer.begin() + Offset, Buffer.end()));
720  Lex->SetKeepWhitespaceMode(true);
721  TrailingWhitespace = 0;
722 }
723 
724 } // namespace format
725 } // namespace clang
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:77
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Token Tok
The Token.
Definition: FormatToken.h:128
Should be used for Protocol Buffers (https://developers.google.com/protocol-buffers/).
Definition: Format.h:1232
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:95
Defines the SourceManager interface.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:215
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
Definition: Type.h:1415
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:157
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:160
bool isBinaryOperator() const
Definition: FormatToken.h:413
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:134
One of these records is kept for each identifier that is lexed.
Should be used for Java.
Definition: Format.h:1225
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:62
void setKind(tok::TokenKind K)
Definition: Token.h:91
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
uint32_t Offset
Definition: CacheTokens.cpp:43
const FormatToken & Tok
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls...
Definition: Format.h:1054
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:316
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:1273
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:1270
SourceLocation End
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding)
Should be used for JavaScript.
Definition: Format.h:1227
std::vector< std::string > StatementMacros
A vector of macros that should be interpreted as complete statements.
Definition: Format.h:1064
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:2256
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:124
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:124
unsigned LastNewlineOffset
The offset just past the last &#39; &#39; in this token&#39;s leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:145
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:307
Various functions to configurably format source code.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:177
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:186
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:141
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
bool IsUnterminatedLiteral
Set to true if this token is an unterminated literal.
Definition: FormatToken.h:175
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:172
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:50
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:1242
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Dataflow Directional Tag Classes.
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:150
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:305
unsigned getLength() const
Definition: Token.h:127
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:1698
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
Should be used for Protocol Buffer messages in text format (https://developers.google.com/protocol-buffers/).
Definition: Format.h:1237
StringRef Text
Definition: Format.cpp:1621
void setLocation(SourceLocation L)
Definition: Token.h:132
#define true
Definition: stdbool.h:32
A trivial tuple used to represent a source range.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:138
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:154
const encoding::Encoding Encoding
const FormatStyle & Style