clang  6.0.0svn
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief This file implements FormatTokenLexer, which tokenizes a source file
12 /// into a FormatToken stream suitable for ClangFormat.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "FormatTokenLexer.h"
17 #include "FormatToken.h"
20 #include "clang/Format/Format.h"
21 #include "llvm/Support/Regex.h"
22 
23 namespace clang {
24 namespace format {
25 
27  unsigned Column, const FormatStyle &Style,
29  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
30  Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
31  Style(Style), IdentTable(getFormattingLangOpts(Style)),
32  Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
33  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
34  MacroBlockEndRegex(Style.MacroBlockEnd) {
35  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
36  getFormattingLangOpts(Style)));
37  Lex->SetKeepWhitespaceMode(true);
38 
39  for (const std::string &ForEachMacro : Style.ForEachMacros)
40  ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
41  std::sort(ForEachMacros.begin(), ForEachMacros.end());
42 }
43 
45  assert(Tokens.empty());
46  assert(FirstInLineIndex == 0);
47  do {
48  Tokens.push_back(getNextToken());
49  if (Style.Language == FormatStyle::LK_JavaScript) {
50  tryParseJSRegexLiteral();
51  handleTemplateStrings();
52  }
54  tryParsePythonComment();
55  tryMergePreviousTokens();
56  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
57  FirstInLineIndex = Tokens.size() - 1;
58  } while (Tokens.back()->Tok.isNot(tok::eof));
59  return Tokens;
60 }
61 
62 void FormatTokenLexer::tryMergePreviousTokens() {
63  if (tryMerge_TMacro())
64  return;
65  if (tryMergeConflictMarkers())
66  return;
67  if (tryMergeLessLess())
68  return;
69  if (tryMergeNSStringLiteral())
70  return;
71 
72  if (Style.Language == FormatStyle::LK_JavaScript) {
73  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
74  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
75  tok::equal};
76  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
77  tok::greaterequal};
78  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
79  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
80  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
81  tok::starequal};
82 
83  // FIXME: Investigate what token type gives the correct operator priority.
84  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
85  return;
86  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
87  return;
88  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
89  return;
90  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
91  return;
92  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
93  return;
94  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
95  Tokens.back()->Tok.setKind(tok::starequal);
96  return;
97  }
98  }
99 
100  if (Style.Language == FormatStyle::LK_Java) {
101  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
102  tok::greater, tok::greater, tok::greaterequal};
103  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
104  return;
105  }
106 }
107 
108 bool FormatTokenLexer::tryMergeNSStringLiteral() {
109  if (Tokens.size() < 2)
110  return false;
111  auto &At = *(Tokens.end() - 2);
112  auto &String = *(Tokens.end() - 1);
113  if (!At->is(tok::at) || !String->is(tok::string_literal))
114  return false;
115  At->Tok.setKind(tok::string_literal);
116  At->TokenText = StringRef(At->TokenText.begin(),
117  String->TokenText.end() - At->TokenText.begin());
118  At->ColumnWidth += String->ColumnWidth;
119  At->Type = TT_ObjCStringLiteral;
120  Tokens.erase(Tokens.end() - 1);
121  return true;
122 }
123 
124 bool FormatTokenLexer::tryMergeLessLess() {
125  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
126  if (Tokens.size() < 3)
127  return false;
128 
129  bool FourthTokenIsLess = false;
130  if (Tokens.size() > 3)
131  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
132 
133  auto First = Tokens.end() - 3;
134  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
135  First[0]->isNot(tok::less) || FourthTokenIsLess)
136  return false;
137 
138  // Only merge if there currently is no whitespace between the two "<".
139  if (First[1]->WhitespaceRange.getBegin() !=
140  First[1]->WhitespaceRange.getEnd())
141  return false;
142 
143  First[0]->Tok.setKind(tok::lessless);
144  First[0]->TokenText = "<<";
145  First[0]->ColumnWidth += 1;
146  Tokens.erase(Tokens.end() - 2);
147  return true;
148 }
149 
150 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
151  TokenType NewType) {
152  if (Tokens.size() < Kinds.size())
153  return false;
154 
156  Tokens.end() - Kinds.size();
157  if (!First[0]->is(Kinds[0]))
158  return false;
159  unsigned AddLength = 0;
160  for (unsigned i = 1; i < Kinds.size(); ++i) {
161  if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
162  First[i]->WhitespaceRange.getEnd())
163  return false;
164  AddLength += First[i]->TokenText.size();
165  }
166  Tokens.resize(Tokens.size() - Kinds.size() + 1);
167  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
168  First[0]->TokenText.size() + AddLength);
169  First[0]->ColumnWidth += AddLength;
170  First[0]->Type = NewType;
171  return true;
172 }
173 
174 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
175 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
176  // NB: This is not entirely correct, as an r_paren can introduce an operand
177  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
178  // corner case to not matter in practice, though.
179  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
180  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
181  tok::colon, tok::question, tok::tilde) ||
182  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
183  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
184  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
185  Tok->isBinaryOperator();
186 }
187 
188 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
189  if (!Prev)
190  return true;
191 
192  // Regex literals can only follow after prefix unary operators, not after
193  // postfix unary operators. If the '++' is followed by a non-operand
194  // introducing token, the slash here is the operand and not the start of a
195  // regex.
196  // `!` is an unary prefix operator, but also a post-fix operator that casts
197  // away nullability, so the same check applies.
198  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
199  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
200 
201  // The previous token must introduce an operand location where regex
202  // literals can occur.
203  if (!precedesOperand(Prev))
204  return false;
205 
206  return true;
207 }
208 
209 // Tries to parse a JavaScript Regex literal starting at the current token,
210 // if that begins with a slash and is in a location where JavaScript allows
211 // regex literals. Changes the current token to a regex literal and updates
212 // its text if successful.
213 void FormatTokenLexer::tryParseJSRegexLiteral() {
214  FormatToken *RegexToken = Tokens.back();
215  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
216  return;
217 
218  FormatToken *Prev = nullptr;
219  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
220  // NB: Because previous pointers are not initialized yet, this cannot use
221  // Token.getPreviousNonComment.
222  if ((*I)->isNot(tok::comment)) {
223  Prev = *I;
224  break;
225  }
226  }
227 
228  if (!canPrecedeRegexLiteral(Prev))
229  return;
230 
231  // 'Manually' lex ahead in the current file buffer.
232  const char *Offset = Lex->getBufferLocation();
233  const char *RegexBegin = Offset - RegexToken->TokenText.size();
234  StringRef Buffer = Lex->getBuffer();
235  bool InCharacterClass = false;
236  bool HaveClosingSlash = false;
237  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
238  // Regular expressions are terminated with a '/', which can only be
239  // escaped using '\' or a character class between '[' and ']'.
240  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
241  switch (*Offset) {
242  case '\\':
243  // Skip the escaped character.
244  ++Offset;
245  break;
246  case '[':
247  InCharacterClass = true;
248  break;
249  case ']':
250  InCharacterClass = false;
251  break;
252  case '/':
253  if (!InCharacterClass)
254  HaveClosingSlash = true;
255  break;
256  }
257  }
258 
259  RegexToken->Type = TT_RegexLiteral;
260  // Treat regex literals like other string_literals.
261  RegexToken->Tok.setKind(tok::string_literal);
262  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
263  RegexToken->ColumnWidth = RegexToken->TokenText.size();
264 
265  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
266 }
267 
268 void FormatTokenLexer::handleTemplateStrings() {
269  FormatToken *BacktickToken = Tokens.back();
270 
271  if (BacktickToken->is(tok::l_brace)) {
272  StateStack.push(LexerState::NORMAL);
273  return;
274  }
275  if (BacktickToken->is(tok::r_brace)) {
276  if (StateStack.size() == 1)
277  return;
278  StateStack.pop();
279  if (StateStack.top() != LexerState::TEMPLATE_STRING)
280  return;
281  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
282  } else if (BacktickToken->is(tok::unknown) &&
283  BacktickToken->TokenText == "`") {
284  StateStack.push(LexerState::TEMPLATE_STRING);
285  } else {
286  return; // Not actually a template
287  }
288 
289  // 'Manually' lex ahead in the current file buffer.
290  const char *Offset = Lex->getBufferLocation();
291  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
292  for (; Offset != Lex->getBuffer().end(); ++Offset) {
293  if (Offset[0] == '`') {
294  StateStack.pop();
295  break;
296  }
297  if (Offset[0] == '\\') {
298  ++Offset; // Skip the escaped character.
299  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
300  Offset[1] == '{') {
301  // '${' introduces an expression interpolation in the template string.
302  StateStack.push(LexerState::NORMAL);
303  ++Offset;
304  break;
305  }
306  }
307 
308  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
309  BacktickToken->Type = TT_TemplateString;
310  BacktickToken->Tok.setKind(tok::string_literal);
311  BacktickToken->TokenText = LiteralText;
312 
313  // Adjust width for potentially multiline string literals.
314  size_t FirstBreak = LiteralText.find('\n');
315  StringRef FirstLineText = FirstBreak == StringRef::npos
316  ? LiteralText
317  : LiteralText.substr(0, FirstBreak);
319  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
320  size_t LastBreak = LiteralText.rfind('\n');
321  if (LastBreak != StringRef::npos) {
322  BacktickToken->IsMultiline = true;
323  unsigned StartColumn = 0; // The template tail spans the entire line.
325  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
326  Style.TabWidth, Encoding);
327  }
328 
329  SourceLocation loc = Offset < Lex->getBuffer().end()
330  ? Lex->getSourceLocation(Offset + 1)
331  : SourceMgr.getLocForEndOfFile(ID);
332  resetLexer(SourceMgr.getFileOffset(loc));
333 }
334 
335 void FormatTokenLexer::tryParsePythonComment() {
336  FormatToken *HashToken = Tokens.back();
337  if (HashToken->isNot(tok::hash))
338  return;
339  // Turn the remainder of this line into a comment.
340  const char *CommentBegin =
341  Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
342  size_t From = CommentBegin - Lex->getBuffer().begin();
343  size_t To = Lex->getBuffer().find_first_of('\n', From);
344  if (To == StringRef::npos)
345  To = Lex->getBuffer().size();
346  size_t Len = To - From;
347  HashToken->Type = TT_LineComment;
348  HashToken->Tok.setKind(tok::comment);
349  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
350  SourceLocation Loc = To < Lex->getBuffer().size()
351  ? Lex->getSourceLocation(CommentBegin + Len)
352  : SourceMgr.getLocForEndOfFile(ID);
353  resetLexer(SourceMgr.getFileOffset(Loc));
354 }
355 
356 bool FormatTokenLexer::tryMerge_TMacro() {
357  if (Tokens.size() < 4)
358  return false;
359  FormatToken *Last = Tokens.back();
360  if (!Last->is(tok::r_paren))
361  return false;
362 
363  FormatToken *String = Tokens[Tokens.size() - 2];
364  if (!String->is(tok::string_literal) || String->IsMultiline)
365  return false;
366 
367  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
368  return false;
369 
370  FormatToken *Macro = Tokens[Tokens.size() - 4];
371  if (Macro->TokenText != "_T")
372  return false;
373 
374  const char *Start = Macro->TokenText.data();
375  const char *End = Last->TokenText.data() + Last->TokenText.size();
376  String->TokenText = StringRef(Start, End - Start);
377  String->IsFirst = Macro->IsFirst;
378  String->LastNewlineOffset = Macro->LastNewlineOffset;
379  String->WhitespaceRange = Macro->WhitespaceRange;
380  String->OriginalColumn = Macro->OriginalColumn;
382  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
383  String->NewlinesBefore = Macro->NewlinesBefore;
384  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
385 
386  Tokens.pop_back();
387  Tokens.pop_back();
388  Tokens.pop_back();
389  Tokens.back() = String;
390  return true;
391 }
392 
393 bool FormatTokenLexer::tryMergeConflictMarkers() {
394  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
395  return false;
396 
397  // Conflict lines look like:
398  // <marker> <text from the vcs>
399  // For example:
400  // >>>>>>> /file/in/file/system at revision 1234
401  //
402  // We merge all tokens in a line that starts with a conflict marker
403  // into a single token with a special token type that the unwrapped line
404  // parser will use to correctly rebuild the underlying code.
405 
406  FileID ID;
407  // Get the position of the first token in the line.
408  unsigned FirstInLineOffset;
409  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
410  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
411  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
412  // Calculate the offset of the start of the current line.
413  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
414  if (LineOffset == StringRef::npos) {
415  LineOffset = 0;
416  } else {
417  ++LineOffset;
418  }
419 
420  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
421  StringRef LineStart;
422  if (FirstSpace == StringRef::npos) {
423  LineStart = Buffer.substr(LineOffset);
424  } else {
425  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
426  }
427 
428  TokenType Type = TT_Unknown;
429  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
430  Type = TT_ConflictStart;
431  } else if (LineStart == "|||||||" || LineStart == "=======" ||
432  LineStart == "====") {
433  Type = TT_ConflictAlternative;
434  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
435  Type = TT_ConflictEnd;
436  }
437 
438  if (Type != TT_Unknown) {
439  FormatToken *Next = Tokens.back();
440 
441  Tokens.resize(FirstInLineIndex + 1);
442  // We do not need to build a complete token here, as we will skip it
443  // during parsing anyway (as we must not touch whitespace around conflict
444  // markers).
445  Tokens.back()->Type = Type;
446  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
447 
448  Tokens.push_back(Next);
449  return true;
450  }
451 
452  return false;
453 }
454 
455 FormatToken *FormatTokenLexer::getStashedToken() {
456  // Create a synthesized second '>' or '<' token.
457  Token Tok = FormatTok->Tok;
458  StringRef TokenText = FormatTok->TokenText;
459 
460  unsigned OriginalColumn = FormatTok->OriginalColumn;
461  FormatTok = new (Allocator.Allocate()) FormatToken;
462  FormatTok->Tok = Tok;
463  SourceLocation TokLocation =
464  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
465  FormatTok->Tok.setLocation(TokLocation);
466  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
467  FormatTok->TokenText = TokenText;
468  FormatTok->ColumnWidth = 1;
469  FormatTok->OriginalColumn = OriginalColumn + 1;
470 
471  return FormatTok;
472 }
473 
474 FormatToken *FormatTokenLexer::getNextToken() {
475  if (StateStack.top() == LexerState::TOKEN_STASHED) {
476  StateStack.pop();
477  return getStashedToken();
478  }
479 
480  FormatTok = new (Allocator.Allocate()) FormatToken;
481  readRawToken(*FormatTok);
482  SourceLocation WhitespaceStart =
483  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
484  FormatTok->IsFirst = IsFirstToken;
485  IsFirstToken = false;
486 
487  // Consume and record whitespace until we find a significant token.
488  unsigned WhitespaceLength = TrailingWhitespace;
489  while (FormatTok->Tok.is(tok::unknown)) {
490  StringRef Text = FormatTok->TokenText;
491  auto EscapesNewline = [&](int pos) {
492  // A '\r' here is just part of '\r\n'. Skip it.
493  if (pos >= 0 && Text[pos] == '\r')
494  --pos;
495  // See whether there is an odd number of '\' before this.
496  // FIXME: This is wrong. A '\' followed by a newline is always removed,
497  // regardless of whether there is another '\' before it.
498  // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
499  unsigned count = 0;
500  for (; pos >= 0; --pos, ++count)
501  if (Text[pos] != '\\')
502  break;
503  return count & 1;
504  };
505  // FIXME: This miscounts tok:unknown tokens that are not just
506  // whitespace, e.g. a '`' character.
507  for (int i = 0, e = Text.size(); i != e; ++i) {
508  switch (Text[i]) {
509  case '\n':
510  ++FormatTok->NewlinesBefore;
511  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
512  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
513  Column = 0;
514  break;
515  case '\r':
516  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
517  Column = 0;
518  break;
519  case '\f':
520  case '\v':
521  Column = 0;
522  break;
523  case ' ':
524  ++Column;
525  break;
526  case '\t':
527  Column += Style.TabWidth - Column % Style.TabWidth;
528  break;
529  case '\\':
530  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
531  FormatTok->Type = TT_ImplicitStringLiteral;
532  break;
533  default:
534  FormatTok->Type = TT_ImplicitStringLiteral;
535  break;
536  }
537  if (FormatTok->Type == TT_ImplicitStringLiteral)
538  break;
539  }
540 
541  if (FormatTok->is(TT_ImplicitStringLiteral))
542  break;
543  WhitespaceLength += FormatTok->Tok.getLength();
544 
545  readRawToken(*FormatTok);
546  }
547 
548  // JavaScript and Java do not allow to escape the end of the line with a
549  // backslash. Backslashes are syntax errors in plain source, but can occur in
550  // comments. When a single line comment ends with a \, it'll cause the next
551  // line of code to be lexed as a comment, breaking formatting. The code below
552  // finds comments that contain a backslash followed by a line break, truncates
553  // the comment token at the backslash, and resets the lexer to restart behind
554  // the backslash.
555  if ((Style.Language == FormatStyle::LK_JavaScript ||
556  Style.Language == FormatStyle::LK_Java) &&
557  FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
558  size_t BackslashPos = FormatTok->TokenText.find('\\');
559  while (BackslashPos != StringRef::npos) {
560  if (BackslashPos + 1 < FormatTok->TokenText.size() &&
561  FormatTok->TokenText[BackslashPos + 1] == '\n') {
562  const char *Offset = Lex->getBufferLocation();
563  Offset -= FormatTok->TokenText.size();
564  Offset += BackslashPos + 1;
565  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
566  FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
568  FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
569  Encoding);
570  break;
571  }
572  BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
573  }
574  }
575 
576  // In case the token starts with escaped newlines, we want to
577  // take them into account as whitespace - this pattern is quite frequent
578  // in macro definitions.
579  // FIXME: Add a more explicit test.
580  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
581  unsigned SkippedWhitespace = 0;
582  if (FormatTok->TokenText.size() > 2 &&
583  (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
584  SkippedWhitespace = 3;
585  else if (FormatTok->TokenText[1] == '\n')
586  SkippedWhitespace = 2;
587  else
588  break;
589 
590  ++FormatTok->NewlinesBefore;
591  WhitespaceLength += SkippedWhitespace;
592  FormatTok->LastNewlineOffset = SkippedWhitespace;
593  Column = 0;
594  FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
595  }
596 
597  FormatTok->WhitespaceRange = SourceRange(
598  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
599 
600  FormatTok->OriginalColumn = Column;
601 
602  TrailingWhitespace = 0;
603  if (FormatTok->Tok.is(tok::comment)) {
604  // FIXME: Add the trimmed whitespace to Column.
605  StringRef UntrimmedText = FormatTok->TokenText;
606  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
607  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
608  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
609  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
610  FormatTok->Tok.setIdentifierInfo(&Info);
611  FormatTok->Tok.setKind(Info.getTokenID());
612  if (Style.Language == FormatStyle::LK_Java &&
613  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
614  tok::kw_operator)) {
615  FormatTok->Tok.setKind(tok::identifier);
616  FormatTok->Tok.setIdentifierInfo(nullptr);
617  } else if (Style.Language == FormatStyle::LK_JavaScript &&
618  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
619  tok::kw_operator)) {
620  FormatTok->Tok.setKind(tok::identifier);
621  FormatTok->Tok.setIdentifierInfo(nullptr);
622  }
623  } else if (FormatTok->Tok.is(tok::greatergreater)) {
624  FormatTok->Tok.setKind(tok::greater);
625  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
626  ++Column;
627  StateStack.push(LexerState::TOKEN_STASHED);
628  } else if (FormatTok->Tok.is(tok::lessless)) {
629  FormatTok->Tok.setKind(tok::less);
630  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
631  ++Column;
632  StateStack.push(LexerState::TOKEN_STASHED);
633  }
634 
635  // Now FormatTok is the next non-whitespace token.
636 
637  StringRef Text = FormatTok->TokenText;
638  size_t FirstNewlinePos = Text.find('\n');
639  if (FirstNewlinePos == StringRef::npos) {
640  // FIXME: ColumnWidth actually depends on the start column, we need to
641  // take this into account when the token is moved.
642  FormatTok->ColumnWidth =
643  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
644  Column += FormatTok->ColumnWidth;
645  } else {
646  FormatTok->IsMultiline = true;
647  // FIXME: ColumnWidth actually depends on the start column, we need to
648  // take this into account when the token is moved.
650  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
651 
652  // The last line of the token always starts in column 0.
653  // Thus, the length can be precomputed even in the presence of tabs.
655  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
656  Column = FormatTok->LastLineColumnWidth;
657  }
658 
659  if (Style.isCpp()) {
660  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
661  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
662  tok::pp_define) &&
663  std::find(ForEachMacros.begin(), ForEachMacros.end(),
664  FormatTok->Tok.getIdentifierInfo()) != ForEachMacros.end()) {
665  FormatTok->Type = TT_ForEachMacro;
666  } else if (FormatTok->is(tok::identifier)) {
667  if (MacroBlockBeginRegex.match(Text)) {
668  FormatTok->Type = TT_MacroBlockBegin;
669  } else if (MacroBlockEndRegex.match(Text)) {
670  FormatTok->Type = TT_MacroBlockEnd;
671  }
672  }
673  }
674 
675  return FormatTok;
676 }
677 
678 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
679  Lex->LexFromRawLexer(Tok.Tok);
680  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
681  Tok.Tok.getLength());
682  // For formatting, treat unterminated string literals like normal string
683  // literals.
684  if (Tok.is(tok::unknown)) {
685  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
686  Tok.Tok.setKind(tok::string_literal);
687  Tok.IsUnterminatedLiteral = true;
688  } else if (Style.Language == FormatStyle::LK_JavaScript &&
689  Tok.TokenText == "''") {
690  Tok.Tok.setKind(tok::string_literal);
691  }
692  }
693 
694  if (Style.Language == FormatStyle::LK_JavaScript &&
695  Tok.is(tok::char_constant)) {
696  Tok.Tok.setKind(tok::string_literal);
697  }
698 
699  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
700  Tok.TokenText == "/* clang-format on */")) {
701  FormattingDisabled = false;
702  }
703 
704  Tok.Finalized = FormattingDisabled;
705 
706  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
707  Tok.TokenText == "/* clang-format off */")) {
708  FormattingDisabled = true;
709  }
710 }
711 
712 void FormatTokenLexer::resetLexer(unsigned Offset) {
713  StringRef Buffer = SourceMgr.getBufferData(ID);
714  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
715  getFormattingLangOpts(Style), Buffer.begin(),
716  Buffer.begin() + Offset, Buffer.end()));
717  Lex->SetKeepWhitespaceMode(true);
718  TrailingWhitespace = 0;
719 }
720 
721 } // namespace format
722 } // namespace clang
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:63
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Token Tok
The Token.
Definition: FormatToken.h:124
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:95
Defines the SourceManager interface.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:215
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
Definition: Type.h:1360
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:153
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:156
bool isBinaryOperator() const
Definition: FormatToken.h:389
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:130
One of these records is kept for each identifier that is lexed.
Should be used for Java.
Definition: Format.h:1168
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:62
void setKind(tok::TokenKind K)
Definition: Token.h:91
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
bool isNot(T Kind) const
Definition: FormatToken.h:313
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
uint32_t Offset
Definition: CacheTokens.cpp:43
const FormatToken & Tok
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls...
Definition: Format.h:986
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:306
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:1216
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:1213
SourceLocation End
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding)
Should be used for JavaScript.
Definition: Format.h:1170
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:2049
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:124
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:120
unsigned LastNewlineOffset
The offset just past the last &#39; &#39; in this token&#39;s leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:141
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:297
Various functions to configurably format source code.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:177
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:186
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:137
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
bool IsUnterminatedLiteral
Set to true if this token is an unterminated literal.
Definition: FormatToken.h:171
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:168
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:1185
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Dataflow Directional Tag Classes.
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:146
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:295
unsigned getLength() const
Definition: Token.h:127
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:1548
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
Should be used for Protocol Buffer messages in text format (https://developers.google.com/protocol-buffers/).
Definition: Format.h:1180
StringRef Text
Definition: Format.cpp:1336
void setLocation(SourceLocation L)
Definition: Token.h:132
#define true
Definition: stdbool.h:32
A trivial tuple used to represent a source range.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:134
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:150
const encoding::Encoding Encoding
const FormatStyle & Style