clang  6.0.0svn
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief This file implements FormatTokenLexer, which tokenizes a source file
12 /// into a FormatToken stream suitable for ClangFormat.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "FormatTokenLexer.h"
17 #include "FormatToken.h"
20 #include "clang/Format/Format.h"
21 #include "llvm/Support/Regex.h"
22 
23 namespace clang {
24 namespace format {
25 
27  const FormatStyle &Style,
29  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
30  Column(0), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
31  Style(Style), IdentTable(getFormattingLangOpts(Style)),
32  Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
33  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
34  MacroBlockEndRegex(Style.MacroBlockEnd) {
35  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
36  getFormattingLangOpts(Style)));
37  Lex->SetKeepWhitespaceMode(true);
38 
39  for (const std::string &ForEachMacro : Style.ForEachMacros)
40  ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
41  std::sort(ForEachMacros.begin(), ForEachMacros.end());
42 }
43 
45  assert(Tokens.empty());
46  assert(FirstInLineIndex == 0);
47  do {
48  Tokens.push_back(getNextToken());
49  if (Style.Language == FormatStyle::LK_JavaScript) {
50  tryParseJSRegexLiteral();
51  handleTemplateStrings();
52  }
53  tryMergePreviousTokens();
54  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
55  FirstInLineIndex = Tokens.size() - 1;
56  } while (Tokens.back()->Tok.isNot(tok::eof));
57  return Tokens;
58 }
59 
60 void FormatTokenLexer::tryMergePreviousTokens() {
61  if (tryMerge_TMacro())
62  return;
63  if (tryMergeConflictMarkers())
64  return;
65  if (tryMergeLessLess())
66  return;
67  if (tryMergeNSStringLiteral())
68  return;
69 
70  if (Style.Language == FormatStyle::LK_JavaScript) {
71  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
72  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
73  tok::equal};
74  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
75  tok::greaterequal};
76  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
77  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
78  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
79  tok::starequal};
80 
81  // FIXME: Investigate what token type gives the correct operator priority.
82  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
83  return;
84  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
85  return;
86  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
87  return;
88  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
89  return;
90  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
91  return;
92  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
93  Tokens.back()->Tok.setKind(tok::starequal);
94  return;
95  }
96  }
97 
98  if (Style.Language == FormatStyle::LK_Java) {
99  static const tok::TokenKind JavaRightLogicalShift[] = {
100  tok::greater, tok::greater, tok::greater};
101  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
102  tok::greater, tok::greater, tok::greaterequal};
103  if (tryMergeTokens(JavaRightLogicalShift, TT_BinaryOperator))
104  return;
105  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
106  return;
107  }
108 }
109 
110 bool FormatTokenLexer::tryMergeNSStringLiteral() {
111  if (Tokens.size() < 2)
112  return false;
113  auto &At = *(Tokens.end() - 2);
114  auto &String = *(Tokens.end() - 1);
115  if (!At->is(tok::at) || !String->is(tok::string_literal))
116  return false;
117  At->Tok.setKind(tok::string_literal);
118  At->TokenText = StringRef(At->TokenText.begin(),
119  String->TokenText.end() - At->TokenText.begin());
120  At->ColumnWidth += String->ColumnWidth;
121  At->Type = TT_ObjCStringLiteral;
122  Tokens.erase(Tokens.end() - 1);
123  return true;
124 }
125 
126 bool FormatTokenLexer::tryMergeLessLess() {
127  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
128  if (Tokens.size() < 3)
129  return false;
130 
131  bool FourthTokenIsLess = false;
132  if (Tokens.size() > 3)
133  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
134 
135  auto First = Tokens.end() - 3;
136  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
137  First[0]->isNot(tok::less) || FourthTokenIsLess)
138  return false;
139 
140  // Only merge if there currently is no whitespace between the two "<".
141  if (First[1]->WhitespaceRange.getBegin() !=
142  First[1]->WhitespaceRange.getEnd())
143  return false;
144 
145  First[0]->Tok.setKind(tok::lessless);
146  First[0]->TokenText = "<<";
147  First[0]->ColumnWidth += 1;
148  Tokens.erase(Tokens.end() - 2);
149  return true;
150 }
151 
152 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
153  TokenType NewType) {
154  if (Tokens.size() < Kinds.size())
155  return false;
156 
158  Tokens.end() - Kinds.size();
159  if (!First[0]->is(Kinds[0]))
160  return false;
161  unsigned AddLength = 0;
162  for (unsigned i = 1; i < Kinds.size(); ++i) {
163  if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
164  First[i]->WhitespaceRange.getEnd())
165  return false;
166  AddLength += First[i]->TokenText.size();
167  }
168  Tokens.resize(Tokens.size() - Kinds.size() + 1);
169  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
170  First[0]->TokenText.size() + AddLength);
171  First[0]->ColumnWidth += AddLength;
172  First[0]->Type = NewType;
173  return true;
174 }
175 
176 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
177 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
178  // NB: This is not entirely correct, as an r_paren can introduce an operand
179  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
180  // corner case to not matter in practice, though.
181  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
182  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
183  tok::colon, tok::question, tok::tilde) ||
184  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
185  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
186  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
187  Tok->isBinaryOperator();
188 }
189 
190 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
191  if (!Prev)
192  return true;
193 
194  // Regex literals can only follow after prefix unary operators, not after
195  // postfix unary operators. If the '++' is followed by a non-operand
196  // introducing token, the slash here is the operand and not the start of a
197  // regex.
198  // `!` is an unary prefix operator, but also a post-fix operator that casts
199  // away nullability, so the same check applies.
200  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
201  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
202 
203  // The previous token must introduce an operand location where regex
204  // literals can occur.
205  if (!precedesOperand(Prev))
206  return false;
207 
208  return true;
209 }
210 
211 // Tries to parse a JavaScript Regex literal starting at the current token,
212 // if that begins with a slash and is in a location where JavaScript allows
213 // regex literals. Changes the current token to a regex literal and updates
214 // its text if successful.
215 void FormatTokenLexer::tryParseJSRegexLiteral() {
216  FormatToken *RegexToken = Tokens.back();
217  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
218  return;
219 
220  FormatToken *Prev = nullptr;
221  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
222  // NB: Because previous pointers are not initialized yet, this cannot use
223  // Token.getPreviousNonComment.
224  if ((*I)->isNot(tok::comment)) {
225  Prev = *I;
226  break;
227  }
228  }
229 
230  if (!canPrecedeRegexLiteral(Prev))
231  return;
232 
233  // 'Manually' lex ahead in the current file buffer.
234  const char *Offset = Lex->getBufferLocation();
235  const char *RegexBegin = Offset - RegexToken->TokenText.size();
236  StringRef Buffer = Lex->getBuffer();
237  bool InCharacterClass = false;
238  bool HaveClosingSlash = false;
239  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
240  // Regular expressions are terminated with a '/', which can only be
241  // escaped using '\' or a character class between '[' and ']'.
242  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
243  switch (*Offset) {
244  case '\\':
245  // Skip the escaped character.
246  ++Offset;
247  break;
248  case '[':
249  InCharacterClass = true;
250  break;
251  case ']':
252  InCharacterClass = false;
253  break;
254  case '/':
255  if (!InCharacterClass)
256  HaveClosingSlash = true;
257  break;
258  }
259  }
260 
261  RegexToken->Type = TT_RegexLiteral;
262  // Treat regex literals like other string_literals.
263  RegexToken->Tok.setKind(tok::string_literal);
264  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
265  RegexToken->ColumnWidth = RegexToken->TokenText.size();
266 
267  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
268 }
269 
270 void FormatTokenLexer::handleTemplateStrings() {
271  FormatToken *BacktickToken = Tokens.back();
272 
273  if (BacktickToken->is(tok::l_brace)) {
274  StateStack.push(LexerState::NORMAL);
275  return;
276  }
277  if (BacktickToken->is(tok::r_brace)) {
278  if (StateStack.size() == 1)
279  return;
280  StateStack.pop();
281  if (StateStack.top() != LexerState::TEMPLATE_STRING)
282  return;
283  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
284  } else if (BacktickToken->is(tok::unknown) &&
285  BacktickToken->TokenText == "`") {
286  StateStack.push(LexerState::TEMPLATE_STRING);
287  } else {
288  return; // Not actually a template
289  }
290 
291  // 'Manually' lex ahead in the current file buffer.
292  const char *Offset = Lex->getBufferLocation();
293  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
294  for (; Offset != Lex->getBuffer().end(); ++Offset) {
295  if (Offset[0] == '`') {
296  StateStack.pop();
297  break;
298  }
299  if (Offset[0] == '\\') {
300  ++Offset; // Skip the escaped character.
301  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
302  Offset[1] == '{') {
303  // '${' introduces an expression interpolation in the template string.
304  StateStack.push(LexerState::NORMAL);
305  ++Offset;
306  break;
307  }
308  }
309 
310  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
311  BacktickToken->Type = TT_TemplateString;
312  BacktickToken->Tok.setKind(tok::string_literal);
313  BacktickToken->TokenText = LiteralText;
314 
315  // Adjust width for potentially multiline string literals.
316  size_t FirstBreak = LiteralText.find('\n');
317  StringRef FirstLineText = FirstBreak == StringRef::npos
318  ? LiteralText
319  : LiteralText.substr(0, FirstBreak);
321  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
322  size_t LastBreak = LiteralText.rfind('\n');
323  if (LastBreak != StringRef::npos) {
324  BacktickToken->IsMultiline = true;
325  unsigned StartColumn = 0; // The template tail spans the entire line.
327  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
328  Style.TabWidth, Encoding);
329  }
330 
331  SourceLocation loc = Offset < Lex->getBuffer().end()
332  ? Lex->getSourceLocation(Offset + 1)
333  : SourceMgr.getLocForEndOfFile(ID);
334  resetLexer(SourceMgr.getFileOffset(loc));
335 }
336 
337 bool FormatTokenLexer::tryMerge_TMacro() {
338  if (Tokens.size() < 4)
339  return false;
340  FormatToken *Last = Tokens.back();
341  if (!Last->is(tok::r_paren))
342  return false;
343 
344  FormatToken *String = Tokens[Tokens.size() - 2];
345  if (!String->is(tok::string_literal) || String->IsMultiline)
346  return false;
347 
348  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
349  return false;
350 
351  FormatToken *Macro = Tokens[Tokens.size() - 4];
352  if (Macro->TokenText != "_T")
353  return false;
354 
355  const char *Start = Macro->TokenText.data();
356  const char *End = Last->TokenText.data() + Last->TokenText.size();
357  String->TokenText = StringRef(Start, End - Start);
358  String->IsFirst = Macro->IsFirst;
359  String->LastNewlineOffset = Macro->LastNewlineOffset;
360  String->WhitespaceRange = Macro->WhitespaceRange;
361  String->OriginalColumn = Macro->OriginalColumn;
363  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
364  String->NewlinesBefore = Macro->NewlinesBefore;
365  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
366 
367  Tokens.pop_back();
368  Tokens.pop_back();
369  Tokens.pop_back();
370  Tokens.back() = String;
371  return true;
372 }
373 
374 bool FormatTokenLexer::tryMergeConflictMarkers() {
375  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
376  return false;
377 
378  // Conflict lines look like:
379  // <marker> <text from the vcs>
380  // For example:
381  // >>>>>>> /file/in/file/system at revision 1234
382  //
383  // We merge all tokens in a line that starts with a conflict marker
384  // into a single token with a special token type that the unwrapped line
385  // parser will use to correctly rebuild the underlying code.
386 
387  FileID ID;
388  // Get the position of the first token in the line.
389  unsigned FirstInLineOffset;
390  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
391  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
392  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
393  // Calculate the offset of the start of the current line.
394  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
395  if (LineOffset == StringRef::npos) {
396  LineOffset = 0;
397  } else {
398  ++LineOffset;
399  }
400 
401  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
402  StringRef LineStart;
403  if (FirstSpace == StringRef::npos) {
404  LineStart = Buffer.substr(LineOffset);
405  } else {
406  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
407  }
408 
409  TokenType Type = TT_Unknown;
410  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
411  Type = TT_ConflictStart;
412  } else if (LineStart == "|||||||" || LineStart == "=======" ||
413  LineStart == "====") {
414  Type = TT_ConflictAlternative;
415  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
416  Type = TT_ConflictEnd;
417  }
418 
419  if (Type != TT_Unknown) {
420  FormatToken *Next = Tokens.back();
421 
422  Tokens.resize(FirstInLineIndex + 1);
423  // We do not need to build a complete token here, as we will skip it
424  // during parsing anyway (as we must not touch whitespace around conflict
425  // markers).
426  Tokens.back()->Type = Type;
427  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
428 
429  Tokens.push_back(Next);
430  return true;
431  }
432 
433  return false;
434 }
435 
436 FormatToken *FormatTokenLexer::getStashedToken() {
437  // Create a synthesized second '>' or '<' token.
438  Token Tok = FormatTok->Tok;
439  StringRef TokenText = FormatTok->TokenText;
440 
441  unsigned OriginalColumn = FormatTok->OriginalColumn;
442  FormatTok = new (Allocator.Allocate()) FormatToken;
443  FormatTok->Tok = Tok;
444  SourceLocation TokLocation =
445  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
446  FormatTok->Tok.setLocation(TokLocation);
447  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
448  FormatTok->TokenText = TokenText;
449  FormatTok->ColumnWidth = 1;
450  FormatTok->OriginalColumn = OriginalColumn + 1;
451 
452  return FormatTok;
453 }
454 
455 FormatToken *FormatTokenLexer::getNextToken() {
456  if (StateStack.top() == LexerState::TOKEN_STASHED) {
457  StateStack.pop();
458  return getStashedToken();
459  }
460 
461  FormatTok = new (Allocator.Allocate()) FormatToken;
462  readRawToken(*FormatTok);
463  SourceLocation WhitespaceStart =
464  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
465  FormatTok->IsFirst = IsFirstToken;
466  IsFirstToken = false;
467 
468  // Consume and record whitespace until we find a significant token.
469  unsigned WhitespaceLength = TrailingWhitespace;
470  while (FormatTok->Tok.is(tok::unknown)) {
471  StringRef Text = FormatTok->TokenText;
472  auto EscapesNewline = [&](int pos) {
473  // A '\r' here is just part of '\r\n'. Skip it.
474  if (pos >= 0 && Text[pos] == '\r')
475  --pos;
476  // See whether there is an odd number of '\' before this.
477  // FIXME: This is wrong. A '\' followed by a newline is always removed,
478  // regardless of whether there is another '\' before it.
479  // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
480  unsigned count = 0;
481  for (; pos >= 0; --pos, ++count)
482  if (Text[pos] != '\\')
483  break;
484  return count & 1;
485  };
486  // FIXME: This miscounts tok:unknown tokens that are not just
487  // whitespace, e.g. a '`' character.
488  for (int i = 0, e = Text.size(); i != e; ++i) {
489  switch (Text[i]) {
490  case '\n':
491  ++FormatTok->NewlinesBefore;
492  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
493  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
494  Column = 0;
495  break;
496  case '\r':
497  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
498  Column = 0;
499  break;
500  case '\f':
501  case '\v':
502  Column = 0;
503  break;
504  case ' ':
505  ++Column;
506  break;
507  case '\t':
508  Column += Style.TabWidth - Column % Style.TabWidth;
509  break;
510  case '\\':
511  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
512  FormatTok->Type = TT_ImplicitStringLiteral;
513  break;
514  default:
515  FormatTok->Type = TT_ImplicitStringLiteral;
516  break;
517  }
518  if (FormatTok->Type == TT_ImplicitStringLiteral)
519  break;
520  }
521 
522  if (FormatTok->is(TT_ImplicitStringLiteral))
523  break;
524  WhitespaceLength += FormatTok->Tok.getLength();
525 
526  readRawToken(*FormatTok);
527  }
528 
529  // JavaScript and Java do not allow to escape the end of the line with a
530  // backslash. Backslashes are syntax errors in plain source, but can occur in
531  // comments. When a single line comment ends with a \, it'll cause the next
532  // line of code to be lexed as a comment, breaking formatting. The code below
533  // finds comments that contain a backslash followed by a line break, truncates
534  // the comment token at the backslash, and resets the lexer to restart behind
535  // the backslash.
536  if ((Style.Language == FormatStyle::LK_JavaScript ||
537  Style.Language == FormatStyle::LK_Java) &&
538  FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
539  size_t BackslashPos = FormatTok->TokenText.find('\\');
540  while (BackslashPos != StringRef::npos) {
541  if (BackslashPos + 1 < FormatTok->TokenText.size() &&
542  FormatTok->TokenText[BackslashPos + 1] == '\n') {
543  const char *Offset = Lex->getBufferLocation();
544  Offset -= FormatTok->TokenText.size();
545  Offset += BackslashPos + 1;
546  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
547  FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
549  FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
550  Encoding);
551  break;
552  }
553  BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
554  }
555  }
556 
557  // In case the token starts with escaped newlines, we want to
558  // take them into account as whitespace - this pattern is quite frequent
559  // in macro definitions.
560  // FIXME: Add a more explicit test.
561  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
562  FormatTok->TokenText[1] == '\n') {
563  ++FormatTok->NewlinesBefore;
564  WhitespaceLength += 2;
565  FormatTok->LastNewlineOffset = 2;
566  Column = 0;
567  FormatTok->TokenText = FormatTok->TokenText.substr(2);
568  }
569 
570  FormatTok->WhitespaceRange = SourceRange(
571  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
572 
573  FormatTok->OriginalColumn = Column;
574 
575  TrailingWhitespace = 0;
576  if (FormatTok->Tok.is(tok::comment)) {
577  // FIXME: Add the trimmed whitespace to Column.
578  StringRef UntrimmedText = FormatTok->TokenText;
579  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
580  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
581  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
582  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
583  FormatTok->Tok.setIdentifierInfo(&Info);
584  FormatTok->Tok.setKind(Info.getTokenID());
585  if (Style.Language == FormatStyle::LK_Java &&
586  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
587  tok::kw_operator)) {
588  FormatTok->Tok.setKind(tok::identifier);
589  FormatTok->Tok.setIdentifierInfo(nullptr);
590  } else if (Style.Language == FormatStyle::LK_JavaScript &&
591  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
592  tok::kw_operator)) {
593  FormatTok->Tok.setKind(tok::identifier);
594  FormatTok->Tok.setIdentifierInfo(nullptr);
595  }
596  } else if (FormatTok->Tok.is(tok::greatergreater)) {
597  FormatTok->Tok.setKind(tok::greater);
598  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
599  ++Column;
600  StateStack.push(LexerState::TOKEN_STASHED);
601  } else if (FormatTok->Tok.is(tok::lessless)) {
602  FormatTok->Tok.setKind(tok::less);
603  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
604  ++Column;
605  StateStack.push(LexerState::TOKEN_STASHED);
606  }
607 
608  // Now FormatTok is the next non-whitespace token.
609 
610  StringRef Text = FormatTok->TokenText;
611  size_t FirstNewlinePos = Text.find('\n');
612  if (FirstNewlinePos == StringRef::npos) {
613  // FIXME: ColumnWidth actually depends on the start column, we need to
614  // take this into account when the token is moved.
615  FormatTok->ColumnWidth =
616  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
617  Column += FormatTok->ColumnWidth;
618  } else {
619  FormatTok->IsMultiline = true;
620  // FIXME: ColumnWidth actually depends on the start column, we need to
621  // take this into account when the token is moved.
623  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
624 
625  // The last line of the token always starts in column 0.
626  // Thus, the length can be precomputed even in the presence of tabs.
628  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
629  Column = FormatTok->LastLineColumnWidth;
630  }
631 
632  if (Style.isCpp()) {
633  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
634  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
635  tok::pp_define) &&
636  std::find(ForEachMacros.begin(), ForEachMacros.end(),
637  FormatTok->Tok.getIdentifierInfo()) != ForEachMacros.end()) {
638  FormatTok->Type = TT_ForEachMacro;
639  } else if (FormatTok->is(tok::identifier)) {
640  if (MacroBlockBeginRegex.match(Text)) {
641  FormatTok->Type = TT_MacroBlockBegin;
642  } else if (MacroBlockEndRegex.match(Text)) {
643  FormatTok->Type = TT_MacroBlockEnd;
644  }
645  }
646  }
647 
648  return FormatTok;
649 }
650 
651 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
652  Lex->LexFromRawLexer(Tok.Tok);
653  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
654  Tok.Tok.getLength());
655  // For formatting, treat unterminated string literals like normal string
656  // literals.
657  if (Tok.is(tok::unknown)) {
658  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
659  Tok.Tok.setKind(tok::string_literal);
660  Tok.IsUnterminatedLiteral = true;
661  } else if (Style.Language == FormatStyle::LK_JavaScript &&
662  Tok.TokenText == "''") {
663  Tok.Tok.setKind(tok::string_literal);
664  }
665  }
666 
667  if (Style.Language == FormatStyle::LK_JavaScript &&
668  Tok.is(tok::char_constant)) {
669  Tok.Tok.setKind(tok::string_literal);
670  }
671 
672  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
673  Tok.TokenText == "/* clang-format on */")) {
674  FormattingDisabled = false;
675  }
676 
677  Tok.Finalized = FormattingDisabled;
678 
679  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
680  Tok.TokenText == "/* clang-format off */")) {
681  FormattingDisabled = true;
682  }
683 }
684 
685 void FormatTokenLexer::resetLexer(unsigned Offset) {
686  StringRef Buffer = SourceMgr.getBufferData(ID);
687  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
688  getFormattingLangOpts(Style), Buffer.begin(),
689  Buffer.begin() + Offset, Buffer.end()));
690  Lex->SetKeepWhitespaceMode(true);
691  TrailingWhitespace = 0;
692 }
693 
694 } // namespace format
695 } // namespace clang
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:63
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Token Tok
The Token.
Definition: FormatToken.h:124
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:95
Defines the SourceManager interface.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:215
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
Definition: Type.h:1300
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:153
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:156
bool isBinaryOperator() const
Definition: FormatToken.h:389
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:130
One of these records is kept for each identifier that is lexed.
Should be used for Java.
Definition: Format.h:1168
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:62
void setKind(tok::TokenKind K)
Definition: Token.h:91
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
uint32_t Offset
Definition: CacheTokens.cpp:43
const FormatToken & Tok
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls...
Definition: Format.h:986
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:306
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:1216
ID
Defines the address space values used by the address space qualifier of QualType. ...
Definition: AddressSpaces.h:26
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:1213
SourceLocation End
Should be used for JavaScript.
Definition: Format.h:1170
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:2010
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:124
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:120
unsigned LastNewlineOffset
The offset just past the last &#39; &#39; in this token&#39;s leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:141
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:297
Various functions to configurably format source code.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:177
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:186
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:137
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
bool IsUnterminatedLiteral
Set to true if this token is an unterminated literal.
Definition: FormatToken.h:171
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:168
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, const FormatStyle &Style, encoding::Encoding Encoding)
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:1185
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Dataflow Directional Tag Classes.
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:146
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:295
unsigned getLength() const
Definition: Token.h:127
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:1505
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:1316
void setLocation(SourceLocation L)
Definition: Token.h:132
#define true
Definition: stdbool.h:32
A trivial tuple used to represent a source range.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:134
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:150
const encoding::Encoding Encoding
const FormatStyle & Style