clang  9.0.0svn
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
26  unsigned Column, const FormatStyle &Style,
28  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
29  Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
30  Style(Style), IdentTable(getFormattingLangOpts(Style)),
31  Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
33  MacroBlockEndRegex(Style.MacroBlockEnd) {
34  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
35  getFormattingLangOpts(Style)));
36  Lex->SetKeepWhitespaceMode(true);
37 
38  for (const std::string &ForEachMacro : Style.ForEachMacros)
39  Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
40  for (const std::string &StatementMacro : Style.StatementMacros)
41  Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
42 }
43 
45  assert(Tokens.empty());
46  assert(FirstInLineIndex == 0);
47  do {
48  Tokens.push_back(getNextToken());
49  if (Style.Language == FormatStyle::LK_JavaScript) {
50  tryParseJSRegexLiteral();
51  handleTemplateStrings();
52  }
53  if (Style.Language == FormatStyle::LK_TextProto)
54  tryParsePythonComment();
55  tryMergePreviousTokens();
56  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
57  FirstInLineIndex = Tokens.size() - 1;
58  } while (Tokens.back()->Tok.isNot(tok::eof));
59  return Tokens;
60 }
61 
62 void FormatTokenLexer::tryMergePreviousTokens() {
63  if (tryMerge_TMacro())
64  return;
65  if (tryMergeConflictMarkers())
66  return;
67  if (tryMergeLessLess())
68  return;
69 
70  if (Style.isCSharp()) {
71  if (tryMergeCSharpKeywordVariables())
72  return;
73  if (tryMergeCSharpVerbatimStringLiteral())
74  return;
75  if (tryMergeCSharpDoubleQuestion())
76  return;
77  if (tryMergeCSharpNullConditionals())
78  return;
79  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
80  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
81  return;
82  }
83 
84  if (tryMergeNSStringLiteral())
85  return;
86 
87  if (Style.Language == FormatStyle::LK_JavaScript) {
88  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
89  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
90  tok::equal};
91  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
92  tok::greaterequal};
93  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
94  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
95  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
96  tok::starequal};
97 
98  // FIXME: Investigate what token type gives the correct operator priority.
99  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
100  return;
101  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
102  return;
103  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
104  return;
105  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
106  return;
107  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
108  return;
109  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
110  Tokens.back()->Tok.setKind(tok::starequal);
111  return;
112  }
113  if (tryMergeJSPrivateIdentifier())
114  return;
115  }
116 
117  if (Style.Language == FormatStyle::LK_Java) {
118  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
119  tok::greater, tok::greater, tok::greaterequal};
120  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
121  return;
122  }
123 }
124 
125 bool FormatTokenLexer::tryMergeNSStringLiteral() {
126  if (Tokens.size() < 2)
127  return false;
128  auto &At = *(Tokens.end() - 2);
129  auto &String = *(Tokens.end() - 1);
130  if (!At->is(tok::at) || !String->is(tok::string_literal))
131  return false;
132  At->Tok.setKind(tok::string_literal);
133  At->TokenText = StringRef(At->TokenText.begin(),
134  String->TokenText.end() - At->TokenText.begin());
135  At->ColumnWidth += String->ColumnWidth;
136  At->Type = TT_ObjCStringLiteral;
137  Tokens.erase(Tokens.end() - 1);
138  return true;
139 }
140 
141 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
142  // Merges #idenfier into a single identifier with the text #identifier
143  // but the token tok::identifier.
144  if (Tokens.size() < 2)
145  return false;
146  auto &Hash = *(Tokens.end() - 2);
147  auto &Identifier = *(Tokens.end() - 1);
148  if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
149  return false;
150  Hash->Tok.setKind(tok::identifier);
151  Hash->TokenText =
152  StringRef(Hash->TokenText.begin(),
153  Identifier->TokenText.end() - Hash->TokenText.begin());
154  Hash->ColumnWidth += Identifier->ColumnWidth;
155  Hash->Type = TT_JsPrivateIdentifier;
156  Tokens.erase(Tokens.end() - 1);
157  return true;
158 }
159 
160 // Search for verbatim or interpolated string literals @"ABC" or
161 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
162 // prevent splitting of @, $ and ".
163 bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
164  if (Tokens.size() < 2)
165  return false;
166  auto &At = *(Tokens.end() - 2);
167  auto &String = *(Tokens.end() - 1);
168 
169  // Look for $"aaaaaa" @"aaaaaa".
170  if (!(At->is(tok::at) || At->TokenText == "$") ||
171  !String->is(tok::string_literal))
172  return false;
173 
174  if (Tokens.size() >= 2 && At->is(tok::at)) {
175  auto &Dollar = *(Tokens.end() - 3);
176  if (Dollar->TokenText == "$") {
177  // This looks like $@"aaaaa" so we need to combine all 3 tokens.
178  Dollar->Tok.setKind(tok::string_literal);
179  Dollar->TokenText =
180  StringRef(Dollar->TokenText.begin(),
181  String->TokenText.end() - Dollar->TokenText.begin());
182  Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
183  Dollar->Type = TT_CSharpStringLiteral;
184  Tokens.erase(Tokens.end() - 2);
185  Tokens.erase(Tokens.end() - 1);
186  return true;
187  }
188  }
189 
190  // Convert back into just a string_literal.
191  At->Tok.setKind(tok::string_literal);
192  At->TokenText = StringRef(At->TokenText.begin(),
193  String->TokenText.end() - At->TokenText.begin());
194  At->ColumnWidth += String->ColumnWidth;
195  At->Type = TT_CSharpStringLiteral;
196  Tokens.erase(Tokens.end() - 1);
197  return true;
198 }
199 
200 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
201  if (Tokens.size() < 2)
202  return false;
203  auto &FirstQuestion = *(Tokens.end() - 2);
204  auto &SecondQuestion = *(Tokens.end() - 1);
205  if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
206  return false;
207  FirstQuestion->Tok.setKind(tok::question);
208  FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
209  SecondQuestion->TokenText.end() -
210  FirstQuestion->TokenText.begin());
211  FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
212  FirstQuestion->Type = TT_CSharpNullCoalescing;
213  Tokens.erase(Tokens.end() - 1);
214  return true;
215 }
216 
217 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
218  if (Tokens.size() < 2)
219  return false;
220  auto &At = *(Tokens.end() - 2);
221  auto &Keyword = *(Tokens.end() - 1);
222  if (!At->is(tok::at))
223  return false;
224  if (!Keywords.isCSharpKeyword(*Keyword))
225  return false;
226 
227  At->Tok.setKind(tok::identifier);
228  At->TokenText = StringRef(At->TokenText.begin(),
229  Keyword->TokenText.end() - At->TokenText.begin());
230  At->ColumnWidth += Keyword->ColumnWidth;
231  At->Type = Keyword->Type;
232  Tokens.erase(Tokens.end() - 1);
233  return true;
234 }
235 
236 // In C# merge the Identifier and the ? together e.g. arg?.
237 bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
238  if (Tokens.size() < 2)
239  return false;
240  auto &Identifier = *(Tokens.end() - 2);
241  auto &Question = *(Tokens.end() - 1);
242  if (!Identifier->isOneOf(tok::r_square, tok::identifier) ||
243  !Question->is(tok::question))
244  return false;
245  Identifier->TokenText =
246  StringRef(Identifier->TokenText.begin(),
247  Question->TokenText.end() - Identifier->TokenText.begin());
248  Identifier->ColumnWidth += Question->ColumnWidth;
249  Identifier->Type = Identifier->Type;
250  Tokens.erase(Tokens.end() - 1);
251  return true;
252 }
253 
254 bool FormatTokenLexer::tryMergeLessLess() {
255  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
256  if (Tokens.size() < 3)
257  return false;
258 
259  bool FourthTokenIsLess = false;
260  if (Tokens.size() > 3)
261  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
262 
263  auto First = Tokens.end() - 3;
264  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
265  First[0]->isNot(tok::less) || FourthTokenIsLess)
266  return false;
267 
268  // Only merge if there currently is no whitespace between the two "<".
269  if (First[1]->WhitespaceRange.getBegin() !=
270  First[1]->WhitespaceRange.getEnd())
271  return false;
272 
273  First[0]->Tok.setKind(tok::lessless);
274  First[0]->TokenText = "<<";
275  First[0]->ColumnWidth += 1;
276  Tokens.erase(Tokens.end() - 2);
277  return true;
278 }
279 
280 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
281  TokenType NewType) {
282  if (Tokens.size() < Kinds.size())
283  return false;
284 
286  Tokens.end() - Kinds.size();
287  if (!First[0]->is(Kinds[0]))
288  return false;
289  unsigned AddLength = 0;
290  for (unsigned i = 1; i < Kinds.size(); ++i) {
291  if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
292  First[i]->WhitespaceRange.getEnd())
293  return false;
294  AddLength += First[i]->TokenText.size();
295  }
296  Tokens.resize(Tokens.size() - Kinds.size() + 1);
297  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
298  First[0]->TokenText.size() + AddLength);
299  First[0]->ColumnWidth += AddLength;
300  First[0]->Type = NewType;
301  return true;
302 }
303 
304 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
305 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
306  // NB: This is not entirely correct, as an r_paren can introduce an operand
307  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
308  // corner case to not matter in practice, though.
309  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
310  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
311  tok::colon, tok::question, tok::tilde) ||
312  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
313  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
314  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
315  Tok->isBinaryOperator();
316 }
317 
318 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
319  if (!Prev)
320  return true;
321 
322  // Regex literals can only follow after prefix unary operators, not after
323  // postfix unary operators. If the '++' is followed by a non-operand
324  // introducing token, the slash here is the operand and not the start of a
325  // regex.
326  // `!` is an unary prefix operator, but also a post-fix operator that casts
327  // away nullability, so the same check applies.
328  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
329  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
330 
331  // The previous token must introduce an operand location where regex
332  // literals can occur.
333  if (!precedesOperand(Prev))
334  return false;
335 
336  return true;
337 }
338 
339 // Tries to parse a JavaScript Regex literal starting at the current token,
340 // if that begins with a slash and is in a location where JavaScript allows
341 // regex literals. Changes the current token to a regex literal and updates
342 // its text if successful.
343 void FormatTokenLexer::tryParseJSRegexLiteral() {
344  FormatToken *RegexToken = Tokens.back();
345  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
346  return;
347 
348  FormatToken *Prev = nullptr;
349  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
350  // NB: Because previous pointers are not initialized yet, this cannot use
351  // Token.getPreviousNonComment.
352  if ((*I)->isNot(tok::comment)) {
353  Prev = *I;
354  break;
355  }
356  }
357 
358  if (!canPrecedeRegexLiteral(Prev))
359  return;
360 
361  // 'Manually' lex ahead in the current file buffer.
362  const char *Offset = Lex->getBufferLocation();
363  const char *RegexBegin = Offset - RegexToken->TokenText.size();
364  StringRef Buffer = Lex->getBuffer();
365  bool InCharacterClass = false;
366  bool HaveClosingSlash = false;
367  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
368  // Regular expressions are terminated with a '/', which can only be
369  // escaped using '\' or a character class between '[' and ']'.
370  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
371  switch (*Offset) {
372  case '\\':
373  // Skip the escaped character.
374  ++Offset;
375  break;
376  case '[':
377  InCharacterClass = true;
378  break;
379  case ']':
380  InCharacterClass = false;
381  break;
382  case '/':
383  if (!InCharacterClass)
384  HaveClosingSlash = true;
385  break;
386  }
387  }
388 
389  RegexToken->Type = TT_RegexLiteral;
390  // Treat regex literals like other string_literals.
391  RegexToken->Tok.setKind(tok::string_literal);
392  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
393  RegexToken->ColumnWidth = RegexToken->TokenText.size();
394 
395  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
396 }
397 
398 void FormatTokenLexer::handleTemplateStrings() {
399  FormatToken *BacktickToken = Tokens.back();
400 
401  if (BacktickToken->is(tok::l_brace)) {
402  StateStack.push(LexerState::NORMAL);
403  return;
404  }
405  if (BacktickToken->is(tok::r_brace)) {
406  if (StateStack.size() == 1)
407  return;
408  StateStack.pop();
409  if (StateStack.top() != LexerState::TEMPLATE_STRING)
410  return;
411  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
412  } else if (BacktickToken->is(tok::unknown) &&
413  BacktickToken->TokenText == "`") {
414  StateStack.push(LexerState::TEMPLATE_STRING);
415  } else {
416  return; // Not actually a template
417  }
418 
419  // 'Manually' lex ahead in the current file buffer.
420  const char *Offset = Lex->getBufferLocation();
421  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
422  for (; Offset != Lex->getBuffer().end(); ++Offset) {
423  if (Offset[0] == '`') {
424  StateStack.pop();
425  break;
426  }
427  if (Offset[0] == '\\') {
428  ++Offset; // Skip the escaped character.
429  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
430  Offset[1] == '{') {
431  // '${' introduces an expression interpolation in the template string.
432  StateStack.push(LexerState::NORMAL);
433  ++Offset;
434  break;
435  }
436  }
437 
438  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
439  BacktickToken->Type = TT_TemplateString;
440  BacktickToken->Tok.setKind(tok::string_literal);
441  BacktickToken->TokenText = LiteralText;
442 
443  // Adjust width for potentially multiline string literals.
444  size_t FirstBreak = LiteralText.find('\n');
445  StringRef FirstLineText = FirstBreak == StringRef::npos
446  ? LiteralText
447  : LiteralText.substr(0, FirstBreak);
449  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
450  size_t LastBreak = LiteralText.rfind('\n');
451  if (LastBreak != StringRef::npos) {
452  BacktickToken->IsMultiline = true;
453  unsigned StartColumn = 0; // The template tail spans the entire line.
455  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
456  Style.TabWidth, Encoding);
457  }
458 
459  SourceLocation loc = Offset < Lex->getBuffer().end()
460  ? Lex->getSourceLocation(Offset + 1)
461  : SourceMgr.getLocForEndOfFile(ID);
462  resetLexer(SourceMgr.getFileOffset(loc));
463 }
464 
465 void FormatTokenLexer::tryParsePythonComment() {
466  FormatToken *HashToken = Tokens.back();
467  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
468  return;
469  // Turn the remainder of this line into a comment.
470  const char *CommentBegin =
471  Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
472  size_t From = CommentBegin - Lex->getBuffer().begin();
473  size_t To = Lex->getBuffer().find_first_of('\n', From);
474  if (To == StringRef::npos)
475  To = Lex->getBuffer().size();
476  size_t Len = To - From;
477  HashToken->Type = TT_LineComment;
478  HashToken->Tok.setKind(tok::comment);
479  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
480  SourceLocation Loc = To < Lex->getBuffer().size()
481  ? Lex->getSourceLocation(CommentBegin + Len)
482  : SourceMgr.getLocForEndOfFile(ID);
483  resetLexer(SourceMgr.getFileOffset(Loc));
484 }
485 
486 bool FormatTokenLexer::tryMerge_TMacro() {
487  if (Tokens.size() < 4)
488  return false;
489  FormatToken *Last = Tokens.back();
490  if (!Last->is(tok::r_paren))
491  return false;
492 
493  FormatToken *String = Tokens[Tokens.size() - 2];
494  if (!String->is(tok::string_literal) || String->IsMultiline)
495  return false;
496 
497  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
498  return false;
499 
500  FormatToken *Macro = Tokens[Tokens.size() - 4];
501  if (Macro->TokenText != "_T")
502  return false;
503 
504  const char *Start = Macro->TokenText.data();
505  const char *End = Last->TokenText.data() + Last->TokenText.size();
506  String->TokenText = StringRef(Start, End - Start);
507  String->IsFirst = Macro->IsFirst;
508  String->LastNewlineOffset = Macro->LastNewlineOffset;
509  String->WhitespaceRange = Macro->WhitespaceRange;
510  String->OriginalColumn = Macro->OriginalColumn;
512  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
513  String->NewlinesBefore = Macro->NewlinesBefore;
514  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
515 
516  Tokens.pop_back();
517  Tokens.pop_back();
518  Tokens.pop_back();
519  Tokens.back() = String;
520  return true;
521 }
522 
523 bool FormatTokenLexer::tryMergeConflictMarkers() {
524  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
525  return false;
526 
527  // Conflict lines look like:
528  // <marker> <text from the vcs>
529  // For example:
530  // >>>>>>> /file/in/file/system at revision 1234
531  //
532  // We merge all tokens in a line that starts with a conflict marker
533  // into a single token with a special token type that the unwrapped line
534  // parser will use to correctly rebuild the underlying code.
535 
536  FileID ID;
537  // Get the position of the first token in the line.
538  unsigned FirstInLineOffset;
539  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
540  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
541  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
542  // Calculate the offset of the start of the current line.
543  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
544  if (LineOffset == StringRef::npos) {
545  LineOffset = 0;
546  } else {
547  ++LineOffset;
548  }
549 
550  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
551  StringRef LineStart;
552  if (FirstSpace == StringRef::npos) {
553  LineStart = Buffer.substr(LineOffset);
554  } else {
555  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
556  }
557 
558  TokenType Type = TT_Unknown;
559  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
560  Type = TT_ConflictStart;
561  } else if (LineStart == "|||||||" || LineStart == "=======" ||
562  LineStart == "====") {
563  Type = TT_ConflictAlternative;
564  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
565  Type = TT_ConflictEnd;
566  }
567 
568  if (Type != TT_Unknown) {
569  FormatToken *Next = Tokens.back();
570 
571  Tokens.resize(FirstInLineIndex + 1);
572  // We do not need to build a complete token here, as we will skip it
573  // during parsing anyway (as we must not touch whitespace around conflict
574  // markers).
575  Tokens.back()->Type = Type;
576  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
577 
578  Tokens.push_back(Next);
579  return true;
580  }
581 
582  return false;
583 }
584 
585 FormatToken *FormatTokenLexer::getStashedToken() {
586  // Create a synthesized second '>' or '<' token.
587  Token Tok = FormatTok->Tok;
588  StringRef TokenText = FormatTok->TokenText;
589 
590  unsigned OriginalColumn = FormatTok->OriginalColumn;
591  FormatTok = new (Allocator.Allocate()) FormatToken;
592  FormatTok->Tok = Tok;
593  SourceLocation TokLocation =
594  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
595  FormatTok->Tok.setLocation(TokLocation);
596  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
597  FormatTok->TokenText = TokenText;
598  FormatTok->ColumnWidth = 1;
599  FormatTok->OriginalColumn = OriginalColumn + 1;
600 
601  return FormatTok;
602 }
603 
604 FormatToken *FormatTokenLexer::getNextToken() {
605  if (StateStack.top() == LexerState::TOKEN_STASHED) {
606  StateStack.pop();
607  return getStashedToken();
608  }
609 
610  FormatTok = new (Allocator.Allocate()) FormatToken;
611  readRawToken(*FormatTok);
612  SourceLocation WhitespaceStart =
613  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
614  FormatTok->IsFirst = IsFirstToken;
615  IsFirstToken = false;
616 
617  // Consume and record whitespace until we find a significant token.
618  unsigned WhitespaceLength = TrailingWhitespace;
619  while (FormatTok->Tok.is(tok::unknown)) {
620  StringRef Text = FormatTok->TokenText;
621  auto EscapesNewline = [&](int pos) {
622  // A '\r' here is just part of '\r\n'. Skip it.
623  if (pos >= 0 && Text[pos] == '\r')
624  --pos;
625  // See whether there is an odd number of '\' before this.
626  // FIXME: This is wrong. A '\' followed by a newline is always removed,
627  // regardless of whether there is another '\' before it.
628  // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
629  unsigned count = 0;
630  for (; pos >= 0; --pos, ++count)
631  if (Text[pos] != '\\')
632  break;
633  return count & 1;
634  };
635  // FIXME: This miscounts tok:unknown tokens that are not just
636  // whitespace, e.g. a '`' character.
637  for (int i = 0, e = Text.size(); i != e; ++i) {
638  switch (Text[i]) {
639  case '\n':
640  ++FormatTok->NewlinesBefore;
641  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
642  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
643  Column = 0;
644  break;
645  case '\r':
646  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
647  Column = 0;
648  break;
649  case '\f':
650  case '\v':
651  Column = 0;
652  break;
653  case ' ':
654  ++Column;
655  break;
656  case '\t':
657  Column += Style.TabWidth - Column % Style.TabWidth;
658  break;
659  case '\\':
660  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
661  FormatTok->Type = TT_ImplicitStringLiteral;
662  break;
663  default:
664  FormatTok->Type = TT_ImplicitStringLiteral;
665  break;
666  }
667  if (FormatTok->Type == TT_ImplicitStringLiteral)
668  break;
669  }
670 
671  if (FormatTok->is(TT_ImplicitStringLiteral))
672  break;
673  WhitespaceLength += FormatTok->Tok.getLength();
674 
675  readRawToken(*FormatTok);
676  }
677 
678  // JavaScript and Java do not allow to escape the end of the line with a
679  // backslash. Backslashes are syntax errors in plain source, but can occur in
680  // comments. When a single line comment ends with a \, it'll cause the next
681  // line of code to be lexed as a comment, breaking formatting. The code below
682  // finds comments that contain a backslash followed by a line break, truncates
683  // the comment token at the backslash, and resets the lexer to restart behind
684  // the backslash.
685  if ((Style.Language == FormatStyle::LK_JavaScript ||
686  Style.Language == FormatStyle::LK_Java) &&
687  FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
688  size_t BackslashPos = FormatTok->TokenText.find('\\');
689  while (BackslashPos != StringRef::npos) {
690  if (BackslashPos + 1 < FormatTok->TokenText.size() &&
691  FormatTok->TokenText[BackslashPos + 1] == '\n') {
692  const char *Offset = Lex->getBufferLocation();
693  Offset -= FormatTok->TokenText.size();
694  Offset += BackslashPos + 1;
695  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
696  FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
698  FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
699  Encoding);
700  break;
701  }
702  BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
703  }
704  }
705 
706  // In case the token starts with escaped newlines, we want to
707  // take them into account as whitespace - this pattern is quite frequent
708  // in macro definitions.
709  // FIXME: Add a more explicit test.
710  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
711  unsigned SkippedWhitespace = 0;
712  if (FormatTok->TokenText.size() > 2 &&
713  (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
714  SkippedWhitespace = 3;
715  else if (FormatTok->TokenText[1] == '\n')
716  SkippedWhitespace = 2;
717  else
718  break;
719 
720  ++FormatTok->NewlinesBefore;
721  WhitespaceLength += SkippedWhitespace;
722  FormatTok->LastNewlineOffset = SkippedWhitespace;
723  Column = 0;
724  FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
725  }
726 
727  FormatTok->WhitespaceRange = SourceRange(
728  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
729 
730  FormatTok->OriginalColumn = Column;
731 
732  TrailingWhitespace = 0;
733  if (FormatTok->Tok.is(tok::comment)) {
734  // FIXME: Add the trimmed whitespace to Column.
735  StringRef UntrimmedText = FormatTok->TokenText;
736  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
737  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
738  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
739  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
740  FormatTok->Tok.setIdentifierInfo(&Info);
741  FormatTok->Tok.setKind(Info.getTokenID());
742  if (Style.Language == FormatStyle::LK_Java &&
743  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
744  tok::kw_operator)) {
745  FormatTok->Tok.setKind(tok::identifier);
746  FormatTok->Tok.setIdentifierInfo(nullptr);
747  } else if (Style.Language == FormatStyle::LK_JavaScript &&
748  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
749  tok::kw_operator)) {
750  FormatTok->Tok.setKind(tok::identifier);
751  FormatTok->Tok.setIdentifierInfo(nullptr);
752  }
753  } else if (FormatTok->Tok.is(tok::greatergreater)) {
754  FormatTok->Tok.setKind(tok::greater);
755  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
756  ++Column;
757  StateStack.push(LexerState::TOKEN_STASHED);
758  } else if (FormatTok->Tok.is(tok::lessless)) {
759  FormatTok->Tok.setKind(tok::less);
760  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
761  ++Column;
762  StateStack.push(LexerState::TOKEN_STASHED);
763  }
764 
765  // Now FormatTok is the next non-whitespace token.
766 
767  StringRef Text = FormatTok->TokenText;
768  size_t FirstNewlinePos = Text.find('\n');
769  if (FirstNewlinePos == StringRef::npos) {
770  // FIXME: ColumnWidth actually depends on the start column, we need to
771  // take this into account when the token is moved.
772  FormatTok->ColumnWidth =
773  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
774  Column += FormatTok->ColumnWidth;
775  } else {
776  FormatTok->IsMultiline = true;
777  // FIXME: ColumnWidth actually depends on the start column, we need to
778  // take this into account when the token is moved.
780  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
781 
782  // The last line of the token always starts in column 0.
783  // Thus, the length can be precomputed even in the presence of tabs.
785  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
786  Column = FormatTok->LastLineColumnWidth;
787  }
788 
789  if (Style.isCpp()) {
790  auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
791  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
792  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
793  tok::pp_define) &&
794  it != Macros.end()) {
795  FormatTok->Type = it->second;
796  } else if (FormatTok->is(tok::identifier)) {
797  if (MacroBlockBeginRegex.match(Text)) {
798  FormatTok->Type = TT_MacroBlockBegin;
799  } else if (MacroBlockEndRegex.match(Text)) {
800  FormatTok->Type = TT_MacroBlockEnd;
801  }
802  }
803  }
804 
805  return FormatTok;
806 }
807 
808 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
809  Lex->LexFromRawLexer(Tok.Tok);
810  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
811  Tok.Tok.getLength());
812  // For formatting, treat unterminated string literals like normal string
813  // literals.
814  if (Tok.is(tok::unknown)) {
815  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
816  Tok.Tok.setKind(tok::string_literal);
817  Tok.IsUnterminatedLiteral = true;
818  } else if (Style.Language == FormatStyle::LK_JavaScript &&
819  Tok.TokenText == "''") {
820  Tok.Tok.setKind(tok::string_literal);
821  }
822  }
823 
824  if ((Style.Language == FormatStyle::LK_JavaScript ||
825  Style.Language == FormatStyle::LK_Proto ||
826  Style.Language == FormatStyle::LK_TextProto) &&
827  Tok.is(tok::char_constant)) {
828  Tok.Tok.setKind(tok::string_literal);
829  }
830 
831  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
832  Tok.TokenText == "/* clang-format on */")) {
833  FormattingDisabled = false;
834  }
835 
836  Tok.Finalized = FormattingDisabled;
837 
838  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
839  Tok.TokenText == "/* clang-format off */")) {
840  FormattingDisabled = true;
841  }
842 }
843 
844 void FormatTokenLexer::resetLexer(unsigned Offset) {
845  StringRef Buffer = SourceMgr.getBufferData(ID);
846  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
847  getFormattingLangOpts(Style), Buffer.begin(),
848  Buffer.begin() + Offset, Buffer.end()));
849  Lex->SetKeepWhitespaceMode(true);
850  TrailingWhitespace = 0;
851 }
852 
853 } // namespace format
854 } // namespace clang
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
StringRef Identifier
Definition: Format.cpp:1681
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:76
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Token Tok
The Token.
Definition: FormatToken.h:130
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:94
Defines the SourceManager interface.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:217
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
Definition: Type.h:1414
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:159
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:162
bool isBinaryOperator() const
Definition: FormatToken.h:415
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:136
One of these records is kept for each identifier that is lexed.
Token - This structure provides full information about a lexed token.
Definition: Token.h:34
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:61
void setKind(tok::TokenKind K)
Definition: Token.h:90
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
const FormatToken & Tok
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:884
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:318
unsigned Offset
Definition: Format.cpp:1676
SourceLocation End
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding)
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:2317
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:123
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:126
unsigned LastNewlineOffset
The offset just past the last &#39; &#39; in this token&#39;s leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:147
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:309
Various functions to configurably format source code.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:176
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:185
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:143
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:24
bool IsUnterminatedLiteral
Set to true if this token is an unterminated literal.
Definition: FormatToken.h:177
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:174
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:49
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Dataflow Directional Tag Classes.
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:152
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:307
unsigned getLength() const
Definition: Token.h:126
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:1675
void setLocation(SourceLocation L)
Definition: Token.h:131
#define true
Definition: stdbool.h:32
A trivial tuple used to represent a source range.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:140
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:156
const encoding::Encoding Encoding
const FormatStyle & Style