clang  10.0.0svn
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
26  unsigned Column, const FormatStyle &Style,
28  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
29  Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
30  Style(Style), IdentTable(getFormattingLangOpts(Style)),
31  Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
33  MacroBlockEndRegex(Style.MacroBlockEnd) {
34  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
35  getFormattingLangOpts(Style)));
36  Lex->SetKeepWhitespaceMode(true);
37 
38  for (const std::string &ForEachMacro : Style.ForEachMacros)
39  Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
40  for (const std::string &StatementMacro : Style.StatementMacros)
41  Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
42  for (const std::string &TypenameMacro : Style.TypenameMacros)
43  Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
44  for (const std::string &NamespaceMacro : Style.NamespaceMacros)
45  Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
46 }
47 
49  assert(Tokens.empty());
50  assert(FirstInLineIndex == 0);
51  do {
52  Tokens.push_back(getNextToken());
53  if (Style.Language == FormatStyle::LK_JavaScript) {
54  tryParseJSRegexLiteral();
55  handleTemplateStrings();
56  }
57  if (Style.Language == FormatStyle::LK_TextProto)
58  tryParsePythonComment();
59  tryMergePreviousTokens();
60  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
61  FirstInLineIndex = Tokens.size() - 1;
62  } while (Tokens.back()->Tok.isNot(tok::eof));
63  return Tokens;
64 }
65 
66 void FormatTokenLexer::tryMergePreviousTokens() {
67  if (tryMerge_TMacro())
68  return;
69  if (tryMergeConflictMarkers())
70  return;
71  if (tryMergeLessLess())
72  return;
73 
74  if (Style.isCSharp()) {
75  if (tryMergeCSharpKeywordVariables())
76  return;
77  if (tryMergeCSharpVerbatimStringLiteral())
78  return;
79  if (tryMergeCSharpDoubleQuestion())
80  return;
81  if (tryMergeCSharpNullConditionals())
82  return;
83  if (tryTransformCSharpForEach())
84  return;
85  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
86  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
87  return;
88  }
89 
90  if (tryMergeNSStringLiteral())
91  return;
92 
93  if (Style.Language == FormatStyle::LK_JavaScript) {
94  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
95  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
96  tok::equal};
97  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
98  tok::greaterequal};
99  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
100  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
101  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
102  tok::starequal};
103 
104  // FIXME: Investigate what token type gives the correct operator priority.
105  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
106  return;
107  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
108  return;
109  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
110  return;
111  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
112  return;
113  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
114  return;
115  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
116  Tokens.back()->Tok.setKind(tok::starequal);
117  return;
118  }
119  if (tryMergeJSPrivateIdentifier())
120  return;
121  }
122 
123  if (Style.Language == FormatStyle::LK_Java) {
124  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
125  tok::greater, tok::greater, tok::greaterequal};
126  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
127  return;
128  }
129 }
130 
131 bool FormatTokenLexer::tryMergeNSStringLiteral() {
132  if (Tokens.size() < 2)
133  return false;
134  auto &At = *(Tokens.end() - 2);
135  auto &String = *(Tokens.end() - 1);
136  if (!At->is(tok::at) || !String->is(tok::string_literal))
137  return false;
138  At->Tok.setKind(tok::string_literal);
139  At->TokenText = StringRef(At->TokenText.begin(),
140  String->TokenText.end() - At->TokenText.begin());
141  At->ColumnWidth += String->ColumnWidth;
142  At->Type = TT_ObjCStringLiteral;
143  Tokens.erase(Tokens.end() - 1);
144  return true;
145 }
146 
147 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
148  // Merges #idenfier into a single identifier with the text #identifier
149  // but the token tok::identifier.
150  if (Tokens.size() < 2)
151  return false;
152  auto &Hash = *(Tokens.end() - 2);
153  auto &Identifier = *(Tokens.end() - 1);
154  if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
155  return false;
156  Hash->Tok.setKind(tok::identifier);
157  Hash->TokenText =
158  StringRef(Hash->TokenText.begin(),
159  Identifier->TokenText.end() - Hash->TokenText.begin());
160  Hash->ColumnWidth += Identifier->ColumnWidth;
161  Hash->Type = TT_JsPrivateIdentifier;
162  Tokens.erase(Tokens.end() - 1);
163  return true;
164 }
165 
166 // Search for verbatim or interpolated string literals @"ABC" or
167 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
168 // prevent splitting of @, $ and ".
169 bool FormatTokenLexer::tryMergeCSharpVerbatimStringLiteral() {
170  if (Tokens.size() < 2)
171  return false;
172  auto &At = *(Tokens.end() - 2);
173  auto &String = *(Tokens.end() - 1);
174 
175  // Look for $"aaaaaa" @"aaaaaa".
176  if (!(At->is(tok::at) || At->TokenText == "$") ||
177  !String->is(tok::string_literal))
178  return false;
179 
180  if (Tokens.size() >= 2 && At->is(tok::at)) {
181  auto &Dollar = *(Tokens.end() - 3);
182  if (Dollar->TokenText == "$") {
183  // This looks like $@"aaaaa" so we need to combine all 3 tokens.
184  Dollar->Tok.setKind(tok::string_literal);
185  Dollar->TokenText =
186  StringRef(Dollar->TokenText.begin(),
187  String->TokenText.end() - Dollar->TokenText.begin());
188  Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
189  Dollar->Type = TT_CSharpStringLiteral;
190  Tokens.erase(Tokens.end() - 2);
191  Tokens.erase(Tokens.end() - 1);
192  return true;
193  }
194  }
195 
196  // Convert back into just a string_literal.
197  At->Tok.setKind(tok::string_literal);
198  At->TokenText = StringRef(At->TokenText.begin(),
199  String->TokenText.end() - At->TokenText.begin());
200  At->ColumnWidth += String->ColumnWidth;
201  At->Type = TT_CSharpStringLiteral;
202  Tokens.erase(Tokens.end() - 1);
203  return true;
204 }
205 
206 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
207  if (Tokens.size() < 2)
208  return false;
209  auto &FirstQuestion = *(Tokens.end() - 2);
210  auto &SecondQuestion = *(Tokens.end() - 1);
211  if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question))
212  return false;
213  FirstQuestion->Tok.setKind(tok::question);
214  FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
215  SecondQuestion->TokenText.end() -
216  FirstQuestion->TokenText.begin());
217  FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
218  FirstQuestion->Type = TT_CSharpNullCoalescing;
219  Tokens.erase(Tokens.end() - 1);
220  return true;
221 }
222 
223 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
224  if (Tokens.size() < 2)
225  return false;
226  auto &At = *(Tokens.end() - 2);
227  auto &Keyword = *(Tokens.end() - 1);
228  if (!At->is(tok::at))
229  return false;
230  if (!Keywords.isCSharpKeyword(*Keyword))
231  return false;
232 
233  At->Tok.setKind(tok::identifier);
234  At->TokenText = StringRef(At->TokenText.begin(),
235  Keyword->TokenText.end() - At->TokenText.begin());
236  At->ColumnWidth += Keyword->ColumnWidth;
237  At->Type = Keyword->Type;
238  Tokens.erase(Tokens.end() - 1);
239  return true;
240 }
241 
242 // In C# merge the Identifier and the ? together e.g. arg?.
243 bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
244  if (Tokens.size() < 2)
245  return false;
246  auto &Identifier = *(Tokens.end() - 2);
247  auto &Question = *(Tokens.end() - 1);
248  if (!Identifier->isOneOf(tok::r_square, tok::identifier) ||
249  !Question->is(tok::question))
250  return false;
251  Identifier->TokenText =
252  StringRef(Identifier->TokenText.begin(),
253  Question->TokenText.end() - Identifier->TokenText.begin());
254  Identifier->ColumnWidth += Question->ColumnWidth;
255  Tokens.erase(Tokens.end() - 1);
256  return true;
257 }
258 
259 // In C# transform identifier foreach into kw_foreach
260 bool FormatTokenLexer::tryTransformCSharpForEach() {
261  if (Tokens.size() < 1)
262  return false;
263  auto &Identifier = *(Tokens.end() - 1);
264  if (!Identifier->is(tok::identifier))
265  return false;
266  if (Identifier->TokenText != "foreach")
267  return false;
268 
269  Identifier->Type = TT_ForEachMacro;
270  Identifier->Tok.setKind(tok::kw_for);
271  return true;
272 }
273 
274 bool FormatTokenLexer::tryMergeLessLess() {
275  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
276  if (Tokens.size() < 3)
277  return false;
278 
279  bool FourthTokenIsLess = false;
280  if (Tokens.size() > 3)
281  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
282 
283  auto First = Tokens.end() - 3;
284  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
285  First[0]->isNot(tok::less) || FourthTokenIsLess)
286  return false;
287 
288  // Only merge if there currently is no whitespace between the two "<".
289  if (First[1]->WhitespaceRange.getBegin() !=
290  First[1]->WhitespaceRange.getEnd())
291  return false;
292 
293  First[0]->Tok.setKind(tok::lessless);
294  First[0]->TokenText = "<<";
295  First[0]->ColumnWidth += 1;
296  Tokens.erase(Tokens.end() - 2);
297  return true;
298 }
299 
300 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
301  TokenType NewType) {
302  if (Tokens.size() < Kinds.size())
303  return false;
304 
306  Tokens.end() - Kinds.size();
307  if (!First[0]->is(Kinds[0]))
308  return false;
309  unsigned AddLength = 0;
310  for (unsigned i = 1; i < Kinds.size(); ++i) {
311  if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
312  First[i]->WhitespaceRange.getEnd())
313  return false;
314  AddLength += First[i]->TokenText.size();
315  }
316  Tokens.resize(Tokens.size() - Kinds.size() + 1);
317  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
318  First[0]->TokenText.size() + AddLength);
319  First[0]->ColumnWidth += AddLength;
320  First[0]->Type = NewType;
321  return true;
322 }
323 
324 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
325 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
326  // NB: This is not entirely correct, as an r_paren can introduce an operand
327  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
328  // corner case to not matter in practice, though.
329  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
330  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
331  tok::colon, tok::question, tok::tilde) ||
332  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
333  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
334  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
335  Tok->isBinaryOperator();
336 }
337 
338 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
339  if (!Prev)
340  return true;
341 
342  // Regex literals can only follow after prefix unary operators, not after
343  // postfix unary operators. If the '++' is followed by a non-operand
344  // introducing token, the slash here is the operand and not the start of a
345  // regex.
346  // `!` is an unary prefix operator, but also a post-fix operator that casts
347  // away nullability, so the same check applies.
348  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
349  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
350 
351  // The previous token must introduce an operand location where regex
352  // literals can occur.
353  if (!precedesOperand(Prev))
354  return false;
355 
356  return true;
357 }
358 
359 // Tries to parse a JavaScript Regex literal starting at the current token,
360 // if that begins with a slash and is in a location where JavaScript allows
361 // regex literals. Changes the current token to a regex literal and updates
362 // its text if successful.
363 void FormatTokenLexer::tryParseJSRegexLiteral() {
364  FormatToken *RegexToken = Tokens.back();
365  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
366  return;
367 
368  FormatToken *Prev = nullptr;
369  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
370  // NB: Because previous pointers are not initialized yet, this cannot use
371  // Token.getPreviousNonComment.
372  if ((*I)->isNot(tok::comment)) {
373  Prev = *I;
374  break;
375  }
376  }
377 
378  if (!canPrecedeRegexLiteral(Prev))
379  return;
380 
381  // 'Manually' lex ahead in the current file buffer.
382  const char *Offset = Lex->getBufferLocation();
383  const char *RegexBegin = Offset - RegexToken->TokenText.size();
384  StringRef Buffer = Lex->getBuffer();
385  bool InCharacterClass = false;
386  bool HaveClosingSlash = false;
387  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
388  // Regular expressions are terminated with a '/', which can only be
389  // escaped using '\' or a character class between '[' and ']'.
390  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
391  switch (*Offset) {
392  case '\\':
393  // Skip the escaped character.
394  ++Offset;
395  break;
396  case '[':
397  InCharacterClass = true;
398  break;
399  case ']':
400  InCharacterClass = false;
401  break;
402  case '/':
403  if (!InCharacterClass)
404  HaveClosingSlash = true;
405  break;
406  }
407  }
408 
409  RegexToken->Type = TT_RegexLiteral;
410  // Treat regex literals like other string_literals.
411  RegexToken->Tok.setKind(tok::string_literal);
412  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
413  RegexToken->ColumnWidth = RegexToken->TokenText.size();
414 
415  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
416 }
417 
418 void FormatTokenLexer::handleTemplateStrings() {
419  FormatToken *BacktickToken = Tokens.back();
420 
421  if (BacktickToken->is(tok::l_brace)) {
422  StateStack.push(LexerState::NORMAL);
423  return;
424  }
425  if (BacktickToken->is(tok::r_brace)) {
426  if (StateStack.size() == 1)
427  return;
428  StateStack.pop();
429  if (StateStack.top() != LexerState::TEMPLATE_STRING)
430  return;
431  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
432  } else if (BacktickToken->is(tok::unknown) &&
433  BacktickToken->TokenText == "`") {
434  StateStack.push(LexerState::TEMPLATE_STRING);
435  } else {
436  return; // Not actually a template
437  }
438 
439  // 'Manually' lex ahead in the current file buffer.
440  const char *Offset = Lex->getBufferLocation();
441  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
442  for (; Offset != Lex->getBuffer().end(); ++Offset) {
443  if (Offset[0] == '`') {
444  StateStack.pop();
445  break;
446  }
447  if (Offset[0] == '\\') {
448  ++Offset; // Skip the escaped character.
449  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
450  Offset[1] == '{') {
451  // '${' introduces an expression interpolation in the template string.
452  StateStack.push(LexerState::NORMAL);
453  ++Offset;
454  break;
455  }
456  }
457 
458  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
459  BacktickToken->Type = TT_TemplateString;
460  BacktickToken->Tok.setKind(tok::string_literal);
461  BacktickToken->TokenText = LiteralText;
462 
463  // Adjust width for potentially multiline string literals.
464  size_t FirstBreak = LiteralText.find('\n');
465  StringRef FirstLineText = FirstBreak == StringRef::npos
466  ? LiteralText
467  : LiteralText.substr(0, FirstBreak);
469  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
470  size_t LastBreak = LiteralText.rfind('\n');
471  if (LastBreak != StringRef::npos) {
472  BacktickToken->IsMultiline = true;
473  unsigned StartColumn = 0; // The template tail spans the entire line.
475  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
476  Style.TabWidth, Encoding);
477  }
478 
479  SourceLocation loc = Offset < Lex->getBuffer().end()
480  ? Lex->getSourceLocation(Offset + 1)
481  : SourceMgr.getLocForEndOfFile(ID);
482  resetLexer(SourceMgr.getFileOffset(loc));
483 }
484 
485 void FormatTokenLexer::tryParsePythonComment() {
486  FormatToken *HashToken = Tokens.back();
487  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
488  return;
489  // Turn the remainder of this line into a comment.
490  const char *CommentBegin =
491  Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
492  size_t From = CommentBegin - Lex->getBuffer().begin();
493  size_t To = Lex->getBuffer().find_first_of('\n', From);
494  if (To == StringRef::npos)
495  To = Lex->getBuffer().size();
496  size_t Len = To - From;
497  HashToken->Type = TT_LineComment;
498  HashToken->Tok.setKind(tok::comment);
499  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
500  SourceLocation Loc = To < Lex->getBuffer().size()
501  ? Lex->getSourceLocation(CommentBegin + Len)
502  : SourceMgr.getLocForEndOfFile(ID);
503  resetLexer(SourceMgr.getFileOffset(Loc));
504 }
505 
506 bool FormatTokenLexer::tryMerge_TMacro() {
507  if (Tokens.size() < 4)
508  return false;
509  FormatToken *Last = Tokens.back();
510  if (!Last->is(tok::r_paren))
511  return false;
512 
513  FormatToken *String = Tokens[Tokens.size() - 2];
514  if (!String->is(tok::string_literal) || String->IsMultiline)
515  return false;
516 
517  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
518  return false;
519 
520  FormatToken *Macro = Tokens[Tokens.size() - 4];
521  if (Macro->TokenText != "_T")
522  return false;
523 
524  const char *Start = Macro->TokenText.data();
525  const char *End = Last->TokenText.data() + Last->TokenText.size();
526  String->TokenText = StringRef(Start, End - Start);
527  String->IsFirst = Macro->IsFirst;
528  String->LastNewlineOffset = Macro->LastNewlineOffset;
529  String->WhitespaceRange = Macro->WhitespaceRange;
530  String->OriginalColumn = Macro->OriginalColumn;
532  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
533  String->NewlinesBefore = Macro->NewlinesBefore;
534  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
535 
536  Tokens.pop_back();
537  Tokens.pop_back();
538  Tokens.pop_back();
539  Tokens.back() = String;
540  return true;
541 }
542 
543 bool FormatTokenLexer::tryMergeConflictMarkers() {
544  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
545  return false;
546 
547  // Conflict lines look like:
548  // <marker> <text from the vcs>
549  // For example:
550  // >>>>>>> /file/in/file/system at revision 1234
551  //
552  // We merge all tokens in a line that starts with a conflict marker
553  // into a single token with a special token type that the unwrapped line
554  // parser will use to correctly rebuild the underlying code.
555 
556  FileID ID;
557  // Get the position of the first token in the line.
558  unsigned FirstInLineOffset;
559  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
560  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
561  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
562  // Calculate the offset of the start of the current line.
563  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
564  if (LineOffset == StringRef::npos) {
565  LineOffset = 0;
566  } else {
567  ++LineOffset;
568  }
569 
570  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
571  StringRef LineStart;
572  if (FirstSpace == StringRef::npos) {
573  LineStart = Buffer.substr(LineOffset);
574  } else {
575  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
576  }
577 
578  TokenType Type = TT_Unknown;
579  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
580  Type = TT_ConflictStart;
581  } else if (LineStart == "|||||||" || LineStart == "=======" ||
582  LineStart == "====") {
583  Type = TT_ConflictAlternative;
584  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
585  Type = TT_ConflictEnd;
586  }
587 
588  if (Type != TT_Unknown) {
589  FormatToken *Next = Tokens.back();
590 
591  Tokens.resize(FirstInLineIndex + 1);
592  // We do not need to build a complete token here, as we will skip it
593  // during parsing anyway (as we must not touch whitespace around conflict
594  // markers).
595  Tokens.back()->Type = Type;
596  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
597 
598  Tokens.push_back(Next);
599  return true;
600  }
601 
602  return false;
603 }
604 
605 FormatToken *FormatTokenLexer::getStashedToken() {
606  // Create a synthesized second '>' or '<' token.
607  Token Tok = FormatTok->Tok;
608  StringRef TokenText = FormatTok->TokenText;
609 
610  unsigned OriginalColumn = FormatTok->OriginalColumn;
611  FormatTok = new (Allocator.Allocate()) FormatToken;
612  FormatTok->Tok = Tok;
613  SourceLocation TokLocation =
614  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
615  FormatTok->Tok.setLocation(TokLocation);
616  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
617  FormatTok->TokenText = TokenText;
618  FormatTok->ColumnWidth = 1;
619  FormatTok->OriginalColumn = OriginalColumn + 1;
620 
621  return FormatTok;
622 }
623 
624 FormatToken *FormatTokenLexer::getNextToken() {
625  if (StateStack.top() == LexerState::TOKEN_STASHED) {
626  StateStack.pop();
627  return getStashedToken();
628  }
629 
630  FormatTok = new (Allocator.Allocate()) FormatToken;
631  readRawToken(*FormatTok);
632  SourceLocation WhitespaceStart =
633  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
634  FormatTok->IsFirst = IsFirstToken;
635  IsFirstToken = false;
636 
637  // Consume and record whitespace until we find a significant token.
638  unsigned WhitespaceLength = TrailingWhitespace;
639  while (FormatTok->Tok.is(tok::unknown)) {
640  StringRef Text = FormatTok->TokenText;
641  auto EscapesNewline = [&](int pos) {
642  // A '\r' here is just part of '\r\n'. Skip it.
643  if (pos >= 0 && Text[pos] == '\r')
644  --pos;
645  // See whether there is an odd number of '\' before this.
646  // FIXME: This is wrong. A '\' followed by a newline is always removed,
647  // regardless of whether there is another '\' before it.
648  // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
649  unsigned count = 0;
650  for (; pos >= 0; --pos, ++count)
651  if (Text[pos] != '\\')
652  break;
653  return count & 1;
654  };
655  // FIXME: This miscounts tok:unknown tokens that are not just
656  // whitespace, e.g. a '`' character.
657  for (int i = 0, e = Text.size(); i != e; ++i) {
658  switch (Text[i]) {
659  case '\n':
660  ++FormatTok->NewlinesBefore;
661  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
662  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
663  Column = 0;
664  break;
665  case '\r':
666  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
667  Column = 0;
668  break;
669  case '\f':
670  case '\v':
671  Column = 0;
672  break;
673  case ' ':
674  ++Column;
675  break;
676  case '\t':
677  Column +=
678  Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
679  break;
680  case '\\':
681  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
682  FormatTok->Type = TT_ImplicitStringLiteral;
683  break;
684  default:
685  FormatTok->Type = TT_ImplicitStringLiteral;
686  break;
687  }
688  if (FormatTok->Type == TT_ImplicitStringLiteral)
689  break;
690  }
691 
692  if (FormatTok->is(TT_ImplicitStringLiteral))
693  break;
694  WhitespaceLength += FormatTok->Tok.getLength();
695 
696  readRawToken(*FormatTok);
697  }
698 
699  // JavaScript and Java do not allow to escape the end of the line with a
700  // backslash. Backslashes are syntax errors in plain source, but can occur in
701  // comments. When a single line comment ends with a \, it'll cause the next
702  // line of code to be lexed as a comment, breaking formatting. The code below
703  // finds comments that contain a backslash followed by a line break, truncates
704  // the comment token at the backslash, and resets the lexer to restart behind
705  // the backslash.
706  if ((Style.Language == FormatStyle::LK_JavaScript ||
707  Style.Language == FormatStyle::LK_Java) &&
708  FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
709  size_t BackslashPos = FormatTok->TokenText.find('\\');
710  while (BackslashPos != StringRef::npos) {
711  if (BackslashPos + 1 < FormatTok->TokenText.size() &&
712  FormatTok->TokenText[BackslashPos + 1] == '\n') {
713  const char *Offset = Lex->getBufferLocation();
714  Offset -= FormatTok->TokenText.size();
715  Offset += BackslashPos + 1;
716  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
717  FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
719  FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
720  Encoding);
721  break;
722  }
723  BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
724  }
725  }
726 
727  // In case the token starts with escaped newlines, we want to
728  // take them into account as whitespace - this pattern is quite frequent
729  // in macro definitions.
730  // FIXME: Add a more explicit test.
731  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
732  unsigned SkippedWhitespace = 0;
733  if (FormatTok->TokenText.size() > 2 &&
734  (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
735  SkippedWhitespace = 3;
736  else if (FormatTok->TokenText[1] == '\n')
737  SkippedWhitespace = 2;
738  else
739  break;
740 
741  ++FormatTok->NewlinesBefore;
742  WhitespaceLength += SkippedWhitespace;
743  FormatTok->LastNewlineOffset = SkippedWhitespace;
744  Column = 0;
745  FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
746  }
747 
748  FormatTok->WhitespaceRange = SourceRange(
749  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
750 
751  FormatTok->OriginalColumn = Column;
752 
753  TrailingWhitespace = 0;
754  if (FormatTok->Tok.is(tok::comment)) {
755  // FIXME: Add the trimmed whitespace to Column.
756  StringRef UntrimmedText = FormatTok->TokenText;
757  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
758  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
759  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
760  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
761  FormatTok->Tok.setIdentifierInfo(&Info);
762  FormatTok->Tok.setKind(Info.getTokenID());
763  if (Style.Language == FormatStyle::LK_Java &&
764  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
765  tok::kw_operator)) {
766  FormatTok->Tok.setKind(tok::identifier);
767  FormatTok->Tok.setIdentifierInfo(nullptr);
768  } else if (Style.Language == FormatStyle::LK_JavaScript &&
769  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
770  tok::kw_operator)) {
771  FormatTok->Tok.setKind(tok::identifier);
772  FormatTok->Tok.setIdentifierInfo(nullptr);
773  }
774  } else if (FormatTok->Tok.is(tok::greatergreater)) {
775  FormatTok->Tok.setKind(tok::greater);
776  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
777  ++Column;
778  StateStack.push(LexerState::TOKEN_STASHED);
779  } else if (FormatTok->Tok.is(tok::lessless)) {
780  FormatTok->Tok.setKind(tok::less);
781  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
782  ++Column;
783  StateStack.push(LexerState::TOKEN_STASHED);
784  }
785 
786  // Now FormatTok is the next non-whitespace token.
787 
788  StringRef Text = FormatTok->TokenText;
789  size_t FirstNewlinePos = Text.find('\n');
790  if (FirstNewlinePos == StringRef::npos) {
791  // FIXME: ColumnWidth actually depends on the start column, we need to
792  // take this into account when the token is moved.
793  FormatTok->ColumnWidth =
794  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
795  Column += FormatTok->ColumnWidth;
796  } else {
797  FormatTok->IsMultiline = true;
798  // FIXME: ColumnWidth actually depends on the start column, we need to
799  // take this into account when the token is moved.
801  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
802 
803  // The last line of the token always starts in column 0.
804  // Thus, the length can be precomputed even in the presence of tabs.
806  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
807  Column = FormatTok->LastLineColumnWidth;
808  }
809 
810  if (Style.isCpp()) {
811  auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
812  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
813  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
814  tok::pp_define) &&
815  it != Macros.end()) {
816  FormatTok->Type = it->second;
817  } else if (FormatTok->is(tok::identifier)) {
818  if (MacroBlockBeginRegex.match(Text)) {
819  FormatTok->Type = TT_MacroBlockBegin;
820  } else if (MacroBlockEndRegex.match(Text)) {
821  FormatTok->Type = TT_MacroBlockEnd;
822  }
823  }
824  }
825 
826  return FormatTok;
827 }
828 
829 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
830  Lex->LexFromRawLexer(Tok.Tok);
831  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
832  Tok.Tok.getLength());
833  // For formatting, treat unterminated string literals like normal string
834  // literals.
835  if (Tok.is(tok::unknown)) {
836  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
837  Tok.Tok.setKind(tok::string_literal);
838  Tok.IsUnterminatedLiteral = true;
839  } else if (Style.Language == FormatStyle::LK_JavaScript &&
840  Tok.TokenText == "''") {
841  Tok.Tok.setKind(tok::string_literal);
842  }
843  }
844 
845  if ((Style.Language == FormatStyle::LK_JavaScript ||
846  Style.Language == FormatStyle::LK_Proto ||
847  Style.Language == FormatStyle::LK_TextProto) &&
848  Tok.is(tok::char_constant)) {
849  Tok.Tok.setKind(tok::string_literal);
850  }
851 
852  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
853  Tok.TokenText == "/* clang-format on */")) {
854  FormattingDisabled = false;
855  }
856 
857  Tok.Finalized = FormattingDisabled;
858 
859  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
860  Tok.TokenText == "/* clang-format off */")) {
861  FormattingDisabled = true;
862  }
863 }
864 
865 void FormatTokenLexer::resetLexer(unsigned Offset) {
866  StringRef Buffer = SourceMgr.getBufferData(ID);
867  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
868  getFormattingLangOpts(Style), Buffer.begin(),
869  Buffer.begin() + Offset, Buffer.end()));
870  Lex->SetKeepWhitespaceMode(true);
871  TrailingWhitespace = 0;
872 }
873 
874 } // namespace format
875 } // namespace clang
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
StringRef Identifier
Definition: Format.cpp:1815
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:76
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Token Tok
The Token.
Definition: FormatToken.h:133
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:97
Defines the SourceManager interface.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:220
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
Definition: Type.h:1436
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:162
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:165
bool isBinaryOperator() const
Definition: FormatToken.h:427
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:139
One of these records is kept for each identifier that is lexed.
Token - This structure provides full information about a lexed token.
Definition: Token.h:34
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:61
void setKind(tok::TokenKind K)
Definition: Token.h:93
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
const FormatToken & Tok
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:902
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:321
unsigned Offset
Definition: Format.cpp:1809
SourceLocation End
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding)
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:2491
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:126
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:129
unsigned LastNewlineOffset
The offset just past the last &#39; &#39; in this token&#39;s leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:150
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:312
Various functions to configurably format source code.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:179
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:188
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:146
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:24
bool IsUnterminatedLiteral
Set to true if this token is an unterminated literal.
Definition: FormatToken.h:180
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:177
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:49
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
const llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Dataflow Directional Tag Classes.
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:155
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:310
unsigned getLength() const
Definition: Token.h:129
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:1808
void setLocation(SourceLocation L)
Definition: Token.h:134
#define true
Definition: stdbool.h:16
A trivial tuple used to represent a source range.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:143
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:159
const encoding::Encoding Encoding
const FormatStyle & Style