clang  16.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
26  const SourceManager &SourceMgr, FileID ID, unsigned Column,
28  llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29  IdentifierTable &IdentTable)
30  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
32  LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33  Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34  Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36  MacroBlockEndRegex(Style.MacroBlockEnd) {
37  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38  Lex->SetKeepWhitespaceMode(true);
39 
40  for (const std::string &ForEachMacro : Style.ForEachMacros) {
41  auto Identifier = &IdentTable.get(ForEachMacro);
42  Macros.insert({Identifier, TT_ForEachMacro});
43  }
44  for (const std::string &IfMacro : Style.IfMacros) {
45  auto Identifier = &IdentTable.get(IfMacro);
46  Macros.insert({Identifier, TT_IfMacro});
47  }
48  for (const std::string &AttributeMacro : Style.AttributeMacros) {
49  auto Identifier = &IdentTable.get(AttributeMacro);
50  Macros.insert({Identifier, TT_AttributeMacro});
51  }
52  for (const std::string &StatementMacro : Style.StatementMacros) {
53  auto Identifier = &IdentTable.get(StatementMacro);
54  Macros.insert({Identifier, TT_StatementMacro});
55  }
56  for (const std::string &TypenameMacro : Style.TypenameMacros) {
57  auto Identifier = &IdentTable.get(TypenameMacro);
58  Macros.insert({Identifier, TT_TypenameMacro});
59  }
60  for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61  auto Identifier = &IdentTable.get(NamespaceMacro);
62  Macros.insert({Identifier, TT_NamespaceMacro});
63  }
64  for (const std::string &WhitespaceSensitiveMacro :
66  auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67  Macros.insert({Identifier, TT_UntouchableMacroFunc});
68  }
69  for (const std::string &StatementAttributeLikeMacro :
71  auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72  Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73  }
74 }
75 
77  assert(Tokens.empty());
78  assert(FirstInLineIndex == 0);
79  do {
80  Tokens.push_back(getNextToken());
81  if (Style.isJavaScript()) {
82  tryParseJSRegexLiteral();
83  handleTemplateStrings();
84  }
86  tryParsePythonComment();
87  tryMergePreviousTokens();
88  if (Style.isCSharp()) {
89  // This needs to come after tokens have been merged so that C#
90  // string literals are correctly identified.
91  handleCSharpVerbatimAndInterpolatedStrings();
92  }
93  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
94  FirstInLineIndex = Tokens.size() - 1;
95  } while (Tokens.back()->isNot(tok::eof));
96  return Tokens;
97 }
98 
99 void FormatTokenLexer::tryMergePreviousTokens() {
100  if (tryMerge_TMacro())
101  return;
102  if (tryMergeConflictMarkers())
103  return;
104  if (tryMergeLessLess())
105  return;
106  if (tryMergeForEach())
107  return;
108  if (Style.isCpp() && tryTransformTryUsageForC())
109  return;
110 
111  if (Style.isJavaScript() || Style.isCSharp()) {
112  static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
113  tok::question};
114  static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
115  tok::period};
116  static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
117 
118  if (tryMergeTokens(FatArrow, TT_FatArrow))
119  return;
120  if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
121  // Treat like the "||" operator (as opposed to the ternary ?).
122  Tokens.back()->Tok.setKind(tok::pipepipe);
123  return;
124  }
125  if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
126  // Treat like a regular "." access.
127  Tokens.back()->Tok.setKind(tok::period);
128  return;
129  }
130  if (tryMergeNullishCoalescingEqual())
131  return;
132  }
133 
134  if (Style.isCSharp()) {
135  static const tok::TokenKind CSharpNullConditionalLSquare[] = {
136  tok::question, tok::l_square};
137 
138  if (tryMergeCSharpKeywordVariables())
139  return;
140  if (tryMergeCSharpStringLiteral())
141  return;
142  if (tryTransformCSharpForEach())
143  return;
144  if (tryMergeTokens(CSharpNullConditionalLSquare,
145  TT_CSharpNullConditionalLSquare)) {
146  // Treat like a regular "[" operator.
147  Tokens.back()->Tok.setKind(tok::l_square);
148  return;
149  }
150  }
151 
152  if (tryMergeNSStringLiteral())
153  return;
154 
155  if (Style.isJavaScript()) {
156  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
157  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
158  tok::equal};
159  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
160  tok::greaterequal};
161  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
162  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
163  tok::starequal};
164  static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
165  static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
166 
167  // FIXME: Investigate what token type gives the correct operator priority.
168  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
169  return;
170  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
171  return;
172  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
173  return;
174  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
175  return;
176  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
177  Tokens.back()->Tok.setKind(tok::starequal);
178  return;
179  }
180  if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
181  tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
182  // Treat like the "=" assignment operator.
183  Tokens.back()->Tok.setKind(tok::equal);
184  return;
185  }
186  if (tryMergeJSPrivateIdentifier())
187  return;
188  }
189 
190  if (Style.Language == FormatStyle::LK_Java) {
191  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
192  tok::greater, tok::greater, tok::greaterequal};
193  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
194  return;
195  }
196 
197  if (Style.isVerilog()) {
198  // Merge the number following a base like `'h?a0`.
199  if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
200  Tokens.end()[-2]->is(tok::numeric_constant) &&
201  Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
202  tok::question) &&
203  tryMergeTokens(2, TT_Unknown)) {
204  return;
205  }
206  // Part select.
207  if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
208  TT_BitFieldColon)) {
209  return;
210  }
211  // Xnor. The combined token is treated as a caret which can also be either a
212  // unary or binary operator. The actual type is determined in
213  // TokenAnnotator. We also check the token length so we know it is not
214  // already a merged token.
215  if (Tokens.back()->TokenText.size() == 1 &&
216  tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
217  TT_BinaryOperator)) {
218  Tokens.back()->Tok.setKind(tok::caret);
219  return;
220  }
221  // Signed shift and distribution weight.
222  if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
223  Tokens.back()->Tok.setKind(tok::lessless);
224  return;
225  }
226  if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
227  Tokens.back()->Tok.setKind(tok::greatergreater);
228  return;
229  }
230  if (tryMergeTokensAny({{tok::lessless, tok::equal},
231  {tok::lessless, tok::lessequal},
232  {tok::greatergreater, tok::equal},
233  {tok::greatergreater, tok::greaterequal},
234  {tok::colon, tok::equal},
235  {tok::colon, tok::slash}},
236  TT_BinaryOperator)) {
237  Tokens.back()->ForcedPrecedence = prec::Assignment;
238  return;
239  }
240  // Exponentiation, signed shift, case equality, and wildcard equality.
241  if (tryMergeTokensAny({{tok::star, tok::star},
242  {tok::lessless, tok::less},
243  {tok::greatergreater, tok::greater},
244  {tok::exclaimequal, tok::equal},
245  {tok::exclaimequal, tok::question},
246  {tok::equalequal, tok::equal},
247  {tok::equalequal, tok::question}},
248  TT_BinaryOperator)) {
249  return;
250  }
251  // Module paths in specify blocks and implications in properties.
252  if (tryMergeTokensAny({{tok::plusequal, tok::greater},
253  {tok::plus, tok::star, tok::greater},
254  {tok::minusequal, tok::greater},
255  {tok::minus, tok::star, tok::greater},
256  {tok::less, tok::arrow},
257  {tok::equal, tok::greater},
258  {tok::star, tok::greater},
259  {tok::pipeequal, tok::greater},
260  {tok::pipe, tok::arrow},
261  {tok::hash, tok::minus, tok::hash},
262  {tok::hash, tok::equal, tok::hash}},
263  TT_BinaryOperator)) {
264  Tokens.back()->ForcedPrecedence = prec::Comma;
265  return;
266  }
267  }
268 }
269 
270 bool FormatTokenLexer::tryMergeNSStringLiteral() {
271  if (Tokens.size() < 2)
272  return false;
273  auto &At = *(Tokens.end() - 2);
274  auto &String = *(Tokens.end() - 1);
275  if (!At->is(tok::at) || !String->is(tok::string_literal))
276  return false;
277  At->Tok.setKind(tok::string_literal);
278  At->TokenText = StringRef(At->TokenText.begin(),
279  String->TokenText.end() - At->TokenText.begin());
280  At->ColumnWidth += String->ColumnWidth;
281  At->setType(TT_ObjCStringLiteral);
282  Tokens.erase(Tokens.end() - 1);
283  return true;
284 }
285 
286 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
287  // Merges #idenfier into a single identifier with the text #identifier
288  // but the token tok::identifier.
289  if (Tokens.size() < 2)
290  return false;
291  auto &Hash = *(Tokens.end() - 2);
292  auto &Identifier = *(Tokens.end() - 1);
293  if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
294  return false;
295  Hash->Tok.setKind(tok::identifier);
296  Hash->TokenText =
297  StringRef(Hash->TokenText.begin(),
298  Identifier->TokenText.end() - Hash->TokenText.begin());
299  Hash->ColumnWidth += Identifier->ColumnWidth;
300  Hash->setType(TT_JsPrivateIdentifier);
301  Tokens.erase(Tokens.end() - 1);
302  return true;
303 }
304 
305 // Search for verbatim or interpolated string literals @"ABC" or
306 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
307 // prevent splitting of @, $ and ".
308 // Merging of multiline verbatim strings with embedded '"' is handled in
309 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
310 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
311  if (Tokens.size() < 2)
312  return false;
313 
314  // Look for @"aaaaaa" or $"aaaaaa".
315  auto &String = *(Tokens.end() - 1);
316  if (!String->is(tok::string_literal))
317  return false;
318 
319  auto &At = *(Tokens.end() - 2);
320  if (!(At->is(tok::at) || At->TokenText == "$"))
321  return false;
322 
323  if (Tokens.size() > 2 && At->is(tok::at)) {
324  auto &Dollar = *(Tokens.end() - 3);
325  if (Dollar->TokenText == "$") {
326  // This looks like $@"aaaaa" so we need to combine all 3 tokens.
327  Dollar->Tok.setKind(tok::string_literal);
328  Dollar->TokenText =
329  StringRef(Dollar->TokenText.begin(),
330  String->TokenText.end() - Dollar->TokenText.begin());
331  Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
332  Dollar->setType(TT_CSharpStringLiteral);
333  Tokens.erase(Tokens.end() - 2);
334  Tokens.erase(Tokens.end() - 1);
335  return true;
336  }
337  }
338 
339  // Convert back into just a string_literal.
340  At->Tok.setKind(tok::string_literal);
341  At->TokenText = StringRef(At->TokenText.begin(),
342  String->TokenText.end() - At->TokenText.begin());
343  At->ColumnWidth += String->ColumnWidth;
344  At->setType(TT_CSharpStringLiteral);
345  Tokens.erase(Tokens.end() - 1);
346  return true;
347 }
348 
349 // Valid C# attribute targets:
350 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
351 const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
352  "assembly", "module", "field", "event", "method",
353  "param", "property", "return", "type",
354 };
355 
356 bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
357  if (Tokens.size() < 2)
358  return false;
359  auto &NullishCoalescing = *(Tokens.end() - 2);
360  auto &Equal = *(Tokens.end() - 1);
361  if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
362  !Equal->is(tok::equal)) {
363  return false;
364  }
365  NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
366  NullishCoalescing->TokenText =
367  StringRef(NullishCoalescing->TokenText.begin(),
368  Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
369  NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
370  NullishCoalescing->setType(TT_NullCoalescingEqual);
371  Tokens.erase(Tokens.end() - 1);
372  return true;
373 }
374 
375 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
376  if (Tokens.size() < 2)
377  return false;
378  auto &At = *(Tokens.end() - 2);
379  auto &Keyword = *(Tokens.end() - 1);
380  if (!At->is(tok::at))
381  return false;
382  if (!Keywords.isCSharpKeyword(*Keyword))
383  return false;
384 
385  At->Tok.setKind(tok::identifier);
386  At->TokenText = StringRef(At->TokenText.begin(),
387  Keyword->TokenText.end() - At->TokenText.begin());
388  At->ColumnWidth += Keyword->ColumnWidth;
389  At->setType(Keyword->getType());
390  Tokens.erase(Tokens.end() - 1);
391  return true;
392 }
393 
394 // In C# transform identifier foreach into kw_foreach
395 bool FormatTokenLexer::tryTransformCSharpForEach() {
396  if (Tokens.size() < 1)
397  return false;
398  auto &Identifier = *(Tokens.end() - 1);
399  if (!Identifier->is(tok::identifier))
400  return false;
401  if (Identifier->TokenText != "foreach")
402  return false;
403 
404  Identifier->setType(TT_ForEachMacro);
405  Identifier->Tok.setKind(tok::kw_for);
406  return true;
407 }
408 
409 bool FormatTokenLexer::tryMergeForEach() {
410  if (Tokens.size() < 2)
411  return false;
412  auto &For = *(Tokens.end() - 2);
413  auto &Each = *(Tokens.end() - 1);
414  if (!For->is(tok::kw_for))
415  return false;
416  if (!Each->is(tok::identifier))
417  return false;
418  if (Each->TokenText != "each")
419  return false;
420 
421  For->setType(TT_ForEachMacro);
422  For->Tok.setKind(tok::kw_for);
423 
424  For->TokenText = StringRef(For->TokenText.begin(),
425  Each->TokenText.end() - For->TokenText.begin());
426  For->ColumnWidth += Each->ColumnWidth;
427  Tokens.erase(Tokens.end() - 1);
428  return true;
429 }
430 
431 bool FormatTokenLexer::tryTransformTryUsageForC() {
432  if (Tokens.size() < 2)
433  return false;
434  auto &Try = *(Tokens.end() - 2);
435  if (!Try->is(tok::kw_try))
436  return false;
437  auto &Next = *(Tokens.end() - 1);
438  if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
439  return false;
440 
441  if (Tokens.size() > 2) {
442  auto &At = *(Tokens.end() - 3);
443  if (At->is(tok::at))
444  return false;
445  }
446 
447  Try->Tok.setKind(tok::identifier);
448  return true;
449 }
450 
451 bool FormatTokenLexer::tryMergeLessLess() {
452  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
453  if (Tokens.size() < 3)
454  return false;
455 
456  auto First = Tokens.end() - 3;
457  if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
458  return false;
459 
460  // Only merge if there currently is no whitespace between the two "<".
461  if (First[1]->hasWhitespaceBefore())
462  return false;
463 
464  auto X = Tokens.size() > 3 ? First[-1] : nullptr;
465  auto Y = First[2];
466  if ((X && X->is(tok::less)) || Y->is(tok::less))
467  return false;
468 
469  // Do not remove a whitespace between the two "<" e.g. "operator< <>".
470  if (X && X->is(tok::kw_operator) && Y->is(tok::greater))
471  return false;
472 
473  First[0]->Tok.setKind(tok::lessless);
474  First[0]->TokenText = "<<";
475  First[0]->ColumnWidth += 1;
476  Tokens.erase(Tokens.end() - 2);
477  return true;
478 }
479 
480 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
481  TokenType NewType) {
482  if (Tokens.size() < Kinds.size())
483  return false;
484 
485  SmallVectorImpl<FormatToken *>::const_iterator First =
486  Tokens.end() - Kinds.size();
487  for (unsigned i = 0; i < Kinds.size(); ++i)
488  if (!First[i]->is(Kinds[i]))
489  return false;
490 
491  return tryMergeTokens(Kinds.size(), NewType);
492 }
493 
494 bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
495  if (Tokens.size() < Count)
496  return false;
497 
498  SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
499  unsigned AddLength = 0;
500  for (size_t i = 1; i < Count; ++i) {
501  // If there is whitespace separating the token and the previous one,
502  // they should not be merged.
503  if (First[i]->hasWhitespaceBefore())
504  return false;
505  AddLength += First[i]->TokenText.size();
506  }
507 
508  Tokens.resize(Tokens.size() - Count + 1);
509  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
510  First[0]->TokenText.size() + AddLength);
511  First[0]->ColumnWidth += AddLength;
512  First[0]->setType(NewType);
513  return true;
514 }
515 
516 bool FormatTokenLexer::tryMergeTokensAny(
517  ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
518  return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
519  return tryMergeTokens(Kinds, NewType);
520  });
521 }
522 
523 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
524 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
525  // NB: This is not entirely correct, as an r_paren can introduce an operand
526  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
527  // corner case to not matter in practice, though.
528  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
529  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
530  tok::colon, tok::question, tok::tilde) ||
531  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
532  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
533  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
534  Tok->isBinaryOperator();
535 }
536 
537 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
538  if (!Prev)
539  return true;
540 
541  // Regex literals can only follow after prefix unary operators, not after
542  // postfix unary operators. If the '++' is followed by a non-operand
543  // introducing token, the slash here is the operand and not the start of a
544  // regex.
545  // `!` is an unary prefix operator, but also a post-fix operator that casts
546  // away nullability, so the same check applies.
547  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
548  return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
549 
550  // The previous token must introduce an operand location where regex
551  // literals can occur.
552  if (!precedesOperand(Prev))
553  return false;
554 
555  return true;
556 }
557 
558 // Tries to parse a JavaScript Regex literal starting at the current token,
559 // if that begins with a slash and is in a location where JavaScript allows
560 // regex literals. Changes the current token to a regex literal and updates
561 // its text if successful.
562 void FormatTokenLexer::tryParseJSRegexLiteral() {
563  FormatToken *RegexToken = Tokens.back();
564  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
565  return;
566 
567  FormatToken *Prev = nullptr;
568  for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
569  // NB: Because previous pointers are not initialized yet, this cannot use
570  // Token.getPreviousNonComment.
571  if (FT->isNot(tok::comment)) {
572  Prev = FT;
573  break;
574  }
575  }
576 
577  if (!canPrecedeRegexLiteral(Prev))
578  return;
579 
580  // 'Manually' lex ahead in the current file buffer.
581  const char *Offset = Lex->getBufferLocation();
582  const char *RegexBegin = Offset - RegexToken->TokenText.size();
583  StringRef Buffer = Lex->getBuffer();
584  bool InCharacterClass = false;
585  bool HaveClosingSlash = false;
586  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
587  // Regular expressions are terminated with a '/', which can only be
588  // escaped using '\' or a character class between '[' and ']'.
589  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
590  switch (*Offset) {
591  case '\\':
592  // Skip the escaped character.
593  ++Offset;
594  break;
595  case '[':
596  InCharacterClass = true;
597  break;
598  case ']':
599  InCharacterClass = false;
600  break;
601  case '/':
602  if (!InCharacterClass)
603  HaveClosingSlash = true;
604  break;
605  }
606  }
607 
608  RegexToken->setType(TT_RegexLiteral);
609  // Treat regex literals like other string_literals.
610  RegexToken->Tok.setKind(tok::string_literal);
611  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
612  RegexToken->ColumnWidth = RegexToken->TokenText.size();
613 
614  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
615 }
616 
617 static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
618  bool Interpolated) {
619  auto Repeated = [&Begin, End]() {
620  return Begin + 1 < End && Begin[1] == Begin[0];
621  };
622 
623  // Look for a terminating '"' in the current file buffer.
624  // Make no effort to format code within an interpolated or verbatim string.
625  //
626  // Interpolated strings could contain { } with " characters inside.
627  // $"{x ?? "null"}"
628  // should not be split into $"{x ?? ", null, "}" but should be treated as a
629  // single string-literal.
630  //
631  // We opt not to try and format expressions inside {} within a C#
632  // interpolated string. Formatting expressions within an interpolated string
633  // would require similar work as that done for JavaScript template strings
634  // in `handleTemplateStrings()`.
635  for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
636  switch (*Begin) {
637  case '\\':
638  if (!Verbatim)
639  ++Begin;
640  break;
641  case '{':
642  if (Interpolated) {
643  // {{ inside an interpolated string is escaped, so skip it.
644  if (Repeated())
645  ++Begin;
646  else
647  ++UnmatchedOpeningBraceCount;
648  }
649  break;
650  case '}':
651  if (Interpolated) {
652  // }} inside an interpolated string is escaped, so skip it.
653  if (Repeated())
654  ++Begin;
655  else if (UnmatchedOpeningBraceCount > 0)
656  --UnmatchedOpeningBraceCount;
657  else
658  return End;
659  }
660  break;
661  case '"':
662  if (UnmatchedOpeningBraceCount > 0)
663  break;
664  // "" within a verbatim string is an escaped double quote: skip it.
665  if (Verbatim && Repeated()) {
666  ++Begin;
667  break;
668  }
669  return Begin;
670  }
671  }
672 
673  return End;
674 }
675 
676 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
677  FormatToken *CSharpStringLiteral = Tokens.back();
678 
679  if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
680  return;
681 
682  auto &TokenText = CSharpStringLiteral->TokenText;
683 
684  bool Verbatim = false;
685  bool Interpolated = false;
686  if (TokenText.startswith(R"($@")")) {
687  Verbatim = true;
688  Interpolated = true;
689  } else if (TokenText.startswith(R"(@")")) {
690  Verbatim = true;
691  } else if (TokenText.startswith(R"($")")) {
692  Interpolated = true;
693  }
694 
695  // Deal with multiline strings.
696  if (!Verbatim && !Interpolated)
697  return;
698 
699  const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
700  const char *Offset = StrBegin;
701  if (Verbatim && Interpolated)
702  Offset += 3;
703  else
704  Offset += 2;
705 
706  const auto End = Lex->getBuffer().end();
707  Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
708 
709  // Make no attempt to format code properly if a verbatim string is
710  // unterminated.
711  if (Offset >= End)
712  return;
713 
714  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
715  TokenText = LiteralText;
716 
717  // Adjust width for potentially multiline string literals.
718  size_t FirstBreak = LiteralText.find('\n');
719  StringRef FirstLineText = FirstBreak == StringRef::npos
720  ? LiteralText
721  : LiteralText.substr(0, FirstBreak);
722  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
723  FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
724  Encoding);
725  size_t LastBreak = LiteralText.rfind('\n');
726  if (LastBreak != StringRef::npos) {
727  CSharpStringLiteral->IsMultiline = true;
728  unsigned StartColumn = 0;
729  CSharpStringLiteral->LastLineColumnWidth =
730  encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
731  StartColumn, Style.TabWidth, Encoding);
732  }
733 
734  assert(Offset < End);
735  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
736 }
737 
738 void FormatTokenLexer::handleTemplateStrings() {
739  FormatToken *BacktickToken = Tokens.back();
740 
741  if (BacktickToken->is(tok::l_brace)) {
742  StateStack.push(LexerState::NORMAL);
743  return;
744  }
745  if (BacktickToken->is(tok::r_brace)) {
746  if (StateStack.size() == 1)
747  return;
748  StateStack.pop();
749  if (StateStack.top() != LexerState::TEMPLATE_STRING)
750  return;
751  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
752  } else if (BacktickToken->is(tok::unknown) &&
753  BacktickToken->TokenText == "`") {
754  StateStack.push(LexerState::TEMPLATE_STRING);
755  } else {
756  return; // Not actually a template
757  }
758 
759  // 'Manually' lex ahead in the current file buffer.
760  const char *Offset = Lex->getBufferLocation();
761  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
762  for (; Offset != Lex->getBuffer().end(); ++Offset) {
763  if (Offset[0] == '`') {
764  StateStack.pop();
765  break;
766  }
767  if (Offset[0] == '\\') {
768  ++Offset; // Skip the escaped character.
769  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
770  Offset[1] == '{') {
771  // '${' introduces an expression interpolation in the template string.
772  StateStack.push(LexerState::NORMAL);
773  ++Offset;
774  break;
775  }
776  }
777 
778  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
779  BacktickToken->setType(TT_TemplateString);
780  BacktickToken->Tok.setKind(tok::string_literal);
781  BacktickToken->TokenText = LiteralText;
782 
783  // Adjust width for potentially multiline string literals.
784  size_t FirstBreak = LiteralText.find('\n');
785  StringRef FirstLineText = FirstBreak == StringRef::npos
786  ? LiteralText
787  : LiteralText.substr(0, FirstBreak);
788  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
789  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
790  size_t LastBreak = LiteralText.rfind('\n');
791  if (LastBreak != StringRef::npos) {
792  BacktickToken->IsMultiline = true;
793  unsigned StartColumn = 0; // The template tail spans the entire line.
794  BacktickToken->LastLineColumnWidth =
795  encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
796  StartColumn, Style.TabWidth, Encoding);
797  }
798 
799  SourceLocation loc = Offset < Lex->getBuffer().end()
800  ? Lex->getSourceLocation(Offset + 1)
801  : SourceMgr.getLocForEndOfFile(ID);
802  resetLexer(SourceMgr.getFileOffset(loc));
803 }
804 
805 void FormatTokenLexer::tryParsePythonComment() {
806  FormatToken *HashToken = Tokens.back();
807  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
808  return;
809  // Turn the remainder of this line into a comment.
810  const char *CommentBegin =
811  Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
812  size_t From = CommentBegin - Lex->getBuffer().begin();
813  size_t To = Lex->getBuffer().find_first_of('\n', From);
814  if (To == StringRef::npos)
815  To = Lex->getBuffer().size();
816  size_t Len = To - From;
817  HashToken->setType(TT_LineComment);
818  HashToken->Tok.setKind(tok::comment);
819  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
820  SourceLocation Loc = To < Lex->getBuffer().size()
821  ? Lex->getSourceLocation(CommentBegin + Len)
822  : SourceMgr.getLocForEndOfFile(ID);
823  resetLexer(SourceMgr.getFileOffset(Loc));
824 }
825 
826 bool FormatTokenLexer::tryMerge_TMacro() {
827  if (Tokens.size() < 4)
828  return false;
829  FormatToken *Last = Tokens.back();
830  if (!Last->is(tok::r_paren))
831  return false;
832 
833  FormatToken *String = Tokens[Tokens.size() - 2];
834  if (!String->is(tok::string_literal) || String->IsMultiline)
835  return false;
836 
837  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
838  return false;
839 
840  FormatToken *Macro = Tokens[Tokens.size() - 4];
841  if (Macro->TokenText != "_T")
842  return false;
843 
844  const char *Start = Macro->TokenText.data();
845  const char *End = Last->TokenText.data() + Last->TokenText.size();
846  String->TokenText = StringRef(Start, End - Start);
847  String->IsFirst = Macro->IsFirst;
848  String->LastNewlineOffset = Macro->LastNewlineOffset;
849  String->WhitespaceRange = Macro->WhitespaceRange;
850  String->OriginalColumn = Macro->OriginalColumn;
851  String->ColumnWidth = encoding::columnWidthWithTabs(
852  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
853  String->NewlinesBefore = Macro->NewlinesBefore;
854  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
855 
856  Tokens.pop_back();
857  Tokens.pop_back();
858  Tokens.pop_back();
859  Tokens.back() = String;
860  if (FirstInLineIndex >= Tokens.size())
861  FirstInLineIndex = Tokens.size() - 1;
862  return true;
863 }
864 
865 bool FormatTokenLexer::tryMergeConflictMarkers() {
866  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
867  return false;
868 
869  // Conflict lines look like:
870  // <marker> <text from the vcs>
871  // For example:
872  // >>>>>>> /file/in/file/system at revision 1234
873  //
874  // We merge all tokens in a line that starts with a conflict marker
875  // into a single token with a special token type that the unwrapped line
876  // parser will use to correctly rebuild the underlying code.
877 
878  FileID ID;
879  // Get the position of the first token in the line.
880  unsigned FirstInLineOffset;
881  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
882  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
883  StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
884  // Calculate the offset of the start of the current line.
885  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
886  if (LineOffset == StringRef::npos)
887  LineOffset = 0;
888  else
889  ++LineOffset;
890 
891  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
892  StringRef LineStart;
893  if (FirstSpace == StringRef::npos)
894  LineStart = Buffer.substr(LineOffset);
895  else
896  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
897 
898  TokenType Type = TT_Unknown;
899  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
900  Type = TT_ConflictStart;
901  } else if (LineStart == "|||||||" || LineStart == "=======" ||
902  LineStart == "====") {
903  Type = TT_ConflictAlternative;
904  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
905  Type = TT_ConflictEnd;
906  }
907 
908  if (Type != TT_Unknown) {
909  FormatToken *Next = Tokens.back();
910 
911  Tokens.resize(FirstInLineIndex + 1);
912  // We do not need to build a complete token here, as we will skip it
913  // during parsing anyway (as we must not touch whitespace around conflict
914  // markers).
915  Tokens.back()->setType(Type);
916  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
917 
918  Tokens.push_back(Next);
919  return true;
920  }
921 
922  return false;
923 }
924 
925 FormatToken *FormatTokenLexer::getStashedToken() {
926  // Create a synthesized second '>' or '<' token.
927  Token Tok = FormatTok->Tok;
928  StringRef TokenText = FormatTok->TokenText;
929 
930  unsigned OriginalColumn = FormatTok->OriginalColumn;
931  FormatTok = new (Allocator.Allocate()) FormatToken;
932  FormatTok->Tok = Tok;
933  SourceLocation TokLocation =
934  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
935  FormatTok->Tok.setLocation(TokLocation);
936  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
937  FormatTok->TokenText = TokenText;
938  FormatTok->ColumnWidth = 1;
939  FormatTok->OriginalColumn = OriginalColumn + 1;
940 
941  return FormatTok;
942 }
943 
944 /// Truncate the current token to the new length and make the lexer continue
945 /// from the end of the truncated token. Used for other languages that have
946 /// different token boundaries, like JavaScript in which a comment ends at a
947 /// line break regardless of whether the line break follows a backslash. Also
948 /// used to set the lexer to the end of whitespace if the lexer regards
949 /// whitespace and an unrecognized symbol as one token.
950 void FormatTokenLexer::truncateToken(size_t NewLen) {
951  assert(NewLen <= FormatTok->TokenText.size());
952  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
953  Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
954  FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
956  FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
957  Encoding);
958  FormatTok->Tok.setLength(NewLen);
959 }
960 
961 /// Count the length of leading whitespace in a token.
962 static size_t countLeadingWhitespace(StringRef Text) {
963  // Basically counting the length matched by this regex.
964  // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
965  // Directly using the regex turned out to be slow. With the regex
966  // version formatting all files in this directory took about 1.25
967  // seconds. This version took about 0.5 seconds.
968  const unsigned char *const Begin = Text.bytes_begin();
969  const unsigned char *const End = Text.bytes_end();
970  const unsigned char *Cur = Begin;
971  while (Cur < End) {
972  if (isspace(Cur[0])) {
973  ++Cur;
974  } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
975  // A '\' followed by a newline always escapes the newline, regardless
976  // of whether there is another '\' before it.
977  // The source has a null byte at the end. So the end of the entire input
978  // isn't reached yet. Also the lexer doesn't break apart an escaped
979  // newline.
980  assert(End - Cur >= 2);
981  Cur += 2;
982  } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
983  (Cur[3] == '\n' || Cur[3] == '\r')) {
984  // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
985  // characters are quoted individually in this comment because if we write
986  // them together some compilers warn that we have a trigraph in the code.
987  assert(End - Cur >= 4);
988  Cur += 4;
989  } else {
990  break;
991  }
992  }
993  return Cur - Begin;
994 }
995 
996 FormatToken *FormatTokenLexer::getNextToken() {
997  if (StateStack.top() == LexerState::TOKEN_STASHED) {
998  StateStack.pop();
999  return getStashedToken();
1000  }
1001 
1002  FormatTok = new (Allocator.Allocate()) FormatToken;
1003  readRawToken(*FormatTok);
1004  SourceLocation WhitespaceStart =
1005  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1006  FormatTok->IsFirst = IsFirstToken;
1007  IsFirstToken = false;
1008 
1009  // Consume and record whitespace until we find a significant token.
1010  // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1011  // followed by a symbol such as backtick. Those symbols may be
1012  // significant in other languages.
1013  unsigned WhitespaceLength = TrailingWhitespace;
1014  while (FormatTok->isNot(tok::eof)) {
1015  auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1016  if (LeadingWhitespace == 0)
1017  break;
1018  if (LeadingWhitespace < FormatTok->TokenText.size())
1019  truncateToken(LeadingWhitespace);
1020  StringRef Text = FormatTok->TokenText;
1021  bool InEscape = false;
1022  for (int i = 0, e = Text.size(); i != e; ++i) {
1023  switch (Text[i]) {
1024  case '\r':
1025  // If this is a CRLF sequence, break here and the LF will be handled on
1026  // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1027  // the same as a single LF.
1028  if (i + 1 < e && Text[i + 1] == '\n')
1029  break;
1030  [[fallthrough]];
1031  case '\n':
1032  ++FormatTok->NewlinesBefore;
1033  if (!InEscape)
1034  FormatTok->HasUnescapedNewline = true;
1035  else
1036  InEscape = false;
1037  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1038  Column = 0;
1039  break;
1040  case '\f':
1041  case '\v':
1042  Column = 0;
1043  break;
1044  case ' ':
1045  ++Column;
1046  break;
1047  case '\t':
1048  Column +=
1049  Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1050  break;
1051  case '\\':
1052  case '?':
1053  case '/':
1054  // The text was entirely whitespace when this loop was entered. Thus
1055  // this has to be an escape sequence.
1056  assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1057  Text.substr(i, 4) == "\?\?/\r" ||
1058  Text.substr(i, 4) == "\?\?/\n" ||
1059  (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1060  Text.substr(i - 1, 4) == "\?\?/\n")) ||
1061  (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1062  Text.substr(i - 2, 4) == "\?\?/\n")));
1063  InEscape = true;
1064  break;
1065  default:
1066  // This shouldn't happen.
1067  assert(false);
1068  break;
1069  }
1070  }
1071  WhitespaceLength += Text.size();
1072  readRawToken(*FormatTok);
1073  }
1074 
1075  if (FormatTok->is(tok::unknown))
1076  FormatTok->setType(TT_ImplicitStringLiteral);
1077 
1078  // JavaScript and Java do not allow to escape the end of the line with a
1079  // backslash. Backslashes are syntax errors in plain source, but can occur in
1080  // comments. When a single line comment ends with a \, it'll cause the next
1081  // line of code to be lexed as a comment, breaking formatting. The code below
1082  // finds comments that contain a backslash followed by a line break, truncates
1083  // the comment token at the backslash, and resets the lexer to restart behind
1084  // the backslash.
1085  if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1086  FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
1087  size_t BackslashPos = FormatTok->TokenText.find('\\');
1088  while (BackslashPos != StringRef::npos) {
1089  if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1090  FormatTok->TokenText[BackslashPos + 1] == '\n') {
1091  truncateToken(BackslashPos + 1);
1092  break;
1093  }
1094  BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1095  }
1096  }
1097 
1098  if (Style.isVerilog()) {
1099  static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1100  SmallVector<StringRef, 1> Matches;
1101  // Verilog uses the backtick instead of the hash for preprocessor stuff.
1102  // And it uses the hash for delays and parameter lists. In order to continue
1103  // using `tok::hash` in other places, the backtick gets marked as the hash
1104  // here. And in order to tell the backtick and hash apart for
1105  // Verilog-specific stuff, the hash becomes an identifier.
1106  if (FormatTok->is(tok::numeric_constant)) {
1107  // In Verilog the quote is not part of a number.
1108  auto Quote = FormatTok->TokenText.find('\'');
1109  if (Quote != StringRef::npos)
1110  truncateToken(Quote);
1111  } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1112  FormatTok->Tok.setKind(tok::raw_identifier);
1113  } else if (FormatTok->is(tok::raw_identifier)) {
1114  if (FormatTok->TokenText == "`") {
1115  FormatTok->Tok.setIdentifierInfo(nullptr);
1116  FormatTok->Tok.setKind(tok::hash);
1117  } else if (FormatTok->TokenText == "``") {
1118  FormatTok->Tok.setIdentifierInfo(nullptr);
1119  FormatTok->Tok.setKind(tok::hashhash);
1120  } else if (Tokens.size() > 0 &&
1121  Tokens.back()->is(Keywords.kw_apostrophe) &&
1122  NumberBase.match(FormatTok->TokenText, &Matches)) {
1123  // In Verilog in a based number literal like `'b10`, there may be
1124  // whitespace between `'b` and `10`. Therefore we handle the base and
1125  // the rest of the number literal as two tokens. But if there is no
1126  // space in the input code, we need to manually separate the two parts.
1127  truncateToken(Matches[0].size());
1128  FormatTok->setFinalizedType(TT_VerilogNumberBase);
1129  }
1130  }
1131  }
1132 
1134  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1135 
1136  FormatTok->OriginalColumn = Column;
1137 
1138  TrailingWhitespace = 0;
1139  if (FormatTok->is(tok::comment)) {
1140  // FIXME: Add the trimmed whitespace to Column.
1141  StringRef UntrimmedText = FormatTok->TokenText;
1142  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1143  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1144  } else if (FormatTok->is(tok::raw_identifier)) {
1145  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1146  FormatTok->Tok.setIdentifierInfo(&Info);
1147  FormatTok->Tok.setKind(Info.getTokenID());
1148  if (Style.Language == FormatStyle::LK_Java &&
1149  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1150  tok::kw_operator)) {
1151  FormatTok->Tok.setKind(tok::identifier);
1152  FormatTok->Tok.setIdentifierInfo(nullptr);
1153  } else if (Style.isJavaScript() &&
1154  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1155  tok::kw_operator)) {
1156  FormatTok->Tok.setKind(tok::identifier);
1157  FormatTok->Tok.setIdentifierInfo(nullptr);
1158  }
1159  } else if (FormatTok->is(tok::greatergreater)) {
1160  FormatTok->Tok.setKind(tok::greater);
1161  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1162  ++Column;
1163  StateStack.push(LexerState::TOKEN_STASHED);
1164  } else if (FormatTok->is(tok::lessless)) {
1165  FormatTok->Tok.setKind(tok::less);
1166  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1167  ++Column;
1168  StateStack.push(LexerState::TOKEN_STASHED);
1169  }
1170 
1171  if (Style.isVerilog() && Tokens.size() > 0 &&
1172  Tokens.back()->is(TT_VerilogNumberBase) &&
1173  FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1174  // Mark the number following a base like `'h?a0` as a number.
1175  FormatTok->Tok.setKind(tok::numeric_constant);
1176  }
1177 
1178  // Now FormatTok is the next non-whitespace token.
1179 
1180  StringRef Text = FormatTok->TokenText;
1181  size_t FirstNewlinePos = Text.find('\n');
1182  if (FirstNewlinePos == StringRef::npos) {
1183  // FIXME: ColumnWidth actually depends on the start column, we need to
1184  // take this into account when the token is moved.
1185  FormatTok->ColumnWidth =
1186  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1187  Column += FormatTok->ColumnWidth;
1188  } else {
1189  FormatTok->IsMultiline = true;
1190  // FIXME: ColumnWidth actually depends on the start column, we need to
1191  // take this into account when the token is moved.
1193  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1194 
1195  // The last line of the token always starts in column 0.
1196  // Thus, the length can be precomputed even in the presence of tabs.
1198  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1199  Column = FormatTok->LastLineColumnWidth;
1200  }
1201 
1202  if (Style.isCpp()) {
1203  auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1204  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1205  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1206  tok::pp_define) &&
1207  it != Macros.end()) {
1208  FormatTok->setType(it->second);
1209  if (it->second == TT_IfMacro) {
1210  // The lexer token currently has type tok::kw_unknown. However, for this
1211  // substitution to be treated correctly in the TokenAnnotator, faking
1212  // the tok value seems to be needed. Not sure if there's a more elegant
1213  // way.
1214  FormatTok->Tok.setKind(tok::kw_if);
1215  }
1216  } else if (FormatTok->is(tok::identifier)) {
1217  if (MacroBlockBeginRegex.match(Text))
1218  FormatTok->setType(TT_MacroBlockBegin);
1219  else if (MacroBlockEndRegex.match(Text))
1220  FormatTok->setType(TT_MacroBlockEnd);
1221  }
1222  }
1223 
1224  return FormatTok;
1225 }
1226 
1227 bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1228  // In Verilog the quote is not a character literal.
1229  //
1230  // Make the backtick and double backtick identifiers to match against them
1231  // more easily.
1232  //
1233  // In Verilog an escaped identifier starts with backslash and ends with
1234  // whitespace. Unless that whitespace is an escaped newline. A backslash can
1235  // also begin an escaped newline outside of an escaped identifier. We check
1236  // for that outside of the Regex since we can't use negative lookhead
1237  // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1238  // identifier may have a length of 0 according to Section A.9.3.
1239  // FIXME: If there is an escaped newline in the middle of an escaped
1240  // identifier, allow for pasting the two lines together, But escaped
1241  // identifiers usually occur only in generated code anyway.
1242  static const llvm::Regex VerilogToken(R"re(^('|``?|\\(\\)re"
1243  "(\r?\n|\r)|[^[:space:]])*)");
1244 
1245  SmallVector<StringRef, 4> Matches;
1246  const char *Start = Lex->getBufferLocation();
1247  if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1248  &Matches)) {
1249  return false;
1250  }
1251  // There is a null byte at the end of the buffer, so we don't have to check
1252  // Start[1] is within the buffer.
1253  if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1254  return false;
1255  size_t Len = Matches[0].size();
1256 
1257  // The kind has to be an identifier so we can match it against those defined
1258  // in Keywords. The kind has to be set before the length because the setLength
1259  // function checks that the kind is not an annotation.
1260  Tok.setKind(tok::raw_identifier);
1261  Tok.setLength(Len);
1262  Tok.setLocation(Lex->getSourceLocation(Start, Len));
1263  Tok.setRawIdentifierData(Start);
1264  Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1265  return true;
1266 }
1267 
1268 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1269  // For Verilog, first see if there is a special token, and fall back to the
1270  // normal lexer if there isn't one.
1271  if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1272  Lex->LexFromRawLexer(Tok.Tok);
1273  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1274  Tok.Tok.getLength());
1275  // For formatting, treat unterminated string literals like normal string
1276  // literals.
1277  if (Tok.is(tok::unknown)) {
1278  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1279  Tok.Tok.setKind(tok::string_literal);
1280  Tok.IsUnterminatedLiteral = true;
1281  } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1282  Tok.Tok.setKind(tok::string_literal);
1283  }
1284  }
1285 
1286  if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Proto ||
1287  Style.Language == FormatStyle::LK_TextProto) &&
1288  Tok.is(tok::char_constant)) {
1289  Tok.Tok.setKind(tok::string_literal);
1290  }
1291 
1292  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
1293  Tok.TokenText == "/* clang-format on */")) {
1294  FormattingDisabled = false;
1295  }
1296 
1297  Tok.Finalized = FormattingDisabled;
1298 
1299  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
1300  Tok.TokenText == "/* clang-format off */")) {
1301  FormattingDisabled = true;
1302  }
1303 }
1304 
1305 void FormatTokenLexer::resetLexer(unsigned Offset) {
1306  StringRef Buffer = SourceMgr.getBufferData(ID);
1307  LangOpts = getFormattingLangOpts(Style);
1308  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1309  Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1310  Lex->SetKeepWhitespaceMode(true);
1311  TrailingWhitespace = 0;
1312 }
1313 
1314 } // namespace format
1315 } // namespace clang
clang::prec::Comma
@ Comma
Definition: OperatorPrecedence.h:28
clang::format::FormatTokenLexer::lex
ArrayRef< FormatToken * > lex()
Definition: FormatTokenLexer.cpp:76
clang::format::TEMPLATE_STRING
@ TEMPLATE_STRING
Definition: FormatTokenLexer.h:35
clang::SourceRange
A trivial tuple used to represent a source range.
Definition: SourceLocation.h:210
string
string(SUBSTRING ${CMAKE_CURRENT_BINARY_DIR} 0 ${PATH_LIB_START} PATH_HEAD) string(SUBSTRING $
Definition: CMakeLists.txt:22
clang::SourceManager::getFileOffset
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
Definition: SourceManager.h:1292
clang::format::AdditionalKeywords::kw_instanceof
IdentifierInfo * kw_instanceof
Definition: FormatToken.h:1344
llvm::SmallVector
Definition: LLVM.h:38
clang::IdentifierTable::get
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Definition: IdentifierTable.h:597
clang::SourceLocation::getLocWithOffset
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
Definition: SourceLocation.h:134
clang::Token::getIdentifierInfo
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:178
clang::format::NORMAL
@ NORMAL
Definition: FormatTokenLexer.h:34
clang::format::FormatStyle::StatementMacros
std::vector< std::string > StatementMacros
A vector of macros that should be interpreted as complete statements.
Definition: Format.h:2216
clang::format::FormatStyle
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:54
clang::format::FormatToken::TokenText
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:258
clang::format::FormatToken::IsMultiline
unsigned IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:274
clang::format::FirstNewlinePos
size_t FirstNewlinePos
Definition: FormatTokenLexer.cpp:1181
clang::ComparisonCategoryType::First
@ First
clang::format::FormatStyle::isVerilog
bool isVerilog() const
Definition: Format.h:2619
clang::format::FormatToken::isNot
bool isNot(T Kind) const
Definition: FormatToken.h:544
clang::prec::Assignment
@ Assignment
Definition: OperatorPrecedence.h:29
SourceManager.h
clang::Token::setIdentifierInfo
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:187
clang::format::FormatToken::OriginalColumn
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:426
clang::tooling::X
static ToolExecutorPluginRegistry::Add< AllTUsToolExecutorPlugin > X("all-TUs", "Runs FrontendActions on all TUs in the compilation database. " "Tool results are stored in memory.")
clang::index::SymbolKind::Macro
@ Macro
Identifier
StringRef Identifier
Definition: Format.cpp:2596
clang::IdentifierInfo::getTokenID
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
Definition: IdentifierTable.h:262
Format.h
clang::format::FormatToken::setType
void setType(TokenType T)
Definition: FormatToken.h:365
clang::Token
Token - This structure provides full information about a lexed token.
Definition: Token.h:34
clang::format::FormatToken::IsFirst
unsigned IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:277
End
SourceLocation End
Definition: USRLocFinder.cpp:167
clang::SourceManager
This class handles loading and caching of source files into memory.
Definition: SourceManager.h:627
clang::format::FormatToken::NewlinesBefore
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:391
clang::Lexer
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
Offset
unsigned Offset
Definition: Format.cpp:2590
clang::format::FormatTokenLexer::FormatTokenLexer
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
Definition: FormatTokenLexer.cpp:25
clang::format::lexCSharpString
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
Definition: FormatTokenLexer.cpp:617
clang::format::AdditionalKeywords::kw_apostrophe
IdentifierInfo * kw_apostrophe
Definition: FormatToken.h:1540
clang::Token::setRawIdentifierData
void setRawIdentifierData(const char *Ptr)
Definition: Token.h:208
clang::format::OriginalColumn
FormatTok OriginalColumn
Definition: FormatTokenLexer.cpp:1136
clang::format::FormatStyle::IfMacros
std::vector< std::string > IfMacros
A vector of macros that should be interpreted as conditionals instead of as function calls.
Definition: Format.h:2188
clang::format::FormatToken::Tok
Token Tok
The Token.
Definition: FormatToken.h:252
clang::format::AdditionalKeywords::isCSharpKeyword
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:1621
clang::format::encoding::Encoding
Encoding
Definition: Encoding.h:27
clang::format::FormatStyle::LK_TextProto
@ LK_TextProto
Should be used for Protocol Buffer messages in text format (https://developers.google....
Definition: Format.h:2609
clang::format::FormatStyle::LK_Proto
@ LK_Proto
Should be used for Protocol Buffers (https://developers.google.com/protocol-buffers/).
Definition: Format.h:2604
clang::dependency_directives_scan::pp_define
@ pp_define
Definition: DependencyDirectivesScanner.h:63
clang::format::AdditionalKeywords::kw_in
IdentifierInfo * kw_in
Definition: FormatToken.h:1306
clang::format::FormatStyle::NamespaceMacros
std::vector< std::string > NamespaceMacros
A vector of macros which are used to open namespace blocks.
Definition: Format.h:2229
clang::format::getFormattingLangOpts
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:3421
clang::format::TrailingWhitespace
TrailingWhitespace
Definition: FormatTokenLexer.cpp:1138
clang::SourceManager::getLocForEndOfFile
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Definition: SourceManager.h:1131
clang::format::FormatStyle::MacroBlockEnd
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:2689
clang::format::FormatStyle::WhitespaceSensitiveMacros
std::vector< std::string > WhitespaceSensitiveMacros
A vector of macros which are whitespace-sensitive and should not be touched.
Definition: Format.h:2246
clang::format::FormatStyle::TabWidth
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:3838
clang::SourceManager::getDecomposedLoc
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
Definition: SourceManager.h:1241
clang::format::FormatStyle::StatementAttributeLikeMacros
std::vector< std::string > StatementAttributeLikeMacros
Macros which are ignored in front of a statement, as if they were an attribute.
Definition: Format.h:3834
clang::format::FormatStyle::isJavaScript
bool isJavaScript() const
Definition: Format.h:2618
clang::SourceManager::getLocForStartOfFile
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file.
Definition: SourceManager.h:1123
clang::format::Column
Column
Definition: FormatTokenLexer.cpp:1199
clang::format::Text
StringRef Text
Definition: FormatTokenLexer.cpp:1180
SourceLocation.h
clang::tok::TokenKind
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
clang::format::FormatStyle::isCSharp
bool isCSharp() const
Definition: Format.h:2616
clang::format::FormatStyle::Language
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:2623
clang::format::FormatStyle::LK_Java
@ LK_Java
Should be used for Java.
Definition: Format.h:2595
clang::format::FormatTok
return FormatTok
Definition: FormatTokenLexer.cpp:1224
Begin
SourceLocation Begin
Definition: USRLocFinder.cpp:165
llvm::ArrayRef
Definition: LLVM.h:34
clang::SourceManager::getCharacterData
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
Definition: SourceManager.cpp:1155
clang::format::FormatToken::setFinalizedType
void setFinalizedType(TokenType T)
Sets the type and also the finalized flag.
Definition: FormatToken.h:374
clang::format::FormatToken::is
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:525
clang::format::FormatToken::isOneOf
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:537
clang::ComparisonCategoryResult::Equal
@ Equal
clang::SourceManager::getBufferData
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
Definition: SourceManager.cpp:733
clang::format::FormatToken::LastLineColumnWidth
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:404
clang::IdentifierInfo
One of these records is kept for each identifier that is lexed.
Definition: IdentifierTable.h:85
clang::format::FormatStyle::MacroBlockBegin
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:2685
clang::Token::getLocation
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:125
clang::Token::setLocation
void setLocation(SourceLocation L)
Definition: Token.h:133
FormatTokenLexer.h
clang::Builtin::ID
ID
Definition: Builtins.h:52
clang::format::FormatStyle::AttributeMacros
std::vector< std::string > AttributeMacros
A vector of strings that should be interpreted as attributes/qualifiers instead of identifiers.
Definition: Format.h:809
clang
Definition: CalledOnceCheck.h:17
clang::ComparisonCategoryType::Last
@ Last
clang::FileID
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Definition: SourceLocation.h:38
clang::IdentifierTable
Implements an efficient mapping from strings to IdentifierInfo nodes.
Definition: IdentifierTable.h:564
clang::Token::setLength
void setLength(unsigned Len)
Definition: Token.h:134
clang::format::FormatStyle::isCpp
bool isCpp() const
Definition: Format.h:2615
clang::format::FormatToken::LastNewlineOffset
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
Definition: FormatToken.h:395
clang::Token::setKind
void setKind(tok::TokenKind K)
Definition: Token.h:93
clang::format::FormatToken::ColumnWidth
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:400
clang::Token::isOneOf
bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const
Definition: Token.h:99
clang::format::encoding::columnWidthWithTabs
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:61
FormatToken.h
clang::format::FormatStyle::TypenameMacros
std::vector< std::string > TypenameMacros
A vector of macros that should be interpreted as type declarations instead of as function calls.
Definition: Format.h:2205
clang::format::FormatStyle::ForEachMacros
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls.
Definition: Format.h:2167
clang::comments::tok::eof
@ eof
Definition: CommentLexer.h:33
true
#define true
Definition: stdbool.h:21
clang::format::FormatToken::HasUnescapedNewline
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:271
clang::format::TokenType
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:155
clang::format::FormatToken::WhitespaceRange
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:267
clang::SourceManager::getBufferOrFake
llvm::MemoryBufferRef getBufferOrFake(FileID FID, SourceLocation Loc=SourceLocation()) const
Return the buffer for the specified FileID.
Definition: SourceManager.h:1028
Type
MatchType Type
Definition: ASTMatchFinder.cpp:71
clang::format::TOKEN_STASHED
@ TOKEN_STASHED
Definition: FormatTokenLexer.h:36