clang 23.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
20#include "clang/Format/Format.h"
21#include "llvm/Support/Regex.h"
22
23namespace clang {
24namespace format {
25
27 const SourceManager &SourceMgr, FileID ID, unsigned Column,
28 const FormatStyle &Style, encoding::Encoding Encoding,
29 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
30 IdentifierTable &IdentTable)
31 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
32 Column(Column), TrailingWhitespace(0),
33 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
34 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
35 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
36 FormattingDisabled(false), FormatOffRegex(Style.OneLineFormatOffRegex),
37 MacroBlockBeginRegex(Style.MacroBlockBegin),
38 MacroBlockEndRegex(Style.MacroBlockEnd) {
39 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
40 Lex->SetKeepWhitespaceMode(true);
41
42 for (const std::string &ForEachMacro : Style.ForEachMacros) {
43 auto Identifier = &IdentTable.get(ForEachMacro);
44 Macros.insert({Identifier, TT_ForEachMacro});
45 }
46 for (const std::string &IfMacro : Style.IfMacros) {
47 auto Identifier = &IdentTable.get(IfMacro);
48 Macros.insert({Identifier, TT_IfMacro});
49 }
50 for (const std::string &AttributeMacro : Style.AttributeMacros) {
51 auto Identifier = &IdentTable.get(AttributeMacro);
52 Macros.insert({Identifier, TT_AttributeMacro});
53 }
54 for (const std::string &StatementMacro : Style.StatementMacros) {
55 auto Identifier = &IdentTable.get(StatementMacro);
56 Macros.insert({Identifier, TT_StatementMacro});
57 }
58 for (const std::string &TypenameMacro : Style.TypenameMacros) {
59 auto Identifier = &IdentTable.get(TypenameMacro);
60 Macros.insert({Identifier, TT_TypenameMacro});
61 }
62 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
63 auto Identifier = &IdentTable.get(NamespaceMacro);
64 Macros.insert({Identifier, TT_NamespaceMacro});
65 }
66 for (const std::string &WhitespaceSensitiveMacro :
67 Style.WhitespaceSensitiveMacros) {
68 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
69 Macros.insert({Identifier, TT_UntouchableMacroFunc});
70 }
71 for (const std::string &StatementAttributeLikeMacro :
72 Style.StatementAttributeLikeMacros) {
73 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
74 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
75 }
76
77 for (const auto &Macro : Style.MacrosSkippedByRemoveParentheses)
78 MacrosSkippedByRemoveParentheses.insert(&IdentTable.get(Macro));
79 for (const auto &TemplateName : Style.TemplateNames)
80 TemplateNames.insert(&IdentTable.get(TemplateName));
81 for (const auto &TypeName : Style.TypeNames)
82 TypeNames.insert(&IdentTable.get(TypeName));
83 for (const auto &VariableTemplate : Style.VariableTemplates)
84 VariableTemplates.insert(&IdentTable.get(VariableTemplate));
85}
86
88 assert(Tokens.empty());
89 assert(FirstInLineIndex == 0);
90 enum { FO_None, FO_CurrentLine, FO_NextLine } FormatOff = FO_None;
91 do {
92 Tokens.push_back(getNextToken());
93 auto &Tok = *Tokens.back();
94 const auto NewlinesBefore = Tok.NewlinesBefore;
95 switch (FormatOff) {
96 case FO_NextLine:
97 if (NewlinesBefore > 1) {
98 FormatOff = FO_None;
99 } else {
100 Tok.Finalized = true;
101 FormatOff = FO_CurrentLine;
102 }
103 break;
104 case FO_CurrentLine:
105 if (NewlinesBefore == 0) {
106 Tok.Finalized = true;
107 break;
108 }
109 FormatOff = FO_None;
110 [[fallthrough]];
111 default:
112 if (!FormattingDisabled && FormatOffRegex.match(Tok.TokenText)) {
113 if (Tok.is(tok::comment) &&
114 (NewlinesBefore > 0 || Tokens.size() == 1)) {
115 Tok.Finalized = true;
116 FormatOff = FO_NextLine;
117 } else {
118 for (auto *Token : reverse(Tokens)) {
119 Token->Finalized = true;
120 if (Token->NewlinesBefore > 0)
121 break;
122 }
123 FormatOff = FO_CurrentLine;
124 }
125 }
126 }
127 if (Style.isJavaScript()) {
128 tryParseJSRegexLiteral();
129 handleTemplateStrings();
130 } else if (Style.isTextProto()) {
131 tryParsePythonComment();
132 }
133 tryMergePreviousTokens();
134 if (Style.isCSharp()) {
135 // This needs to come after tokens have been merged so that C#
136 // string literals are correctly identified.
137 handleCSharpVerbatimAndInterpolatedStrings();
138 } else if (Style.isTableGen()) {
139 handleTableGenMultilineString();
140 handleTableGenNumericLikeIdentifier();
141 }
142 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
143 FirstInLineIndex = Tokens.size() - 1;
144 } while (Tokens.back()->isNot(tok::eof));
145 if (Style.InsertNewlineAtEOF) {
146 auto &TokEOF = *Tokens.back();
147 if (TokEOF.NewlinesBefore == 0) {
148 TokEOF.NewlinesBefore = 1;
149 TokEOF.OriginalColumn = 0;
150 }
151 }
152 return Tokens;
153}
154
155void FormatTokenLexer::tryMergePreviousTokens() {
156 if (tryMerge_TMacro())
157 return;
158 if (tryMergeConflictMarkers())
159 return;
160 if (tryMergeLessLess())
161 return;
162 if (tryMergeGreaterGreater())
163 return;
164 if (tryMergeForEach())
165 return;
166 if (Style.isCpp() && tryTransformTryUsageForC())
167 return;
168
169 if ((Style.Language == FormatStyle::LK_Cpp ||
170 Style.Language == FormatStyle::LK_ObjC) &&
171 tryMergeUserDefinedLiteral()) {
172 return;
173 }
174
175 if (Style.isJavaScript() || Style.isCSharp()) {
176 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
177 tok::question};
178 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
179 tok::period};
180 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
181
182 if (tryMergeTokens(FatArrow, TT_FatArrow))
183 return;
184 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
185 // Treat like the "||" operator (as opposed to the ternary ?).
186 Tokens.back()->Tok.setKind(tok::pipepipe);
187 return;
188 }
189 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
190 // Treat like a regular "." access.
191 Tokens.back()->Tok.setKind(tok::period);
192 return;
193 }
194 if (tryMergeNullishCoalescingEqual())
195 return;
196
197 if (Style.isCSharp()) {
198 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
199 tok::question, tok::l_square};
200
201 if (tryMergeCSharpKeywordVariables())
202 return;
203 if (tryMergeCSharpStringLiteral())
204 return;
205 if (tryTransformCSharpForEach())
206 return;
207 if (tryMergeTokens(CSharpNullConditionalLSquare,
208 TT_CSharpNullConditionalLSquare)) {
209 // Treat like a regular "[" operator.
210 Tokens.back()->Tok.setKind(tok::l_square);
211 return;
212 }
213 }
214 }
215
216 if (tryMergeNSStringLiteral())
217 return;
218
219 if (Style.isJavaScript()) {
220 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
221 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
222 tok::equal};
223 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
224 tok::greaterequal};
225 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
226 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
227 tok::starequal};
228 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
229 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
230
231 // FIXME: Investigate what token type gives the correct operator priority.
232 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
233 return;
234 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
235 return;
236 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
237 return;
238 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
239 return;
240 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
241 Tokens.back()->Tok.setKind(tok::starequal);
242 return;
243 }
244 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
245 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
246 // Treat like the "=" assignment operator.
247 Tokens.back()->Tok.setKind(tok::equal);
248 return;
249 }
250 if (tryMergeJSPrivateIdentifier())
251 return;
252 } else if (Style.isJava()) {
253 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
254 tok::greater, tok::greater, tok::greaterequal};
255 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
256 return;
257 } else if (Style.isVerilog()) {
258 // Merge the number following a base like `'h?a0`.
259 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
260 Tokens.end()[-2]->is(tok::numeric_constant) &&
261 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
262 tok::question) &&
263 tryMergeTokens(2, TT_Unknown)) {
264 return;
265 }
266 // Part select.
267 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
268 TT_BitFieldColon)) {
269 return;
270 }
271 // Xnor. The combined token is treated as a caret which can also be either a
272 // unary or binary operator. The actual type is determined in
273 // TokenAnnotator. We also check the token length so we know it is not
274 // already a merged token.
275 if (Tokens.back()->TokenText.size() == 1 &&
276 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
277 TT_BinaryOperator)) {
278 Tokens.back()->Tok.setKind(tok::caret);
279 return;
280 }
281 // Signed shift and distribution weight.
282 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
283 Tokens.back()->Tok.setKind(tok::lessless);
284 return;
285 }
286 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
287 Tokens.back()->Tok.setKind(tok::greatergreater);
288 return;
289 }
290 if (tryMergeTokensAny({{tok::lessless, tok::equal},
291 {tok::lessless, tok::lessequal},
292 {tok::greatergreater, tok::equal},
293 {tok::greatergreater, tok::greaterequal},
294 {tok::colon, tok::equal},
295 {tok::colon, tok::slash}},
296 TT_BinaryOperator)) {
297 Tokens.back()->ForcedPrecedence = prec::Assignment;
298 return;
299 }
300 // Exponentiation, signed shift, case equality, and wildcard equality.
301 if (tryMergeTokensAny({{tok::star, tok::star},
302 {tok::lessless, tok::less},
303 {tok::greatergreater, tok::greater},
304 {tok::exclaimequal, tok::equal},
305 {tok::exclaimequal, tok::question},
306 {tok::equalequal, tok::equal},
307 {tok::equalequal, tok::question}},
308 TT_BinaryOperator)) {
309 return;
310 }
311 // Module paths in specify blocks and the implication and boolean equality
312 // operators.
313 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
314 {tok::plus, tok::star, tok::greater},
315 {tok::minusequal, tok::greater},
316 {tok::minus, tok::star, tok::greater},
317 {tok::less, tok::arrow},
318 {tok::equal, tok::greater},
319 {tok::star, tok::greater},
320 {tok::pipeequal, tok::greater},
321 {tok::pipe, tok::arrow}},
322 TT_BinaryOperator) ||
323 Tokens.back()->is(tok::arrow)) {
324 Tokens.back()->ForcedPrecedence = prec::Comma;
325 return;
326 }
327 if (Tokens.size() >= 3 &&
328 Tokens[Tokens.size() - 3]->is(Keywords.kw_verilogHash) &&
329 Tokens[Tokens.size() - 2]->isOneOf(tok::minus, tok::equal) &&
330 Tokens[Tokens.size() - 1]->is(Keywords.kw_verilogHash) &&
331 tryMergeTokens(3, TT_BinaryOperator)) {
332 Tokens.back()->setFinalizedType(TT_BinaryOperator);
333 Tokens.back()->ForcedPrecedence = prec::Comma;
334 return;
335 }
336 } else if (Style.isTableGen()) {
337 // TableGen's Multi line string starts with [{
338 if (tryMergeTokens({tok::l_square, tok::l_brace},
339 TT_TableGenMultiLineString)) {
340 // Set again with finalizing. This must never be annotated as other types.
341 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
342 Tokens.back()->Tok.setKind(tok::string_literal);
343 return;
344 }
345 // TableGen's bang operator is the form !<name>.
346 // !cond is a special case with specific syntax.
347 if (tryMergeTokens({tok::exclaim, tok::identifier},
348 TT_TableGenBangOperator)) {
349 Tokens.back()->Tok.setKind(tok::identifier);
350 Tokens.back()->Tok.setIdentifierInfo(nullptr);
351 if (Tokens.back()->TokenText == "!cond")
352 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
353 else
354 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
355 return;
356 }
357 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
358 // Here, "! if" becomes "!if". That is, ! captures if even when the space
359 // exists. That is only one possibility in TableGen's syntax.
360 Tokens.back()->Tok.setKind(tok::identifier);
361 Tokens.back()->Tok.setIdentifierInfo(nullptr);
362 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
363 return;
364 }
365 // +, - with numbers are literals. Not unary operators.
366 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
367 Tokens.back()->Tok.setKind(tok::numeric_constant);
368 return;
369 }
370 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
371 Tokens.back()->Tok.setKind(tok::numeric_constant);
372 return;
373 }
374 }
375}
376
377bool FormatTokenLexer::tryMergeNSStringLiteral() {
378 if (Tokens.size() < 2)
379 return false;
380 auto &At = *(Tokens.end() - 2);
381 auto &String = *(Tokens.end() - 1);
382 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
383 return false;
384 At->Tok.setKind(tok::string_literal);
385 At->TokenText = StringRef(At->TokenText.begin(),
386 String->TokenText.end() - At->TokenText.begin());
387 At->ColumnWidth += String->ColumnWidth;
388 At->setType(TT_ObjCStringLiteral);
389 Tokens.erase(Tokens.end() - 1);
390 return true;
391}
392
393bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
394 // Merges #idenfier into a single identifier with the text #identifier
395 // but the token tok::identifier.
396 if (Tokens.size() < 2)
397 return false;
398 auto &Hash = *(Tokens.end() - 2);
399 auto &Identifier = *(Tokens.end() - 1);
400 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
401 return false;
402 Hash->Tok.setKind(tok::identifier);
403 Hash->TokenText =
404 StringRef(Hash->TokenText.begin(),
405 Identifier->TokenText.end() - Hash->TokenText.begin());
406 Hash->ColumnWidth += Identifier->ColumnWidth;
407 Hash->setType(TT_JsPrivateIdentifier);
408 Tokens.erase(Tokens.end() - 1);
409 return true;
410}
411
412// Search for verbatim or interpolated string literals @"ABC" or
413// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
414// prevent splitting of @, $ and ".
415// Merging of multiline verbatim strings with embedded '"' is handled in
416// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
417bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
418 if (Tokens.size() < 2)
419 return false;
420
421 // Look for @"aaaaaa" or $"aaaaaa".
422 const auto String = *(Tokens.end() - 1);
423 if (String->isNot(tok::string_literal))
424 return false;
425
426 auto Prefix = *(Tokens.end() - 2);
427 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
428 return false;
429
430 if (Tokens.size() > 2) {
431 const auto Tok = *(Tokens.end() - 3);
432 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
433 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
434 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
435 Tok->ColumnWidth += Prefix->ColumnWidth;
436 Tokens.erase(Tokens.end() - 2);
437 Prefix = Tok;
438 }
439 }
440
441 // Convert back into just a string_literal.
442 Prefix->Tok.setKind(tok::string_literal);
443 Prefix->TokenText =
444 StringRef(Prefix->TokenText.begin(),
445 String->TokenText.end() - Prefix->TokenText.begin());
446 Prefix->ColumnWidth += String->ColumnWidth;
447 Prefix->setType(TT_CSharpStringLiteral);
448 Tokens.erase(Tokens.end() - 1);
449 return true;
450}
451
452// Valid C# attribute targets:
453// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
454const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
455 "assembly", "module", "field", "event", "method",
456 "param", "property", "return", "type",
457};
458
459bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
460 if (Tokens.size() < 2)
461 return false;
462 auto &NullishCoalescing = *(Tokens.end() - 2);
463 auto &Equal = *(Tokens.end() - 1);
464 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
465 Equal->isNot(tok::equal)) {
466 return false;
467 }
468 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
469 NullishCoalescing->TokenText =
470 StringRef(NullishCoalescing->TokenText.begin(),
471 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
472 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
473 NullishCoalescing->setType(TT_NullCoalescingEqual);
474 Tokens.erase(Tokens.end() - 1);
475 return true;
476}
477
478bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
479 if (Tokens.size() < 2)
480 return false;
481 const auto At = *(Tokens.end() - 2);
482 if (At->isNot(tok::at))
483 return false;
484 const auto Keyword = *(Tokens.end() - 1);
485 if (Keyword->TokenText == "$")
486 return false;
487 if (!Keywords.isCSharpKeyword(*Keyword))
488 return false;
489
490 At->Tok.setKind(tok::identifier);
491 At->TokenText = StringRef(At->TokenText.begin(),
492 Keyword->TokenText.end() - At->TokenText.begin());
493 At->ColumnWidth += Keyword->ColumnWidth;
494 At->setType(Keyword->getType());
495 Tokens.erase(Tokens.end() - 1);
496 return true;
497}
498
499// In C# transform identifier foreach into kw_foreach
500bool FormatTokenLexer::tryTransformCSharpForEach() {
501 if (Tokens.empty())
502 return false;
503 auto &Identifier = *(Tokens.end() - 1);
504 if (Identifier->isNot(tok::identifier))
505 return false;
506 if (Identifier->TokenText != "foreach")
507 return false;
508
509 Identifier->setType(TT_ForEachMacro);
510 Identifier->Tok.setKind(tok::kw_for);
511 return true;
512}
513
514bool FormatTokenLexer::tryMergeForEach() {
515 if (Tokens.size() < 2)
516 return false;
517 auto &For = *(Tokens.end() - 2);
518 auto &Each = *(Tokens.end() - 1);
519 if (For->isNot(tok::kw_for))
520 return false;
521 if (Each->isNot(tok::identifier))
522 return false;
523 if (Each->TokenText != "each")
524 return false;
525
526 For->setType(TT_ForEachMacro);
527 For->Tok.setKind(tok::kw_for);
528
529 For->TokenText = StringRef(For->TokenText.begin(),
530 Each->TokenText.end() - For->TokenText.begin());
531 For->ColumnWidth += Each->ColumnWidth;
532 Tokens.erase(Tokens.end() - 1);
533 return true;
534}
535
536bool FormatTokenLexer::tryTransformTryUsageForC() {
537 if (Tokens.size() < 2)
538 return false;
539 auto &Try = *(Tokens.end() - 2);
540 if (Try->isNot(tok::kw_try))
541 return false;
542 auto &Next = *(Tokens.end() - 1);
543 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
544 return false;
545
546 if (Tokens.size() > 2) {
547 auto &At = *(Tokens.end() - 3);
548 if (At->is(tok::at))
549 return false;
550 }
551
552 Try->Tok.setKind(tok::identifier);
553 return true;
554}
555
556bool FormatTokenLexer::tryMergeLessLess() {
557 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
558 if (Tokens.size() < 3)
559 return false;
560
561 auto First = Tokens.end() - 3;
562 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
563 return false;
564
565 // Only merge if there currently is no whitespace between the two "<".
566 if (First[1]->hasWhitespaceBefore())
567 return false;
568
569 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
570 if (X && X->is(tok::less))
571 return false;
572
573 auto Y = First[2];
574 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
575 return false;
576
577 First[0]->Tok.setKind(tok::lessless);
578 First[0]->TokenText = "<<";
579 First[0]->ColumnWidth += 1;
580 Tokens.erase(Tokens.end() - 2);
581 return true;
582}
583
584bool FormatTokenLexer::tryMergeGreaterGreater() {
585 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
586 if (Tokens.size() < 2)
587 return false;
588
589 auto First = Tokens.end() - 2;
590 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
591 return false;
592
593 // Only merge if there currently is no whitespace between the first two ">".
594 if (First[1]->hasWhitespaceBefore())
595 return false;
596
597 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
598 if (Tok && Tok->isNot(tok::kw_operator))
599 return false;
600
601 First[0]->Tok.setKind(tok::greatergreater);
602 First[0]->TokenText = ">>";
603 First[0]->ColumnWidth += 1;
604 Tokens.erase(Tokens.end() - 1);
605 return true;
606}
607
608bool FormatTokenLexer::tryMergeUserDefinedLiteral() {
609 if (Tokens.size() < 2)
610 return false;
611
612 auto *First = Tokens.end() - 2;
613 auto &Suffix = First[1];
614 if (Suffix->hasWhitespaceBefore() || Suffix->TokenText != "$")
615 return false;
616
617 auto &Literal = First[0];
618 if (!Literal->Tok.isLiteral())
619 return false;
620
621 auto &Text = Literal->TokenText;
622 if (!Text.ends_with("_"))
623 return false;
624
625 Text = StringRef(Text.data(), Text.size() + 1);
626 ++Literal->ColumnWidth;
627 Tokens.erase(&Suffix);
628 return true;
629}
630
631bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
632 TokenType NewType) {
633 if (Tokens.size() < Kinds.size())
634 return false;
635
636 const auto *First = Tokens.end() - Kinds.size();
637 for (unsigned i = 0; i < Kinds.size(); ++i)
638 if (First[i]->isNot(Kinds[i]))
639 return false;
640
641 return tryMergeTokens(Kinds.size(), NewType);
642}
643
644bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
645 if (Tokens.size() < Count)
646 return false;
647
648 const auto *First = Tokens.end() - Count;
649 unsigned AddLength = 0;
650 for (size_t i = 1; i < Count; ++i) {
651 // If there is whitespace separating the token and the previous one,
652 // they should not be merged.
653 if (First[i]->hasWhitespaceBefore())
654 return false;
655 AddLength += First[i]->TokenText.size();
656 }
657
658 Tokens.resize(Tokens.size() - Count + 1);
659 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
660 First[0]->TokenText.size() + AddLength);
661 First[0]->ColumnWidth += AddLength;
662 First[0]->setType(NewType);
663 return true;
664}
665
666bool FormatTokenLexer::tryMergeTokensAny(
668 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
669 return tryMergeTokens(Kinds, NewType);
670 });
671}
672
673// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
674bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
675 // NB: This is not entirely correct, as an r_paren can introduce an operand
676 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
677 // corner case to not matter in practice, though.
678 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
679 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
680 tok::colon, tok::question, tok::tilde) ||
681 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
682 tok::kw_else, tok::kw_void, tok::kw_typeof,
683 Keywords.kw_instanceof, Keywords.kw_in) ||
684 Tok->isPlacementOperator() || Tok->isBinaryOperator();
685}
686
687bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
688 if (!Prev)
689 return true;
690
691 // Regex literals can only follow after prefix unary operators, not after
692 // postfix unary operators. If the '++' is followed by a non-operand
693 // introducing token, the slash here is the operand and not the start of a
694 // regex.
695 // `!` is an unary prefix operator, but also a post-fix operator that casts
696 // away nullability, so the same check applies.
697 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
698 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
699
700 // The previous token must introduce an operand location where regex
701 // literals can occur.
702 if (!precedesOperand(Prev))
703 return false;
704
705 return true;
706}
707
708void FormatTokenLexer::tryParseJavaTextBlock() {
709 if (FormatTok->TokenText != "\"\"")
710 return;
711
712 const auto *S = Lex->getBufferLocation();
713 const auto *End = Lex->getBuffer().end();
714
715 if (S == End || *S != '\"')
716 return;
717
718 ++S; // Skip the `"""` that begins a text block.
719
720 // Find the `"""` that ends the text block.
721 bool Escaped = false;
722 for (int Count = 0; Count < 3 && S < End; ++S) {
723 if (Escaped) {
724 Escaped = false;
725 continue;
726 }
727 switch (*S) {
728 case '\"':
729 ++Count;
730 break;
731 case '\\':
732 Escaped = true;
733 [[fallthrough]];
734 default:
735 Count = 0;
736 }
737 }
738
739 // Ignore the possibly invalid text block.
740 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(S)));
741}
742
743// Tries to parse a JavaScript Regex literal starting at the current token,
744// if that begins with a slash and is in a location where JavaScript allows
745// regex literals. Changes the current token to a regex literal and updates
746// its text if successful.
747void FormatTokenLexer::tryParseJSRegexLiteral() {
748 FormatToken *RegexToken = Tokens.back();
749 if (RegexToken->isNoneOf(tok::slash, tok::slashequal))
750 return;
751
752 FormatToken *Prev = nullptr;
753 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
754 // NB: Because previous pointers are not initialized yet, this cannot use
755 // Token.getPreviousNonComment.
756 if (FT->isNot(tok::comment)) {
757 Prev = FT;
758 break;
759 }
760 }
761
762 if (!canPrecedeRegexLiteral(Prev))
763 return;
764
765 // 'Manually' lex ahead in the current file buffer.
766 const char *Offset = Lex->getBufferLocation();
767 const char *RegexBegin = Offset - RegexToken->TokenText.size();
768 StringRef Buffer = Lex->getBuffer();
769 bool InCharacterClass = false;
770 bool HaveClosingSlash = false;
771 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
772 // Regular expressions are terminated with a '/', which can only be
773 // escaped using '\' or a character class between '[' and ']'.
774 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
775 switch (*Offset) {
776 case '\\':
777 // Skip the escaped character.
778 ++Offset;
779 break;
780 case '[':
781 InCharacterClass = true;
782 break;
783 case ']':
784 InCharacterClass = false;
785 break;
786 case '/':
787 if (!InCharacterClass)
788 HaveClosingSlash = true;
789 break;
790 }
791 }
792
793 RegexToken->setType(TT_RegexLiteral);
794 // Treat regex literals like other string_literals.
795 RegexToken->Tok.setKind(tok::string_literal);
796 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
797 RegexToken->ColumnWidth = RegexToken->TokenText.size();
798
799 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
800}
801
802static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
803 bool Interpolated) {
804 auto Repeated = [&Begin, End]() {
805 return Begin + 1 < End && Begin[1] == Begin[0];
806 };
807
808 // Look for a terminating '"' in the current file buffer.
809 // Make no effort to format code within an interpolated or verbatim string.
810 //
811 // Interpolated strings could contain { } with " characters inside.
812 // $"{x ?? "null"}"
813 // should not be split into $"{x ?? ", null, "}" but should be treated as a
814 // single string-literal.
815 //
816 // We opt not to try and format expressions inside {} within a C#
817 // interpolated string. Formatting expressions within an interpolated string
818 // would require similar work as that done for JavaScript template strings
819 // in `handleTemplateStrings()`.
820 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
821 switch (*Begin) {
822 case '\\':
823 if (!Verbatim)
824 ++Begin;
825 break;
826 case '{':
827 if (Interpolated) {
828 // {{ inside an interpolated string is escaped, so skip it.
829 if (Repeated())
830 ++Begin;
831 else
832 ++UnmatchedOpeningBraceCount;
833 }
834 break;
835 case '}':
836 if (Interpolated) {
837 // }} inside an interpolated string is escaped, so skip it.
838 if (Repeated())
839 ++Begin;
840 else if (UnmatchedOpeningBraceCount > 0)
841 --UnmatchedOpeningBraceCount;
842 else
843 return End;
844 }
845 break;
846 case '"':
847 if (UnmatchedOpeningBraceCount > 0)
848 break;
849 // "" within a verbatim string is an escaped double quote: skip it.
850 if (Verbatim && Repeated()) {
851 ++Begin;
852 break;
853 }
854 return Begin;
855 }
856 }
857
858 return End;
859}
860
861void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
862 FormatToken *CSharpStringLiteral = Tokens.back();
863
864 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
865 return;
866
867 auto &TokenText = CSharpStringLiteral->TokenText;
868
869 bool Verbatim = false;
870 bool Interpolated = false;
871 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
872 Verbatim = true;
873 Interpolated = true;
874 } else if (TokenText.starts_with(R"(@")")) {
875 Verbatim = true;
876 } else if (TokenText.starts_with(R"($")")) {
877 Interpolated = true;
878 }
879
880 // Deal with multiline strings.
881 if (!Verbatim && !Interpolated)
882 return;
883
884 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
885 const char *Offset = StrBegin;
886 Offset += Verbatim && Interpolated ? 3 : 2;
887
888 const auto End = Lex->getBuffer().end();
889 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
890
891 // Make no attempt to format code properly if a verbatim string is
892 // unterminated.
893 if (Offset >= End)
894 return;
895
896 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
897 TokenText = LiteralText;
898
899 // Adjust width for potentially multiline string literals.
900 size_t FirstBreak = LiteralText.find('\n');
901 StringRef FirstLineText = FirstBreak == StringRef::npos
902 ? LiteralText
903 : LiteralText.substr(0, FirstBreak);
904 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
905 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
906 Encoding);
907 size_t LastBreak = LiteralText.rfind('\n');
908 if (LastBreak != StringRef::npos) {
909 CSharpStringLiteral->IsMultiline = true;
910 unsigned StartColumn = 0;
911 CSharpStringLiteral->LastLineColumnWidth =
912 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
913 StartColumn, Style.TabWidth, Encoding);
914 }
915
916 assert(Offset < End);
917 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
918}
919
920void FormatTokenLexer::handleTableGenMultilineString() {
921 FormatToken *MultiLineString = Tokens.back();
922 if (MultiLineString->isNot(TT_TableGenMultiLineString))
923 return;
924
925 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
926 // "}]" is the end of multi line string.
927 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
928 if (CloseOffset == StringRef::npos)
929 return;
930 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
931 MultiLineString->TokenText = Text;
932 resetLexer(SourceMgr.getFileOffset(
933 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
934 auto FirstLineText = Text;
935 auto FirstBreak = Text.find('\n');
936 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
937 if (FirstBreak != StringRef::npos) {
938 MultiLineString->IsMultiline = true;
939 FirstLineText = Text.substr(0, FirstBreak + 1);
940 // LastLineColumnWidth holds the width of the last line.
941 auto LastBreak = Text.rfind('\n');
942 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
943 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
944 Style.TabWidth, Encoding);
945 }
946 // ColumnWidth holds only the width of the first line.
947 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
948 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
949}
950
951void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
952 FormatToken *Tok = Tokens.back();
953 // TableGen identifiers can begin with digits. Such tokens are lexed as
954 // numeric_constant now.
955 if (Tok->isNot(tok::numeric_constant))
956 return;
957 StringRef Text = Tok->TokenText;
958 // The following check is based on llvm::TGLexer::LexToken.
959 // That lexes the token as a number if any of the following holds:
960 // 1. It starts with '+', '-'.
961 // 2. All the characters are digits.
962 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
963 // 4. The first non-digit character is 'x', and the next is a hex digit.
964 // Note that in the case 3 and 4, if the next character does not exists in
965 // this token, the token is an identifier.
966 if (Text.empty() || Text[0] == '+' || Text[0] == '-')
967 return;
968 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
969 // All the characters are digits
970 if (NonDigitPos == StringRef::npos)
971 return;
972 char FirstNonDigit = Text[NonDigitPos];
973 if (NonDigitPos < Text.size() - 1) {
974 char TheNext = Text[NonDigitPos + 1];
975 // Regarded as a binary number.
976 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
977 return;
978 // Regarded as hex number.
979 if (FirstNonDigit == 'x' && isxdigit(TheNext))
980 return;
981 }
982 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
983 // This is actually an identifier in TableGen.
984 Tok->Tok.setKind(tok::identifier);
985 Tok->Tok.setIdentifierInfo(nullptr);
986 }
987}
988
989void FormatTokenLexer::handleTemplateStrings() {
990 FormatToken *BacktickToken = Tokens.back();
991
992 if (BacktickToken->is(tok::l_brace)) {
993 StateStack.push(LexerState::NORMAL);
994 return;
995 }
996 if (BacktickToken->is(tok::r_brace)) {
997 if (StateStack.size() == 1)
998 return;
999 StateStack.pop();
1000 if (StateStack.top() != LexerState::TEMPLATE_STRING)
1001 return;
1002 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
1003 } else if (BacktickToken->is(tok::unknown) &&
1004 BacktickToken->TokenText == "`") {
1005 StateStack.push(LexerState::TEMPLATE_STRING);
1006 } else {
1007 return; // Not actually a template
1008 }
1009
1010 // 'Manually' lex ahead in the current file buffer.
1011 const char *Offset = Lex->getBufferLocation();
1012 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
1013 for (; Offset != Lex->getBuffer().end(); ++Offset) {
1014 if (Offset[0] == '`') {
1015 StateStack.pop();
1016 ++Offset;
1017 break;
1018 }
1019 if (Offset[0] == '\\') {
1020 ++Offset; // Skip the escaped character.
1021 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
1022 Offset[1] == '{') {
1023 // '${' introduces an expression interpolation in the template string.
1024 StateStack.push(LexerState::NORMAL);
1025 Offset += 2;
1026 break;
1027 }
1028 }
1029
1030 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
1031 BacktickToken->setType(TT_TemplateString);
1032 BacktickToken->Tok.setKind(tok::string_literal);
1033 BacktickToken->TokenText = LiteralText;
1034
1035 // Adjust width for potentially multiline string literals.
1036 size_t FirstBreak = LiteralText.find('\n');
1037 StringRef FirstLineText = FirstBreak == StringRef::npos
1038 ? LiteralText
1039 : LiteralText.substr(0, FirstBreak);
1040 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
1041 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
1042 size_t LastBreak = LiteralText.rfind('\n');
1043 if (LastBreak != StringRef::npos) {
1044 BacktickToken->IsMultiline = true;
1045 unsigned StartColumn = 0; // The template tail spans the entire line.
1046 BacktickToken->LastLineColumnWidth =
1047 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
1048 StartColumn, Style.TabWidth, Encoding);
1049 }
1050
1051 SourceLocation loc = Lex->getSourceLocation(Offset);
1052 resetLexer(SourceMgr.getFileOffset(loc));
1053}
1054
1055void FormatTokenLexer::tryParsePythonComment() {
1056 FormatToken *HashToken = Tokens.back();
1057 if (HashToken->isNoneOf(tok::hash, tok::hashhash))
1058 return;
1059 // Turn the remainder of this line into a comment.
1060 const char *CommentBegin =
1061 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
1062 size_t From = CommentBegin - Lex->getBuffer().begin();
1063 size_t To = Lex->getBuffer().find_first_of('\n', From);
1064 if (To == StringRef::npos)
1065 To = Lex->getBuffer().size();
1066 size_t Len = To - From;
1067 HashToken->setType(TT_LineComment);
1068 HashToken->Tok.setKind(tok::comment);
1069 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
1070 SourceLocation Loc = To < Lex->getBuffer().size()
1071 ? Lex->getSourceLocation(CommentBegin + Len)
1072 : SourceMgr.getLocForEndOfFile(ID);
1073 resetLexer(SourceMgr.getFileOffset(Loc));
1074}
1075
1076bool FormatTokenLexer::tryMerge_TMacro() {
1077 if (Tokens.size() < 4)
1078 return false;
1079 FormatToken *Last = Tokens.back();
1080 if (Last->isNot(tok::r_paren))
1081 return false;
1082
1083 FormatToken *String = Tokens[Tokens.size() - 2];
1084 if (String->isNot(tok::string_literal) || String->IsMultiline)
1085 return false;
1086
1087 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
1088 return false;
1089
1090 FormatToken *Macro = Tokens[Tokens.size() - 4];
1091 if (Macro->TokenText != "_T")
1092 return false;
1093
1094 const char *Start = Macro->TokenText.data();
1095 const char *End = Last->TokenText.data() + Last->TokenText.size();
1096 String->TokenText = StringRef(Start, End - Start);
1097 String->IsFirst = Macro->IsFirst;
1098 String->LastNewlineOffset = Macro->LastNewlineOffset;
1099 String->WhitespaceRange = Macro->WhitespaceRange;
1100 String->OriginalColumn = Macro->OriginalColumn;
1101 String->ColumnWidth = encoding::columnWidthWithTabs(
1102 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1103 String->NewlinesBefore = Macro->NewlinesBefore;
1104 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1105
1106 Tokens.pop_back();
1107 Tokens.pop_back();
1108 Tokens.pop_back();
1109 Tokens.back() = String;
1110 if (FirstInLineIndex >= Tokens.size())
1111 FirstInLineIndex = Tokens.size() - 1;
1112 return true;
1113}
1114
1115bool FormatTokenLexer::tryMergeConflictMarkers() {
1116 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1117 return false;
1118
1119 // Conflict lines look like:
1120 // <marker> <text from the vcs>
1121 // For example:
1122 // >>>>>>> /file/in/file/system at revision 1234
1123 //
1124 // We merge all tokens in a line that starts with a conflict marker
1125 // into a single token with a special token type that the unwrapped line
1126 // parser will use to correctly rebuild the underlying code.
1127
1128 FileID ID;
1129 // Get the position of the first token in the line.
1130 unsigned FirstInLineOffset;
1131 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1132 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1133 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1134 // Calculate the offset of the start of the current line.
1135 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1136 if (LineOffset == StringRef::npos)
1137 LineOffset = 0;
1138 else
1139 ++LineOffset;
1140
1141 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1142 StringRef LineStart;
1143 if (FirstSpace == StringRef::npos)
1144 LineStart = Buffer.substr(LineOffset);
1145 else
1146 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1147
1148 TokenType Type = TT_Unknown;
1149 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1150 Type = TT_ConflictStart;
1151 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1152 LineStart == "====") {
1153 Type = TT_ConflictAlternative;
1154 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1155 Type = TT_ConflictEnd;
1156 }
1157
1158 if (Type != TT_Unknown) {
1159 FormatToken *Next = Tokens.back();
1160
1161 Tokens.resize(FirstInLineIndex + 1);
1162 // We do not need to build a complete token here, as we will skip it
1163 // during parsing anyway (as we must not touch whitespace around conflict
1164 // markers).
1165 Tokens.back()->setType(Type);
1166 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1167
1168 Tokens.push_back(Next);
1169 return true;
1170 }
1171
1172 return false;
1173}
1174
1175FormatToken *FormatTokenLexer::getStashedToken() {
1176 // Create a synthesized second '>' or '<' token.
1177 Token Tok = FormatTok->Tok;
1178 StringRef TokenText = FormatTok->TokenText;
1179
1180 unsigned OriginalColumn = FormatTok->OriginalColumn;
1181 FormatTok = new (Allocator.Allocate()) FormatToken;
1182 FormatTok->Tok = Tok;
1183 SourceLocation TokLocation =
1184 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1185 FormatTok->Tok.setLocation(TokLocation);
1186 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1187 FormatTok->TokenText = TokenText;
1188 FormatTok->ColumnWidth = 1;
1189 FormatTok->OriginalColumn = OriginalColumn + 1;
1190
1191 return FormatTok;
1192}
1193
1194/// Truncate the current token to the new length and make the lexer continue
1195/// from the end of the truncated token. Used for other languages that have
1196/// different token boundaries, like JavaScript in which a comment ends at a
1197/// line break regardless of whether the line break follows a backslash. Also
1198/// used to set the lexer to the end of whitespace if the lexer regards
1199/// whitespace and an unrecognized symbol as one token.
1200void FormatTokenLexer::truncateToken(size_t NewLen) {
1201 assert(NewLen <= FormatTok->TokenText.size());
1202 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1203 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1204 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1205 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1206 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1207 Encoding);
1208 FormatTok->Tok.setLength(NewLen);
1209}
1210
1211/// Count the length of leading whitespace in a token.
1212static size_t countLeadingWhitespace(StringRef Text) {
1213 // Basically counting the length matched by this regex.
1214 // "^([\n\r\f\v \t]|\\\\[\n\r])+"
1215 // Directly using the regex turned out to be slow. With the regex
1216 // version formatting all files in this directory took about 1.25
1217 // seconds. This version took about 0.5 seconds.
1218 const unsigned char *const Begin = Text.bytes_begin();
1219 const unsigned char *const End = Text.bytes_end();
1220 const unsigned char *Cur = Begin;
1221 while (Cur < End) {
1222 if (isWhitespace(Cur[0])) {
1223 ++Cur;
1224 } else if (Cur[0] == '\\') {
1225 // A backslash followed by optional horizontal whitespaces (P22232R2) and
1226 // then a newline always escapes the newline.
1227 // The source has a null byte at the end. So the end of the entire input
1228 // isn't reached yet. Also the lexer doesn't break apart an escaped
1229 // newline.
1230 const auto *Lookahead = Cur + 1;
1231 while (isHorizontalWhitespace(*Lookahead))
1232 ++Lookahead;
1233 // No line splice found; the backslash is a token.
1234 if (!isVerticalWhitespace(*Lookahead))
1235 break;
1236 // Splice found, consume it.
1237 Cur = Lookahead + 1;
1238 } else {
1239 break;
1240 }
1241 }
1242 return Cur - Begin;
1243}
1244
1245FormatToken *FormatTokenLexer::getNextToken() {
1246 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1247 StateStack.pop();
1248 return getStashedToken();
1249 }
1250
1251 FormatTok = new (Allocator.Allocate()) FormatToken;
1252 readRawToken(*FormatTok);
1253 SourceLocation WhitespaceStart =
1254 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1255 FormatTok->IsFirst = IsFirstToken;
1256 IsFirstToken = false;
1257
1258 // Consume and record whitespace until we find a significant token.
1259 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1260 // followed by a symbol such as backtick. Those symbols may be
1261 // significant in other languages.
1262 unsigned WhitespaceLength = TrailingWhitespace;
1263 while (FormatTok->isNot(tok::eof)) {
1264 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1265 if (LeadingWhitespace == 0)
1266 break;
1267 if (LeadingWhitespace < FormatTok->TokenText.size())
1268 truncateToken(LeadingWhitespace);
1269 StringRef Text = FormatTok->TokenText;
1270 bool InEscape = false;
1271 for (int i = 0, e = Text.size(); i != e; ++i) {
1272 switch (Text[i]) {
1273 case '\r':
1274 // If this is a CRLF sequence, break here and the LF will be handled on
1275 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1276 // the same as a single LF.
1277 if (i + 1 < e && Text[i + 1] == '\n')
1278 break;
1279 [[fallthrough]];
1280 case '\n':
1281 ++FormatTok->NewlinesBefore;
1282 if (!InEscape)
1283 FormatTok->HasUnescapedNewline = true;
1284 else
1285 InEscape = false;
1286 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1287 Column = 0;
1288 break;
1289 case '\f':
1290 if (Style.KeepFormFeed && !FormatTok->HasFormFeedBefore &&
1291 // The form feed is immediately preceded and followed by a newline.
1292 i > 0 && Text[i - 1] == '\n' &&
1293 ((i + 1 < e && Text[i + 1] == '\n') ||
1294 (i + 2 < e && Text[i + 1] == '\r' && Text[i + 2] == '\n'))) {
1295 FormatTok->HasFormFeedBefore = true;
1296 }
1297 [[fallthrough]];
1298 case '\v':
1299 Column = 0;
1300 break;
1301 case ' ':
1302 ++Column;
1303 break;
1304 case '\t':
1305 Column +=
1306 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1307 break;
1308 case '\\':
1309 // The code preceding the loop and in the countLeadingWhitespace
1310 // function guarantees that Text is entirely whitespace, not including
1311 // comments but including escaped newlines. So the character shows up,
1312 // then it has to be in an escape sequence.
1313 assert([&]() -> bool {
1314 size_t j = i + 1;
1315 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
1316 ++j;
1317 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
1318 }());
1319 InEscape = true;
1320 break;
1321 default:
1322 // This shouldn't happen.
1323 assert(false);
1324 break;
1325 }
1326 }
1327 WhitespaceLength += Text.size();
1328 readRawToken(*FormatTok);
1329 }
1330
1331 if (FormatTok->is(tok::unknown))
1332 FormatTok->setType(TT_ImplicitStringLiteral);
1333
1334 const bool IsCpp = Style.isCpp();
1335
1336 // JavaScript and Java do not allow to escape the end of the line with a
1337 // backslash. Backslashes are syntax errors in plain source, but can occur in
1338 // comments. When a single line comment ends with a \, it'll cause the next
1339 // line of code to be lexed as a comment, breaking formatting. The code below
1340 // finds comments that contain a backslash followed by a line break, truncates
1341 // the comment token at the backslash, and resets the lexer to restart behind
1342 // the backslash.
1343 if (const auto Text = FormatTok->TokenText;
1344 Text.starts_with("//") &&
1345 (IsCpp || Style.isJavaScript() || Style.isJava())) {
1346 assert(FormatTok->is(tok::comment));
1347 for (auto Pos = Text.find('\\'); Pos++ != StringRef::npos;
1348 Pos = Text.find('\\', Pos)) {
1349 if (Pos < Text.size() && Text[Pos] == '\n' &&
1350 (!IsCpp || Text.substr(Pos + 1).ltrim().starts_with("//"))) {
1351 truncateToken(Pos);
1352 break;
1353 }
1354 }
1355 }
1356
1357 if (Style.isVerilog()) {
1358 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1359 SmallVector<StringRef, 1> Matches;
1360 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1361 // And it uses the hash for delays and parameter lists. In order to continue
1362 // using `tok::hash` in other places, the backtick gets marked as the hash
1363 // here. And in order to tell the backtick and hash apart for
1364 // Verilog-specific stuff, the hash becomes an identifier.
1365 if (FormatTok->is(tok::numeric_constant)) {
1366 // In Verilog the quote is not part of a number.
1367 auto Quote = FormatTok->TokenText.find('\'');
1368 if (Quote != StringRef::npos)
1369 truncateToken(Quote);
1370 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1371 FormatTok->Tok.setKind(tok::raw_identifier);
1372 } else if (FormatTok->is(tok::raw_identifier)) {
1373 if (FormatTok->TokenText == "`") {
1374 FormatTok->Tok.setIdentifierInfo(nullptr);
1375 FormatTok->Tok.setKind(tok::hash);
1376 } else if (FormatTok->TokenText == "``") {
1377 FormatTok->Tok.setIdentifierInfo(nullptr);
1378 FormatTok->Tok.setKind(tok::hashhash);
1379 } else if (!Tokens.empty() && Tokens.back()->is(Keywords.kw_apostrophe) &&
1380 NumberBase.match(FormatTok->TokenText, &Matches)) {
1381 // In Verilog in a based number literal like `'b10`, there may be
1382 // whitespace between `'b` and `10`. Therefore we handle the base and
1383 // the rest of the number literal as two tokens. But if there is no
1384 // space in the input code, we need to manually separate the two parts.
1385 truncateToken(Matches[0].size());
1386 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1387 }
1388 }
1389 }
1390
1391 FormatTok->WhitespaceRange = SourceRange(
1392 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1393
1394 FormatTok->OriginalColumn = Column;
1395
1396 TrailingWhitespace = 0;
1397 if (FormatTok->is(tok::comment)) {
1398 // FIXME: Add the trimmed whitespace to Column.
1399 StringRef UntrimmedText = FormatTok->TokenText;
1400 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1401 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1402 } else if (FormatTok->is(tok::raw_identifier)) {
1403 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1404 FormatTok->Tok.setIdentifierInfo(&Info);
1405 FormatTok->Tok.setKind(Info.getTokenID());
1406 if (Style.isJava() &&
1407 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1408 tok::kw_operator)) {
1409 FormatTok->Tok.setKind(tok::identifier);
1410 } else if (Style.isJavaScript() &&
1411 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1412 tok::kw_operator)) {
1413 FormatTok->Tok.setKind(tok::identifier);
1414 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1415 FormatTok->Tok.setKind(tok::identifier);
1416 } else if (Style.isVerilog() && Keywords.isVerilogIdentifier(*FormatTok)) {
1417 FormatTok->Tok.setKind(tok::identifier);
1418 }
1419 } else if (const bool Greater = FormatTok->is(tok::greatergreater);
1420 Greater || FormatTok->is(tok::lessless)) {
1421 FormatTok->Tok.setKind(Greater ? tok::greater : tok::less);
1422 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1423 ++Column;
1424 StateStack.push(LexerState::TOKEN_STASHED);
1425 } else if (Style.isJava() && FormatTok->is(tok::string_literal)) {
1426 tryParseJavaTextBlock();
1427 }
1428
1429 if (Style.isVerilog() && !Tokens.empty() &&
1430 Tokens.back()->is(TT_VerilogNumberBase) &&
1431 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1432 // Mark the number following a base like `'h?a0` as a number.
1433 FormatTok->Tok.setKind(tok::numeric_constant);
1434 }
1435
1436 // Now FormatTok is the next non-whitespace token.
1437
1438 StringRef Text = FormatTok->TokenText;
1439 size_t FirstNewlinePos = Text.find('\n');
1440 if (FirstNewlinePos == StringRef::npos) {
1441 // FIXME: ColumnWidth actually depends on the start column, we need to
1442 // take this into account when the token is moved.
1443 FormatTok->ColumnWidth =
1444 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1445 Column += FormatTok->ColumnWidth;
1446 } else {
1447 FormatTok->IsMultiline = true;
1448 // FIXME: ColumnWidth actually depends on the start column, we need to
1449 // take this into account when the token is moved.
1450 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1451 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1452
1453 // The last line of the token always starts in column 0.
1454 // Thus, the length can be precomputed even in the presence of tabs.
1455 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1456 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1457 Column = FormatTok->LastLineColumnWidth;
1458 }
1459
1460 if (IsCpp) {
1461 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1462 auto it = Macros.find(Identifier);
1463 if ((Tokens.empty() || !Tokens.back()->Tok.getIdentifierInfo() ||
1464 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() !=
1465 tok::pp_define) &&
1466 it != Macros.end()) {
1467 FormatTok->setType(it->second);
1468 if (it->second == TT_IfMacro) {
1469 // The lexer token currently has type tok::kw_unknown. However, for this
1470 // substitution to be treated correctly in the TokenAnnotator, faking
1471 // the tok value seems to be needed. Not sure if there's a more elegant
1472 // way.
1473 FormatTok->Tok.setKind(tok::kw_if);
1474 }
1475 } else if (FormatTok->is(tok::identifier)) {
1476 if (MacroBlockBeginRegex.match(Text))
1477 FormatTok->setType(TT_MacroBlockBegin);
1478 else if (MacroBlockEndRegex.match(Text))
1479 FormatTok->setType(TT_MacroBlockEnd);
1480 else if (MacrosSkippedByRemoveParentheses.contains(Identifier))
1481 FormatTok->setFinalizedType(TT_FunctionLikeMacro);
1482 else if (TemplateNames.contains(Identifier))
1483 FormatTok->setFinalizedType(TT_TemplateName);
1484 else if (TypeNames.contains(Identifier))
1485 FormatTok->setFinalizedType(TT_TypeName);
1486 else if (VariableTemplates.contains(Identifier))
1487 FormatTok->setFinalizedType(TT_VariableTemplate);
1488 }
1489 }
1490
1491 return FormatTok;
1492}
1493
1494bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1495 const char *Start = Lex->getBufferLocation();
1496 size_t Len;
1497 switch (Start[0]) {
1498 // In Verilog the quote is not a character literal.
1499 case '\'':
1500 Len = 1;
1501 break;
1502 // Make the backtick and double backtick identifiers to match against them
1503 // more easily.
1504 case '`':
1505 if (Start[1] == '`')
1506 Len = 2;
1507 else
1508 Len = 1;
1509 break;
1510 // In Verilog an escaped identifier starts with a backslash and ends with
1511 // whitespace. Unless that whitespace is an escaped newline.
1512 // FIXME: If there is an escaped newline in the middle of an escaped
1513 // identifier, allow for pasting the two lines together, But escaped
1514 // identifiers usually occur only in generated code anyway.
1515 case '\\':
1516 // A backslash can also begin an escaped newline outside of an escaped
1517 // identifier.
1518 if (Start[1] == '\r' || Start[1] == '\n')
1519 return false;
1520 Len = 1;
1521 while (Start[Len] != '\0' && Start[Len] != '\f' && Start[Len] != '\n' &&
1522 Start[Len] != '\r' && Start[Len] != '\t' && Start[Len] != '\v' &&
1523 Start[Len] != ' ') {
1524 // There is a null byte at the end of the buffer, so we don't have to
1525 // check whether the next byte is within the buffer.
1526 if (Start[Len] == '\\' && Start[Len + 1] == '\r' &&
1527 Start[Len + 2] == '\n') {
1528 Len += 3;
1529 } else if (Start[Len] == '\\' &&
1530 (Start[Len + 1] == '\r' || Start[Len + 1] == '\n')) {
1531 Len += 2;
1532 } else {
1533 Len += 1;
1534 }
1535 }
1536 break;
1537 default:
1538 return false;
1539 }
1540
1541 // The kind has to be an identifier so we can match it against those defined
1542 // in Keywords. The kind has to be set before the length because the setLength
1543 // function checks that the kind is not an annotation.
1544 Tok.setKind(tok::raw_identifier);
1545 Tok.setLength(Len);
1546 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1547 Tok.setRawIdentifierData(Start);
1548 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1549 return true;
1550}
1551
1552void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1553 // For Verilog, first see if there is a special token, and fall back to the
1554 // normal lexer if there isn't one.
1555 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1556 Lex->LexFromRawLexer(Tok.Tok);
1557 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1558 Tok.Tok.getLength());
1559 // For formatting, treat unterminated string literals like normal string
1560 // literals.
1561 if (Tok.is(tok::unknown)) {
1562 if (Tok.TokenText.starts_with("\"")) {
1563 Tok.Tok.setKind(tok::string_literal);
1564 Tok.IsUnterminatedLiteral = true;
1565 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1566 Tok.Tok.setKind(tok::string_literal);
1567 }
1568 }
1569
1570 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1571 Tok.Tok.setKind(tok::string_literal);
1572
1573 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1574 FormattingDisabled = false;
1575
1576 Tok.Finalized = FormattingDisabled;
1577
1578 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1579 FormattingDisabled = true;
1580}
1581
1582void FormatTokenLexer::resetLexer(unsigned Offset) {
1583 StringRef Buffer = SourceMgr.getBufferData(ID);
1584 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1585 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1586 Lex->SetKeepWhitespaceMode(true);
1587 TrailingWhitespace = 0;
1588}
1589
1590} // namespace format
1591} // namespace clang
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
FormatToken()
Token Tok
The Token.
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
FormatToken * Next
The next token in the unwrapped line.
bool is(tok::TokenKind Kind) const
Various functions to configurably format source code.
#define X(type, name)
Definition Value.h:97
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
Token - This structure provides full information about a lexed token.
Definition Token.h:36
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition Token.h:140
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
uint32_t Literal
Literals are represented as positive integers.
Definition CNFFormula.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition Format.cpp:4594
bool isClangFormatOn(StringRef Comment)
Definition Format.cpp:4590
TokenType
Determines the semantic type of a syntactic token, e.g.
LangOptions getFormattingLangOpts(const FormatStyle &Style)
Definition Format.cpp:4234
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition CharInfo.h:99
std::vector< std::string > Macros
A list of macros of the form <definition>=<expansion> .
Definition Format.h:3704
@ TemplateName
The identifier is a template name. FIXME: Add an annotation for that.
Definition Parser.h:61
nullptr
This class represents a compute construct, representing a 'Kind' of ‘parallel’, 'serial',...
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition Format.h:5533
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition CharInfo.h:91
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition CharInfo.h:108
@ Keyword
The name has been typo-corrected to a keyword.
Definition Sema.h:562
@ Type
The name was classified as a type.
Definition Sema.h:564
std::vector< std::string > MacrosSkippedByRemoveParentheses
A vector of function-like macros whose invocations should be skipped by RemoveParentheses.
Definition Format.h:3709
std::vector< std::string > TemplateNames
A vector of non-keyword identifiers that should be interpreted as template names.
Definition Format.h:5523
std::vector< std::string > VariableTemplates
A vector of non-keyword identifiers that should be interpreted as variable template names.
Definition Format.h:5584
#define true
Definition stdbool.h:25
A wrapper around a Token storing information about the whitespace characters preceding it.
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
bool HasFormFeedBefore
Has "\n\f\n" or "\n\f\r\n" before TokenText.
unsigned IsFirst
Indicates that this is the first token of the file.