clang 22.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
20#include "clang/Format/Format.h"
21#include "llvm/Support/Regex.h"
22
23namespace clang {
24namespace format {
25
27 const SourceManager &SourceMgr, FileID ID, unsigned Column,
28 const FormatStyle &Style, encoding::Encoding Encoding,
29 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
30 IdentifierTable &IdentTable)
31 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
32 Column(Column), TrailingWhitespace(0),
33 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
34 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
35 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
36 FormattingDisabled(false), FormatOffRegex(Style.OneLineFormatOffRegex),
37 MacroBlockBeginRegex(Style.MacroBlockBegin),
38 MacroBlockEndRegex(Style.MacroBlockEnd) {
39 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
40 Lex->SetKeepWhitespaceMode(true);
41
42 for (const std::string &ForEachMacro : Style.ForEachMacros) {
43 auto Identifier = &IdentTable.get(ForEachMacro);
44 Macros.insert({Identifier, TT_ForEachMacro});
45 }
46 for (const std::string &IfMacro : Style.IfMacros) {
47 auto Identifier = &IdentTable.get(IfMacro);
48 Macros.insert({Identifier, TT_IfMacro});
49 }
50 for (const std::string &AttributeMacro : Style.AttributeMacros) {
51 auto Identifier = &IdentTable.get(AttributeMacro);
52 Macros.insert({Identifier, TT_AttributeMacro});
53 }
54 for (const std::string &StatementMacro : Style.StatementMacros) {
55 auto Identifier = &IdentTable.get(StatementMacro);
56 Macros.insert({Identifier, TT_StatementMacro});
57 }
58 for (const std::string &TypenameMacro : Style.TypenameMacros) {
59 auto Identifier = &IdentTable.get(TypenameMacro);
60 Macros.insert({Identifier, TT_TypenameMacro});
61 }
62 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
63 auto Identifier = &IdentTable.get(NamespaceMacro);
64 Macros.insert({Identifier, TT_NamespaceMacro});
65 }
66 for (const std::string &WhitespaceSensitiveMacro :
67 Style.WhitespaceSensitiveMacros) {
68 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
69 Macros.insert({Identifier, TT_UntouchableMacroFunc});
70 }
71 for (const std::string &StatementAttributeLikeMacro :
72 Style.StatementAttributeLikeMacros) {
73 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
74 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
75 }
76
77 for (const auto &Macro : Style.MacrosSkippedByRemoveParentheses)
78 MacrosSkippedByRemoveParentheses.insert(&IdentTable.get(Macro));
79 for (const auto &TemplateName : Style.TemplateNames)
80 TemplateNames.insert(&IdentTable.get(TemplateName));
81 for (const auto &TypeName : Style.TypeNames)
82 TypeNames.insert(&IdentTable.get(TypeName));
83 for (const auto &VariableTemplate : Style.VariableTemplates)
84 VariableTemplates.insert(&IdentTable.get(VariableTemplate));
85}
86
88 assert(Tokens.empty());
89 assert(FirstInLineIndex == 0);
90 enum { FO_None, FO_CurrentLine, FO_NextLine } FormatOff = FO_None;
91 do {
92 Tokens.push_back(getNextToken());
93 auto &Tok = *Tokens.back();
94 const auto NewlinesBefore = Tok.NewlinesBefore;
95 switch (FormatOff) {
96 case FO_NextLine:
97 if (NewlinesBefore > 1) {
98 FormatOff = FO_None;
99 } else {
100 Tok.Finalized = true;
101 FormatOff = FO_CurrentLine;
102 }
103 break;
104 case FO_CurrentLine:
105 if (NewlinesBefore == 0) {
106 Tok.Finalized = true;
107 break;
108 }
109 FormatOff = FO_None;
110 [[fallthrough]];
111 default:
112 if (!FormattingDisabled && FormatOffRegex.match(Tok.TokenText)) {
113 if (Tok.is(tok::comment) &&
114 (NewlinesBefore > 0 || Tokens.size() == 1)) {
115 Tok.Finalized = true;
116 FormatOff = FO_NextLine;
117 } else {
118 for (auto *Token : reverse(Tokens)) {
119 Token->Finalized = true;
120 if (Token->NewlinesBefore > 0)
121 break;
122 }
123 FormatOff = FO_CurrentLine;
124 }
125 }
126 }
127 if (Style.isJavaScript()) {
128 tryParseJSRegexLiteral();
129 handleTemplateStrings();
130 } else if (Style.isTextProto()) {
131 tryParsePythonComment();
132 }
133 tryMergePreviousTokens();
134 if (Style.isCSharp()) {
135 // This needs to come after tokens have been merged so that C#
136 // string literals are correctly identified.
137 handleCSharpVerbatimAndInterpolatedStrings();
138 } else if (Style.isTableGen()) {
139 handleTableGenMultilineString();
140 handleTableGenNumericLikeIdentifier();
141 }
142 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
143 FirstInLineIndex = Tokens.size() - 1;
144 } while (Tokens.back()->isNot(tok::eof));
145 if (Style.InsertNewlineAtEOF) {
146 auto &TokEOF = *Tokens.back();
147 if (TokEOF.NewlinesBefore == 0) {
148 TokEOF.NewlinesBefore = 1;
149 TokEOF.OriginalColumn = 0;
150 }
151 }
152 return Tokens;
153}
154
155void FormatTokenLexer::tryMergePreviousTokens() {
156 if (tryMerge_TMacro())
157 return;
158 if (tryMergeConflictMarkers())
159 return;
160 if (tryMergeLessLess())
161 return;
162 if (tryMergeGreaterGreater())
163 return;
164 if (tryMergeForEach())
165 return;
166 if (Style.isCpp() && tryTransformTryUsageForC())
167 return;
168
169 if ((Style.Language == FormatStyle::LK_Cpp ||
170 Style.Language == FormatStyle::LK_ObjC) &&
171 tryMergeUserDefinedLiteral()) {
172 return;
173 }
174
175 if (Style.isJavaScript() || Style.isCSharp()) {
176 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
177 tok::question};
178 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
179 tok::period};
180 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
181
182 if (tryMergeTokens(FatArrow, TT_FatArrow))
183 return;
184 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
185 // Treat like the "||" operator (as opposed to the ternary ?).
186 Tokens.back()->Tok.setKind(tok::pipepipe);
187 return;
188 }
189 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
190 // Treat like a regular "." access.
191 Tokens.back()->Tok.setKind(tok::period);
192 return;
193 }
194 if (tryMergeNullishCoalescingEqual())
195 return;
196
197 if (Style.isCSharp()) {
198 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
199 tok::question, tok::l_square};
200
201 if (tryMergeCSharpKeywordVariables())
202 return;
203 if (tryMergeCSharpStringLiteral())
204 return;
205 if (tryTransformCSharpForEach())
206 return;
207 if (tryMergeTokens(CSharpNullConditionalLSquare,
208 TT_CSharpNullConditionalLSquare)) {
209 // Treat like a regular "[" operator.
210 Tokens.back()->Tok.setKind(tok::l_square);
211 return;
212 }
213 }
214 }
215
216 if (tryMergeNSStringLiteral())
217 return;
218
219 if (Style.isJavaScript()) {
220 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
221 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
222 tok::equal};
223 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
224 tok::greaterequal};
225 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
226 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
227 tok::starequal};
228 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
229 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
230
231 // FIXME: Investigate what token type gives the correct operator priority.
232 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
233 return;
234 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
235 return;
236 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
237 return;
238 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
239 return;
240 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
241 Tokens.back()->Tok.setKind(tok::starequal);
242 return;
243 }
244 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
245 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
246 // Treat like the "=" assignment operator.
247 Tokens.back()->Tok.setKind(tok::equal);
248 return;
249 }
250 if (tryMergeJSPrivateIdentifier())
251 return;
252 } else if (Style.isJava()) {
253 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
254 tok::greater, tok::greater, tok::greaterequal};
255 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
256 return;
257 } else if (Style.isVerilog()) {
258 // Merge the number following a base like `'h?a0`.
259 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
260 Tokens.end()[-2]->is(tok::numeric_constant) &&
261 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
262 tok::question) &&
263 tryMergeTokens(2, TT_Unknown)) {
264 return;
265 }
266 // Part select.
267 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
268 TT_BitFieldColon)) {
269 return;
270 }
271 // Xnor. The combined token is treated as a caret which can also be either a
272 // unary or binary operator. The actual type is determined in
273 // TokenAnnotator. We also check the token length so we know it is not
274 // already a merged token.
275 if (Tokens.back()->TokenText.size() == 1 &&
276 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
277 TT_BinaryOperator)) {
278 Tokens.back()->Tok.setKind(tok::caret);
279 return;
280 }
281 // Signed shift and distribution weight.
282 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
283 Tokens.back()->Tok.setKind(tok::lessless);
284 return;
285 }
286 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
287 Tokens.back()->Tok.setKind(tok::greatergreater);
288 return;
289 }
290 if (tryMergeTokensAny({{tok::lessless, tok::equal},
291 {tok::lessless, tok::lessequal},
292 {tok::greatergreater, tok::equal},
293 {tok::greatergreater, tok::greaterequal},
294 {tok::colon, tok::equal},
295 {tok::colon, tok::slash}},
296 TT_BinaryOperator)) {
297 Tokens.back()->ForcedPrecedence = prec::Assignment;
298 return;
299 }
300 // Exponentiation, signed shift, case equality, and wildcard equality.
301 if (tryMergeTokensAny({{tok::star, tok::star},
302 {tok::lessless, tok::less},
303 {tok::greatergreater, tok::greater},
304 {tok::exclaimequal, tok::equal},
305 {tok::exclaimequal, tok::question},
306 {tok::equalequal, tok::equal},
307 {tok::equalequal, tok::question}},
308 TT_BinaryOperator)) {
309 return;
310 }
311 // Module paths in specify blocks and the implication and boolean equality
312 // operators.
313 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
314 {tok::plus, tok::star, tok::greater},
315 {tok::minusequal, tok::greater},
316 {tok::minus, tok::star, tok::greater},
317 {tok::less, tok::arrow},
318 {tok::equal, tok::greater},
319 {tok::star, tok::greater},
320 {tok::pipeequal, tok::greater},
321 {tok::pipe, tok::arrow}},
322 TT_BinaryOperator) ||
323 Tokens.back()->is(tok::arrow)) {
324 Tokens.back()->ForcedPrecedence = prec::Comma;
325 return;
326 }
327 if (Tokens.size() >= 3 &&
328 Tokens[Tokens.size() - 3]->is(Keywords.kw_verilogHash) &&
329 Tokens[Tokens.size() - 2]->isOneOf(tok::minus, tok::equal) &&
330 Tokens[Tokens.size() - 1]->is(Keywords.kw_verilogHash) &&
331 tryMergeTokens(3, TT_BinaryOperator)) {
332 Tokens.back()->setFinalizedType(TT_BinaryOperator);
333 Tokens.back()->ForcedPrecedence = prec::Comma;
334 return;
335 }
336 } else if (Style.isTableGen()) {
337 // TableGen's Multi line string starts with [{
338 if (tryMergeTokens({tok::l_square, tok::l_brace},
339 TT_TableGenMultiLineString)) {
340 // Set again with finalizing. This must never be annotated as other types.
341 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
342 Tokens.back()->Tok.setKind(tok::string_literal);
343 return;
344 }
345 // TableGen's bang operator is the form !<name>.
346 // !cond is a special case with specific syntax.
347 if (tryMergeTokens({tok::exclaim, tok::identifier},
348 TT_TableGenBangOperator)) {
349 Tokens.back()->Tok.setKind(tok::identifier);
350 Tokens.back()->Tok.setIdentifierInfo(nullptr);
351 if (Tokens.back()->TokenText == "!cond")
352 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
353 else
354 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
355 return;
356 }
357 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
358 // Here, "! if" becomes "!if". That is, ! captures if even when the space
359 // exists. That is only one possibility in TableGen's syntax.
360 Tokens.back()->Tok.setKind(tok::identifier);
361 Tokens.back()->Tok.setIdentifierInfo(nullptr);
362 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
363 return;
364 }
365 // +, - with numbers are literals. Not unary operators.
366 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
367 Tokens.back()->Tok.setKind(tok::numeric_constant);
368 return;
369 }
370 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
371 Tokens.back()->Tok.setKind(tok::numeric_constant);
372 return;
373 }
374 }
375}
376
377bool FormatTokenLexer::tryMergeNSStringLiteral() {
378 if (Tokens.size() < 2)
379 return false;
380 auto &At = *(Tokens.end() - 2);
381 auto &String = *(Tokens.end() - 1);
382 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
383 return false;
384 At->Tok.setKind(tok::string_literal);
385 At->TokenText = StringRef(At->TokenText.begin(),
386 String->TokenText.end() - At->TokenText.begin());
387 At->ColumnWidth += String->ColumnWidth;
388 At->setType(TT_ObjCStringLiteral);
389 Tokens.erase(Tokens.end() - 1);
390 return true;
391}
392
393bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
394 // Merges #idenfier into a single identifier with the text #identifier
395 // but the token tok::identifier.
396 if (Tokens.size() < 2)
397 return false;
398 auto &Hash = *(Tokens.end() - 2);
399 auto &Identifier = *(Tokens.end() - 1);
400 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
401 return false;
402 Hash->Tok.setKind(tok::identifier);
403 Hash->TokenText =
404 StringRef(Hash->TokenText.begin(),
405 Identifier->TokenText.end() - Hash->TokenText.begin());
406 Hash->ColumnWidth += Identifier->ColumnWidth;
407 Hash->setType(TT_JsPrivateIdentifier);
408 Tokens.erase(Tokens.end() - 1);
409 return true;
410}
411
412// Search for verbatim or interpolated string literals @"ABC" or
413// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
414// prevent splitting of @, $ and ".
415// Merging of multiline verbatim strings with embedded '"' is handled in
416// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
417bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
418 if (Tokens.size() < 2)
419 return false;
420
421 // Look for @"aaaaaa" or $"aaaaaa".
422 const auto String = *(Tokens.end() - 1);
423 if (String->isNot(tok::string_literal))
424 return false;
425
426 auto Prefix = *(Tokens.end() - 2);
427 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
428 return false;
429
430 if (Tokens.size() > 2) {
431 const auto Tok = *(Tokens.end() - 3);
432 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
433 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
434 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
435 Tok->ColumnWidth += Prefix->ColumnWidth;
436 Tokens.erase(Tokens.end() - 2);
437 Prefix = Tok;
438 }
439 }
440
441 // Convert back into just a string_literal.
442 Prefix->Tok.setKind(tok::string_literal);
443 Prefix->TokenText =
444 StringRef(Prefix->TokenText.begin(),
445 String->TokenText.end() - Prefix->TokenText.begin());
446 Prefix->ColumnWidth += String->ColumnWidth;
447 Prefix->setType(TT_CSharpStringLiteral);
448 Tokens.erase(Tokens.end() - 1);
449 return true;
450}
451
452// Valid C# attribute targets:
453// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
454const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
455 "assembly", "module", "field", "event", "method",
456 "param", "property", "return", "type",
457};
458
459bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
460 if (Tokens.size() < 2)
461 return false;
462 auto &NullishCoalescing = *(Tokens.end() - 2);
463 auto &Equal = *(Tokens.end() - 1);
464 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
465 Equal->isNot(tok::equal)) {
466 return false;
467 }
468 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
469 NullishCoalescing->TokenText =
470 StringRef(NullishCoalescing->TokenText.begin(),
471 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
472 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
473 NullishCoalescing->setType(TT_NullCoalescingEqual);
474 Tokens.erase(Tokens.end() - 1);
475 return true;
476}
477
478bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
479 if (Tokens.size() < 2)
480 return false;
481 const auto At = *(Tokens.end() - 2);
482 if (At->isNot(tok::at))
483 return false;
484 const auto Keyword = *(Tokens.end() - 1);
485 if (Keyword->TokenText == "$")
486 return false;
487 if (!Keywords.isCSharpKeyword(*Keyword))
488 return false;
489
490 At->Tok.setKind(tok::identifier);
491 At->TokenText = StringRef(At->TokenText.begin(),
492 Keyword->TokenText.end() - At->TokenText.begin());
493 At->ColumnWidth += Keyword->ColumnWidth;
494 At->setType(Keyword->getType());
495 Tokens.erase(Tokens.end() - 1);
496 return true;
497}
498
499// In C# transform identifier foreach into kw_foreach
500bool FormatTokenLexer::tryTransformCSharpForEach() {
501 if (Tokens.empty())
502 return false;
503 auto &Identifier = *(Tokens.end() - 1);
504 if (Identifier->isNot(tok::identifier))
505 return false;
506 if (Identifier->TokenText != "foreach")
507 return false;
508
509 Identifier->setType(TT_ForEachMacro);
510 Identifier->Tok.setKind(tok::kw_for);
511 return true;
512}
513
514bool FormatTokenLexer::tryMergeForEach() {
515 if (Tokens.size() < 2)
516 return false;
517 auto &For = *(Tokens.end() - 2);
518 auto &Each = *(Tokens.end() - 1);
519 if (For->isNot(tok::kw_for))
520 return false;
521 if (Each->isNot(tok::identifier))
522 return false;
523 if (Each->TokenText != "each")
524 return false;
525
526 For->setType(TT_ForEachMacro);
527 For->Tok.setKind(tok::kw_for);
528
529 For->TokenText = StringRef(For->TokenText.begin(),
530 Each->TokenText.end() - For->TokenText.begin());
531 For->ColumnWidth += Each->ColumnWidth;
532 Tokens.erase(Tokens.end() - 1);
533 return true;
534}
535
536bool FormatTokenLexer::tryTransformTryUsageForC() {
537 if (Tokens.size() < 2)
538 return false;
539 auto &Try = *(Tokens.end() - 2);
540 if (Try->isNot(tok::kw_try))
541 return false;
542 auto &Next = *(Tokens.end() - 1);
543 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
544 return false;
545
546 if (Tokens.size() > 2) {
547 auto &At = *(Tokens.end() - 3);
548 if (At->is(tok::at))
549 return false;
550 }
551
552 Try->Tok.setKind(tok::identifier);
553 return true;
554}
555
556bool FormatTokenLexer::tryMergeLessLess() {
557 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
558 if (Tokens.size() < 3)
559 return false;
560
561 auto First = Tokens.end() - 3;
562 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
563 return false;
564
565 // Only merge if there currently is no whitespace between the two "<".
566 if (First[1]->hasWhitespaceBefore())
567 return false;
568
569 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
570 if (X && X->is(tok::less))
571 return false;
572
573 auto Y = First[2];
574 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
575 return false;
576
577 First[0]->Tok.setKind(tok::lessless);
578 First[0]->TokenText = "<<";
579 First[0]->ColumnWidth += 1;
580 Tokens.erase(Tokens.end() - 2);
581 return true;
582}
583
584bool FormatTokenLexer::tryMergeGreaterGreater() {
585 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
586 if (Tokens.size() < 2)
587 return false;
588
589 auto First = Tokens.end() - 2;
590 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
591 return false;
592
593 // Only merge if there currently is no whitespace between the first two ">".
594 if (First[1]->hasWhitespaceBefore())
595 return false;
596
597 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
598 if (Tok && Tok->isNot(tok::kw_operator))
599 return false;
600
601 First[0]->Tok.setKind(tok::greatergreater);
602 First[0]->TokenText = ">>";
603 First[0]->ColumnWidth += 1;
604 Tokens.erase(Tokens.end() - 1);
605 return true;
606}
607
608bool FormatTokenLexer::tryMergeUserDefinedLiteral() {
609 if (Tokens.size() < 2)
610 return false;
611
612 auto *First = Tokens.end() - 2;
613 auto &Suffix = First[1];
614 if (Suffix->hasWhitespaceBefore() || Suffix->TokenText != "$")
615 return false;
616
617 auto &Literal = First[0];
618 if (!Literal->Tok.isLiteral())
619 return false;
620
621 auto &Text = Literal->TokenText;
622 if (!Text.ends_with("_"))
623 return false;
624
625 Text = StringRef(Text.data(), Text.size() + 1);
626 ++Literal->ColumnWidth;
627 Tokens.erase(&Suffix);
628 return true;
629}
630
631bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
632 TokenType NewType) {
633 if (Tokens.size() < Kinds.size())
634 return false;
635
636 const auto *First = Tokens.end() - Kinds.size();
637 for (unsigned i = 0; i < Kinds.size(); ++i)
638 if (First[i]->isNot(Kinds[i]))
639 return false;
640
641 return tryMergeTokens(Kinds.size(), NewType);
642}
643
644bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
645 if (Tokens.size() < Count)
646 return false;
647
648 const auto *First = Tokens.end() - Count;
649 unsigned AddLength = 0;
650 for (size_t i = 1; i < Count; ++i) {
651 // If there is whitespace separating the token and the previous one,
652 // they should not be merged.
653 if (First[i]->hasWhitespaceBefore())
654 return false;
655 AddLength += First[i]->TokenText.size();
656 }
657
658 Tokens.resize(Tokens.size() - Count + 1);
659 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
660 First[0]->TokenText.size() + AddLength);
661 First[0]->ColumnWidth += AddLength;
662 First[0]->setType(NewType);
663 return true;
664}
665
666bool FormatTokenLexer::tryMergeTokensAny(
668 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
669 return tryMergeTokens(Kinds, NewType);
670 });
671}
672
673// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
674bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
675 // NB: This is not entirely correct, as an r_paren can introduce an operand
676 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
677 // corner case to not matter in practice, though.
678 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
679 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
680 tok::colon, tok::question, tok::tilde) ||
681 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
682 tok::kw_else, tok::kw_void, tok::kw_typeof,
683 Keywords.kw_instanceof, Keywords.kw_in) ||
684 Tok->isPlacementOperator() || Tok->isBinaryOperator();
685}
686
687bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
688 if (!Prev)
689 return true;
690
691 // Regex literals can only follow after prefix unary operators, not after
692 // postfix unary operators. If the '++' is followed by a non-operand
693 // introducing token, the slash here is the operand and not the start of a
694 // regex.
695 // `!` is an unary prefix operator, but also a post-fix operator that casts
696 // away nullability, so the same check applies.
697 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
698 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
699
700 // The previous token must introduce an operand location where regex
701 // literals can occur.
702 if (!precedesOperand(Prev))
703 return false;
704
705 return true;
706}
707
708void FormatTokenLexer::tryParseJavaTextBlock() {
709 if (FormatTok->TokenText != "\"\"")
710 return;
711
712 const auto *S = Lex->getBufferLocation();
713 const auto *End = Lex->getBuffer().end();
714
715 if (S == End || *S != '\"')
716 return;
717
718 ++S; // Skip the `"""` that begins a text block.
719
720 // Find the `"""` that ends the text block.
721 for (int Count = 0; Count < 3 && S < End; ++S) {
722 switch (*S) {
723 case '\\':
724 Count = -1;
725 break;
726 case '\"':
727 ++Count;
728 break;
729 default:
730 Count = 0;
731 }
732 }
733
734 // Ignore the possibly invalid text block.
735 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(S)));
736}
737
738// Tries to parse a JavaScript Regex literal starting at the current token,
739// if that begins with a slash and is in a location where JavaScript allows
740// regex literals. Changes the current token to a regex literal and updates
741// its text if successful.
742void FormatTokenLexer::tryParseJSRegexLiteral() {
743 FormatToken *RegexToken = Tokens.back();
744 if (RegexToken->isNoneOf(tok::slash, tok::slashequal))
745 return;
746
747 FormatToken *Prev = nullptr;
748 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
749 // NB: Because previous pointers are not initialized yet, this cannot use
750 // Token.getPreviousNonComment.
751 if (FT->isNot(tok::comment)) {
752 Prev = FT;
753 break;
754 }
755 }
756
757 if (!canPrecedeRegexLiteral(Prev))
758 return;
759
760 // 'Manually' lex ahead in the current file buffer.
761 const char *Offset = Lex->getBufferLocation();
762 const char *RegexBegin = Offset - RegexToken->TokenText.size();
763 StringRef Buffer = Lex->getBuffer();
764 bool InCharacterClass = false;
765 bool HaveClosingSlash = false;
766 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
767 // Regular expressions are terminated with a '/', which can only be
768 // escaped using '\' or a character class between '[' and ']'.
769 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
770 switch (*Offset) {
771 case '\\':
772 // Skip the escaped character.
773 ++Offset;
774 break;
775 case '[':
776 InCharacterClass = true;
777 break;
778 case ']':
779 InCharacterClass = false;
780 break;
781 case '/':
782 if (!InCharacterClass)
783 HaveClosingSlash = true;
784 break;
785 }
786 }
787
788 RegexToken->setType(TT_RegexLiteral);
789 // Treat regex literals like other string_literals.
790 RegexToken->Tok.setKind(tok::string_literal);
791 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
792 RegexToken->ColumnWidth = RegexToken->TokenText.size();
793
794 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
795}
796
797static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
798 bool Interpolated) {
799 auto Repeated = [&Begin, End]() {
800 return Begin + 1 < End && Begin[1] == Begin[0];
801 };
802
803 // Look for a terminating '"' in the current file buffer.
804 // Make no effort to format code within an interpolated or verbatim string.
805 //
806 // Interpolated strings could contain { } with " characters inside.
807 // $"{x ?? "null"}"
808 // should not be split into $"{x ?? ", null, "}" but should be treated as a
809 // single string-literal.
810 //
811 // We opt not to try and format expressions inside {} within a C#
812 // interpolated string. Formatting expressions within an interpolated string
813 // would require similar work as that done for JavaScript template strings
814 // in `handleTemplateStrings()`.
815 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
816 switch (*Begin) {
817 case '\\':
818 if (!Verbatim)
819 ++Begin;
820 break;
821 case '{':
822 if (Interpolated) {
823 // {{ inside an interpolated string is escaped, so skip it.
824 if (Repeated())
825 ++Begin;
826 else
827 ++UnmatchedOpeningBraceCount;
828 }
829 break;
830 case '}':
831 if (Interpolated) {
832 // }} inside an interpolated string is escaped, so skip it.
833 if (Repeated())
834 ++Begin;
835 else if (UnmatchedOpeningBraceCount > 0)
836 --UnmatchedOpeningBraceCount;
837 else
838 return End;
839 }
840 break;
841 case '"':
842 if (UnmatchedOpeningBraceCount > 0)
843 break;
844 // "" within a verbatim string is an escaped double quote: skip it.
845 if (Verbatim && Repeated()) {
846 ++Begin;
847 break;
848 }
849 return Begin;
850 }
851 }
852
853 return End;
854}
855
856void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
857 FormatToken *CSharpStringLiteral = Tokens.back();
858
859 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
860 return;
861
862 auto &TokenText = CSharpStringLiteral->TokenText;
863
864 bool Verbatim = false;
865 bool Interpolated = false;
866 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
867 Verbatim = true;
868 Interpolated = true;
869 } else if (TokenText.starts_with(R"(@")")) {
870 Verbatim = true;
871 } else if (TokenText.starts_with(R"($")")) {
872 Interpolated = true;
873 }
874
875 // Deal with multiline strings.
876 if (!Verbatim && !Interpolated)
877 return;
878
879 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
880 const char *Offset = StrBegin;
881 Offset += Verbatim && Interpolated ? 3 : 2;
882
883 const auto End = Lex->getBuffer().end();
884 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
885
886 // Make no attempt to format code properly if a verbatim string is
887 // unterminated.
888 if (Offset >= End)
889 return;
890
891 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
892 TokenText = LiteralText;
893
894 // Adjust width for potentially multiline string literals.
895 size_t FirstBreak = LiteralText.find('\n');
896 StringRef FirstLineText = FirstBreak == StringRef::npos
897 ? LiteralText
898 : LiteralText.substr(0, FirstBreak);
899 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
900 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
901 Encoding);
902 size_t LastBreak = LiteralText.rfind('\n');
903 if (LastBreak != StringRef::npos) {
904 CSharpStringLiteral->IsMultiline = true;
905 unsigned StartColumn = 0;
906 CSharpStringLiteral->LastLineColumnWidth =
907 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
908 StartColumn, Style.TabWidth, Encoding);
909 }
910
911 assert(Offset < End);
912 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
913}
914
915void FormatTokenLexer::handleTableGenMultilineString() {
916 FormatToken *MultiLineString = Tokens.back();
917 if (MultiLineString->isNot(TT_TableGenMultiLineString))
918 return;
919
920 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
921 // "}]" is the end of multi line string.
922 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
923 if (CloseOffset == StringRef::npos)
924 return;
925 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
926 MultiLineString->TokenText = Text;
927 resetLexer(SourceMgr.getFileOffset(
928 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
929 auto FirstLineText = Text;
930 auto FirstBreak = Text.find('\n');
931 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
932 if (FirstBreak != StringRef::npos) {
933 MultiLineString->IsMultiline = true;
934 FirstLineText = Text.substr(0, FirstBreak + 1);
935 // LastLineColumnWidth holds the width of the last line.
936 auto LastBreak = Text.rfind('\n');
937 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
938 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
939 Style.TabWidth, Encoding);
940 }
941 // ColumnWidth holds only the width of the first line.
942 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
943 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
944}
945
946void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
947 FormatToken *Tok = Tokens.back();
948 // TableGen identifiers can begin with digits. Such tokens are lexed as
949 // numeric_constant now.
950 if (Tok->isNot(tok::numeric_constant))
951 return;
952 StringRef Text = Tok->TokenText;
953 // The following check is based on llvm::TGLexer::LexToken.
954 // That lexes the token as a number if any of the following holds:
955 // 1. It starts with '+', '-'.
956 // 2. All the characters are digits.
957 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
958 // 4. The first non-digit character is 'x', and the next is a hex digit.
959 // Note that in the case 3 and 4, if the next character does not exists in
960 // this token, the token is an identifier.
961 if (Text.empty() || Text[0] == '+' || Text[0] == '-')
962 return;
963 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
964 // All the characters are digits
965 if (NonDigitPos == StringRef::npos)
966 return;
967 char FirstNonDigit = Text[NonDigitPos];
968 if (NonDigitPos < Text.size() - 1) {
969 char TheNext = Text[NonDigitPos + 1];
970 // Regarded as a binary number.
971 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
972 return;
973 // Regarded as hex number.
974 if (FirstNonDigit == 'x' && isxdigit(TheNext))
975 return;
976 }
977 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
978 // This is actually an identifier in TableGen.
979 Tok->Tok.setKind(tok::identifier);
980 Tok->Tok.setIdentifierInfo(nullptr);
981 }
982}
983
984void FormatTokenLexer::handleTemplateStrings() {
985 FormatToken *BacktickToken = Tokens.back();
986
987 if (BacktickToken->is(tok::l_brace)) {
988 StateStack.push(LexerState::NORMAL);
989 return;
990 }
991 if (BacktickToken->is(tok::r_brace)) {
992 if (StateStack.size() == 1)
993 return;
994 StateStack.pop();
995 if (StateStack.top() != LexerState::TEMPLATE_STRING)
996 return;
997 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
998 } else if (BacktickToken->is(tok::unknown) &&
999 BacktickToken->TokenText == "`") {
1000 StateStack.push(LexerState::TEMPLATE_STRING);
1001 } else {
1002 return; // Not actually a template
1003 }
1004
1005 // 'Manually' lex ahead in the current file buffer.
1006 const char *Offset = Lex->getBufferLocation();
1007 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
1008 for (; Offset != Lex->getBuffer().end(); ++Offset) {
1009 if (Offset[0] == '`') {
1010 StateStack.pop();
1011 ++Offset;
1012 break;
1013 }
1014 if (Offset[0] == '\\') {
1015 ++Offset; // Skip the escaped character.
1016 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
1017 Offset[1] == '{') {
1018 // '${' introduces an expression interpolation in the template string.
1019 StateStack.push(LexerState::NORMAL);
1020 Offset += 2;
1021 break;
1022 }
1023 }
1024
1025 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
1026 BacktickToken->setType(TT_TemplateString);
1027 BacktickToken->Tok.setKind(tok::string_literal);
1028 BacktickToken->TokenText = LiteralText;
1029
1030 // Adjust width for potentially multiline string literals.
1031 size_t FirstBreak = LiteralText.find('\n');
1032 StringRef FirstLineText = FirstBreak == StringRef::npos
1033 ? LiteralText
1034 : LiteralText.substr(0, FirstBreak);
1035 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
1036 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
1037 size_t LastBreak = LiteralText.rfind('\n');
1038 if (LastBreak != StringRef::npos) {
1039 BacktickToken->IsMultiline = true;
1040 unsigned StartColumn = 0; // The template tail spans the entire line.
1041 BacktickToken->LastLineColumnWidth =
1042 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
1043 StartColumn, Style.TabWidth, Encoding);
1044 }
1045
1046 SourceLocation loc = Lex->getSourceLocation(Offset);
1047 resetLexer(SourceMgr.getFileOffset(loc));
1048}
1049
1050void FormatTokenLexer::tryParsePythonComment() {
1051 FormatToken *HashToken = Tokens.back();
1052 if (HashToken->isNoneOf(tok::hash, tok::hashhash))
1053 return;
1054 // Turn the remainder of this line into a comment.
1055 const char *CommentBegin =
1056 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
1057 size_t From = CommentBegin - Lex->getBuffer().begin();
1058 size_t To = Lex->getBuffer().find_first_of('\n', From);
1059 if (To == StringRef::npos)
1060 To = Lex->getBuffer().size();
1061 size_t Len = To - From;
1062 HashToken->setType(TT_LineComment);
1063 HashToken->Tok.setKind(tok::comment);
1064 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
1065 SourceLocation Loc = To < Lex->getBuffer().size()
1066 ? Lex->getSourceLocation(CommentBegin + Len)
1067 : SourceMgr.getLocForEndOfFile(ID);
1068 resetLexer(SourceMgr.getFileOffset(Loc));
1069}
1070
1071bool FormatTokenLexer::tryMerge_TMacro() {
1072 if (Tokens.size() < 4)
1073 return false;
1074 FormatToken *Last = Tokens.back();
1075 if (Last->isNot(tok::r_paren))
1076 return false;
1077
1078 FormatToken *String = Tokens[Tokens.size() - 2];
1079 if (String->isNot(tok::string_literal) || String->IsMultiline)
1080 return false;
1081
1082 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
1083 return false;
1084
1085 FormatToken *Macro = Tokens[Tokens.size() - 4];
1086 if (Macro->TokenText != "_T")
1087 return false;
1088
1089 const char *Start = Macro->TokenText.data();
1090 const char *End = Last->TokenText.data() + Last->TokenText.size();
1091 String->TokenText = StringRef(Start, End - Start);
1092 String->IsFirst = Macro->IsFirst;
1093 String->LastNewlineOffset = Macro->LastNewlineOffset;
1094 String->WhitespaceRange = Macro->WhitespaceRange;
1095 String->OriginalColumn = Macro->OriginalColumn;
1096 String->ColumnWidth = encoding::columnWidthWithTabs(
1097 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1098 String->NewlinesBefore = Macro->NewlinesBefore;
1099 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1100
1101 Tokens.pop_back();
1102 Tokens.pop_back();
1103 Tokens.pop_back();
1104 Tokens.back() = String;
1105 if (FirstInLineIndex >= Tokens.size())
1106 FirstInLineIndex = Tokens.size() - 1;
1107 return true;
1108}
1109
1110bool FormatTokenLexer::tryMergeConflictMarkers() {
1111 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1112 return false;
1113
1114 // Conflict lines look like:
1115 // <marker> <text from the vcs>
1116 // For example:
1117 // >>>>>>> /file/in/file/system at revision 1234
1118 //
1119 // We merge all tokens in a line that starts with a conflict marker
1120 // into a single token with a special token type that the unwrapped line
1121 // parser will use to correctly rebuild the underlying code.
1122
1123 FileID ID;
1124 // Get the position of the first token in the line.
1125 unsigned FirstInLineOffset;
1126 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1127 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1128 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1129 // Calculate the offset of the start of the current line.
1130 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1131 if (LineOffset == StringRef::npos)
1132 LineOffset = 0;
1133 else
1134 ++LineOffset;
1135
1136 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1137 StringRef LineStart;
1138 if (FirstSpace == StringRef::npos)
1139 LineStart = Buffer.substr(LineOffset);
1140 else
1141 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1142
1143 TokenType Type = TT_Unknown;
1144 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1145 Type = TT_ConflictStart;
1146 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1147 LineStart == "====") {
1148 Type = TT_ConflictAlternative;
1149 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1150 Type = TT_ConflictEnd;
1151 }
1152
1153 if (Type != TT_Unknown) {
1154 FormatToken *Next = Tokens.back();
1155
1156 Tokens.resize(FirstInLineIndex + 1);
1157 // We do not need to build a complete token here, as we will skip it
1158 // during parsing anyway (as we must not touch whitespace around conflict
1159 // markers).
1160 Tokens.back()->setType(Type);
1161 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1162
1163 Tokens.push_back(Next);
1164 return true;
1165 }
1166
1167 return false;
1168}
1169
1170FormatToken *FormatTokenLexer::getStashedToken() {
1171 // Create a synthesized second '>' or '<' token.
1172 Token Tok = FormatTok->Tok;
1173 StringRef TokenText = FormatTok->TokenText;
1174
1175 unsigned OriginalColumn = FormatTok->OriginalColumn;
1176 FormatTok = new (Allocator.Allocate()) FormatToken;
1177 FormatTok->Tok = Tok;
1178 SourceLocation TokLocation =
1179 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1180 FormatTok->Tok.setLocation(TokLocation);
1181 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1182 FormatTok->TokenText = TokenText;
1183 FormatTok->ColumnWidth = 1;
1184 FormatTok->OriginalColumn = OriginalColumn + 1;
1185
1186 return FormatTok;
1187}
1188
1189/// Truncate the current token to the new length and make the lexer continue
1190/// from the end of the truncated token. Used for other languages that have
1191/// different token boundaries, like JavaScript in which a comment ends at a
1192/// line break regardless of whether the line break follows a backslash. Also
1193/// used to set the lexer to the end of whitespace if the lexer regards
1194/// whitespace and an unrecognized symbol as one token.
1195void FormatTokenLexer::truncateToken(size_t NewLen) {
1196 assert(NewLen <= FormatTok->TokenText.size());
1197 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1198 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1199 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1200 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1201 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1202 Encoding);
1203 FormatTok->Tok.setLength(NewLen);
1204}
1205
1206/// Count the length of leading whitespace in a token.
1207static size_t countLeadingWhitespace(StringRef Text) {
1208 // Basically counting the length matched by this regex.
1209 // "^([\n\r\f\v \t]|\\\\[\n\r])+"
1210 // Directly using the regex turned out to be slow. With the regex
1211 // version formatting all files in this directory took about 1.25
1212 // seconds. This version took about 0.5 seconds.
1213 const unsigned char *const Begin = Text.bytes_begin();
1214 const unsigned char *const End = Text.bytes_end();
1215 const unsigned char *Cur = Begin;
1216 while (Cur < End) {
1217 if (isWhitespace(Cur[0])) {
1218 ++Cur;
1219 } else if (Cur[0] == '\\') {
1220 // A backslash followed by optional horizontal whitespaces (P22232R2) and
1221 // then a newline always escapes the newline.
1222 // The source has a null byte at the end. So the end of the entire input
1223 // isn't reached yet. Also the lexer doesn't break apart an escaped
1224 // newline.
1225 const auto *Lookahead = Cur + 1;
1226 while (isHorizontalWhitespace(*Lookahead))
1227 ++Lookahead;
1228 // No line splice found; the backslash is a token.
1229 if (!isVerticalWhitespace(*Lookahead))
1230 break;
1231 // Splice found, consume it.
1232 Cur = Lookahead + 1;
1233 } else {
1234 break;
1235 }
1236 }
1237 return Cur - Begin;
1238}
1239
1240FormatToken *FormatTokenLexer::getNextToken() {
1241 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1242 StateStack.pop();
1243 return getStashedToken();
1244 }
1245
1246 FormatTok = new (Allocator.Allocate()) FormatToken;
1247 readRawToken(*FormatTok);
1248 SourceLocation WhitespaceStart =
1249 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1250 FormatTok->IsFirst = IsFirstToken;
1251 IsFirstToken = false;
1252
1253 // Consume and record whitespace until we find a significant token.
1254 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1255 // followed by a symbol such as backtick. Those symbols may be
1256 // significant in other languages.
1257 unsigned WhitespaceLength = TrailingWhitespace;
1258 while (FormatTok->isNot(tok::eof)) {
1259 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1260 if (LeadingWhitespace == 0)
1261 break;
1262 if (LeadingWhitespace < FormatTok->TokenText.size())
1263 truncateToken(LeadingWhitespace);
1264 StringRef Text = FormatTok->TokenText;
1265 bool InEscape = false;
1266 for (int i = 0, e = Text.size(); i != e; ++i) {
1267 switch (Text[i]) {
1268 case '\r':
1269 // If this is a CRLF sequence, break here and the LF will be handled on
1270 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1271 // the same as a single LF.
1272 if (i + 1 < e && Text[i + 1] == '\n')
1273 break;
1274 [[fallthrough]];
1275 case '\n':
1276 ++FormatTok->NewlinesBefore;
1277 if (!InEscape)
1278 FormatTok->HasUnescapedNewline = true;
1279 else
1280 InEscape = false;
1281 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1282 Column = 0;
1283 break;
1284 case '\f':
1285 if (Style.KeepFormFeed && !FormatTok->HasFormFeedBefore &&
1286 // The form feed is immediately preceded and followed by a newline.
1287 i > 0 && Text[i - 1] == '\n' &&
1288 ((i + 1 < e && Text[i + 1] == '\n') ||
1289 (i + 2 < e && Text[i + 1] == '\r' && Text[i + 2] == '\n'))) {
1290 FormatTok->HasFormFeedBefore = true;
1291 }
1292 [[fallthrough]];
1293 case '\v':
1294 Column = 0;
1295 break;
1296 case ' ':
1297 ++Column;
1298 break;
1299 case '\t':
1300 Column +=
1301 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1302 break;
1303 case '\\':
1304 // The code preceding the loop and in the countLeadingWhitespace
1305 // function guarantees that Text is entirely whitespace, not including
1306 // comments but including escaped newlines. So the character shows up,
1307 // then it has to be in an escape sequence.
1308 assert([&]() -> bool {
1309 size_t j = i + 1;
1310 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
1311 ++j;
1312 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
1313 }());
1314 InEscape = true;
1315 break;
1316 default:
1317 // This shouldn't happen.
1318 assert(false);
1319 break;
1320 }
1321 }
1322 WhitespaceLength += Text.size();
1323 readRawToken(*FormatTok);
1324 }
1325
1326 if (FormatTok->is(tok::unknown))
1327 FormatTok->setType(TT_ImplicitStringLiteral);
1328
1329 const bool IsCpp = Style.isCpp();
1330
1331 // JavaScript and Java do not allow to escape the end of the line with a
1332 // backslash. Backslashes are syntax errors in plain source, but can occur in
1333 // comments. When a single line comment ends with a \, it'll cause the next
1334 // line of code to be lexed as a comment, breaking formatting. The code below
1335 // finds comments that contain a backslash followed by a line break, truncates
1336 // the comment token at the backslash, and resets the lexer to restart behind
1337 // the backslash.
1338 if (const auto Text = FormatTok->TokenText;
1339 Text.starts_with("//") &&
1340 (IsCpp || Style.isJavaScript() || Style.isJava())) {
1341 assert(FormatTok->is(tok::comment));
1342 for (auto Pos = Text.find('\\'); Pos++ != StringRef::npos;
1343 Pos = Text.find('\\', Pos)) {
1344 if (Pos < Text.size() && Text[Pos] == '\n' &&
1345 (!IsCpp || Text.substr(Pos + 1).ltrim().starts_with("//"))) {
1346 truncateToken(Pos);
1347 break;
1348 }
1349 }
1350 }
1351
1352 if (Style.isVerilog()) {
1353 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1354 SmallVector<StringRef, 1> Matches;
1355 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1356 // And it uses the hash for delays and parameter lists. In order to continue
1357 // using `tok::hash` in other places, the backtick gets marked as the hash
1358 // here. And in order to tell the backtick and hash apart for
1359 // Verilog-specific stuff, the hash becomes an identifier.
1360 if (FormatTok->is(tok::numeric_constant)) {
1361 // In Verilog the quote is not part of a number.
1362 auto Quote = FormatTok->TokenText.find('\'');
1363 if (Quote != StringRef::npos)
1364 truncateToken(Quote);
1365 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1366 FormatTok->Tok.setKind(tok::raw_identifier);
1367 } else if (FormatTok->is(tok::raw_identifier)) {
1368 if (FormatTok->TokenText == "`") {
1369 FormatTok->Tok.setIdentifierInfo(nullptr);
1370 FormatTok->Tok.setKind(tok::hash);
1371 } else if (FormatTok->TokenText == "``") {
1372 FormatTok->Tok.setIdentifierInfo(nullptr);
1373 FormatTok->Tok.setKind(tok::hashhash);
1374 } else if (!Tokens.empty() && Tokens.back()->is(Keywords.kw_apostrophe) &&
1375 NumberBase.match(FormatTok->TokenText, &Matches)) {
1376 // In Verilog in a based number literal like `'b10`, there may be
1377 // whitespace between `'b` and `10`. Therefore we handle the base and
1378 // the rest of the number literal as two tokens. But if there is no
1379 // space in the input code, we need to manually separate the two parts.
1380 truncateToken(Matches[0].size());
1381 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1382 }
1383 }
1384 }
1385
1386 FormatTok->WhitespaceRange = SourceRange(
1387 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1388
1389 FormatTok->OriginalColumn = Column;
1390
1391 TrailingWhitespace = 0;
1392 if (FormatTok->is(tok::comment)) {
1393 // FIXME: Add the trimmed whitespace to Column.
1394 StringRef UntrimmedText = FormatTok->TokenText;
1395 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1396 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1397 } else if (FormatTok->is(tok::raw_identifier)) {
1398 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1399 FormatTok->Tok.setIdentifierInfo(&Info);
1400 FormatTok->Tok.setKind(Info.getTokenID());
1401 if (Style.isJava() &&
1402 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1403 tok::kw_operator)) {
1404 FormatTok->Tok.setKind(tok::identifier);
1405 } else if (Style.isJavaScript() &&
1406 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1407 tok::kw_operator)) {
1408 FormatTok->Tok.setKind(tok::identifier);
1409 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1410 FormatTok->Tok.setKind(tok::identifier);
1411 }
1412 } else if (const bool Greater = FormatTok->is(tok::greatergreater);
1413 Greater || FormatTok->is(tok::lessless)) {
1414 FormatTok->Tok.setKind(Greater ? tok::greater : tok::less);
1415 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1416 ++Column;
1417 StateStack.push(LexerState::TOKEN_STASHED);
1418 } else if (Style.isJava() && FormatTok->is(tok::string_literal)) {
1419 tryParseJavaTextBlock();
1420 }
1421
1422 if (Style.isVerilog() && !Tokens.empty() &&
1423 Tokens.back()->is(TT_VerilogNumberBase) &&
1424 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1425 // Mark the number following a base like `'h?a0` as a number.
1426 FormatTok->Tok.setKind(tok::numeric_constant);
1427 }
1428
1429 // Now FormatTok is the next non-whitespace token.
1430
1431 StringRef Text = FormatTok->TokenText;
1432 size_t FirstNewlinePos = Text.find('\n');
1433 if (FirstNewlinePos == StringRef::npos) {
1434 // FIXME: ColumnWidth actually depends on the start column, we need to
1435 // take this into account when the token is moved.
1436 FormatTok->ColumnWidth =
1437 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1438 Column += FormatTok->ColumnWidth;
1439 } else {
1440 FormatTok->IsMultiline = true;
1441 // FIXME: ColumnWidth actually depends on the start column, we need to
1442 // take this into account when the token is moved.
1443 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1444 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1445
1446 // The last line of the token always starts in column 0.
1447 // Thus, the length can be precomputed even in the presence of tabs.
1448 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1449 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1450 Column = FormatTok->LastLineColumnWidth;
1451 }
1452
1453 if (IsCpp) {
1454 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1455 auto it = Macros.find(Identifier);
1456 if ((Tokens.empty() || !Tokens.back()->Tok.getIdentifierInfo() ||
1457 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() !=
1458 tok::pp_define) &&
1459 it != Macros.end()) {
1460 FormatTok->setType(it->second);
1461 if (it->second == TT_IfMacro) {
1462 // The lexer token currently has type tok::kw_unknown. However, for this
1463 // substitution to be treated correctly in the TokenAnnotator, faking
1464 // the tok value seems to be needed. Not sure if there's a more elegant
1465 // way.
1466 FormatTok->Tok.setKind(tok::kw_if);
1467 }
1468 } else if (FormatTok->is(tok::identifier)) {
1469 if (MacroBlockBeginRegex.match(Text))
1470 FormatTok->setType(TT_MacroBlockBegin);
1471 else if (MacroBlockEndRegex.match(Text))
1472 FormatTok->setType(TT_MacroBlockEnd);
1473 else if (MacrosSkippedByRemoveParentheses.contains(Identifier))
1474 FormatTok->setFinalizedType(TT_FunctionLikeMacro);
1475 else if (TemplateNames.contains(Identifier))
1476 FormatTok->setFinalizedType(TT_TemplateName);
1477 else if (TypeNames.contains(Identifier))
1478 FormatTok->setFinalizedType(TT_TypeName);
1479 else if (VariableTemplates.contains(Identifier))
1480 FormatTok->setFinalizedType(TT_VariableTemplate);
1481 }
1482 }
1483
1484 return FormatTok;
1485}
1486
1487bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1488 const char *Start = Lex->getBufferLocation();
1489 size_t Len;
1490 switch (Start[0]) {
1491 // In Verilog the quote is not a character literal.
1492 case '\'':
1493 Len = 1;
1494 break;
1495 // Make the backtick and double backtick identifiers to match against them
1496 // more easily.
1497 case '`':
1498 if (Start[1] == '`')
1499 Len = 2;
1500 else
1501 Len = 1;
1502 break;
1503 // In Verilog an escaped identifier starts with a backslash and ends with
1504 // whitespace. Unless that whitespace is an escaped newline.
1505 // FIXME: If there is an escaped newline in the middle of an escaped
1506 // identifier, allow for pasting the two lines together, But escaped
1507 // identifiers usually occur only in generated code anyway.
1508 case '\\':
1509 // A backslash can also begin an escaped newline outside of an escaped
1510 // identifier.
1511 if (Start[1] == '\r' || Start[1] == '\n')
1512 return false;
1513 Len = 1;
1514 while (Start[Len] != '\0' && Start[Len] != '\f' && Start[Len] != '\n' &&
1515 Start[Len] != '\r' && Start[Len] != '\t' && Start[Len] != '\v' &&
1516 Start[Len] != ' ') {
1517 // There is a null byte at the end of the buffer, so we don't have to
1518 // check whether the next byte is within the buffer.
1519 if (Start[Len] == '\\' && Start[Len + 1] == '\r' &&
1520 Start[Len + 2] == '\n') {
1521 Len += 3;
1522 } else if (Start[Len] == '\\' &&
1523 (Start[Len + 1] == '\r' || Start[Len + 1] == '\n')) {
1524 Len += 2;
1525 } else {
1526 Len += 1;
1527 }
1528 }
1529 break;
1530 default:
1531 return false;
1532 }
1533
1534 // The kind has to be an identifier so we can match it against those defined
1535 // in Keywords. The kind has to be set before the length because the setLength
1536 // function checks that the kind is not an annotation.
1537 Tok.setKind(tok::raw_identifier);
1538 Tok.setLength(Len);
1539 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1540 Tok.setRawIdentifierData(Start);
1541 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1542 return true;
1543}
1544
1545void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1546 // For Verilog, first see if there is a special token, and fall back to the
1547 // normal lexer if there isn't one.
1548 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1549 Lex->LexFromRawLexer(Tok.Tok);
1550 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1551 Tok.Tok.getLength());
1552 // For formatting, treat unterminated string literals like normal string
1553 // literals.
1554 if (Tok.is(tok::unknown)) {
1555 if (Tok.TokenText.starts_with("\"")) {
1556 Tok.Tok.setKind(tok::string_literal);
1557 Tok.IsUnterminatedLiteral = true;
1558 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1559 Tok.Tok.setKind(tok::string_literal);
1560 }
1561 }
1562
1563 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1564 Tok.Tok.setKind(tok::string_literal);
1565
1566 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1567 FormattingDisabled = false;
1568
1569 Tok.Finalized = FormattingDisabled;
1570
1571 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1572 FormattingDisabled = true;
1573}
1574
1575void FormatTokenLexer::resetLexer(unsigned Offset) {
1576 StringRef Buffer = SourceMgr.getBufferData(ID);
1577 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1578 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1579 Lex->SetKeepWhitespaceMode(true);
1580 TrailingWhitespace = 0;
1581}
1582
1583} // namespace format
1584} // namespace clang
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
FormatToken()
Token Tok
The Token.
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
FormatToken * Next
The next token in the unwrapped line.
bool is(tok::TokenKind Kind) const
Various functions to configurably format source code.
#define X(type, name)
Definition Value.h:97
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
Token - This structure provides full information about a lexed token.
Definition Token.h:36
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition Token.h:134
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
uint32_t Literal
Literals are represented as positive integers.
Definition CNFFormula.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition Format.cpp:4586
bool isClangFormatOn(StringRef Comment)
Definition Format.cpp:4582
TokenType
Determines the semantic type of a syntactic token, e.g.
LangOptions getFormattingLangOpts(const FormatStyle &Style)
Definition Format.cpp:4226
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition CharInfo.h:99
std::vector< std::string > Macros
A list of macros of the form <definition>=<expansion> .
Definition Format.h:3648
@ TemplateName
The identifier is a template name. FIXME: Add an annotation for that.
Definition Parser.h:61
nullptr
This class represents a compute construct, representing a 'Kind' of ‘parallel’, 'serial',...
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition Format.h:5467
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition CharInfo.h:91
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition CharInfo.h:108
@ Keyword
The name has been typo-corrected to a keyword.
Definition Sema.h:560
@ Type
The name was classified as a type.
Definition Sema.h:562
std::vector< std::string > MacrosSkippedByRemoveParentheses
A vector of function-like macros whose invocations should be skipped by RemoveParentheses.
Definition Format.h:3653
std::vector< std::string > TemplateNames
A vector of non-keyword identifiers that should be interpreted as template names.
Definition Format.h:5457
std::vector< std::string > VariableTemplates
A vector of non-keyword identifiers that should be interpreted as variable template names.
Definition Format.h:5518
#define true
Definition stdbool.h:25
A wrapper around a Token storing information about the whitespace characters preceding it.
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
bool HasFormFeedBefore
Has "\n\f\n" or "\n\f\r\n" before TokenText.
unsigned IsFirst
Indicates that this is the first token of the file.