clang 23.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
20#include "clang/Format/Format.h"
21#include "llvm/Support/Regex.h"
22
23namespace clang {
24namespace format {
25
27 const SourceManager &SourceMgr, FileID ID, unsigned Column,
28 const FormatStyle &Style, encoding::Encoding Encoding,
29 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
30 IdentifierTable &IdentTable)
31 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
32 Column(Column), TrailingWhitespace(0),
33 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
34 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
35 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
36 FormattingDisabled(false), FormatOffRegex(Style.OneLineFormatOffRegex),
37 MacroBlockBeginRegex(Style.MacroBlockBegin),
38 MacroBlockEndRegex(Style.MacroBlockEnd) {
39 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
40 Lex->SetKeepWhitespaceMode(true);
41
42 for (const std::string &ForEachMacro : Style.ForEachMacros) {
43 auto Identifier = &IdentTable.get(ForEachMacro);
44 Macros.insert({Identifier, TT_ForEachMacro});
45 }
46 for (const std::string &IfMacro : Style.IfMacros) {
47 auto Identifier = &IdentTable.get(IfMacro);
48 Macros.insert({Identifier, TT_IfMacro});
49 }
50 for (const std::string &AttributeMacro : Style.AttributeMacros) {
51 auto Identifier = &IdentTable.get(AttributeMacro);
52 Macros.insert({Identifier, TT_AttributeMacro});
53 }
54 for (const std::string &StatementMacro : Style.StatementMacros) {
55 auto Identifier = &IdentTable.get(StatementMacro);
56 Macros.insert({Identifier, TT_StatementMacro});
57 }
58 for (const std::string &TypenameMacro : Style.TypenameMacros) {
59 auto Identifier = &IdentTable.get(TypenameMacro);
60 Macros.insert({Identifier, TT_TypenameMacro});
61 }
62 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
63 auto Identifier = &IdentTable.get(NamespaceMacro);
64 Macros.insert({Identifier, TT_NamespaceMacro});
65 }
66 for (const std::string &WhitespaceSensitiveMacro :
67 Style.WhitespaceSensitiveMacros) {
68 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
69 Macros.insert({Identifier, TT_UntouchableMacroFunc});
70 }
71 for (const std::string &StatementAttributeLikeMacro :
72 Style.StatementAttributeLikeMacros) {
73 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
74 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
75 }
76
77 for (const auto &Macro : Style.MacrosSkippedByRemoveParentheses)
78 MacrosSkippedByRemoveParentheses.insert(&IdentTable.get(Macro));
79 for (const auto &TemplateName : Style.TemplateNames)
80 TemplateNames.insert(&IdentTable.get(TemplateName));
81 for (const auto &TypeName : Style.TypeNames)
82 TypeNames.insert(&IdentTable.get(TypeName));
83 for (const auto &VariableTemplate : Style.VariableTemplates)
84 VariableTemplates.insert(&IdentTable.get(VariableTemplate));
85}
86
88 assert(Tokens.empty());
89 assert(FirstInLineIndex == 0);
90 enum { FO_None, FO_CurrentLine, FO_NextLine } FormatOff = FO_None;
91 do {
92 Tokens.push_back(getNextToken());
93 auto &Tok = *Tokens.back();
94 const auto NewlinesBefore = Tok.NewlinesBefore;
95 switch (FormatOff) {
96 case FO_NextLine:
97 if (NewlinesBefore > 1) {
98 FormatOff = FO_None;
99 } else {
100 Tok.Finalized = true;
101 FormatOff = FO_CurrentLine;
102 }
103 break;
104 case FO_CurrentLine:
105 if (NewlinesBefore == 0) {
106 Tok.Finalized = true;
107 break;
108 }
109 FormatOff = FO_None;
110 [[fallthrough]];
111 default:
112 if (!FormattingDisabled && FormatOffRegex.match(Tok.TokenText)) {
113 if (Tok.is(tok::comment) &&
114 (NewlinesBefore > 0 || Tokens.size() == 1)) {
115 Tok.Finalized = true;
116 FormatOff = FO_NextLine;
117 } else {
118 for (auto *Token : reverse(Tokens)) {
119 Token->Finalized = true;
120 if (Token->NewlinesBefore > 0)
121 break;
122 }
123 FormatOff = FO_CurrentLine;
124 }
125 }
126 }
127 if (Style.isJavaScript()) {
128 tryParseJSRegexLiteral();
129 handleTemplateStrings();
130 } else if (Style.isTextProto()) {
131 tryParsePythonComment();
132 }
133 tryMergePreviousTokens();
134 if (Style.isCSharp()) {
135 // This needs to come after tokens have been merged so that C#
136 // string literals are correctly identified.
137 handleCSharpVerbatimAndInterpolatedStrings();
138 } else if (Style.isTableGen()) {
139 handleTableGenMultilineString();
140 handleTableGenNumericLikeIdentifier();
141 }
142 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
143 FirstInLineIndex = Tokens.size() - 1;
144 } while (Tokens.back()->isNot(tok::eof));
145 if (Style.InsertNewlineAtEOF) {
146 auto &TokEOF = *Tokens.back();
147 if (TokEOF.NewlinesBefore == 0) {
148 TokEOF.NewlinesBefore = 1;
149 TokEOF.OriginalColumn = 0;
150 }
151 }
152 return Tokens;
153}
154
155void FormatTokenLexer::tryMergePreviousTokens() {
156 if (tryMerge_TMacro())
157 return;
158 if (tryMergeConflictMarkers())
159 return;
160 if (tryMergeLessLess())
161 return;
162 if (tryMergeGreaterGreater())
163 return;
164 if (tryMergeForEach())
165 return;
166
167 if ((Style.Language == FormatStyle::LK_Cpp ||
168 Style.Language == FormatStyle::LK_ObjC) &&
169 tryMergeUserDefinedLiteral()) {
170 return;
171 }
172
173 if (Style.isJavaScript() || Style.isCSharp()) {
174 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
175 tok::question};
176 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
177 tok::period};
178 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
179
180 if (tryMergeTokens(FatArrow, TT_FatArrow))
181 return;
182 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
183 // Treat like the "||" operator (as opposed to the ternary ?).
184 Tokens.back()->Tok.setKind(tok::pipepipe);
185 return;
186 }
187 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
188 // Treat like a regular "." access.
189 Tokens.back()->Tok.setKind(tok::period);
190 return;
191 }
192 if (tryMergeNullishCoalescingEqual())
193 return;
194
195 if (Style.isCSharp()) {
196 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
197 tok::question, tok::l_square};
198
199 if (tryMergeCSharpKeywordVariables())
200 return;
201 if (tryMergeCSharpStringLiteral())
202 return;
203 if (tryTransformCSharpForEach())
204 return;
205 if (tryMergeTokens(CSharpNullConditionalLSquare,
206 TT_CSharpNullConditionalLSquare)) {
207 // Treat like a regular "[" operator.
208 Tokens.back()->Tok.setKind(tok::l_square);
209 return;
210 }
211 }
212 }
213
214 if (tryMergeNSStringLiteral())
215 return;
216
217 if (Style.isJavaScript()) {
218 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
219 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
220 tok::equal};
221 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
222 tok::greaterequal};
223 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
224 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
225 tok::starequal};
226 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
227 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
228
229 // FIXME: Investigate what token type gives the correct operator priority.
230 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
231 return;
232 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
233 return;
234 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
235 return;
236 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
237 return;
238 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
239 Tokens.back()->Tok.setKind(tok::starequal);
240 return;
241 }
242 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
243 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
244 // Treat like the "=" assignment operator.
245 Tokens.back()->Tok.setKind(tok::equal);
246 return;
247 }
248 if (tryMergeJSPrivateIdentifier())
249 return;
250 } else if (Style.isJava()) {
251 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
252 tok::greater, tok::greater, tok::greaterequal};
253 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
254 return;
255 } else if (Style.isVerilog()) {
256 // Merge the number following a base like `'h?a0`.
257 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
258 Tokens.end()[-2]->is(tok::numeric_constant) &&
259 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
260 tok::question) &&
261 tryMergeTokens(2, TT_Unknown)) {
262 return;
263 }
264 // Part select.
265 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
266 TT_BitFieldColon)) {
267 return;
268 }
269 // Xnor. The combined token is treated as a caret which can also be either a
270 // unary or binary operator. The actual type is determined in
271 // TokenAnnotator. We also check the token length so we know it is not
272 // already a merged token.
273 if (Tokens.back()->TokenText.size() == 1 &&
274 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
275 TT_BinaryOperator)) {
276 Tokens.back()->Tok.setKind(tok::caret);
277 return;
278 }
279 // Signed shift and distribution weight.
280 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
281 Tokens.back()->Tok.setKind(tok::lessless);
282 return;
283 }
284 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
285 Tokens.back()->Tok.setKind(tok::greatergreater);
286 return;
287 }
288 if (tryMergeTokensAny({{tok::lessless, tok::equal},
289 {tok::lessless, tok::lessequal},
290 {tok::greatergreater, tok::equal},
291 {tok::greatergreater, tok::greaterequal},
292 {tok::colon, tok::equal},
293 {tok::colon, tok::slash}},
294 TT_BinaryOperator)) {
295 Tokens.back()->ForcedPrecedence = prec::Assignment;
296 return;
297 }
298 // Exponentiation, signed shift, case equality, and wildcard equality.
299 if (tryMergeTokensAny({{tok::star, tok::star},
300 {tok::lessless, tok::less},
301 {tok::greatergreater, tok::greater},
302 {tok::exclaimequal, tok::equal},
303 {tok::exclaimequal, tok::question},
304 {tok::equalequal, tok::equal},
305 {tok::equalequal, tok::question}},
306 TT_BinaryOperator)) {
307 return;
308 }
309 // Module paths in specify blocks and the implication and boolean equality
310 // operators.
311 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
312 {tok::plus, tok::star, tok::greater},
313 {tok::minusequal, tok::greater},
314 {tok::minus, tok::star, tok::greater},
315 {tok::less, tok::arrow},
316 {tok::equal, tok::greater},
317 {tok::star, tok::greater},
318 {tok::pipeequal, tok::greater},
319 {tok::pipe, tok::arrow}},
320 TT_BinaryOperator) ||
321 Tokens.back()->is(tok::arrow)) {
322 Tokens.back()->ForcedPrecedence = prec::Comma;
323 return;
324 }
325 if (Tokens.size() >= 3 &&
326 Tokens[Tokens.size() - 3]->is(Keywords.kw_verilogHash) &&
327 Tokens[Tokens.size() - 2]->isOneOf(tok::minus, tok::equal) &&
328 Tokens[Tokens.size() - 1]->is(Keywords.kw_verilogHash) &&
329 tryMergeTokens(3, TT_BinaryOperator)) {
330 Tokens.back()->setFinalizedType(TT_BinaryOperator);
331 Tokens.back()->ForcedPrecedence = prec::Comma;
332 return;
333 }
334 } else if (Style.isTableGen()) {
335 // TableGen's Multi line string starts with [{
336 if (tryMergeTokens({tok::l_square, tok::l_brace},
337 TT_TableGenMultiLineString)) {
338 // Set again with finalizing. This must never be annotated as other types.
339 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
340 Tokens.back()->Tok.setKind(tok::string_literal);
341 return;
342 }
343 // TableGen's bang operator is the form !<name>.
344 // !cond is a special case with specific syntax.
345 if (tryMergeTokens({tok::exclaim, tok::identifier},
346 TT_TableGenBangOperator)) {
347 Tokens.back()->Tok.setKind(tok::identifier);
348 Tokens.back()->Tok.setIdentifierInfo(nullptr);
349 if (Tokens.back()->TokenText == "!cond")
350 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
351 else
352 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
353 return;
354 }
355 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
356 // Here, "! if" becomes "!if". That is, ! captures if even when the space
357 // exists. That is only one possibility in TableGen's syntax.
358 Tokens.back()->Tok.setKind(tok::identifier);
359 Tokens.back()->Tok.setIdentifierInfo(nullptr);
360 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
361 return;
362 }
363 // +, - with numbers are literals. Not unary operators.
364 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
365 Tokens.back()->Tok.setKind(tok::numeric_constant);
366 return;
367 }
368 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
369 Tokens.back()->Tok.setKind(tok::numeric_constant);
370 return;
371 }
372 }
373}
374
375bool FormatTokenLexer::tryMergeNSStringLiteral() {
376 if (Tokens.size() < 2)
377 return false;
378 auto &At = *(Tokens.end() - 2);
379 auto &String = *(Tokens.end() - 1);
380 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
381 return false;
382 At->Tok.setKind(tok::string_literal);
383 At->TokenText = StringRef(At->TokenText.begin(),
384 String->TokenText.end() - At->TokenText.begin());
385 At->ColumnWidth += String->ColumnWidth;
386 At->setType(TT_ObjCStringLiteral);
387 Tokens.erase(Tokens.end() - 1);
388 return true;
389}
390
391bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
392 // Merges #idenfier into a single identifier with the text #identifier
393 // but the token tok::identifier.
394 if (Tokens.size() < 2)
395 return false;
396 auto &Hash = *(Tokens.end() - 2);
397 auto &Identifier = *(Tokens.end() - 1);
398 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
399 return false;
400 Hash->Tok.setKind(tok::identifier);
401 Hash->TokenText =
402 StringRef(Hash->TokenText.begin(),
403 Identifier->TokenText.end() - Hash->TokenText.begin());
404 Hash->ColumnWidth += Identifier->ColumnWidth;
405 Hash->setType(TT_JsPrivateIdentifier);
406 Tokens.erase(Tokens.end() - 1);
407 return true;
408}
409
410// Search for verbatim or interpolated string literals @"ABC" or
411// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
412// prevent splitting of @, $ and ".
413// Merging of multiline verbatim strings with embedded '"' is handled in
414// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
415bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
416 if (Tokens.size() < 2)
417 return false;
418
419 // Look for @"aaaaaa" or $"aaaaaa".
420 const auto String = *(Tokens.end() - 1);
421 if (String->isNot(tok::string_literal))
422 return false;
423
424 auto Prefix = *(Tokens.end() - 2);
425 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
426 return false;
427
428 if (Tokens.size() > 2) {
429 const auto Tok = *(Tokens.end() - 3);
430 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
431 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
432 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
433 Tok->ColumnWidth += Prefix->ColumnWidth;
434 Tokens.erase(Tokens.end() - 2);
435 Prefix = Tok;
436 }
437 }
438
439 // Convert back into just a string_literal.
440 Prefix->Tok.setKind(tok::string_literal);
441 Prefix->TokenText =
442 StringRef(Prefix->TokenText.begin(),
443 String->TokenText.end() - Prefix->TokenText.begin());
444 Prefix->ColumnWidth += String->ColumnWidth;
445 Prefix->setType(TT_CSharpStringLiteral);
446 Tokens.erase(Tokens.end() - 1);
447 return true;
448}
449
450// Valid C# attribute targets:
451// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
452const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
453 "assembly", "module", "field", "event", "method",
454 "param", "property", "return", "type",
455};
456
457bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
458 if (Tokens.size() < 2)
459 return false;
460 auto &NullishCoalescing = *(Tokens.end() - 2);
461 auto &Equal = *(Tokens.end() - 1);
462 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
463 Equal->isNot(tok::equal)) {
464 return false;
465 }
466 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
467 NullishCoalescing->TokenText =
468 StringRef(NullishCoalescing->TokenText.begin(),
469 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
470 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
471 NullishCoalescing->setType(TT_NullCoalescingEqual);
472 Tokens.erase(Tokens.end() - 1);
473 return true;
474}
475
476bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
477 if (Tokens.size() < 2)
478 return false;
479 const auto At = *(Tokens.end() - 2);
480 if (At->isNot(tok::at))
481 return false;
482 const auto Keyword = *(Tokens.end() - 1);
483 if (Keyword->TokenText == "$")
484 return false;
485 if (!Keywords.isCSharpKeyword(*Keyword))
486 return false;
487
488 At->Tok.setKind(tok::identifier);
489 At->TokenText = StringRef(At->TokenText.begin(),
490 Keyword->TokenText.end() - At->TokenText.begin());
491 At->ColumnWidth += Keyword->ColumnWidth;
492 At->setType(Keyword->getType());
493 Tokens.erase(Tokens.end() - 1);
494 return true;
495}
496
497// In C# transform identifier foreach into kw_foreach
498bool FormatTokenLexer::tryTransformCSharpForEach() {
499 if (Tokens.empty())
500 return false;
501 auto &Identifier = *(Tokens.end() - 1);
502 if (Identifier->isNot(tok::identifier))
503 return false;
504 if (Identifier->TokenText != "foreach")
505 return false;
506
507 Identifier->setType(TT_ForEachMacro);
508 Identifier->Tok.setKind(tok::kw_for);
509 return true;
510}
511
512bool FormatTokenLexer::tryMergeForEach() {
513 if (Tokens.size() < 2)
514 return false;
515 auto &For = *(Tokens.end() - 2);
516 auto &Each = *(Tokens.end() - 1);
517 if (For->isNot(tok::kw_for))
518 return false;
519 if (Each->isNot(tok::identifier))
520 return false;
521 if (Each->TokenText != "each")
522 return false;
523
524 For->setType(TT_ForEachMacro);
525 For->Tok.setKind(tok::kw_for);
526
527 For->TokenText = StringRef(For->TokenText.begin(),
528 Each->TokenText.end() - For->TokenText.begin());
529 For->ColumnWidth += Each->ColumnWidth;
530 Tokens.erase(Tokens.end() - 1);
531 return true;
532}
533
534bool FormatTokenLexer::tryMergeLessLess() {
535 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
536 if (Tokens.size() < 3)
537 return false;
538
539 auto First = Tokens.end() - 3;
540 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
541 return false;
542
543 // Only merge if there currently is no whitespace between the two "<".
544 if (First[1]->hasWhitespaceBefore())
545 return false;
546
547 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
548 if (X && X->is(tok::less))
549 return false;
550
551 auto Y = First[2];
552 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
553 return false;
554
555 First[0]->Tok.setKind(tok::lessless);
556 First[0]->TokenText = "<<";
557 First[0]->ColumnWidth += 1;
558 Tokens.erase(Tokens.end() - 2);
559 return true;
560}
561
562bool FormatTokenLexer::tryMergeGreaterGreater() {
563 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
564 if (Tokens.size() < 2)
565 return false;
566
567 auto First = Tokens.end() - 2;
568 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
569 return false;
570
571 // Only merge if there currently is no whitespace between the first two ">".
572 if (First[1]->hasWhitespaceBefore())
573 return false;
574
575 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
576 if (Tok && Tok->isNot(tok::kw_operator))
577 return false;
578
579 First[0]->Tok.setKind(tok::greatergreater);
580 First[0]->TokenText = ">>";
581 First[0]->ColumnWidth += 1;
582 Tokens.erase(Tokens.end() - 1);
583 return true;
584}
585
586bool FormatTokenLexer::tryMergeUserDefinedLiteral() {
587 if (Tokens.size() < 2)
588 return false;
589
590 auto *First = Tokens.end() - 2;
591 auto &Suffix = First[1];
592 if (Suffix->hasWhitespaceBefore() || Suffix->TokenText != "$")
593 return false;
594
595 auto &Literal = First[0];
596 if (!Literal->Tok.isLiteral())
597 return false;
598
599 auto &Text = Literal->TokenText;
600 if (!Text.ends_with("_"))
601 return false;
602
603 Text = StringRef(Text.data(), Text.size() + 1);
604 ++Literal->ColumnWidth;
605 Tokens.erase(&Suffix);
606 return true;
607}
608
609bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
610 TokenType NewType) {
611 if (Tokens.size() < Kinds.size())
612 return false;
613
614 const auto *First = Tokens.end() - Kinds.size();
615 for (unsigned i = 0; i < Kinds.size(); ++i)
616 if (First[i]->isNot(Kinds[i]))
617 return false;
618
619 return tryMergeTokens(Kinds.size(), NewType);
620}
621
622bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
623 if (Tokens.size() < Count)
624 return false;
625
626 const auto *First = Tokens.end() - Count;
627 unsigned AddLength = 0;
628 for (size_t i = 1; i < Count; ++i) {
629 // If there is whitespace separating the token and the previous one,
630 // they should not be merged.
631 if (First[i]->hasWhitespaceBefore())
632 return false;
633 AddLength += First[i]->TokenText.size();
634 }
635
636 Tokens.resize(Tokens.size() - Count + 1);
637 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
638 First[0]->TokenText.size() + AddLength);
639 First[0]->ColumnWidth += AddLength;
640 First[0]->setType(NewType);
641 return true;
642}
643
644bool FormatTokenLexer::tryMergeTokensAny(
646 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
647 return tryMergeTokens(Kinds, NewType);
648 });
649}
650
651// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
652bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
653 // NB: This is not entirely correct, as an r_paren can introduce an operand
654 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
655 // corner case to not matter in practice, though.
656 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
657 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
658 tok::colon, tok::question, tok::tilde) ||
659 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
660 tok::kw_else, tok::kw_void, tok::kw_typeof,
661 Keywords.kw_instanceof, Keywords.kw_in) ||
662 Tok->isPlacementOperator() || Tok->isBinaryOperator();
663}
664
665bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
666 if (!Prev)
667 return true;
668
669 // Regex literals can only follow after prefix unary operators, not after
670 // postfix unary operators. If the '++' is followed by a non-operand
671 // introducing token, the slash here is the operand and not the start of a
672 // regex.
673 // `!` is an unary prefix operator, but also a post-fix operator that casts
674 // away nullability, so the same check applies.
675 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
676 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
677
678 // The previous token must introduce an operand location where regex
679 // literals can occur.
680 if (!precedesOperand(Prev))
681 return false;
682
683 return true;
684}
685
686void FormatTokenLexer::tryParseJavaTextBlock() {
687 if (FormatTok->TokenText != "\"\"")
688 return;
689
690 const auto *S = Lex->getBufferLocation();
691 const auto *End = Lex->getBuffer().end();
692
693 if (S == End || *S != '\"')
694 return;
695
696 ++S; // Skip the `"""` that begins a text block.
697
698 // Find the `"""` that ends the text block.
699 bool Escaped = false;
700 for (int Count = 0; Count < 3 && S < End; ++S) {
701 if (Escaped) {
702 Escaped = false;
703 continue;
704 }
705 switch (*S) {
706 case '\"':
707 ++Count;
708 break;
709 case '\\':
710 Escaped = true;
711 [[fallthrough]];
712 default:
713 Count = 0;
714 }
715 }
716
717 // Ignore the possibly invalid text block.
718 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(S)));
719}
720
721// Tries to parse a JavaScript Regex literal starting at the current token,
722// if that begins with a slash and is in a location where JavaScript allows
723// regex literals. Changes the current token to a regex literal and updates
724// its text if successful.
725void FormatTokenLexer::tryParseJSRegexLiteral() {
726 FormatToken *RegexToken = Tokens.back();
727 if (RegexToken->isNoneOf(tok::slash, tok::slashequal))
728 return;
729
730 FormatToken *Prev = nullptr;
731 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
732 // NB: Because previous pointers are not initialized yet, this cannot use
733 // Token.getPreviousNonComment.
734 if (FT->isNot(tok::comment)) {
735 Prev = FT;
736 break;
737 }
738 }
739
740 if (!canPrecedeRegexLiteral(Prev))
741 return;
742
743 // 'Manually' lex ahead in the current file buffer.
744 const char *Offset = Lex->getBufferLocation();
745 const char *RegexBegin = Offset - RegexToken->TokenText.size();
746 StringRef Buffer = Lex->getBuffer();
747 bool InCharacterClass = false;
748 bool HaveClosingSlash = false;
749 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
750 // Regular expressions are terminated with a '/', which can only be
751 // escaped using '\' or a character class between '[' and ']'.
752 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
753 switch (*Offset) {
754 case '\\':
755 // Skip the escaped character.
756 ++Offset;
757 break;
758 case '[':
759 InCharacterClass = true;
760 break;
761 case ']':
762 InCharacterClass = false;
763 break;
764 case '/':
765 if (!InCharacterClass)
766 HaveClosingSlash = true;
767 break;
768 }
769 }
770
771 RegexToken->setType(TT_RegexLiteral);
772 // Treat regex literals like other string_literals.
773 RegexToken->Tok.setKind(tok::string_literal);
774 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
775 RegexToken->ColumnWidth = RegexToken->TokenText.size();
776
777 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
778}
779
780static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
781 bool Interpolated) {
782 auto Repeated = [&Begin, End]() {
783 return Begin + 1 < End && Begin[1] == Begin[0];
784 };
785
786 // Look for a terminating '"' in the current file buffer.
787 // Make no effort to format code within an interpolated or verbatim string.
788 //
789 // Interpolated strings could contain { } with " characters inside.
790 // $"{x ?? "null"}"
791 // should not be split into $"{x ?? ", null, "}" but should be treated as a
792 // single string-literal.
793 //
794 // We opt not to try and format expressions inside {} within a C#
795 // interpolated string. Formatting expressions within an interpolated string
796 // would require similar work as that done for JavaScript template strings
797 // in `handleTemplateStrings()`.
798 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
799 switch (*Begin) {
800 case '\\':
801 if (!Verbatim)
802 ++Begin;
803 break;
804 case '{':
805 if (Interpolated) {
806 // {{ inside an interpolated string is escaped, so skip it.
807 if (Repeated())
808 ++Begin;
809 else
810 ++UnmatchedOpeningBraceCount;
811 }
812 break;
813 case '}':
814 if (Interpolated) {
815 // }} inside an interpolated string is escaped, so skip it.
816 if (Repeated())
817 ++Begin;
818 else if (UnmatchedOpeningBraceCount > 0)
819 --UnmatchedOpeningBraceCount;
820 else
821 return End;
822 }
823 break;
824 case '"':
825 if (UnmatchedOpeningBraceCount > 0)
826 break;
827 // "" within a verbatim string is an escaped double quote: skip it.
828 if (Verbatim && Repeated()) {
829 ++Begin;
830 break;
831 }
832 return Begin;
833 }
834 }
835
836 return End;
837}
838
839void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
840 FormatToken *CSharpStringLiteral = Tokens.back();
841
842 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
843 return;
844
845 auto &TokenText = CSharpStringLiteral->TokenText;
846
847 bool Verbatim = false;
848 bool Interpolated = false;
849 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
850 Verbatim = true;
851 Interpolated = true;
852 } else if (TokenText.starts_with(R"(@")")) {
853 Verbatim = true;
854 } else if (TokenText.starts_with(R"($")")) {
855 Interpolated = true;
856 }
857
858 // Deal with multiline strings.
859 if (!Verbatim && !Interpolated)
860 return;
861
862 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
863 const char *Offset = StrBegin;
864 Offset += Verbatim && Interpolated ? 3 : 2;
865
866 const auto End = Lex->getBuffer().end();
867 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
868
869 // Make no attempt to format code properly if a verbatim string is
870 // unterminated.
871 if (Offset >= End)
872 return;
873
874 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
875 TokenText = LiteralText;
876
877 // Adjust width for potentially multiline string literals.
878 size_t FirstBreak = LiteralText.find('\n');
879 StringRef FirstLineText = FirstBreak == StringRef::npos
880 ? LiteralText
881 : LiteralText.substr(0, FirstBreak);
882 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
883 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
884 Encoding);
885 size_t LastBreak = LiteralText.rfind('\n');
886 if (LastBreak != StringRef::npos) {
887 CSharpStringLiteral->IsMultiline = true;
888 unsigned StartColumn = 0;
889 CSharpStringLiteral->LastLineColumnWidth =
890 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
891 StartColumn, Style.TabWidth, Encoding);
892 }
893
894 assert(Offset < End);
895 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
896}
897
898void FormatTokenLexer::handleTableGenMultilineString() {
899 FormatToken *MultiLineString = Tokens.back();
900 if (MultiLineString->isNot(TT_TableGenMultiLineString))
901 return;
902
903 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
904 // "}]" is the end of multi line string.
905 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
906 if (CloseOffset == StringRef::npos)
907 return;
908 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
909 MultiLineString->TokenText = Text;
910 resetLexer(SourceMgr.getFileOffset(
911 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
912 auto FirstLineText = Text;
913 auto FirstBreak = Text.find('\n');
914 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
915 if (FirstBreak != StringRef::npos) {
916 MultiLineString->IsMultiline = true;
917 FirstLineText = Text.substr(0, FirstBreak + 1);
918 // LastLineColumnWidth holds the width of the last line.
919 auto LastBreak = Text.rfind('\n');
920 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
921 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
922 Style.TabWidth, Encoding);
923 }
924 // ColumnWidth holds only the width of the first line.
925 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
926 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
927}
928
929void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
930 FormatToken *Tok = Tokens.back();
931 // TableGen identifiers can begin with digits. Such tokens are lexed as
932 // numeric_constant now.
933 if (Tok->isNot(tok::numeric_constant))
934 return;
935 StringRef Text = Tok->TokenText;
936 // The following check is based on llvm::TGLexer::LexToken.
937 // That lexes the token as a number if any of the following holds:
938 // 1. It starts with '+', '-'.
939 // 2. All the characters are digits.
940 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
941 // 4. The first non-digit character is 'x', and the next is a hex digit.
942 // Note that in the case 3 and 4, if the next character does not exists in
943 // this token, the token is an identifier.
944 if (Text.empty() || Text[0] == '+' || Text[0] == '-')
945 return;
946 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
947 // All the characters are digits
948 if (NonDigitPos == StringRef::npos)
949 return;
950 char FirstNonDigit = Text[NonDigitPos];
951 if (NonDigitPos < Text.size() - 1) {
952 char TheNext = Text[NonDigitPos + 1];
953 // Regarded as a binary number.
954 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
955 return;
956 // Regarded as hex number.
957 if (FirstNonDigit == 'x' && isxdigit(TheNext))
958 return;
959 }
960 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
961 // This is actually an identifier in TableGen.
962 Tok->Tok.setKind(tok::identifier);
963 Tok->Tok.setIdentifierInfo(nullptr);
964 }
965}
966
967void FormatTokenLexer::handleTemplateStrings() {
968 FormatToken *BacktickToken = Tokens.back();
969
970 if (BacktickToken->is(tok::l_brace)) {
971 StateStack.push(LexerState::NORMAL);
972 return;
973 }
974 if (BacktickToken->is(tok::r_brace)) {
975 if (StateStack.size() == 1)
976 return;
977 StateStack.pop();
978 if (StateStack.top() != LexerState::TEMPLATE_STRING)
979 return;
980 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
981 } else if (BacktickToken->is(tok::unknown) &&
982 BacktickToken->TokenText == "`") {
983 StateStack.push(LexerState::TEMPLATE_STRING);
984 } else {
985 return; // Not actually a template
986 }
987
988 // 'Manually' lex ahead in the current file buffer.
989 const char *Offset = Lex->getBufferLocation();
990 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
991 for (; Offset != Lex->getBuffer().end(); ++Offset) {
992 if (Offset[0] == '`') {
993 StateStack.pop();
994 ++Offset;
995 break;
996 }
997 if (Offset[0] == '\\') {
998 ++Offset; // Skip the escaped character.
999 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
1000 Offset[1] == '{') {
1001 // '${' introduces an expression interpolation in the template string.
1002 StateStack.push(LexerState::NORMAL);
1003 Offset += 2;
1004 break;
1005 }
1006 }
1007
1008 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
1009 BacktickToken->setType(TT_TemplateString);
1010 BacktickToken->Tok.setKind(tok::string_literal);
1011 BacktickToken->TokenText = LiteralText;
1012
1013 // Adjust width for potentially multiline string literals.
1014 size_t FirstBreak = LiteralText.find('\n');
1015 StringRef FirstLineText = FirstBreak == StringRef::npos
1016 ? LiteralText
1017 : LiteralText.substr(0, FirstBreak);
1018 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
1019 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
1020 size_t LastBreak = LiteralText.rfind('\n');
1021 if (LastBreak != StringRef::npos) {
1022 BacktickToken->IsMultiline = true;
1023 unsigned StartColumn = 0; // The template tail spans the entire line.
1024 BacktickToken->LastLineColumnWidth =
1025 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
1026 StartColumn, Style.TabWidth, Encoding);
1027 }
1028
1029 SourceLocation loc = Lex->getSourceLocation(Offset);
1030 resetLexer(SourceMgr.getFileOffset(loc));
1031}
1032
1033void FormatTokenLexer::tryParsePythonComment() {
1034 FormatToken *HashToken = Tokens.back();
1035 if (HashToken->isNoneOf(tok::hash, tok::hashhash))
1036 return;
1037 // Turn the remainder of this line into a comment.
1038 const char *CommentBegin =
1039 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
1040 size_t From = CommentBegin - Lex->getBuffer().begin();
1041 size_t To = Lex->getBuffer().find_first_of('\n', From);
1042 if (To == StringRef::npos)
1043 To = Lex->getBuffer().size();
1044 size_t Len = To - From;
1045 HashToken->setType(TT_LineComment);
1046 HashToken->Tok.setKind(tok::comment);
1047 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
1048 SourceLocation Loc = To < Lex->getBuffer().size()
1049 ? Lex->getSourceLocation(CommentBegin + Len)
1050 : SourceMgr.getLocForEndOfFile(ID);
1051 resetLexer(SourceMgr.getFileOffset(Loc));
1052}
1053
1054bool FormatTokenLexer::tryMerge_TMacro() {
1055 if (Tokens.size() < 4)
1056 return false;
1057 FormatToken *Last = Tokens.back();
1058 if (Last->isNot(tok::r_paren))
1059 return false;
1060
1061 FormatToken *String = Tokens[Tokens.size() - 2];
1062 if (String->isNot(tok::string_literal) || String->IsMultiline)
1063 return false;
1064
1065 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
1066 return false;
1067
1068 FormatToken *Macro = Tokens[Tokens.size() - 4];
1069 if (Macro->TokenText != "_T")
1070 return false;
1071
1072 const char *Start = Macro->TokenText.data();
1073 const char *End = Last->TokenText.data() + Last->TokenText.size();
1074 String->TokenText = StringRef(Start, End - Start);
1075 String->IsFirst = Macro->IsFirst;
1076 String->LastNewlineOffset = Macro->LastNewlineOffset;
1077 String->WhitespaceRange = Macro->WhitespaceRange;
1078 String->OriginalColumn = Macro->OriginalColumn;
1079 String->ColumnWidth = encoding::columnWidthWithTabs(
1080 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1081 String->NewlinesBefore = Macro->NewlinesBefore;
1082 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1083
1084 Tokens.pop_back();
1085 Tokens.pop_back();
1086 Tokens.pop_back();
1087 Tokens.back() = String;
1088 if (FirstInLineIndex >= Tokens.size())
1089 FirstInLineIndex = Tokens.size() - 1;
1090 return true;
1091}
1092
1093bool FormatTokenLexer::tryMergeConflictMarkers() {
1094 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1095 return false;
1096
1097 // Conflict lines look like:
1098 // <marker> <text from the vcs>
1099 // For example:
1100 // >>>>>>> /file/in/file/system at revision 1234
1101 //
1102 // We merge all tokens in a line that starts with a conflict marker
1103 // into a single token with a special token type that the unwrapped line
1104 // parser will use to correctly rebuild the underlying code.
1105
1106 FileID ID;
1107 // Get the position of the first token in the line.
1108 unsigned FirstInLineOffset;
1109 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1110 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1111 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1112 // Calculate the offset of the start of the current line.
1113 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1114 if (LineOffset == StringRef::npos)
1115 LineOffset = 0;
1116 else
1117 ++LineOffset;
1118
1119 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1120 StringRef LineStart;
1121 if (FirstSpace == StringRef::npos)
1122 LineStart = Buffer.substr(LineOffset);
1123 else
1124 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1125
1126 TokenType Type = TT_Unknown;
1127 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1128 Type = TT_ConflictStart;
1129 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1130 LineStart == "====") {
1131 Type = TT_ConflictAlternative;
1132 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1133 Type = TT_ConflictEnd;
1134 }
1135
1136 if (Type != TT_Unknown) {
1137 FormatToken *Next = Tokens.back();
1138
1139 Tokens.resize(FirstInLineIndex + 1);
1140 // We do not need to build a complete token here, as we will skip it
1141 // during parsing anyway (as we must not touch whitespace around conflict
1142 // markers).
1143 Tokens.back()->setType(Type);
1144 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1145
1146 Tokens.push_back(Next);
1147 return true;
1148 }
1149
1150 return false;
1151}
1152
1153FormatToken *FormatTokenLexer::getStashedToken() {
1154 // Create a synthesized second '>' or '<' token.
1155 Token Tok = FormatTok->Tok;
1156 StringRef TokenText = FormatTok->TokenText;
1157
1158 unsigned OriginalColumn = FormatTok->OriginalColumn;
1159 FormatTok = new (Allocator.Allocate()) FormatToken;
1160 FormatTok->Tok = Tok;
1161 SourceLocation TokLocation =
1162 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1163 FormatTok->Tok.setLocation(TokLocation);
1164 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1165 FormatTok->TokenText = TokenText;
1166 FormatTok->ColumnWidth = 1;
1167 FormatTok->OriginalColumn = OriginalColumn + 1;
1168
1169 return FormatTok;
1170}
1171
1172/// Truncate the current token to the new length and make the lexer continue
1173/// from the end of the truncated token. Used for other languages that have
1174/// different token boundaries, like JavaScript in which a comment ends at a
1175/// line break regardless of whether the line break follows a backslash. Also
1176/// used to set the lexer to the end of whitespace if the lexer regards
1177/// whitespace and an unrecognized symbol as one token.
1178void FormatTokenLexer::truncateToken(size_t NewLen) {
1179 assert(NewLen <= FormatTok->TokenText.size());
1180 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1181 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1182 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1183 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1184 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1185 Encoding);
1186 FormatTok->Tok.setLength(NewLen);
1187}
1188
1189/// Count the length of leading whitespace in a token.
1190static size_t countLeadingWhitespace(StringRef Text) {
1191 // Basically counting the length matched by this regex.
1192 // "^([\n\r\f\v \t]|\\\\[\n\r])+"
1193 // Directly using the regex turned out to be slow. With the regex
1194 // version formatting all files in this directory took about 1.25
1195 // seconds. This version took about 0.5 seconds.
1196 const unsigned char *const Begin = Text.bytes_begin();
1197 const unsigned char *const End = Text.bytes_end();
1198 const unsigned char *Cur = Begin;
1199 while (Cur < End) {
1200 if (isWhitespace(Cur[0])) {
1201 ++Cur;
1202 } else if (Cur[0] == '\\') {
1203 // A backslash followed by optional horizontal whitespaces (P22232R2) and
1204 // then a newline always escapes the newline.
1205 // The source has a null byte at the end. So the end of the entire input
1206 // isn't reached yet. Also the lexer doesn't break apart an escaped
1207 // newline.
1208 const auto *Lookahead = Cur + 1;
1209 while (isHorizontalWhitespace(*Lookahead))
1210 ++Lookahead;
1211 // No line splice found; the backslash is a token.
1212 if (!isVerticalWhitespace(*Lookahead))
1213 break;
1214 // Splice found, consume it.
1215 Cur = Lookahead + 1;
1216 } else {
1217 break;
1218 }
1219 }
1220 return Cur - Begin;
1221}
1222
1223FormatToken *FormatTokenLexer::getNextToken() {
1224 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1225 StateStack.pop();
1226 return getStashedToken();
1227 }
1228
1229 FormatTok = new (Allocator.Allocate()) FormatToken;
1230 readRawToken(*FormatTok);
1231 SourceLocation WhitespaceStart =
1232 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1233 FormatTok->IsFirst = IsFirstToken;
1234 IsFirstToken = false;
1235
1236 // Consume and record whitespace until we find a significant token.
1237 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1238 // followed by a symbol such as backtick. Those symbols may be
1239 // significant in other languages.
1240 unsigned WhitespaceLength = TrailingWhitespace;
1241 while (FormatTok->isNot(tok::eof)) {
1242 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1243 if (LeadingWhitespace == 0)
1244 break;
1245 if (LeadingWhitespace < FormatTok->TokenText.size())
1246 truncateToken(LeadingWhitespace);
1247 StringRef Text = FormatTok->TokenText;
1248 bool InEscape = false;
1249 for (int i = 0, e = Text.size(); i != e; ++i) {
1250 switch (Text[i]) {
1251 case '\r':
1252 // If this is a CRLF sequence, break here and the LF will be handled on
1253 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1254 // the same as a single LF.
1255 if (i + 1 < e && Text[i + 1] == '\n')
1256 break;
1257 [[fallthrough]];
1258 case '\n':
1259 ++FormatTok->NewlinesBefore;
1260 if (!InEscape)
1261 FormatTok->HasUnescapedNewline = true;
1262 else
1263 InEscape = false;
1264 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1265 Column = 0;
1266 break;
1267 case '\f':
1268 if (Style.KeepFormFeed && !FormatTok->HasFormFeedBefore &&
1269 // The form feed is immediately preceded and followed by a newline.
1270 i > 0 && Text[i - 1] == '\n' &&
1271 ((i + 1 < e && Text[i + 1] == '\n') ||
1272 (i + 2 < e && Text[i + 1] == '\r' && Text[i + 2] == '\n'))) {
1273 FormatTok->HasFormFeedBefore = true;
1274 }
1275 [[fallthrough]];
1276 case '\v':
1277 Column = 0;
1278 break;
1279 case ' ':
1280 ++Column;
1281 break;
1282 case '\t':
1283 Column +=
1284 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1285 break;
1286 case '\\':
1287 // The code preceding the loop and in the countLeadingWhitespace
1288 // function guarantees that Text is entirely whitespace, not including
1289 // comments but including escaped newlines. So the character shows up,
1290 // then it has to be in an escape sequence.
1291 assert([&]() -> bool {
1292 size_t j = i + 1;
1293 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
1294 ++j;
1295 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
1296 }());
1297 InEscape = true;
1298 break;
1299 default:
1300 // This shouldn't happen.
1301 assert(false);
1302 break;
1303 }
1304 }
1305 WhitespaceLength += Text.size();
1306 readRawToken(*FormatTok);
1307 }
1308
1309 if (FormatTok->is(tok::unknown))
1310 FormatTok->setType(TT_ImplicitStringLiteral);
1311
1312 const bool IsCpp = Style.isCpp();
1313
1314 // JavaScript and Java do not allow to escape the end of the line with a
1315 // backslash. Backslashes are syntax errors in plain source, but can occur in
1316 // comments. When a single line comment ends with a \, it'll cause the next
1317 // line of code to be lexed as a comment, breaking formatting. The code below
1318 // finds comments that contain a backslash followed by a line break, truncates
1319 // the comment token at the backslash, and resets the lexer to restart behind
1320 // the backslash.
1321 if (const auto Text = FormatTok->TokenText;
1322 Text.starts_with("//") &&
1323 (IsCpp || Style.isJavaScript() || Style.isJava())) {
1324 assert(FormatTok->is(tok::comment));
1325 for (auto Pos = Text.find('\\'); Pos++ != StringRef::npos;
1326 Pos = Text.find('\\', Pos)) {
1327 if (Pos < Text.size() && Text[Pos] == '\n' &&
1328 (!IsCpp || Text.substr(Pos + 1).ltrim().starts_with("//"))) {
1329 truncateToken(Pos);
1330 break;
1331 }
1332 }
1333 }
1334
1335 if (Style.isVerilog()) {
1336 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1337 SmallVector<StringRef, 1> Matches;
1338 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1339 // And it uses the hash for delays and parameter lists. In order to continue
1340 // using `tok::hash` in other places, the backtick gets marked as the hash
1341 // here. And in order to tell the backtick and hash apart for
1342 // Verilog-specific stuff, the hash becomes an identifier.
1343 if (FormatTok->is(tok::numeric_constant)) {
1344 // In Verilog the quote is not part of a number.
1345 auto Quote = FormatTok->TokenText.find('\'');
1346 if (Quote != StringRef::npos)
1347 truncateToken(Quote);
1348 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1349 FormatTok->Tok.setKind(tok::raw_identifier);
1350 } else if (FormatTok->is(tok::raw_identifier)) {
1351 if (FormatTok->TokenText == "`") {
1352 FormatTok->Tok.setIdentifierInfo(nullptr);
1353 FormatTok->Tok.setKind(tok::hash);
1354 } else if (FormatTok->TokenText == "``") {
1355 FormatTok->Tok.setIdentifierInfo(nullptr);
1356 FormatTok->Tok.setKind(tok::hashhash);
1357 } else if (!Tokens.empty() && Tokens.back()->is(Keywords.kw_apostrophe) &&
1358 NumberBase.match(FormatTok->TokenText, &Matches)) {
1359 // In Verilog in a based number literal like `'b10`, there may be
1360 // whitespace between `'b` and `10`. Therefore we handle the base and
1361 // the rest of the number literal as two tokens. But if there is no
1362 // space in the input code, we need to manually separate the two parts.
1363 truncateToken(Matches[0].size());
1364 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1365 }
1366 }
1367 }
1368
1369 FormatTok->WhitespaceRange = SourceRange(
1370 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1371
1372 FormatTok->OriginalColumn = Column;
1373
1374 TrailingWhitespace = 0;
1375 if (FormatTok->is(tok::comment)) {
1376 // FIXME: Add the trimmed whitespace to Column.
1377 StringRef UntrimmedText = FormatTok->TokenText;
1378 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1379 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1380 } else if (FormatTok->is(tok::raw_identifier)) {
1381 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1382 FormatTok->Tok.setIdentifierInfo(&Info);
1383 FormatTok->Tok.setKind(Info.getTokenID());
1384 if (Style.isJava() &&
1385 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1386 tok::kw_operator)) {
1387 FormatTok->Tok.setKind(tok::identifier);
1388 } else if (Style.isJavaScript() &&
1389 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1390 tok::kw_operator)) {
1391 FormatTok->Tok.setKind(tok::identifier);
1392 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1393 FormatTok->Tok.setKind(tok::identifier);
1394 } else if (Style.isVerilog() && Keywords.isVerilogIdentifier(*FormatTok)) {
1395 FormatTok->Tok.setKind(tok::identifier);
1396 }
1397 } else if (const bool Greater = FormatTok->is(tok::greatergreater);
1398 Greater || FormatTok->is(tok::lessless)) {
1399 FormatTok->Tok.setKind(Greater ? tok::greater : tok::less);
1400 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1401 ++Column;
1402 StateStack.push(LexerState::TOKEN_STASHED);
1403 } else if (Style.isJava() && FormatTok->is(tok::string_literal)) {
1404 tryParseJavaTextBlock();
1405 }
1406
1407 if (Style.isVerilog() && !Tokens.empty() &&
1408 Tokens.back()->is(TT_VerilogNumberBase) &&
1409 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1410 // Mark the number following a base like `'h?a0` as a number.
1411 FormatTok->Tok.setKind(tok::numeric_constant);
1412 }
1413
1414 // Now FormatTok is the next non-whitespace token.
1415
1416 StringRef Text = FormatTok->TokenText;
1417 size_t FirstNewlinePos = Text.find('\n');
1418 if (FirstNewlinePos == StringRef::npos) {
1419 // FIXME: ColumnWidth actually depends on the start column, we need to
1420 // take this into account when the token is moved.
1421 FormatTok->ColumnWidth =
1422 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1423 Column += FormatTok->ColumnWidth;
1424 } else {
1425 FormatTok->IsMultiline = true;
1426 // FIXME: ColumnWidth actually depends on the start column, we need to
1427 // take this into account when the token is moved.
1428 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1429 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1430
1431 // The last line of the token always starts in column 0.
1432 // Thus, the length can be precomputed even in the presence of tabs.
1433 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1434 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1435 Column = FormatTok->LastLineColumnWidth;
1436 }
1437
1438 if (IsCpp) {
1439 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1440 auto it = Macros.find(Identifier);
1441 if ((Tokens.empty() || !Tokens.back()->Tok.getIdentifierInfo() ||
1442 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() !=
1443 tok::pp_define) &&
1444 it != Macros.end()) {
1445 FormatTok->setType(it->second);
1446 if (it->second == TT_IfMacro) {
1447 // The lexer token currently has type tok::kw_unknown. However, for this
1448 // substitution to be treated correctly in the TokenAnnotator, faking
1449 // the tok value seems to be needed. Not sure if there's a more elegant
1450 // way.
1451 FormatTok->Tok.setKind(tok::kw_if);
1452 }
1453 } else if (FormatTok->is(tok::identifier)) {
1454 if (MacroBlockBeginRegex.match(Text))
1455 FormatTok->setType(TT_MacroBlockBegin);
1456 else if (MacroBlockEndRegex.match(Text))
1457 FormatTok->setType(TT_MacroBlockEnd);
1458 else if (MacrosSkippedByRemoveParentheses.contains(Identifier))
1459 FormatTok->setFinalizedType(TT_FunctionLikeMacro);
1460 else if (TemplateNames.contains(Identifier))
1461 FormatTok->setFinalizedType(TT_TemplateName);
1462 else if (TypeNames.contains(Identifier))
1463 FormatTok->setFinalizedType(TT_TypeName);
1464 else if (VariableTemplates.contains(Identifier))
1465 FormatTok->setFinalizedType(TT_VariableTemplate);
1466 }
1467 }
1468
1469 return FormatTok;
1470}
1471
1472bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1473 const char *Start = Lex->getBufferLocation();
1474 size_t Len;
1475 switch (Start[0]) {
1476 // In Verilog the quote is not a character literal.
1477 case '\'':
1478 Len = 1;
1479 break;
1480 // Make the backtick and double backtick identifiers to match against them
1481 // more easily.
1482 case '`':
1483 if (Start[1] == '`')
1484 Len = 2;
1485 else
1486 Len = 1;
1487 break;
1488 // In Verilog an escaped identifier starts with a backslash and ends with
1489 // whitespace. Unless that whitespace is an escaped newline.
1490 // FIXME: If there is an escaped newline in the middle of an escaped
1491 // identifier, allow for pasting the two lines together, But escaped
1492 // identifiers usually occur only in generated code anyway.
1493 case '\\':
1494 // A backslash can also begin an escaped newline outside of an escaped
1495 // identifier.
1496 if (Start[1] == '\r' || Start[1] == '\n')
1497 return false;
1498 Len = 1;
1499 while (Start[Len] != '\0' && Start[Len] != '\f' && Start[Len] != '\n' &&
1500 Start[Len] != '\r' && Start[Len] != '\t' && Start[Len] != '\v' &&
1501 Start[Len] != ' ') {
1502 // There is a null byte at the end of the buffer, so we don't have to
1503 // check whether the next byte is within the buffer.
1504 if (Start[Len] == '\\' && Start[Len + 1] == '\r' &&
1505 Start[Len + 2] == '\n') {
1506 Len += 3;
1507 } else if (Start[Len] == '\\' &&
1508 (Start[Len + 1] == '\r' || Start[Len + 1] == '\n')) {
1509 Len += 2;
1510 } else {
1511 Len += 1;
1512 }
1513 }
1514 break;
1515 default:
1516 return false;
1517 }
1518
1519 // The kind has to be an identifier so we can match it against those defined
1520 // in Keywords. The kind has to be set before the length because the setLength
1521 // function checks that the kind is not an annotation.
1522 Tok.setKind(tok::raw_identifier);
1523 Tok.setLength(Len);
1524 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1525 Tok.setRawIdentifierData(Start);
1526 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1527 return true;
1528}
1529
1530void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1531 // For Verilog, first see if there is a special token, and fall back to the
1532 // normal lexer if there isn't one.
1533 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1534 Lex->LexFromRawLexer(Tok.Tok);
1535 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1536 Tok.Tok.getLength());
1537 // For formatting, treat unterminated string literals like normal string
1538 // literals.
1539 if (Tok.is(tok::unknown)) {
1540 if (Tok.TokenText.starts_with("\"")) {
1541 Tok.Tok.setKind(tok::string_literal);
1542 Tok.IsUnterminatedLiteral = true;
1543 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1544 Tok.Tok.setKind(tok::string_literal);
1545 }
1546 }
1547
1548 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1549 Tok.Tok.setKind(tok::string_literal);
1550
1551 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1552 FormattingDisabled = false;
1553
1554 Tok.Finalized = FormattingDisabled;
1555
1556 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1557 FormattingDisabled = true;
1558}
1559
1560void FormatTokenLexer::resetLexer(unsigned Offset) {
1561 StringRef Buffer = SourceMgr.getBufferData(ID);
1562 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1563 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1564 Lex->SetKeepWhitespaceMode(true);
1565 TrailingWhitespace = 0;
1566}
1567
1568} // namespace format
1569} // namespace clang
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
FormatToken()
Token Tok
The Token.
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
FormatToken * Next
The next token in the unwrapped line.
bool is(tok::TokenKind Kind) const
Various functions to configurably format source code.
#define X(type, name)
Definition Value.h:97
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
Token - This structure provides full information about a lexed token.
Definition Token.h:36
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition Token.h:142
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
uint32_t Literal
Literals are represented as positive integers.
Definition CNFFormula.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition Format.cpp:4757
bool isClangFormatOn(StringRef Comment)
Definition Format.cpp:4753
TokenType
Determines the semantic type of a syntactic token, e.g.
LangOptions getFormattingLangOpts(const FormatStyle &Style)
Definition Format.cpp:4361
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition CharInfo.h:99
std::vector< std::string > Macros
A list of macros of the form <definition>=<expansion> .
Definition Format.h:3934
@ TemplateName
The identifier is a template name. FIXME: Add an annotation for that.
Definition Parser.h:61
nullptr
This class represents a compute construct, representing a 'Kind' of ‘parallel’, 'serial',...
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition Format.h:5763
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition CharInfo.h:91
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition CharInfo.h:108
@ Keyword
The name has been typo-corrected to a keyword.
Definition Sema.h:562
@ Type
The name was classified as a type.
Definition Sema.h:564
std::vector< std::string > MacrosSkippedByRemoveParentheses
A vector of function-like macros whose invocations should be skipped by RemoveParentheses.
Definition Format.h:3939
std::vector< std::string > TemplateNames
A vector of non-keyword identifiers that should be interpreted as template names.
Definition Format.h:5753
std::vector< std::string > VariableTemplates
A vector of non-keyword identifiers that should be interpreted as variable template names.
Definition Format.h:5814
#define true
Definition stdbool.h:25
A wrapper around a Token storing information about the whitespace characters preceding it.
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
bool HasFormFeedBefore
Has "\n\f\n" or "\n\f\r\n" before TokenText.
unsigned IsFirst
Indicates that this is the first token of the file.