clang 22.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
20#include "clang/Format/Format.h"
21#include "llvm/Support/Regex.h"
22
23namespace clang {
24namespace format {
25
27 const SourceManager &SourceMgr, FileID ID, unsigned Column,
28 const FormatStyle &Style, encoding::Encoding Encoding,
29 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
30 IdentifierTable &IdentTable)
31 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
32 Column(Column), TrailingWhitespace(0),
33 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
34 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
35 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
36 FormattingDisabled(false), FormatOffRegex(Style.OneLineFormatOffRegex),
37 MacroBlockBeginRegex(Style.MacroBlockBegin),
38 MacroBlockEndRegex(Style.MacroBlockEnd) {
39 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
40 Lex->SetKeepWhitespaceMode(true);
41
42 for (const std::string &ForEachMacro : Style.ForEachMacros) {
43 auto Identifier = &IdentTable.get(ForEachMacro);
44 Macros.insert({Identifier, TT_ForEachMacro});
45 }
46 for (const std::string &IfMacro : Style.IfMacros) {
47 auto Identifier = &IdentTable.get(IfMacro);
48 Macros.insert({Identifier, TT_IfMacro});
49 }
50 for (const std::string &AttributeMacro : Style.AttributeMacros) {
51 auto Identifier = &IdentTable.get(AttributeMacro);
52 Macros.insert({Identifier, TT_AttributeMacro});
53 }
54 for (const std::string &StatementMacro : Style.StatementMacros) {
55 auto Identifier = &IdentTable.get(StatementMacro);
56 Macros.insert({Identifier, TT_StatementMacro});
57 }
58 for (const std::string &TypenameMacro : Style.TypenameMacros) {
59 auto Identifier = &IdentTable.get(TypenameMacro);
60 Macros.insert({Identifier, TT_TypenameMacro});
61 }
62 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
63 auto Identifier = &IdentTable.get(NamespaceMacro);
64 Macros.insert({Identifier, TT_NamespaceMacro});
65 }
66 for (const std::string &WhitespaceSensitiveMacro :
67 Style.WhitespaceSensitiveMacros) {
68 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
69 Macros.insert({Identifier, TT_UntouchableMacroFunc});
70 }
71 for (const std::string &StatementAttributeLikeMacro :
72 Style.StatementAttributeLikeMacros) {
73 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
74 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
75 }
76
77 for (const auto &Macro : Style.MacrosSkippedByRemoveParentheses)
78 MacrosSkippedByRemoveParentheses.insert(&IdentTable.get(Macro));
79 for (const auto &TemplateName : Style.TemplateNames)
80 TemplateNames.insert(&IdentTable.get(TemplateName));
81 for (const auto &TypeName : Style.TypeNames)
82 TypeNames.insert(&IdentTable.get(TypeName));
83 for (const auto &VariableTemplate : Style.VariableTemplates)
84 VariableTemplates.insert(&IdentTable.get(VariableTemplate));
85}
86
88 assert(Tokens.empty());
89 assert(FirstInLineIndex == 0);
90 enum { FO_None, FO_CurrentLine, FO_NextLine } FormatOff = FO_None;
91 do {
92 Tokens.push_back(getNextToken());
93 auto &Tok = *Tokens.back();
94 const auto NewlinesBefore = Tok.NewlinesBefore;
95 switch (FormatOff) {
96 case FO_NextLine:
97 if (NewlinesBefore > 1) {
98 FormatOff = FO_None;
99 } else {
100 Tok.Finalized = true;
101 FormatOff = FO_CurrentLine;
102 }
103 break;
104 case FO_CurrentLine:
105 if (NewlinesBefore == 0) {
106 Tok.Finalized = true;
107 break;
108 }
109 FormatOff = FO_None;
110 [[fallthrough]];
111 default:
112 if (!FormattingDisabled && FormatOffRegex.match(Tok.TokenText)) {
113 if (Tok.is(tok::comment) &&
114 (NewlinesBefore > 0 || Tokens.size() == 1)) {
115 Tok.Finalized = true;
116 FormatOff = FO_NextLine;
117 } else {
118 for (auto *Token : reverse(Tokens)) {
119 Token->Finalized = true;
120 if (Token->NewlinesBefore > 0)
121 break;
122 }
123 FormatOff = FO_CurrentLine;
124 }
125 }
126 }
127 if (Style.isJavaScript()) {
128 tryParseJSRegexLiteral();
129 handleTemplateStrings();
130 } else if (Style.isTextProto()) {
131 tryParsePythonComment();
132 }
133 tryMergePreviousTokens();
134 if (Style.isCSharp()) {
135 // This needs to come after tokens have been merged so that C#
136 // string literals are correctly identified.
137 handleCSharpVerbatimAndInterpolatedStrings();
138 } else if (Style.isTableGen()) {
139 handleTableGenMultilineString();
140 handleTableGenNumericLikeIdentifier();
141 }
142 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
143 FirstInLineIndex = Tokens.size() - 1;
144 } while (Tokens.back()->isNot(tok::eof));
145 if (Style.InsertNewlineAtEOF) {
146 auto &TokEOF = *Tokens.back();
147 if (TokEOF.NewlinesBefore == 0) {
148 TokEOF.NewlinesBefore = 1;
149 TokEOF.OriginalColumn = 0;
150 }
151 }
152 return Tokens;
153}
154
155void FormatTokenLexer::tryMergePreviousTokens() {
156 if (tryMerge_TMacro())
157 return;
158 if (tryMergeConflictMarkers())
159 return;
160 if (tryMergeLessLess())
161 return;
162 if (tryMergeGreaterGreater())
163 return;
164 if (tryMergeForEach())
165 return;
166 if (Style.isCpp() && tryTransformTryUsageForC())
167 return;
168
169 if ((Style.Language == FormatStyle::LK_Cpp ||
170 Style.Language == FormatStyle::LK_ObjC) &&
171 tryMergeUserDefinedLiteral()) {
172 return;
173 }
174
175 if (Style.isJavaScript() || Style.isCSharp()) {
176 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
177 tok::question};
178 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
179 tok::period};
180 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
181
182 if (tryMergeTokens(FatArrow, TT_FatArrow))
183 return;
184 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
185 // Treat like the "||" operator (as opposed to the ternary ?).
186 Tokens.back()->Tok.setKind(tok::pipepipe);
187 return;
188 }
189 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
190 // Treat like a regular "." access.
191 Tokens.back()->Tok.setKind(tok::period);
192 return;
193 }
194 if (tryMergeNullishCoalescingEqual())
195 return;
196
197 if (Style.isCSharp()) {
198 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
199 tok::question, tok::l_square};
200
201 if (tryMergeCSharpKeywordVariables())
202 return;
203 if (tryMergeCSharpStringLiteral())
204 return;
205 if (tryTransformCSharpForEach())
206 return;
207 if (tryMergeTokens(CSharpNullConditionalLSquare,
208 TT_CSharpNullConditionalLSquare)) {
209 // Treat like a regular "[" operator.
210 Tokens.back()->Tok.setKind(tok::l_square);
211 return;
212 }
213 }
214 }
215
216 if (tryMergeNSStringLiteral())
217 return;
218
219 if (Style.isJavaScript()) {
220 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
221 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
222 tok::equal};
223 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
224 tok::greaterequal};
225 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
226 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
227 tok::starequal};
228 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
229 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
230
231 // FIXME: Investigate what token type gives the correct operator priority.
232 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
233 return;
234 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
235 return;
236 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
237 return;
238 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
239 return;
240 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
241 Tokens.back()->Tok.setKind(tok::starequal);
242 return;
243 }
244 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
245 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
246 // Treat like the "=" assignment operator.
247 Tokens.back()->Tok.setKind(tok::equal);
248 return;
249 }
250 if (tryMergeJSPrivateIdentifier())
251 return;
252 } else if (Style.isJava()) {
253 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
254 tok::greater, tok::greater, tok::greaterequal};
255 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
256 return;
257 } else if (Style.isVerilog()) {
258 // Merge the number following a base like `'h?a0`.
259 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
260 Tokens.end()[-2]->is(tok::numeric_constant) &&
261 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
262 tok::question) &&
263 tryMergeTokens(2, TT_Unknown)) {
264 return;
265 }
266 // Part select.
267 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
268 TT_BitFieldColon)) {
269 return;
270 }
271 // Xnor. The combined token is treated as a caret which can also be either a
272 // unary or binary operator. The actual type is determined in
273 // TokenAnnotator. We also check the token length so we know it is not
274 // already a merged token.
275 if (Tokens.back()->TokenText.size() == 1 &&
276 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
277 TT_BinaryOperator)) {
278 Tokens.back()->Tok.setKind(tok::caret);
279 return;
280 }
281 // Signed shift and distribution weight.
282 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
283 Tokens.back()->Tok.setKind(tok::lessless);
284 return;
285 }
286 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
287 Tokens.back()->Tok.setKind(tok::greatergreater);
288 return;
289 }
290 if (tryMergeTokensAny({{tok::lessless, tok::equal},
291 {tok::lessless, tok::lessequal},
292 {tok::greatergreater, tok::equal},
293 {tok::greatergreater, tok::greaterequal},
294 {tok::colon, tok::equal},
295 {tok::colon, tok::slash}},
296 TT_BinaryOperator)) {
297 Tokens.back()->ForcedPrecedence = prec::Assignment;
298 return;
299 }
300 // Exponentiation, signed shift, case equality, and wildcard equality.
301 if (tryMergeTokensAny({{tok::star, tok::star},
302 {tok::lessless, tok::less},
303 {tok::greatergreater, tok::greater},
304 {tok::exclaimequal, tok::equal},
305 {tok::exclaimequal, tok::question},
306 {tok::equalequal, tok::equal},
307 {tok::equalequal, tok::question}},
308 TT_BinaryOperator)) {
309 return;
310 }
311 // Module paths in specify blocks and the implication and boolean equality
312 // operators.
313 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
314 {tok::plus, tok::star, tok::greater},
315 {tok::minusequal, tok::greater},
316 {tok::minus, tok::star, tok::greater},
317 {tok::less, tok::arrow},
318 {tok::equal, tok::greater},
319 {tok::star, tok::greater},
320 {tok::pipeequal, tok::greater},
321 {tok::pipe, tok::arrow},
322 {tok::hash, tok::minus, tok::hash},
323 {tok::hash, tok::equal, tok::hash}},
324 TT_BinaryOperator) ||
325 Tokens.back()->is(tok::arrow)) {
326 Tokens.back()->ForcedPrecedence = prec::Comma;
327 return;
328 }
329 } else if (Style.isTableGen()) {
330 // TableGen's Multi line string starts with [{
331 if (tryMergeTokens({tok::l_square, tok::l_brace},
332 TT_TableGenMultiLineString)) {
333 // Set again with finalizing. This must never be annotated as other types.
334 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
335 Tokens.back()->Tok.setKind(tok::string_literal);
336 return;
337 }
338 // TableGen's bang operator is the form !<name>.
339 // !cond is a special case with specific syntax.
340 if (tryMergeTokens({tok::exclaim, tok::identifier},
341 TT_TableGenBangOperator)) {
342 Tokens.back()->Tok.setKind(tok::identifier);
343 Tokens.back()->Tok.setIdentifierInfo(nullptr);
344 if (Tokens.back()->TokenText == "!cond")
345 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
346 else
347 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
348 return;
349 }
350 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
351 // Here, "! if" becomes "!if". That is, ! captures if even when the space
352 // exists. That is only one possibility in TableGen's syntax.
353 Tokens.back()->Tok.setKind(tok::identifier);
354 Tokens.back()->Tok.setIdentifierInfo(nullptr);
355 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
356 return;
357 }
358 // +, - with numbers are literals. Not unary operators.
359 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
360 Tokens.back()->Tok.setKind(tok::numeric_constant);
361 return;
362 }
363 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
364 Tokens.back()->Tok.setKind(tok::numeric_constant);
365 return;
366 }
367 }
368}
369
370bool FormatTokenLexer::tryMergeNSStringLiteral() {
371 if (Tokens.size() < 2)
372 return false;
373 auto &At = *(Tokens.end() - 2);
374 auto &String = *(Tokens.end() - 1);
375 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
376 return false;
377 At->Tok.setKind(tok::string_literal);
378 At->TokenText = StringRef(At->TokenText.begin(),
379 String->TokenText.end() - At->TokenText.begin());
380 At->ColumnWidth += String->ColumnWidth;
381 At->setType(TT_ObjCStringLiteral);
382 Tokens.erase(Tokens.end() - 1);
383 return true;
384}
385
386bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
387 // Merges #idenfier into a single identifier with the text #identifier
388 // but the token tok::identifier.
389 if (Tokens.size() < 2)
390 return false;
391 auto &Hash = *(Tokens.end() - 2);
392 auto &Identifier = *(Tokens.end() - 1);
393 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
394 return false;
395 Hash->Tok.setKind(tok::identifier);
396 Hash->TokenText =
397 StringRef(Hash->TokenText.begin(),
398 Identifier->TokenText.end() - Hash->TokenText.begin());
399 Hash->ColumnWidth += Identifier->ColumnWidth;
400 Hash->setType(TT_JsPrivateIdentifier);
401 Tokens.erase(Tokens.end() - 1);
402 return true;
403}
404
405// Search for verbatim or interpolated string literals @"ABC" or
406// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
407// prevent splitting of @, $ and ".
408// Merging of multiline verbatim strings with embedded '"' is handled in
409// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
410bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
411 if (Tokens.size() < 2)
412 return false;
413
414 // Look for @"aaaaaa" or $"aaaaaa".
415 const auto String = *(Tokens.end() - 1);
416 if (String->isNot(tok::string_literal))
417 return false;
418
419 auto Prefix = *(Tokens.end() - 2);
420 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
421 return false;
422
423 if (Tokens.size() > 2) {
424 const auto Tok = *(Tokens.end() - 3);
425 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
426 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
427 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
428 Tok->ColumnWidth += Prefix->ColumnWidth;
429 Tokens.erase(Tokens.end() - 2);
430 Prefix = Tok;
431 }
432 }
433
434 // Convert back into just a string_literal.
435 Prefix->Tok.setKind(tok::string_literal);
436 Prefix->TokenText =
437 StringRef(Prefix->TokenText.begin(),
438 String->TokenText.end() - Prefix->TokenText.begin());
439 Prefix->ColumnWidth += String->ColumnWidth;
440 Prefix->setType(TT_CSharpStringLiteral);
441 Tokens.erase(Tokens.end() - 1);
442 return true;
443}
444
445// Valid C# attribute targets:
446// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
447const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
448 "assembly", "module", "field", "event", "method",
449 "param", "property", "return", "type",
450};
451
452bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
453 if (Tokens.size() < 2)
454 return false;
455 auto &NullishCoalescing = *(Tokens.end() - 2);
456 auto &Equal = *(Tokens.end() - 1);
457 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
458 Equal->isNot(tok::equal)) {
459 return false;
460 }
461 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
462 NullishCoalescing->TokenText =
463 StringRef(NullishCoalescing->TokenText.begin(),
464 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
465 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
466 NullishCoalescing->setType(TT_NullCoalescingEqual);
467 Tokens.erase(Tokens.end() - 1);
468 return true;
469}
470
471bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
472 if (Tokens.size() < 2)
473 return false;
474 const auto At = *(Tokens.end() - 2);
475 if (At->isNot(tok::at))
476 return false;
477 const auto Keyword = *(Tokens.end() - 1);
478 if (Keyword->TokenText == "$")
479 return false;
480 if (!Keywords.isCSharpKeyword(*Keyword))
481 return false;
482
483 At->Tok.setKind(tok::identifier);
484 At->TokenText = StringRef(At->TokenText.begin(),
485 Keyword->TokenText.end() - At->TokenText.begin());
486 At->ColumnWidth += Keyword->ColumnWidth;
487 At->setType(Keyword->getType());
488 Tokens.erase(Tokens.end() - 1);
489 return true;
490}
491
492// In C# transform identifier foreach into kw_foreach
493bool FormatTokenLexer::tryTransformCSharpForEach() {
494 if (Tokens.empty())
495 return false;
496 auto &Identifier = *(Tokens.end() - 1);
497 if (Identifier->isNot(tok::identifier))
498 return false;
499 if (Identifier->TokenText != "foreach")
500 return false;
501
502 Identifier->setType(TT_ForEachMacro);
503 Identifier->Tok.setKind(tok::kw_for);
504 return true;
505}
506
507bool FormatTokenLexer::tryMergeForEach() {
508 if (Tokens.size() < 2)
509 return false;
510 auto &For = *(Tokens.end() - 2);
511 auto &Each = *(Tokens.end() - 1);
512 if (For->isNot(tok::kw_for))
513 return false;
514 if (Each->isNot(tok::identifier))
515 return false;
516 if (Each->TokenText != "each")
517 return false;
518
519 For->setType(TT_ForEachMacro);
520 For->Tok.setKind(tok::kw_for);
521
522 For->TokenText = StringRef(For->TokenText.begin(),
523 Each->TokenText.end() - For->TokenText.begin());
524 For->ColumnWidth += Each->ColumnWidth;
525 Tokens.erase(Tokens.end() - 1);
526 return true;
527}
528
529bool FormatTokenLexer::tryTransformTryUsageForC() {
530 if (Tokens.size() < 2)
531 return false;
532 auto &Try = *(Tokens.end() - 2);
533 if (Try->isNot(tok::kw_try))
534 return false;
535 auto &Next = *(Tokens.end() - 1);
536 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
537 return false;
538
539 if (Tokens.size() > 2) {
540 auto &At = *(Tokens.end() - 3);
541 if (At->is(tok::at))
542 return false;
543 }
544
545 Try->Tok.setKind(tok::identifier);
546 return true;
547}
548
549bool FormatTokenLexer::tryMergeLessLess() {
550 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
551 if (Tokens.size() < 3)
552 return false;
553
554 auto First = Tokens.end() - 3;
555 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
556 return false;
557
558 // Only merge if there currently is no whitespace between the two "<".
559 if (First[1]->hasWhitespaceBefore())
560 return false;
561
562 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
563 if (X && X->is(tok::less))
564 return false;
565
566 auto Y = First[2];
567 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
568 return false;
569
570 First[0]->Tok.setKind(tok::lessless);
571 First[0]->TokenText = "<<";
572 First[0]->ColumnWidth += 1;
573 Tokens.erase(Tokens.end() - 2);
574 return true;
575}
576
577bool FormatTokenLexer::tryMergeGreaterGreater() {
578 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
579 if (Tokens.size() < 2)
580 return false;
581
582 auto First = Tokens.end() - 2;
583 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
584 return false;
585
586 // Only merge if there currently is no whitespace between the first two ">".
587 if (First[1]->hasWhitespaceBefore())
588 return false;
589
590 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
591 if (Tok && Tok->isNot(tok::kw_operator))
592 return false;
593
594 First[0]->Tok.setKind(tok::greatergreater);
595 First[0]->TokenText = ">>";
596 First[0]->ColumnWidth += 1;
597 Tokens.erase(Tokens.end() - 1);
598 return true;
599}
600
601bool FormatTokenLexer::tryMergeUserDefinedLiteral() {
602 if (Tokens.size() < 2)
603 return false;
604
605 auto *First = Tokens.end() - 2;
606 auto &Suffix = First[1];
607 if (Suffix->hasWhitespaceBefore() || Suffix->TokenText != "$")
608 return false;
609
610 auto &Literal = First[0];
611 if (!Literal->Tok.isLiteral())
612 return false;
613
614 auto &Text = Literal->TokenText;
615 if (!Text.ends_with("_"))
616 return false;
617
618 Text = StringRef(Text.data(), Text.size() + 1);
619 ++Literal->ColumnWidth;
620 Tokens.erase(&Suffix);
621 return true;
622}
623
624bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
625 TokenType NewType) {
626 if (Tokens.size() < Kinds.size())
627 return false;
628
629 const auto *First = Tokens.end() - Kinds.size();
630 for (unsigned i = 0; i < Kinds.size(); ++i)
631 if (First[i]->isNot(Kinds[i]))
632 return false;
633
634 return tryMergeTokens(Kinds.size(), NewType);
635}
636
637bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
638 if (Tokens.size() < Count)
639 return false;
640
641 const auto *First = Tokens.end() - Count;
642 unsigned AddLength = 0;
643 for (size_t i = 1; i < Count; ++i) {
644 // If there is whitespace separating the token and the previous one,
645 // they should not be merged.
646 if (First[i]->hasWhitespaceBefore())
647 return false;
648 AddLength += First[i]->TokenText.size();
649 }
650
651 Tokens.resize(Tokens.size() - Count + 1);
652 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
653 First[0]->TokenText.size() + AddLength);
654 First[0]->ColumnWidth += AddLength;
655 First[0]->setType(NewType);
656 return true;
657}
658
659bool FormatTokenLexer::tryMergeTokensAny(
661 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
662 return tryMergeTokens(Kinds, NewType);
663 });
664}
665
666// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
667bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
668 // NB: This is not entirely correct, as an r_paren can introduce an operand
669 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
670 // corner case to not matter in practice, though.
671 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
672 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
673 tok::colon, tok::question, tok::tilde) ||
674 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
675 tok::kw_else, tok::kw_void, tok::kw_typeof,
676 Keywords.kw_instanceof, Keywords.kw_in) ||
677 Tok->isPlacementOperator() || Tok->isBinaryOperator();
678}
679
680bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
681 if (!Prev)
682 return true;
683
684 // Regex literals can only follow after prefix unary operators, not after
685 // postfix unary operators. If the '++' is followed by a non-operand
686 // introducing token, the slash here is the operand and not the start of a
687 // regex.
688 // `!` is an unary prefix operator, but also a post-fix operator that casts
689 // away nullability, so the same check applies.
690 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
691 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
692
693 // The previous token must introduce an operand location where regex
694 // literals can occur.
695 if (!precedesOperand(Prev))
696 return false;
697
698 return true;
699}
700
701void FormatTokenLexer::tryParseJavaTextBlock() {
702 if (FormatTok->TokenText != "\"\"")
703 return;
704
705 const auto *S = Lex->getBufferLocation();
706 const auto *End = Lex->getBuffer().end();
707
708 if (S == End || *S != '\"')
709 return;
710
711 ++S; // Skip the `"""` that begins a text block.
712
713 // Find the `"""` that ends the text block.
714 for (int Count = 0; Count < 3 && S < End; ++S) {
715 switch (*S) {
716 case '\\':
717 Count = -1;
718 break;
719 case '\"':
720 ++Count;
721 break;
722 default:
723 Count = 0;
724 }
725 }
726
727 // Ignore the possibly invalid text block.
728 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(S)));
729}
730
731// Tries to parse a JavaScript Regex literal starting at the current token,
732// if that begins with a slash and is in a location where JavaScript allows
733// regex literals. Changes the current token to a regex literal and updates
734// its text if successful.
735void FormatTokenLexer::tryParseJSRegexLiteral() {
736 FormatToken *RegexToken = Tokens.back();
737 if (RegexToken->isNoneOf(tok::slash, tok::slashequal))
738 return;
739
740 FormatToken *Prev = nullptr;
741 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
742 // NB: Because previous pointers are not initialized yet, this cannot use
743 // Token.getPreviousNonComment.
744 if (FT->isNot(tok::comment)) {
745 Prev = FT;
746 break;
747 }
748 }
749
750 if (!canPrecedeRegexLiteral(Prev))
751 return;
752
753 // 'Manually' lex ahead in the current file buffer.
754 const char *Offset = Lex->getBufferLocation();
755 const char *RegexBegin = Offset - RegexToken->TokenText.size();
756 StringRef Buffer = Lex->getBuffer();
757 bool InCharacterClass = false;
758 bool HaveClosingSlash = false;
759 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
760 // Regular expressions are terminated with a '/', which can only be
761 // escaped using '\' or a character class between '[' and ']'.
762 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
763 switch (*Offset) {
764 case '\\':
765 // Skip the escaped character.
766 ++Offset;
767 break;
768 case '[':
769 InCharacterClass = true;
770 break;
771 case ']':
772 InCharacterClass = false;
773 break;
774 case '/':
775 if (!InCharacterClass)
776 HaveClosingSlash = true;
777 break;
778 }
779 }
780
781 RegexToken->setType(TT_RegexLiteral);
782 // Treat regex literals like other string_literals.
783 RegexToken->Tok.setKind(tok::string_literal);
784 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
785 RegexToken->ColumnWidth = RegexToken->TokenText.size();
786
787 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
788}
789
790static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
791 bool Interpolated) {
792 auto Repeated = [&Begin, End]() {
793 return Begin + 1 < End && Begin[1] == Begin[0];
794 };
795
796 // Look for a terminating '"' in the current file buffer.
797 // Make no effort to format code within an interpolated or verbatim string.
798 //
799 // Interpolated strings could contain { } with " characters inside.
800 // $"{x ?? "null"}"
801 // should not be split into $"{x ?? ", null, "}" but should be treated as a
802 // single string-literal.
803 //
804 // We opt not to try and format expressions inside {} within a C#
805 // interpolated string. Formatting expressions within an interpolated string
806 // would require similar work as that done for JavaScript template strings
807 // in `handleTemplateStrings()`.
808 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
809 switch (*Begin) {
810 case '\\':
811 if (!Verbatim)
812 ++Begin;
813 break;
814 case '{':
815 if (Interpolated) {
816 // {{ inside an interpolated string is escaped, so skip it.
817 if (Repeated())
818 ++Begin;
819 else
820 ++UnmatchedOpeningBraceCount;
821 }
822 break;
823 case '}':
824 if (Interpolated) {
825 // }} inside an interpolated string is escaped, so skip it.
826 if (Repeated())
827 ++Begin;
828 else if (UnmatchedOpeningBraceCount > 0)
829 --UnmatchedOpeningBraceCount;
830 else
831 return End;
832 }
833 break;
834 case '"':
835 if (UnmatchedOpeningBraceCount > 0)
836 break;
837 // "" within a verbatim string is an escaped double quote: skip it.
838 if (Verbatim && Repeated()) {
839 ++Begin;
840 break;
841 }
842 return Begin;
843 }
844 }
845
846 return End;
847}
848
849void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
850 FormatToken *CSharpStringLiteral = Tokens.back();
851
852 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
853 return;
854
855 auto &TokenText = CSharpStringLiteral->TokenText;
856
857 bool Verbatim = false;
858 bool Interpolated = false;
859 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
860 Verbatim = true;
861 Interpolated = true;
862 } else if (TokenText.starts_with(R"(@")")) {
863 Verbatim = true;
864 } else if (TokenText.starts_with(R"($")")) {
865 Interpolated = true;
866 }
867
868 // Deal with multiline strings.
869 if (!Verbatim && !Interpolated)
870 return;
871
872 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
873 const char *Offset = StrBegin;
874 Offset += Verbatim && Interpolated ? 3 : 2;
875
876 const auto End = Lex->getBuffer().end();
877 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
878
879 // Make no attempt to format code properly if a verbatim string is
880 // unterminated.
881 if (Offset >= End)
882 return;
883
884 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
885 TokenText = LiteralText;
886
887 // Adjust width for potentially multiline string literals.
888 size_t FirstBreak = LiteralText.find('\n');
889 StringRef FirstLineText = FirstBreak == StringRef::npos
890 ? LiteralText
891 : LiteralText.substr(0, FirstBreak);
892 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
893 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
894 Encoding);
895 size_t LastBreak = LiteralText.rfind('\n');
896 if (LastBreak != StringRef::npos) {
897 CSharpStringLiteral->IsMultiline = true;
898 unsigned StartColumn = 0;
899 CSharpStringLiteral->LastLineColumnWidth =
900 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
901 StartColumn, Style.TabWidth, Encoding);
902 }
903
904 assert(Offset < End);
905 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
906}
907
908void FormatTokenLexer::handleTableGenMultilineString() {
909 FormatToken *MultiLineString = Tokens.back();
910 if (MultiLineString->isNot(TT_TableGenMultiLineString))
911 return;
912
913 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
914 // "}]" is the end of multi line string.
915 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
916 if (CloseOffset == StringRef::npos)
917 return;
918 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
919 MultiLineString->TokenText = Text;
920 resetLexer(SourceMgr.getFileOffset(
921 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
922 auto FirstLineText = Text;
923 auto FirstBreak = Text.find('\n');
924 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
925 if (FirstBreak != StringRef::npos) {
926 MultiLineString->IsMultiline = true;
927 FirstLineText = Text.substr(0, FirstBreak + 1);
928 // LastLineColumnWidth holds the width of the last line.
929 auto LastBreak = Text.rfind('\n');
930 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
931 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
932 Style.TabWidth, Encoding);
933 }
934 // ColumnWidth holds only the width of the first line.
935 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
936 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
937}
938
939void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
940 FormatToken *Tok = Tokens.back();
941 // TableGen identifiers can begin with digits. Such tokens are lexed as
942 // numeric_constant now.
943 if (Tok->isNot(tok::numeric_constant))
944 return;
945 StringRef Text = Tok->TokenText;
946 // The following check is based on llvm::TGLexer::LexToken.
947 // That lexes the token as a number if any of the following holds:
948 // 1. It starts with '+', '-'.
949 // 2. All the characters are digits.
950 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
951 // 4. The first non-digit character is 'x', and the next is a hex digit.
952 // Note that in the case 3 and 4, if the next character does not exists in
953 // this token, the token is an identifier.
954 if (Text.empty() || Text[0] == '+' || Text[0] == '-')
955 return;
956 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
957 // All the characters are digits
958 if (NonDigitPos == StringRef::npos)
959 return;
960 char FirstNonDigit = Text[NonDigitPos];
961 if (NonDigitPos < Text.size() - 1) {
962 char TheNext = Text[NonDigitPos + 1];
963 // Regarded as a binary number.
964 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
965 return;
966 // Regarded as hex number.
967 if (FirstNonDigit == 'x' && isxdigit(TheNext))
968 return;
969 }
970 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
971 // This is actually an identifier in TableGen.
972 Tok->Tok.setKind(tok::identifier);
973 Tok->Tok.setIdentifierInfo(nullptr);
974 }
975}
976
977void FormatTokenLexer::handleTemplateStrings() {
978 FormatToken *BacktickToken = Tokens.back();
979
980 if (BacktickToken->is(tok::l_brace)) {
981 StateStack.push(LexerState::NORMAL);
982 return;
983 }
984 if (BacktickToken->is(tok::r_brace)) {
985 if (StateStack.size() == 1)
986 return;
987 StateStack.pop();
988 if (StateStack.top() != LexerState::TEMPLATE_STRING)
989 return;
990 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
991 } else if (BacktickToken->is(tok::unknown) &&
992 BacktickToken->TokenText == "`") {
993 StateStack.push(LexerState::TEMPLATE_STRING);
994 } else {
995 return; // Not actually a template
996 }
997
998 // 'Manually' lex ahead in the current file buffer.
999 const char *Offset = Lex->getBufferLocation();
1000 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
1001 for (; Offset != Lex->getBuffer().end(); ++Offset) {
1002 if (Offset[0] == '`') {
1003 StateStack.pop();
1004 ++Offset;
1005 break;
1006 }
1007 if (Offset[0] == '\\') {
1008 ++Offset; // Skip the escaped character.
1009 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
1010 Offset[1] == '{') {
1011 // '${' introduces an expression interpolation in the template string.
1012 StateStack.push(LexerState::NORMAL);
1013 Offset += 2;
1014 break;
1015 }
1016 }
1017
1018 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
1019 BacktickToken->setType(TT_TemplateString);
1020 BacktickToken->Tok.setKind(tok::string_literal);
1021 BacktickToken->TokenText = LiteralText;
1022
1023 // Adjust width for potentially multiline string literals.
1024 size_t FirstBreak = LiteralText.find('\n');
1025 StringRef FirstLineText = FirstBreak == StringRef::npos
1026 ? LiteralText
1027 : LiteralText.substr(0, FirstBreak);
1028 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
1029 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
1030 size_t LastBreak = LiteralText.rfind('\n');
1031 if (LastBreak != StringRef::npos) {
1032 BacktickToken->IsMultiline = true;
1033 unsigned StartColumn = 0; // The template tail spans the entire line.
1034 BacktickToken->LastLineColumnWidth =
1035 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
1036 StartColumn, Style.TabWidth, Encoding);
1037 }
1038
1039 SourceLocation loc = Lex->getSourceLocation(Offset);
1040 resetLexer(SourceMgr.getFileOffset(loc));
1041}
1042
1043void FormatTokenLexer::tryParsePythonComment() {
1044 FormatToken *HashToken = Tokens.back();
1045 if (HashToken->isNoneOf(tok::hash, tok::hashhash))
1046 return;
1047 // Turn the remainder of this line into a comment.
1048 const char *CommentBegin =
1049 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
1050 size_t From = CommentBegin - Lex->getBuffer().begin();
1051 size_t To = Lex->getBuffer().find_first_of('\n', From);
1052 if (To == StringRef::npos)
1053 To = Lex->getBuffer().size();
1054 size_t Len = To - From;
1055 HashToken->setType(TT_LineComment);
1056 HashToken->Tok.setKind(tok::comment);
1057 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
1058 SourceLocation Loc = To < Lex->getBuffer().size()
1059 ? Lex->getSourceLocation(CommentBegin + Len)
1060 : SourceMgr.getLocForEndOfFile(ID);
1061 resetLexer(SourceMgr.getFileOffset(Loc));
1062}
1063
1064bool FormatTokenLexer::tryMerge_TMacro() {
1065 if (Tokens.size() < 4)
1066 return false;
1067 FormatToken *Last = Tokens.back();
1068 if (Last->isNot(tok::r_paren))
1069 return false;
1070
1071 FormatToken *String = Tokens[Tokens.size() - 2];
1072 if (String->isNot(tok::string_literal) || String->IsMultiline)
1073 return false;
1074
1075 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
1076 return false;
1077
1078 FormatToken *Macro = Tokens[Tokens.size() - 4];
1079 if (Macro->TokenText != "_T")
1080 return false;
1081
1082 const char *Start = Macro->TokenText.data();
1083 const char *End = Last->TokenText.data() + Last->TokenText.size();
1084 String->TokenText = StringRef(Start, End - Start);
1085 String->IsFirst = Macro->IsFirst;
1086 String->LastNewlineOffset = Macro->LastNewlineOffset;
1087 String->WhitespaceRange = Macro->WhitespaceRange;
1088 String->OriginalColumn = Macro->OriginalColumn;
1089 String->ColumnWidth = encoding::columnWidthWithTabs(
1090 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1091 String->NewlinesBefore = Macro->NewlinesBefore;
1092 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1093
1094 Tokens.pop_back();
1095 Tokens.pop_back();
1096 Tokens.pop_back();
1097 Tokens.back() = String;
1098 if (FirstInLineIndex >= Tokens.size())
1099 FirstInLineIndex = Tokens.size() - 1;
1100 return true;
1101}
1102
1103bool FormatTokenLexer::tryMergeConflictMarkers() {
1104 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1105 return false;
1106
1107 // Conflict lines look like:
1108 // <marker> <text from the vcs>
1109 // For example:
1110 // >>>>>>> /file/in/file/system at revision 1234
1111 //
1112 // We merge all tokens in a line that starts with a conflict marker
1113 // into a single token with a special token type that the unwrapped line
1114 // parser will use to correctly rebuild the underlying code.
1115
1116 FileID ID;
1117 // Get the position of the first token in the line.
1118 unsigned FirstInLineOffset;
1119 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1120 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1121 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1122 // Calculate the offset of the start of the current line.
1123 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1124 if (LineOffset == StringRef::npos)
1125 LineOffset = 0;
1126 else
1127 ++LineOffset;
1128
1129 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1130 StringRef LineStart;
1131 if (FirstSpace == StringRef::npos)
1132 LineStart = Buffer.substr(LineOffset);
1133 else
1134 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1135
1136 TokenType Type = TT_Unknown;
1137 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1138 Type = TT_ConflictStart;
1139 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1140 LineStart == "====") {
1141 Type = TT_ConflictAlternative;
1142 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1143 Type = TT_ConflictEnd;
1144 }
1145
1146 if (Type != TT_Unknown) {
1147 FormatToken *Next = Tokens.back();
1148
1149 Tokens.resize(FirstInLineIndex + 1);
1150 // We do not need to build a complete token here, as we will skip it
1151 // during parsing anyway (as we must not touch whitespace around conflict
1152 // markers).
1153 Tokens.back()->setType(Type);
1154 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1155
1156 Tokens.push_back(Next);
1157 return true;
1158 }
1159
1160 return false;
1161}
1162
1163FormatToken *FormatTokenLexer::getStashedToken() {
1164 // Create a synthesized second '>' or '<' token.
1165 Token Tok = FormatTok->Tok;
1166 StringRef TokenText = FormatTok->TokenText;
1167
1168 unsigned OriginalColumn = FormatTok->OriginalColumn;
1169 FormatTok = new (Allocator.Allocate()) FormatToken;
1170 FormatTok->Tok = Tok;
1171 SourceLocation TokLocation =
1172 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1173 FormatTok->Tok.setLocation(TokLocation);
1174 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1175 FormatTok->TokenText = TokenText;
1176 FormatTok->ColumnWidth = 1;
1177 FormatTok->OriginalColumn = OriginalColumn + 1;
1178
1179 return FormatTok;
1180}
1181
1182/// Truncate the current token to the new length and make the lexer continue
1183/// from the end of the truncated token. Used for other languages that have
1184/// different token boundaries, like JavaScript in which a comment ends at a
1185/// line break regardless of whether the line break follows a backslash. Also
1186/// used to set the lexer to the end of whitespace if the lexer regards
1187/// whitespace and an unrecognized symbol as one token.
1188void FormatTokenLexer::truncateToken(size_t NewLen) {
1189 assert(NewLen <= FormatTok->TokenText.size());
1190 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1191 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1192 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1193 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1194 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1195 Encoding);
1196 FormatTok->Tok.setLength(NewLen);
1197}
1198
1199/// Count the length of leading whitespace in a token.
1200static size_t countLeadingWhitespace(StringRef Text) {
1201 // Basically counting the length matched by this regex.
1202 // "^([\n\r\f\v \t]|\\\\[\n\r])+"
1203 // Directly using the regex turned out to be slow. With the regex
1204 // version formatting all files in this directory took about 1.25
1205 // seconds. This version took about 0.5 seconds.
1206 const unsigned char *const Begin = Text.bytes_begin();
1207 const unsigned char *const End = Text.bytes_end();
1208 const unsigned char *Cur = Begin;
1209 while (Cur < End) {
1210 if (isWhitespace(Cur[0])) {
1211 ++Cur;
1212 } else if (Cur[0] == '\\') {
1213 // A backslash followed by optional horizontal whitespaces (P22232R2) and
1214 // then a newline always escapes the newline.
1215 // The source has a null byte at the end. So the end of the entire input
1216 // isn't reached yet. Also the lexer doesn't break apart an escaped
1217 // newline.
1218 const auto *Lookahead = Cur + 1;
1219 while (isHorizontalWhitespace(*Lookahead))
1220 ++Lookahead;
1221 // No line splice found; the backslash is a token.
1222 if (!isVerticalWhitespace(*Lookahead))
1223 break;
1224 // Splice found, consume it.
1225 Cur = Lookahead + 1;
1226 } else {
1227 break;
1228 }
1229 }
1230 return Cur - Begin;
1231}
1232
1233FormatToken *FormatTokenLexer::getNextToken() {
1234 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1235 StateStack.pop();
1236 return getStashedToken();
1237 }
1238
1239 FormatTok = new (Allocator.Allocate()) FormatToken;
1240 readRawToken(*FormatTok);
1241 SourceLocation WhitespaceStart =
1242 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1243 FormatTok->IsFirst = IsFirstToken;
1244 IsFirstToken = false;
1245
1246 // Consume and record whitespace until we find a significant token.
1247 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1248 // followed by a symbol such as backtick. Those symbols may be
1249 // significant in other languages.
1250 unsigned WhitespaceLength = TrailingWhitespace;
1251 while (FormatTok->isNot(tok::eof)) {
1252 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1253 if (LeadingWhitespace == 0)
1254 break;
1255 if (LeadingWhitespace < FormatTok->TokenText.size())
1256 truncateToken(LeadingWhitespace);
1257 StringRef Text = FormatTok->TokenText;
1258 bool InEscape = false;
1259 for (int i = 0, e = Text.size(); i != e; ++i) {
1260 switch (Text[i]) {
1261 case '\r':
1262 // If this is a CRLF sequence, break here and the LF will be handled on
1263 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1264 // the same as a single LF.
1265 if (i + 1 < e && Text[i + 1] == '\n')
1266 break;
1267 [[fallthrough]];
1268 case '\n':
1269 ++FormatTok->NewlinesBefore;
1270 if (!InEscape)
1271 FormatTok->HasUnescapedNewline = true;
1272 else
1273 InEscape = false;
1274 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1275 Column = 0;
1276 break;
1277 case '\f':
1278 if (Style.KeepFormFeed && !FormatTok->HasFormFeedBefore &&
1279 // The form feed is immediately preceded and followed by a newline.
1280 i > 0 && Text[i - 1] == '\n' &&
1281 ((i + 1 < e && Text[i + 1] == '\n') ||
1282 (i + 2 < e && Text[i + 1] == '\r' && Text[i + 2] == '\n'))) {
1283 FormatTok->HasFormFeedBefore = true;
1284 }
1285 [[fallthrough]];
1286 case '\v':
1287 Column = 0;
1288 break;
1289 case ' ':
1290 ++Column;
1291 break;
1292 case '\t':
1293 Column +=
1294 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1295 break;
1296 case '\\':
1297 // The code preceding the loop and in the countLeadingWhitespace
1298 // function guarantees that Text is entirely whitespace, not including
1299 // comments but including escaped newlines. So the character shows up,
1300 // then it has to be in an escape sequence.
1301 assert([&]() -> bool {
1302 size_t j = i + 1;
1303 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
1304 ++j;
1305 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
1306 }());
1307 InEscape = true;
1308 break;
1309 default:
1310 // This shouldn't happen.
1311 assert(false);
1312 break;
1313 }
1314 }
1315 WhitespaceLength += Text.size();
1316 readRawToken(*FormatTok);
1317 }
1318
1319 if (FormatTok->is(tok::unknown))
1320 FormatTok->setType(TT_ImplicitStringLiteral);
1321
1322 const bool IsCpp = Style.isCpp();
1323
1324 // JavaScript and Java do not allow to escape the end of the line with a
1325 // backslash. Backslashes are syntax errors in plain source, but can occur in
1326 // comments. When a single line comment ends with a \, it'll cause the next
1327 // line of code to be lexed as a comment, breaking formatting. The code below
1328 // finds comments that contain a backslash followed by a line break, truncates
1329 // the comment token at the backslash, and resets the lexer to restart behind
1330 // the backslash.
1331 if (const auto Text = FormatTok->TokenText;
1332 Text.starts_with("//") &&
1333 (IsCpp || Style.isJavaScript() || Style.isJava())) {
1334 assert(FormatTok->is(tok::comment));
1335 for (auto Pos = Text.find('\\'); Pos++ != StringRef::npos;
1336 Pos = Text.find('\\', Pos)) {
1337 if (Pos < Text.size() && Text[Pos] == '\n' &&
1338 (!IsCpp || Text.substr(Pos + 1).ltrim().starts_with("//"))) {
1339 truncateToken(Pos);
1340 break;
1341 }
1342 }
1343 }
1344
1345 if (Style.isVerilog()) {
1346 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1347 SmallVector<StringRef, 1> Matches;
1348 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1349 // And it uses the hash for delays and parameter lists. In order to continue
1350 // using `tok::hash` in other places, the backtick gets marked as the hash
1351 // here. And in order to tell the backtick and hash apart for
1352 // Verilog-specific stuff, the hash becomes an identifier.
1353 if (FormatTok->is(tok::numeric_constant)) {
1354 // In Verilog the quote is not part of a number.
1355 auto Quote = FormatTok->TokenText.find('\'');
1356 if (Quote != StringRef::npos)
1357 truncateToken(Quote);
1358 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1359 FormatTok->Tok.setKind(tok::raw_identifier);
1360 } else if (FormatTok->is(tok::raw_identifier)) {
1361 if (FormatTok->TokenText == "`") {
1362 FormatTok->Tok.setIdentifierInfo(nullptr);
1363 FormatTok->Tok.setKind(tok::hash);
1364 } else if (FormatTok->TokenText == "``") {
1365 FormatTok->Tok.setIdentifierInfo(nullptr);
1366 FormatTok->Tok.setKind(tok::hashhash);
1367 } else if (!Tokens.empty() && Tokens.back()->is(Keywords.kw_apostrophe) &&
1368 NumberBase.match(FormatTok->TokenText, &Matches)) {
1369 // In Verilog in a based number literal like `'b10`, there may be
1370 // whitespace between `'b` and `10`. Therefore we handle the base and
1371 // the rest of the number literal as two tokens. But if there is no
1372 // space in the input code, we need to manually separate the two parts.
1373 truncateToken(Matches[0].size());
1374 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1375 }
1376 }
1377 }
1378
1379 FormatTok->WhitespaceRange = SourceRange(
1380 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1381
1382 FormatTok->OriginalColumn = Column;
1383
1384 TrailingWhitespace = 0;
1385 if (FormatTok->is(tok::comment)) {
1386 // FIXME: Add the trimmed whitespace to Column.
1387 StringRef UntrimmedText = FormatTok->TokenText;
1388 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1389 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1390 } else if (FormatTok->is(tok::raw_identifier)) {
1391 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1392 FormatTok->Tok.setIdentifierInfo(&Info);
1393 FormatTok->Tok.setKind(Info.getTokenID());
1394 if (Style.isJava() &&
1395 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1396 tok::kw_operator)) {
1397 FormatTok->Tok.setKind(tok::identifier);
1398 } else if (Style.isJavaScript() &&
1399 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1400 tok::kw_operator)) {
1401 FormatTok->Tok.setKind(tok::identifier);
1402 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1403 FormatTok->Tok.setKind(tok::identifier);
1404 }
1405 } else if (const bool Greater = FormatTok->is(tok::greatergreater);
1406 Greater || FormatTok->is(tok::lessless)) {
1407 FormatTok->Tok.setKind(Greater ? tok::greater : tok::less);
1408 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1409 ++Column;
1410 StateStack.push(LexerState::TOKEN_STASHED);
1411 } else if (Style.isJava() && FormatTok->is(tok::string_literal)) {
1412 tryParseJavaTextBlock();
1413 }
1414
1415 if (Style.isVerilog() && !Tokens.empty() &&
1416 Tokens.back()->is(TT_VerilogNumberBase) &&
1417 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1418 // Mark the number following a base like `'h?a0` as a number.
1419 FormatTok->Tok.setKind(tok::numeric_constant);
1420 }
1421
1422 // Now FormatTok is the next non-whitespace token.
1423
1424 StringRef Text = FormatTok->TokenText;
1425 size_t FirstNewlinePos = Text.find('\n');
1426 if (FirstNewlinePos == StringRef::npos) {
1427 // FIXME: ColumnWidth actually depends on the start column, we need to
1428 // take this into account when the token is moved.
1429 FormatTok->ColumnWidth =
1430 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1431 Column += FormatTok->ColumnWidth;
1432 } else {
1433 FormatTok->IsMultiline = true;
1434 // FIXME: ColumnWidth actually depends on the start column, we need to
1435 // take this into account when the token is moved.
1436 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1437 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1438
1439 // The last line of the token always starts in column 0.
1440 // Thus, the length can be precomputed even in the presence of tabs.
1441 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1442 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1443 Column = FormatTok->LastLineColumnWidth;
1444 }
1445
1446 if (IsCpp) {
1447 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1448 auto it = Macros.find(Identifier);
1449 if ((Tokens.empty() || !Tokens.back()->Tok.getIdentifierInfo() ||
1450 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() !=
1451 tok::pp_define) &&
1452 it != Macros.end()) {
1453 FormatTok->setType(it->second);
1454 if (it->second == TT_IfMacro) {
1455 // The lexer token currently has type tok::kw_unknown. However, for this
1456 // substitution to be treated correctly in the TokenAnnotator, faking
1457 // the tok value seems to be needed. Not sure if there's a more elegant
1458 // way.
1459 FormatTok->Tok.setKind(tok::kw_if);
1460 }
1461 } else if (FormatTok->is(tok::identifier)) {
1462 if (MacroBlockBeginRegex.match(Text))
1463 FormatTok->setType(TT_MacroBlockBegin);
1464 else if (MacroBlockEndRegex.match(Text))
1465 FormatTok->setType(TT_MacroBlockEnd);
1466 else if (MacrosSkippedByRemoveParentheses.contains(Identifier))
1467 FormatTok->setFinalizedType(TT_FunctionLikeMacro);
1468 else if (TemplateNames.contains(Identifier))
1469 FormatTok->setFinalizedType(TT_TemplateName);
1470 else if (TypeNames.contains(Identifier))
1471 FormatTok->setFinalizedType(TT_TypeName);
1472 else if (VariableTemplates.contains(Identifier))
1473 FormatTok->setFinalizedType(TT_VariableTemplate);
1474 }
1475 }
1476
1477 return FormatTok;
1478}
1479
1480bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1481 const char *Start = Lex->getBufferLocation();
1482 size_t Len;
1483 switch (Start[0]) {
1484 // In Verilog the quote is not a character literal.
1485 case '\'':
1486 Len = 1;
1487 break;
1488 // Make the backtick and double backtick identifiers to match against them
1489 // more easily.
1490 case '`':
1491 if (Start[1] == '`')
1492 Len = 2;
1493 else
1494 Len = 1;
1495 break;
1496 // In Verilog an escaped identifier starts with a backslash and ends with
1497 // whitespace. Unless that whitespace is an escaped newline.
1498 // FIXME: If there is an escaped newline in the middle of an escaped
1499 // identifier, allow for pasting the two lines together, But escaped
1500 // identifiers usually occur only in generated code anyway.
1501 case '\\':
1502 // A backslash can also begin an escaped newline outside of an escaped
1503 // identifier.
1504 if (Start[1] == '\r' || Start[1] == '\n')
1505 return false;
1506 Len = 1;
1507 while (Start[Len] != '\0' && Start[Len] != '\f' && Start[Len] != '\n' &&
1508 Start[Len] != '\r' && Start[Len] != '\t' && Start[Len] != '\v' &&
1509 Start[Len] != ' ') {
1510 // There is a null byte at the end of the buffer, so we don't have to
1511 // check whether the next byte is within the buffer.
1512 if (Start[Len] == '\\' && Start[Len + 1] == '\r' &&
1513 Start[Len + 2] == '\n') {
1514 Len += 3;
1515 } else if (Start[Len] == '\\' &&
1516 (Start[Len + 1] == '\r' || Start[Len + 1] == '\n')) {
1517 Len += 2;
1518 } else {
1519 Len += 1;
1520 }
1521 }
1522 break;
1523 default:
1524 return false;
1525 }
1526
1527 // The kind has to be an identifier so we can match it against those defined
1528 // in Keywords. The kind has to be set before the length because the setLength
1529 // function checks that the kind is not an annotation.
1530 Tok.setKind(tok::raw_identifier);
1531 Tok.setLength(Len);
1532 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1533 Tok.setRawIdentifierData(Start);
1534 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1535 return true;
1536}
1537
1538void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1539 // For Verilog, first see if there is a special token, and fall back to the
1540 // normal lexer if there isn't one.
1541 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1542 Lex->LexFromRawLexer(Tok.Tok);
1543 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1544 Tok.Tok.getLength());
1545 // For formatting, treat unterminated string literals like normal string
1546 // literals.
1547 if (Tok.is(tok::unknown)) {
1548 if (Tok.TokenText.starts_with("\"")) {
1549 Tok.Tok.setKind(tok::string_literal);
1550 Tok.IsUnterminatedLiteral = true;
1551 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1552 Tok.Tok.setKind(tok::string_literal);
1553 }
1554 }
1555
1556 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1557 Tok.Tok.setKind(tok::string_literal);
1558
1559 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1560 FormattingDisabled = false;
1561
1562 Tok.Finalized = FormattingDisabled;
1563
1564 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1565 FormattingDisabled = true;
1566}
1567
1568void FormatTokenLexer::resetLexer(unsigned Offset) {
1569 StringRef Buffer = SourceMgr.getBufferData(ID);
1570 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1571 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1572 Lex->SetKeepWhitespaceMode(true);
1573 TrailingWhitespace = 0;
1574}
1575
1576} // namespace format
1577} // namespace clang
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
FormatToken()
Token Tok
The Token.
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
FormatToken * Next
The next token in the unwrapped line.
bool is(tok::TokenKind Kind) const
Various functions to configurably format source code.
#define X(type, name)
Definition Value.h:97
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
Token - This structure provides full information about a lexed token.
Definition Token.h:36
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition Token.h:134
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
uint32_t Literal
Literals are represented as positive integers.
Definition CNFFormula.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition Format.cpp:4499
bool isClangFormatOn(StringRef Comment)
Definition Format.cpp:4495
TokenType
Determines the semantic type of a syntactic token, e.g.
LangOptions getFormattingLangOpts(const FormatStyle &Style)
Definition Format.cpp:4139
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition CharInfo.h:99
std::vector< std::string > Macros
A list of macros of the form <definition>=<expansion> .
Definition Format.h:3545
@ TemplateName
The identifier is a template name. FIXME: Add an annotation for that.
Definition Parser.h:61
nullptr
This class represents a compute construct, representing a 'Kind' of ‘parallel’, 'serial',...
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition Format.h:5364
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition CharInfo.h:91
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition CharInfo.h:108
@ Keyword
The name has been typo-corrected to a keyword.
Definition Sema.h:560
@ Type
The name was classified as a type.
Definition Sema.h:562
std::vector< std::string > MacrosSkippedByRemoveParentheses
A vector of function-like macros whose invocations should be skipped by RemoveParentheses.
Definition Format.h:3550
std::vector< std::string > TemplateNames
A vector of non-keyword identifiers that should be interpreted as template names.
Definition Format.h:5354
std::vector< std::string > VariableTemplates
A vector of non-keyword identifiers that should be interpreted as variable template names.
Definition Format.h:5415
#define true
Definition stdbool.h:25
A wrapper around a Token storing information about the whitespace characters preceding it.
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
bool HasFormFeedBefore
Has "\n\f\n" or "\n\f\r\n" before TokenText.
unsigned IsFirst
Indicates that this is the first token of the file.