clang 20.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
19#include "clang/Format/Format.h"
20#include "llvm/Support/Regex.h"
21
22namespace clang {
23namespace format {
24
26 const SourceManager &SourceMgr, FileID ID, unsigned Column,
27 const FormatStyle &Style, encoding::Encoding Encoding,
28 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29 IdentifierTable &IdentTable)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31 Column(Column), TrailingWhitespace(0),
32 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36 MacroBlockEndRegex(Style.MacroBlockEnd) {
37 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38 Lex->SetKeepWhitespaceMode(true);
39
40 for (const std::string &ForEachMacro : Style.ForEachMacros) {
41 auto Identifier = &IdentTable.get(ForEachMacro);
42 Macros.insert({Identifier, TT_ForEachMacro});
43 }
44 for (const std::string &IfMacro : Style.IfMacros) {
45 auto Identifier = &IdentTable.get(IfMacro);
46 Macros.insert({Identifier, TT_IfMacro});
47 }
48 for (const std::string &AttributeMacro : Style.AttributeMacros) {
49 auto Identifier = &IdentTable.get(AttributeMacro);
50 Macros.insert({Identifier, TT_AttributeMacro});
51 }
52 for (const std::string &StatementMacro : Style.StatementMacros) {
53 auto Identifier = &IdentTable.get(StatementMacro);
54 Macros.insert({Identifier, TT_StatementMacro});
55 }
56 for (const std::string &TypenameMacro : Style.TypenameMacros) {
57 auto Identifier = &IdentTable.get(TypenameMacro);
58 Macros.insert({Identifier, TT_TypenameMacro});
59 }
60 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61 auto Identifier = &IdentTable.get(NamespaceMacro);
62 Macros.insert({Identifier, TT_NamespaceMacro});
63 }
64 for (const std::string &WhitespaceSensitiveMacro :
66 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67 Macros.insert({Identifier, TT_UntouchableMacroFunc});
68 }
69 for (const std::string &StatementAttributeLikeMacro :
71 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73 }
74
75 for (const auto &TemplateName : Style.TemplateNames)
76 TemplateNames.insert(&IdentTable.get(TemplateName));
77 for (const auto &TypeName : Style.TypeNames)
78 TypeNames.insert(&IdentTable.get(TypeName));
79 for (const auto &VariableTemplate : Style.VariableTemplates)
80 VariableTemplates.insert(&IdentTable.get(VariableTemplate));
81}
82
84 assert(Tokens.empty());
85 assert(FirstInLineIndex == 0);
86 do {
87 Tokens.push_back(getNextToken());
88 if (Style.isJavaScript()) {
89 tryParseJSRegexLiteral();
90 handleTemplateStrings();
91 }
93 tryParsePythonComment();
94 tryMergePreviousTokens();
95 if (Style.isCSharp()) {
96 // This needs to come after tokens have been merged so that C#
97 // string literals are correctly identified.
98 handleCSharpVerbatimAndInterpolatedStrings();
99 }
100 if (Style.isTableGen()) {
101 handleTableGenMultilineString();
102 handleTableGenNumericLikeIdentifier();
103 }
104 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
105 FirstInLineIndex = Tokens.size() - 1;
106 } while (Tokens.back()->isNot(tok::eof));
107 if (Style.InsertNewlineAtEOF) {
108 auto &TokEOF = *Tokens.back();
109 if (TokEOF.NewlinesBefore == 0) {
110 TokEOF.NewlinesBefore = 1;
111 TokEOF.OriginalColumn = 0;
112 }
113 }
114 return Tokens;
115}
116
117void FormatTokenLexer::tryMergePreviousTokens() {
118 if (tryMerge_TMacro())
119 return;
120 if (tryMergeConflictMarkers())
121 return;
122 if (tryMergeLessLess())
123 return;
124 if (tryMergeGreaterGreater())
125 return;
126 if (tryMergeForEach())
127 return;
128 if (Style.isCpp() && tryTransformTryUsageForC())
129 return;
130
131 if (Style.isJavaScript() || Style.isCSharp()) {
132 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
133 tok::question};
134 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
135 tok::period};
136 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
137
138 if (tryMergeTokens(FatArrow, TT_FatArrow))
139 return;
140 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
141 // Treat like the "||" operator (as opposed to the ternary ?).
142 Tokens.back()->Tok.setKind(tok::pipepipe);
143 return;
144 }
145 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
146 // Treat like a regular "." access.
147 Tokens.back()->Tok.setKind(tok::period);
148 return;
149 }
150 if (tryMergeNullishCoalescingEqual())
151 return;
152 }
153
154 if (Style.isCSharp()) {
155 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
156 tok::question, tok::l_square};
157
158 if (tryMergeCSharpKeywordVariables())
159 return;
160 if (tryMergeCSharpStringLiteral())
161 return;
162 if (tryTransformCSharpForEach())
163 return;
164 if (tryMergeTokens(CSharpNullConditionalLSquare,
165 TT_CSharpNullConditionalLSquare)) {
166 // Treat like a regular "[" operator.
167 Tokens.back()->Tok.setKind(tok::l_square);
168 return;
169 }
170 }
171
172 if (tryMergeNSStringLiteral())
173 return;
174
175 if (Style.isJavaScript()) {
176 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
177 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
178 tok::equal};
179 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
180 tok::greaterequal};
181 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
182 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
183 tok::starequal};
184 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
185 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
186
187 // FIXME: Investigate what token type gives the correct operator priority.
188 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
189 return;
190 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
191 return;
192 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
193 return;
194 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
195 return;
196 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
197 Tokens.back()->Tok.setKind(tok::starequal);
198 return;
199 }
200 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
201 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
202 // Treat like the "=" assignment operator.
203 Tokens.back()->Tok.setKind(tok::equal);
204 return;
205 }
206 if (tryMergeJSPrivateIdentifier())
207 return;
208 }
209
210 if (Style.Language == FormatStyle::LK_Java) {
211 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
212 tok::greater, tok::greater, tok::greaterequal};
213 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
214 return;
215 }
216
217 if (Style.isVerilog()) {
218 // Merge the number following a base like `'h?a0`.
219 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
220 Tokens.end()[-2]->is(tok::numeric_constant) &&
221 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
222 tok::question) &&
223 tryMergeTokens(2, TT_Unknown)) {
224 return;
225 }
226 // Part select.
227 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
228 TT_BitFieldColon)) {
229 return;
230 }
231 // Xnor. The combined token is treated as a caret which can also be either a
232 // unary or binary operator. The actual type is determined in
233 // TokenAnnotator. We also check the token length so we know it is not
234 // already a merged token.
235 if (Tokens.back()->TokenText.size() == 1 &&
236 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
237 TT_BinaryOperator)) {
238 Tokens.back()->Tok.setKind(tok::caret);
239 return;
240 }
241 // Signed shift and distribution weight.
242 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
243 Tokens.back()->Tok.setKind(tok::lessless);
244 return;
245 }
246 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
247 Tokens.back()->Tok.setKind(tok::greatergreater);
248 return;
249 }
250 if (tryMergeTokensAny({{tok::lessless, tok::equal},
251 {tok::lessless, tok::lessequal},
252 {tok::greatergreater, tok::equal},
253 {tok::greatergreater, tok::greaterequal},
254 {tok::colon, tok::equal},
255 {tok::colon, tok::slash}},
256 TT_BinaryOperator)) {
257 Tokens.back()->ForcedPrecedence = prec::Assignment;
258 return;
259 }
260 // Exponentiation, signed shift, case equality, and wildcard equality.
261 if (tryMergeTokensAny({{tok::star, tok::star},
262 {tok::lessless, tok::less},
263 {tok::greatergreater, tok::greater},
264 {tok::exclaimequal, tok::equal},
265 {tok::exclaimequal, tok::question},
266 {tok::equalequal, tok::equal},
267 {tok::equalequal, tok::question}},
268 TT_BinaryOperator)) {
269 return;
270 }
271 // Module paths in specify blocks and the implication and boolean equality
272 // operators.
273 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
274 {tok::plus, tok::star, tok::greater},
275 {tok::minusequal, tok::greater},
276 {tok::minus, tok::star, tok::greater},
277 {tok::less, tok::arrow},
278 {tok::equal, tok::greater},
279 {tok::star, tok::greater},
280 {tok::pipeequal, tok::greater},
281 {tok::pipe, tok::arrow},
282 {tok::hash, tok::minus, tok::hash},
283 {tok::hash, tok::equal, tok::hash}},
284 TT_BinaryOperator) ||
285 Tokens.back()->is(tok::arrow)) {
286 Tokens.back()->ForcedPrecedence = prec::Comma;
287 return;
288 }
289 }
290 if (Style.isTableGen()) {
291 // TableGen's Multi line string starts with [{
292 if (tryMergeTokens({tok::l_square, tok::l_brace},
293 TT_TableGenMultiLineString)) {
294 // Set again with finalizing. This must never be annotated as other types.
295 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
296 Tokens.back()->Tok.setKind(tok::string_literal);
297 return;
298 }
299 // TableGen's bang operator is the form !<name>.
300 // !cond is a special case with specific syntax.
301 if (tryMergeTokens({tok::exclaim, tok::identifier},
302 TT_TableGenBangOperator)) {
303 Tokens.back()->Tok.setKind(tok::identifier);
304 Tokens.back()->Tok.setIdentifierInfo(nullptr);
305 if (Tokens.back()->TokenText == "!cond")
306 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
307 else
308 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
309 return;
310 }
311 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
312 // Here, "! if" becomes "!if". That is, ! captures if even when the space
313 // exists. That is only one possibility in TableGen's syntax.
314 Tokens.back()->Tok.setKind(tok::identifier);
315 Tokens.back()->Tok.setIdentifierInfo(nullptr);
316 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
317 return;
318 }
319 // +, - with numbers are literals. Not unary operators.
320 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
321 Tokens.back()->Tok.setKind(tok::numeric_constant);
322 return;
323 }
324 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
325 Tokens.back()->Tok.setKind(tok::numeric_constant);
326 return;
327 }
328 }
329}
330
331bool FormatTokenLexer::tryMergeNSStringLiteral() {
332 if (Tokens.size() < 2)
333 return false;
334 auto &At = *(Tokens.end() - 2);
335 auto &String = *(Tokens.end() - 1);
336 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
337 return false;
338 At->Tok.setKind(tok::string_literal);
339 At->TokenText = StringRef(At->TokenText.begin(),
340 String->TokenText.end() - At->TokenText.begin());
341 At->ColumnWidth += String->ColumnWidth;
342 At->setType(TT_ObjCStringLiteral);
343 Tokens.erase(Tokens.end() - 1);
344 return true;
345}
346
347bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
348 // Merges #idenfier into a single identifier with the text #identifier
349 // but the token tok::identifier.
350 if (Tokens.size() < 2)
351 return false;
352 auto &Hash = *(Tokens.end() - 2);
353 auto &Identifier = *(Tokens.end() - 1);
354 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
355 return false;
356 Hash->Tok.setKind(tok::identifier);
357 Hash->TokenText =
358 StringRef(Hash->TokenText.begin(),
359 Identifier->TokenText.end() - Hash->TokenText.begin());
360 Hash->ColumnWidth += Identifier->ColumnWidth;
361 Hash->setType(TT_JsPrivateIdentifier);
362 Tokens.erase(Tokens.end() - 1);
363 return true;
364}
365
366// Search for verbatim or interpolated string literals @"ABC" or
367// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
368// prevent splitting of @, $ and ".
369// Merging of multiline verbatim strings with embedded '"' is handled in
370// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
371bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
372 if (Tokens.size() < 2)
373 return false;
374
375 // Look for @"aaaaaa" or $"aaaaaa".
376 const auto String = *(Tokens.end() - 1);
377 if (String->isNot(tok::string_literal))
378 return false;
379
380 auto Prefix = *(Tokens.end() - 2);
381 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
382 return false;
383
384 if (Tokens.size() > 2) {
385 const auto Tok = *(Tokens.end() - 3);
386 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
387 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
388 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
389 Tok->ColumnWidth += Prefix->ColumnWidth;
390 Tokens.erase(Tokens.end() - 2);
391 Prefix = Tok;
392 }
393 }
394
395 // Convert back into just a string_literal.
396 Prefix->Tok.setKind(tok::string_literal);
397 Prefix->TokenText =
398 StringRef(Prefix->TokenText.begin(),
399 String->TokenText.end() - Prefix->TokenText.begin());
400 Prefix->ColumnWidth += String->ColumnWidth;
401 Prefix->setType(TT_CSharpStringLiteral);
402 Tokens.erase(Tokens.end() - 1);
403 return true;
404}
405
406// Valid C# attribute targets:
407// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
408const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
409 "assembly", "module", "field", "event", "method",
410 "param", "property", "return", "type",
411};
412
413bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
414 if (Tokens.size() < 2)
415 return false;
416 auto &NullishCoalescing = *(Tokens.end() - 2);
417 auto &Equal = *(Tokens.end() - 1);
418 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
419 Equal->isNot(tok::equal)) {
420 return false;
421 }
422 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
423 NullishCoalescing->TokenText =
424 StringRef(NullishCoalescing->TokenText.begin(),
425 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
426 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
427 NullishCoalescing->setType(TT_NullCoalescingEqual);
428 Tokens.erase(Tokens.end() - 1);
429 return true;
430}
431
432bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
433 if (Tokens.size() < 2)
434 return false;
435 const auto At = *(Tokens.end() - 2);
436 if (At->isNot(tok::at))
437 return false;
438 const auto Keyword = *(Tokens.end() - 1);
439 if (Keyword->TokenText == "$")
440 return false;
441 if (!Keywords.isCSharpKeyword(*Keyword))
442 return false;
443
444 At->Tok.setKind(tok::identifier);
445 At->TokenText = StringRef(At->TokenText.begin(),
446 Keyword->TokenText.end() - At->TokenText.begin());
447 At->ColumnWidth += Keyword->ColumnWidth;
448 At->setType(Keyword->getType());
449 Tokens.erase(Tokens.end() - 1);
450 return true;
451}
452
453// In C# transform identifier foreach into kw_foreach
454bool FormatTokenLexer::tryTransformCSharpForEach() {
455 if (Tokens.size() < 1)
456 return false;
457 auto &Identifier = *(Tokens.end() - 1);
458 if (Identifier->isNot(tok::identifier))
459 return false;
460 if (Identifier->TokenText != "foreach")
461 return false;
462
463 Identifier->setType(TT_ForEachMacro);
464 Identifier->Tok.setKind(tok::kw_for);
465 return true;
466}
467
468bool FormatTokenLexer::tryMergeForEach() {
469 if (Tokens.size() < 2)
470 return false;
471 auto &For = *(Tokens.end() - 2);
472 auto &Each = *(Tokens.end() - 1);
473 if (For->isNot(tok::kw_for))
474 return false;
475 if (Each->isNot(tok::identifier))
476 return false;
477 if (Each->TokenText != "each")
478 return false;
479
480 For->setType(TT_ForEachMacro);
481 For->Tok.setKind(tok::kw_for);
482
483 For->TokenText = StringRef(For->TokenText.begin(),
484 Each->TokenText.end() - For->TokenText.begin());
485 For->ColumnWidth += Each->ColumnWidth;
486 Tokens.erase(Tokens.end() - 1);
487 return true;
488}
489
490bool FormatTokenLexer::tryTransformTryUsageForC() {
491 if (Tokens.size() < 2)
492 return false;
493 auto &Try = *(Tokens.end() - 2);
494 if (Try->isNot(tok::kw_try))
495 return false;
496 auto &Next = *(Tokens.end() - 1);
497 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
498 return false;
499
500 if (Tokens.size() > 2) {
501 auto &At = *(Tokens.end() - 3);
502 if (At->is(tok::at))
503 return false;
504 }
505
506 Try->Tok.setKind(tok::identifier);
507 return true;
508}
509
510bool FormatTokenLexer::tryMergeLessLess() {
511 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
512 if (Tokens.size() < 3)
513 return false;
514
515 auto First = Tokens.end() - 3;
516 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
517 return false;
518
519 // Only merge if there currently is no whitespace between the two "<".
520 if (First[1]->hasWhitespaceBefore())
521 return false;
522
523 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
524 if (X && X->is(tok::less))
525 return false;
526
527 auto Y = First[2];
528 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
529 return false;
530
531 First[0]->Tok.setKind(tok::lessless);
532 First[0]->TokenText = "<<";
533 First[0]->ColumnWidth += 1;
534 Tokens.erase(Tokens.end() - 2);
535 return true;
536}
537
538bool FormatTokenLexer::tryMergeGreaterGreater() {
539 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
540 if (Tokens.size() < 2)
541 return false;
542
543 auto First = Tokens.end() - 2;
544 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
545 return false;
546
547 // Only merge if there currently is no whitespace between the first two ">".
548 if (First[1]->hasWhitespaceBefore())
549 return false;
550
551 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
552 if (Tok && Tok->isNot(tok::kw_operator))
553 return false;
554
555 First[0]->Tok.setKind(tok::greatergreater);
556 First[0]->TokenText = ">>";
557 First[0]->ColumnWidth += 1;
558 Tokens.erase(Tokens.end() - 1);
559 return true;
560}
561
562bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
563 TokenType NewType) {
564 if (Tokens.size() < Kinds.size())
565 return false;
566
567 const auto *First = Tokens.end() - Kinds.size();
568 for (unsigned i = 0; i < Kinds.size(); ++i)
569 if (First[i]->isNot(Kinds[i]))
570 return false;
571
572 return tryMergeTokens(Kinds.size(), NewType);
573}
574
575bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
576 if (Tokens.size() < Count)
577 return false;
578
579 const auto *First = Tokens.end() - Count;
580 unsigned AddLength = 0;
581 for (size_t i = 1; i < Count; ++i) {
582 // If there is whitespace separating the token and the previous one,
583 // they should not be merged.
584 if (First[i]->hasWhitespaceBefore())
585 return false;
586 AddLength += First[i]->TokenText.size();
587 }
588
589 Tokens.resize(Tokens.size() - Count + 1);
590 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
591 First[0]->TokenText.size() + AddLength);
592 First[0]->ColumnWidth += AddLength;
593 First[0]->setType(NewType);
594 return true;
595}
596
597bool FormatTokenLexer::tryMergeTokensAny(
598 ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
599 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
600 return tryMergeTokens(Kinds, NewType);
601 });
602}
603
604// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
605bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
606 // NB: This is not entirely correct, as an r_paren can introduce an operand
607 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
608 // corner case to not matter in practice, though.
609 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
610 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
611 tok::colon, tok::question, tok::tilde) ||
612 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
613 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
614 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
615 Tok->isBinaryOperator();
616}
617
618bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
619 if (!Prev)
620 return true;
621
622 // Regex literals can only follow after prefix unary operators, not after
623 // postfix unary operators. If the '++' is followed by a non-operand
624 // introducing token, the slash here is the operand and not the start of a
625 // regex.
626 // `!` is an unary prefix operator, but also a post-fix operator that casts
627 // away nullability, so the same check applies.
628 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
629 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
630
631 // The previous token must introduce an operand location where regex
632 // literals can occur.
633 if (!precedesOperand(Prev))
634 return false;
635
636 return true;
637}
638
639// Tries to parse a JavaScript Regex literal starting at the current token,
640// if that begins with a slash and is in a location where JavaScript allows
641// regex literals. Changes the current token to a regex literal and updates
642// its text if successful.
643void FormatTokenLexer::tryParseJSRegexLiteral() {
644 FormatToken *RegexToken = Tokens.back();
645 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
646 return;
647
648 FormatToken *Prev = nullptr;
649 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
650 // NB: Because previous pointers are not initialized yet, this cannot use
651 // Token.getPreviousNonComment.
652 if (FT->isNot(tok::comment)) {
653 Prev = FT;
654 break;
655 }
656 }
657
658 if (!canPrecedeRegexLiteral(Prev))
659 return;
660
661 // 'Manually' lex ahead in the current file buffer.
662 const char *Offset = Lex->getBufferLocation();
663 const char *RegexBegin = Offset - RegexToken->TokenText.size();
664 StringRef Buffer = Lex->getBuffer();
665 bool InCharacterClass = false;
666 bool HaveClosingSlash = false;
667 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
668 // Regular expressions are terminated with a '/', which can only be
669 // escaped using '\' or a character class between '[' and ']'.
670 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
671 switch (*Offset) {
672 case '\\':
673 // Skip the escaped character.
674 ++Offset;
675 break;
676 case '[':
677 InCharacterClass = true;
678 break;
679 case ']':
680 InCharacterClass = false;
681 break;
682 case '/':
683 if (!InCharacterClass)
684 HaveClosingSlash = true;
685 break;
686 }
687 }
688
689 RegexToken->setType(TT_RegexLiteral);
690 // Treat regex literals like other string_literals.
691 RegexToken->Tok.setKind(tok::string_literal);
692 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
693 RegexToken->ColumnWidth = RegexToken->TokenText.size();
694
695 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
696}
697
698static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
699 bool Interpolated) {
700 auto Repeated = [&Begin, End]() {
701 return Begin + 1 < End && Begin[1] == Begin[0];
702 };
703
704 // Look for a terminating '"' in the current file buffer.
705 // Make no effort to format code within an interpolated or verbatim string.
706 //
707 // Interpolated strings could contain { } with " characters inside.
708 // $"{x ?? "null"}"
709 // should not be split into $"{x ?? ", null, "}" but should be treated as a
710 // single string-literal.
711 //
712 // We opt not to try and format expressions inside {} within a C#
713 // interpolated string. Formatting expressions within an interpolated string
714 // would require similar work as that done for JavaScript template strings
715 // in `handleTemplateStrings()`.
716 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
717 switch (*Begin) {
718 case '\\':
719 if (!Verbatim)
720 ++Begin;
721 break;
722 case '{':
723 if (Interpolated) {
724 // {{ inside an interpolated string is escaped, so skip it.
725 if (Repeated())
726 ++Begin;
727 else
728 ++UnmatchedOpeningBraceCount;
729 }
730 break;
731 case '}':
732 if (Interpolated) {
733 // }} inside an interpolated string is escaped, so skip it.
734 if (Repeated())
735 ++Begin;
736 else if (UnmatchedOpeningBraceCount > 0)
737 --UnmatchedOpeningBraceCount;
738 else
739 return End;
740 }
741 break;
742 case '"':
743 if (UnmatchedOpeningBraceCount > 0)
744 break;
745 // "" within a verbatim string is an escaped double quote: skip it.
746 if (Verbatim && Repeated()) {
747 ++Begin;
748 break;
749 }
750 return Begin;
751 }
752 }
753
754 return End;
755}
756
757void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
758 FormatToken *CSharpStringLiteral = Tokens.back();
759
760 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
761 return;
762
763 auto &TokenText = CSharpStringLiteral->TokenText;
764
765 bool Verbatim = false;
766 bool Interpolated = false;
767 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
768 Verbatim = true;
769 Interpolated = true;
770 } else if (TokenText.starts_with(R"(@")")) {
771 Verbatim = true;
772 } else if (TokenText.starts_with(R"($")")) {
773 Interpolated = true;
774 }
775
776 // Deal with multiline strings.
777 if (!Verbatim && !Interpolated)
778 return;
779
780 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
781 const char *Offset = StrBegin;
782 if (Verbatim && Interpolated)
783 Offset += 3;
784 else
785 Offset += 2;
786
787 const auto End = Lex->getBuffer().end();
788 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
789
790 // Make no attempt to format code properly if a verbatim string is
791 // unterminated.
792 if (Offset >= End)
793 return;
794
795 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
796 TokenText = LiteralText;
797
798 // Adjust width for potentially multiline string literals.
799 size_t FirstBreak = LiteralText.find('\n');
800 StringRef FirstLineText = FirstBreak == StringRef::npos
801 ? LiteralText
802 : LiteralText.substr(0, FirstBreak);
803 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
804 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
805 Encoding);
806 size_t LastBreak = LiteralText.rfind('\n');
807 if (LastBreak != StringRef::npos) {
808 CSharpStringLiteral->IsMultiline = true;
809 unsigned StartColumn = 0;
810 CSharpStringLiteral->LastLineColumnWidth =
811 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
812 StartColumn, Style.TabWidth, Encoding);
813 }
814
815 assert(Offset < End);
816 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
817}
818
819void FormatTokenLexer::handleTableGenMultilineString() {
820 FormatToken *MultiLineString = Tokens.back();
821 if (MultiLineString->isNot(TT_TableGenMultiLineString))
822 return;
823
824 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
825 // "}]" is the end of multi line string.
826 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
827 if (CloseOffset == StringRef::npos)
828 return;
829 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
830 MultiLineString->TokenText = Text;
831 resetLexer(SourceMgr.getFileOffset(
832 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
833 auto FirstLineText = Text;
834 auto FirstBreak = Text.find('\n');
835 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
836 if (FirstBreak != StringRef::npos) {
837 MultiLineString->IsMultiline = true;
838 FirstLineText = Text.substr(0, FirstBreak + 1);
839 // LastLineColumnWidth holds the width of the last line.
840 auto LastBreak = Text.rfind('\n');
841 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
842 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
843 Style.TabWidth, Encoding);
844 }
845 // ColumnWidth holds only the width of the first line.
846 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
847 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
848}
849
850void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
851 FormatToken *Tok = Tokens.back();
852 // TableGen identifiers can begin with digits. Such tokens are lexed as
853 // numeric_constant now.
854 if (Tok->isNot(tok::numeric_constant))
855 return;
856 StringRef Text = Tok->TokenText;
857 // The following check is based on llvm::TGLexer::LexToken.
858 // That lexes the token as a number if any of the following holds:
859 // 1. It starts with '+', '-'.
860 // 2. All the characters are digits.
861 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
862 // 4. The first non-digit character is 'x', and the next is a hex digit.
863 // Note that in the case 3 and 4, if the next character does not exists in
864 // this token, the token is an identifier.
865 if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
866 return;
867 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
868 // All the characters are digits
869 if (NonDigitPos == StringRef::npos)
870 return;
871 char FirstNonDigit = Text[NonDigitPos];
872 if (NonDigitPos < Text.size() - 1) {
873 char TheNext = Text[NonDigitPos + 1];
874 // Regarded as a binary number.
875 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
876 return;
877 // Regarded as hex number.
878 if (FirstNonDigit == 'x' && isxdigit(TheNext))
879 return;
880 }
881 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
882 // This is actually an identifier in TableGen.
883 Tok->Tok.setKind(tok::identifier);
884 Tok->Tok.setIdentifierInfo(nullptr);
885 }
886}
887
888void FormatTokenLexer::handleTemplateStrings() {
889 FormatToken *BacktickToken = Tokens.back();
890
891 if (BacktickToken->is(tok::l_brace)) {
892 StateStack.push(LexerState::NORMAL);
893 return;
894 }
895 if (BacktickToken->is(tok::r_brace)) {
896 if (StateStack.size() == 1)
897 return;
898 StateStack.pop();
899 if (StateStack.top() != LexerState::TEMPLATE_STRING)
900 return;
901 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
902 } else if (BacktickToken->is(tok::unknown) &&
903 BacktickToken->TokenText == "`") {
904 StateStack.push(LexerState::TEMPLATE_STRING);
905 } else {
906 return; // Not actually a template
907 }
908
909 // 'Manually' lex ahead in the current file buffer.
910 const char *Offset = Lex->getBufferLocation();
911 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
912 for (; Offset != Lex->getBuffer().end(); ++Offset) {
913 if (Offset[0] == '`') {
914 StateStack.pop();
915 ++Offset;
916 break;
917 }
918 if (Offset[0] == '\\') {
919 ++Offset; // Skip the escaped character.
920 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
921 Offset[1] == '{') {
922 // '${' introduces an expression interpolation in the template string.
923 StateStack.push(LexerState::NORMAL);
924 Offset += 2;
925 break;
926 }
927 }
928
929 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
930 BacktickToken->setType(TT_TemplateString);
931 BacktickToken->Tok.setKind(tok::string_literal);
932 BacktickToken->TokenText = LiteralText;
933
934 // Adjust width for potentially multiline string literals.
935 size_t FirstBreak = LiteralText.find('\n');
936 StringRef FirstLineText = FirstBreak == StringRef::npos
937 ? LiteralText
938 : LiteralText.substr(0, FirstBreak);
939 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
940 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
941 size_t LastBreak = LiteralText.rfind('\n');
942 if (LastBreak != StringRef::npos) {
943 BacktickToken->IsMultiline = true;
944 unsigned StartColumn = 0; // The template tail spans the entire line.
945 BacktickToken->LastLineColumnWidth =
946 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
947 StartColumn, Style.TabWidth, Encoding);
948 }
949
950 SourceLocation loc = Lex->getSourceLocation(Offset);
951 resetLexer(SourceMgr.getFileOffset(loc));
952}
953
954void FormatTokenLexer::tryParsePythonComment() {
955 FormatToken *HashToken = Tokens.back();
956 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
957 return;
958 // Turn the remainder of this line into a comment.
959 const char *CommentBegin =
960 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
961 size_t From = CommentBegin - Lex->getBuffer().begin();
962 size_t To = Lex->getBuffer().find_first_of('\n', From);
963 if (To == StringRef::npos)
964 To = Lex->getBuffer().size();
965 size_t Len = To - From;
966 HashToken->setType(TT_LineComment);
967 HashToken->Tok.setKind(tok::comment);
968 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
969 SourceLocation Loc = To < Lex->getBuffer().size()
970 ? Lex->getSourceLocation(CommentBegin + Len)
971 : SourceMgr.getLocForEndOfFile(ID);
972 resetLexer(SourceMgr.getFileOffset(Loc));
973}
974
975bool FormatTokenLexer::tryMerge_TMacro() {
976 if (Tokens.size() < 4)
977 return false;
978 FormatToken *Last = Tokens.back();
979 if (Last->isNot(tok::r_paren))
980 return false;
981
982 FormatToken *String = Tokens[Tokens.size() - 2];
983 if (String->isNot(tok::string_literal) || String->IsMultiline)
984 return false;
985
986 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
987 return false;
988
989 FormatToken *Macro = Tokens[Tokens.size() - 4];
990 if (Macro->TokenText != "_T")
991 return false;
992
993 const char *Start = Macro->TokenText.data();
994 const char *End = Last->TokenText.data() + Last->TokenText.size();
995 String->TokenText = StringRef(Start, End - Start);
996 String->IsFirst = Macro->IsFirst;
997 String->LastNewlineOffset = Macro->LastNewlineOffset;
998 String->WhitespaceRange = Macro->WhitespaceRange;
999 String->OriginalColumn = Macro->OriginalColumn;
1000 String->ColumnWidth = encoding::columnWidthWithTabs(
1001 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1002 String->NewlinesBefore = Macro->NewlinesBefore;
1003 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1004
1005 Tokens.pop_back();
1006 Tokens.pop_back();
1007 Tokens.pop_back();
1008 Tokens.back() = String;
1009 if (FirstInLineIndex >= Tokens.size())
1010 FirstInLineIndex = Tokens.size() - 1;
1011 return true;
1012}
1013
1014bool FormatTokenLexer::tryMergeConflictMarkers() {
1015 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1016 return false;
1017
1018 // Conflict lines look like:
1019 // <marker> <text from the vcs>
1020 // For example:
1021 // >>>>>>> /file/in/file/system at revision 1234
1022 //
1023 // We merge all tokens in a line that starts with a conflict marker
1024 // into a single token with a special token type that the unwrapped line
1025 // parser will use to correctly rebuild the underlying code.
1026
1027 FileID ID;
1028 // Get the position of the first token in the line.
1029 unsigned FirstInLineOffset;
1030 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1031 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1032 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1033 // Calculate the offset of the start of the current line.
1034 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1035 if (LineOffset == StringRef::npos)
1036 LineOffset = 0;
1037 else
1038 ++LineOffset;
1039
1040 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1041 StringRef LineStart;
1042 if (FirstSpace == StringRef::npos)
1043 LineStart = Buffer.substr(LineOffset);
1044 else
1045 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1046
1047 TokenType Type = TT_Unknown;
1048 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1049 Type = TT_ConflictStart;
1050 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1051 LineStart == "====") {
1052 Type = TT_ConflictAlternative;
1053 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1054 Type = TT_ConflictEnd;
1055 }
1056
1057 if (Type != TT_Unknown) {
1058 FormatToken *Next = Tokens.back();
1059
1060 Tokens.resize(FirstInLineIndex + 1);
1061 // We do not need to build a complete token here, as we will skip it
1062 // during parsing anyway (as we must not touch whitespace around conflict
1063 // markers).
1064 Tokens.back()->setType(Type);
1065 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1066
1067 Tokens.push_back(Next);
1068 return true;
1069 }
1070
1071 return false;
1072}
1073
1074FormatToken *FormatTokenLexer::getStashedToken() {
1075 // Create a synthesized second '>' or '<' token.
1076 Token Tok = FormatTok->Tok;
1077 StringRef TokenText = FormatTok->TokenText;
1078
1079 unsigned OriginalColumn = FormatTok->OriginalColumn;
1080 FormatTok = new (Allocator.Allocate()) FormatToken;
1081 FormatTok->Tok = Tok;
1082 SourceLocation TokLocation =
1083 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1084 FormatTok->Tok.setLocation(TokLocation);
1085 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1086 FormatTok->TokenText = TokenText;
1087 FormatTok->ColumnWidth = 1;
1088 FormatTok->OriginalColumn = OriginalColumn + 1;
1089
1090 return FormatTok;
1091}
1092
1093/// Truncate the current token to the new length and make the lexer continue
1094/// from the end of the truncated token. Used for other languages that have
1095/// different token boundaries, like JavaScript in which a comment ends at a
1096/// line break regardless of whether the line break follows a backslash. Also
1097/// used to set the lexer to the end of whitespace if the lexer regards
1098/// whitespace and an unrecognized symbol as one token.
1099void FormatTokenLexer::truncateToken(size_t NewLen) {
1100 assert(NewLen <= FormatTok->TokenText.size());
1101 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1102 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1103 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1105 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1106 Encoding);
1107 FormatTok->Tok.setLength(NewLen);
1108}
1109
1110/// Count the length of leading whitespace in a token.
1111static size_t countLeadingWhitespace(StringRef Text) {
1112 // Basically counting the length matched by this regex.
1113 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
1114 // Directly using the regex turned out to be slow. With the regex
1115 // version formatting all files in this directory took about 1.25
1116 // seconds. This version took about 0.5 seconds.
1117 const unsigned char *const Begin = Text.bytes_begin();
1118 const unsigned char *const End = Text.bytes_end();
1119 const unsigned char *Cur = Begin;
1120 while (Cur < End) {
1121 if (isspace(Cur[0])) {
1122 ++Cur;
1123 } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
1124 // A '\' followed by a newline always escapes the newline, regardless
1125 // of whether there is another '\' before it.
1126 // The source has a null byte at the end. So the end of the entire input
1127 // isn't reached yet. Also the lexer doesn't break apart an escaped
1128 // newline.
1129 assert(End - Cur >= 2);
1130 Cur += 2;
1131 } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
1132 (Cur[3] == '\n' || Cur[3] == '\r')) {
1133 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1134 // characters are quoted individually in this comment because if we write
1135 // them together some compilers warn that we have a trigraph in the code.
1136 assert(End - Cur >= 4);
1137 Cur += 4;
1138 } else {
1139 break;
1140 }
1141 }
1142 return Cur - Begin;
1143}
1144
1145FormatToken *FormatTokenLexer::getNextToken() {
1146 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1147 StateStack.pop();
1148 return getStashedToken();
1149 }
1150
1151 FormatTok = new (Allocator.Allocate()) FormatToken;
1152 readRawToken(*FormatTok);
1153 SourceLocation WhitespaceStart =
1154 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1155 FormatTok->IsFirst = IsFirstToken;
1156 IsFirstToken = false;
1157
1158 // Consume and record whitespace until we find a significant token.
1159 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1160 // followed by a symbol such as backtick. Those symbols may be
1161 // significant in other languages.
1162 unsigned WhitespaceLength = TrailingWhitespace;
1163 while (FormatTok->isNot(tok::eof)) {
1164 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1165 if (LeadingWhitespace == 0)
1166 break;
1167 if (LeadingWhitespace < FormatTok->TokenText.size())
1168 truncateToken(LeadingWhitespace);
1169 StringRef Text = FormatTok->TokenText;
1170 bool InEscape = false;
1171 for (int i = 0, e = Text.size(); i != e; ++i) {
1172 switch (Text[i]) {
1173 case '\r':
1174 // If this is a CRLF sequence, break here and the LF will be handled on
1175 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1176 // the same as a single LF.
1177 if (i + 1 < e && Text[i + 1] == '\n')
1178 break;
1179 [[fallthrough]];
1180 case '\n':
1181 ++FormatTok->NewlinesBefore;
1182 if (!InEscape)
1183 FormatTok->HasUnescapedNewline = true;
1184 else
1185 InEscape = false;
1186 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1187 Column = 0;
1188 break;
1189 case '\f':
1190 if (Style.KeepFormFeed && !FormatTok->HasFormFeedBefore &&
1191 // The form feed is immediately preceded and followed by a newline.
1192 i > 0 && Text[i - 1] == '\n' &&
1193 ((i + 1 < e && Text[i + 1] == '\n') ||
1194 (i + 2 < e && Text[i + 1] == '\r' && Text[i + 2] == '\n'))) {
1195 FormatTok->HasFormFeedBefore = true;
1196 }
1197 [[fallthrough]];
1198 case '\v':
1199 Column = 0;
1200 break;
1201 case ' ':
1202 ++Column;
1203 break;
1204 case '\t':
1205 Column +=
1206 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1207 break;
1208 case '\\':
1209 case '?':
1210 case '/':
1211 // The text was entirely whitespace when this loop was entered. Thus
1212 // this has to be an escape sequence.
1213 assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1214 Text.substr(i, 4) == "\?\?/\r" ||
1215 Text.substr(i, 4) == "\?\?/\n" ||
1216 (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1217 Text.substr(i - 1, 4) == "\?\?/\n")) ||
1218 (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1219 Text.substr(i - 2, 4) == "\?\?/\n")));
1220 InEscape = true;
1221 break;
1222 default:
1223 // This shouldn't happen.
1224 assert(false);
1225 break;
1226 }
1227 }
1228 WhitespaceLength += Text.size();
1229 readRawToken(*FormatTok);
1230 }
1231
1232 if (FormatTok->is(tok::unknown))
1233 FormatTok->setType(TT_ImplicitStringLiteral);
1234
1235 // JavaScript and Java do not allow to escape the end of the line with a
1236 // backslash. Backslashes are syntax errors in plain source, but can occur in
1237 // comments. When a single line comment ends with a \, it'll cause the next
1238 // line of code to be lexed as a comment, breaking formatting. The code below
1239 // finds comments that contain a backslash followed by a line break, truncates
1240 // the comment token at the backslash, and resets the lexer to restart behind
1241 // the backslash.
1242 if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1243 FormatTok->is(tok::comment) && FormatTok->TokenText.starts_with("//")) {
1244 size_t BackslashPos = FormatTok->TokenText.find('\\');
1245 while (BackslashPos != StringRef::npos) {
1246 if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1247 FormatTok->TokenText[BackslashPos + 1] == '\n') {
1248 truncateToken(BackslashPos + 1);
1249 break;
1250 }
1251 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1252 }
1253 }
1254
1255 if (Style.isVerilog()) {
1256 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1257 SmallVector<StringRef, 1> Matches;
1258 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1259 // And it uses the hash for delays and parameter lists. In order to continue
1260 // using `tok::hash` in other places, the backtick gets marked as the hash
1261 // here. And in order to tell the backtick and hash apart for
1262 // Verilog-specific stuff, the hash becomes an identifier.
1263 if (FormatTok->is(tok::numeric_constant)) {
1264 // In Verilog the quote is not part of a number.
1265 auto Quote = FormatTok->TokenText.find('\'');
1266 if (Quote != StringRef::npos)
1267 truncateToken(Quote);
1268 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1269 FormatTok->Tok.setKind(tok::raw_identifier);
1270 } else if (FormatTok->is(tok::raw_identifier)) {
1271 if (FormatTok->TokenText == "`") {
1272 FormatTok->Tok.setIdentifierInfo(nullptr);
1273 FormatTok->Tok.setKind(tok::hash);
1274 } else if (FormatTok->TokenText == "``") {
1275 FormatTok->Tok.setIdentifierInfo(nullptr);
1276 FormatTok->Tok.setKind(tok::hashhash);
1277 } else if (Tokens.size() > 0 &&
1278 Tokens.back()->is(Keywords.kw_apostrophe) &&
1279 NumberBase.match(FormatTok->TokenText, &Matches)) {
1280 // In Verilog in a based number literal like `'b10`, there may be
1281 // whitespace between `'b` and `10`. Therefore we handle the base and
1282 // the rest of the number literal as two tokens. But if there is no
1283 // space in the input code, we need to manually separate the two parts.
1284 truncateToken(Matches[0].size());
1285 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1286 }
1287 }
1288 }
1289
1290 FormatTok->WhitespaceRange = SourceRange(
1291 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1292
1293 FormatTok->OriginalColumn = Column;
1294
1295 TrailingWhitespace = 0;
1296 if (FormatTok->is(tok::comment)) {
1297 // FIXME: Add the trimmed whitespace to Column.
1298 StringRef UntrimmedText = FormatTok->TokenText;
1299 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1300 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1301 } else if (FormatTok->is(tok::raw_identifier)) {
1302 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1303 FormatTok->Tok.setIdentifierInfo(&Info);
1304 FormatTok->Tok.setKind(Info.getTokenID());
1305 if (Style.Language == FormatStyle::LK_Java &&
1306 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1307 tok::kw_operator)) {
1308 FormatTok->Tok.setKind(tok::identifier);
1309 FormatTok->Tok.setIdentifierInfo(nullptr);
1310 } else if (Style.isJavaScript() &&
1311 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1312 tok::kw_operator)) {
1313 FormatTok->Tok.setKind(tok::identifier);
1314 FormatTok->Tok.setIdentifierInfo(nullptr);
1315 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1316 FormatTok->Tok.setKind(tok::identifier);
1317 FormatTok->Tok.setIdentifierInfo(nullptr);
1318 }
1319 } else if (FormatTok->is(tok::greatergreater)) {
1320 FormatTok->Tok.setKind(tok::greater);
1321 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1322 ++Column;
1323 StateStack.push(LexerState::TOKEN_STASHED);
1324 } else if (FormatTok->is(tok::lessless)) {
1325 FormatTok->Tok.setKind(tok::less);
1326 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1327 ++Column;
1328 StateStack.push(LexerState::TOKEN_STASHED);
1329 }
1330
1331 if (Style.isVerilog() && Tokens.size() > 0 &&
1332 Tokens.back()->is(TT_VerilogNumberBase) &&
1333 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1334 // Mark the number following a base like `'h?a0` as a number.
1335 FormatTok->Tok.setKind(tok::numeric_constant);
1336 }
1337
1338 // Now FormatTok is the next non-whitespace token.
1339
1340 StringRef Text = FormatTok->TokenText;
1341 size_t FirstNewlinePos = Text.find('\n');
1342 if (FirstNewlinePos == StringRef::npos) {
1343 // FIXME: ColumnWidth actually depends on the start column, we need to
1344 // take this into account when the token is moved.
1345 FormatTok->ColumnWidth =
1346 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1347 Column += FormatTok->ColumnWidth;
1348 } else {
1349 FormatTok->IsMultiline = true;
1350 // FIXME: ColumnWidth actually depends on the start column, we need to
1351 // take this into account when the token is moved.
1353 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1354
1355 // The last line of the token always starts in column 0.
1356 // Thus, the length can be precomputed even in the presence of tabs.
1358 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1359 Column = FormatTok->LastLineColumnWidth;
1360 }
1361
1362 if (Style.isCpp()) {
1363 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1364 auto it = Macros.find(Identifier);
1365 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1366 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1367 tok::pp_define) &&
1368 it != Macros.end()) {
1369 FormatTok->setType(it->second);
1370 if (it->second == TT_IfMacro) {
1371 // The lexer token currently has type tok::kw_unknown. However, for this
1372 // substitution to be treated correctly in the TokenAnnotator, faking
1373 // the tok value seems to be needed. Not sure if there's a more elegant
1374 // way.
1375 FormatTok->Tok.setKind(tok::kw_if);
1376 }
1377 } else if (FormatTok->is(tok::identifier)) {
1378 if (MacroBlockBeginRegex.match(Text))
1379 FormatTok->setType(TT_MacroBlockBegin);
1380 else if (MacroBlockEndRegex.match(Text))
1381 FormatTok->setType(TT_MacroBlockEnd);
1382 else if (TemplateNames.contains(Identifier))
1383 FormatTok->setFinalizedType(TT_TemplateName);
1384 else if (TypeNames.contains(Identifier))
1385 FormatTok->setFinalizedType(TT_TypeName);
1386 else if (VariableTemplates.contains(Identifier))
1387 FormatTok->setFinalizedType(TT_VariableTemplate);
1388 }
1389 }
1390
1391 return FormatTok;
1392}
1393
1394bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1395 // In Verilog the quote is not a character literal.
1396 //
1397 // Make the backtick and double backtick identifiers to match against them
1398 // more easily.
1399 //
1400 // In Verilog an escaped identifier starts with backslash and ends with
1401 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1402 // also begin an escaped newline outside of an escaped identifier. We check
1403 // for that outside of the Regex since we can't use negative lookhead
1404 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1405 // identifier may have a length of 0 according to Section A.9.3.
1406 // FIXME: If there is an escaped newline in the middle of an escaped
1407 // identifier, allow for pasting the two lines together, But escaped
1408 // identifiers usually occur only in generated code anyway.
1409 static const llvm::Regex VerilogToken(R"re(^('|``?|\\‍(\\)re"
1410 "(\r?\n|\r)|[^[:space:]])*)");
1411
1412 SmallVector<StringRef, 4> Matches;
1413 const char *Start = Lex->getBufferLocation();
1414 if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1415 &Matches)) {
1416 return false;
1417 }
1418 // There is a null byte at the end of the buffer, so we don't have to check
1419 // Start[1] is within the buffer.
1420 if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1421 return false;
1422 size_t Len = Matches[0].size();
1423
1424 // The kind has to be an identifier so we can match it against those defined
1425 // in Keywords. The kind has to be set before the length because the setLength
1426 // function checks that the kind is not an annotation.
1427 Tok.setKind(tok::raw_identifier);
1428 Tok.setLength(Len);
1429 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1430 Tok.setRawIdentifierData(Start);
1431 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1432 return true;
1433}
1434
1435void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1436 // For Verilog, first see if there is a special token, and fall back to the
1437 // normal lexer if there isn't one.
1438 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1439 Lex->LexFromRawLexer(Tok.Tok);
1440 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1441 Tok.Tok.getLength());
1442 // For formatting, treat unterminated string literals like normal string
1443 // literals.
1444 if (Tok.is(tok::unknown)) {
1445 if (Tok.TokenText.starts_with("\"")) {
1446 Tok.Tok.setKind(tok::string_literal);
1447 Tok.IsUnterminatedLiteral = true;
1448 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1449 Tok.Tok.setKind(tok::string_literal);
1450 }
1451 }
1452
1453 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1454 Tok.Tok.setKind(tok::string_literal);
1455
1456 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1457 FormattingDisabled = false;
1458
1459 Tok.Finalized = FormattingDisabled;
1460
1461 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1462 FormattingDisabled = true;
1463}
1464
1465void FormatTokenLexer::resetLexer(unsigned Offset) {
1466 StringRef Buffer = SourceMgr.getBufferData(ID);
1467 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1468 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1469 Lex->SetKeepWhitespaceMode(true);
1470 TrailingWhitespace = 0;
1471}
1472
1473} // namespace format
1474} // namespace clang
MatchType Type
static char ID
Definition: Arena.cpp:183
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:3052
StringRef Identifier
Definition: Format.cpp:3059
Various functions to configurably format source code.
#define X(type, name)
Definition: Value.h:144
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
SourceLocation Begin
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
llvm::MemoryBufferRef getBufferOrFake(FileID FID, SourceLocation Loc=SourceLocation()) const
Return the buffer for the specified FileID.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
void setLength(unsigned Len)
Definition: Token.h:141
void setKind(tok::TokenKind K)
Definition: Token.h:95
void setLocation(SourceLocation L)
Definition: Token.h:140
bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const
Definition: Token.h:101
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:196
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition: Format.cpp:4235
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:3930
bool isClangFormatOn(StringRef Comment)
Definition: Format.cpp:4231
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:212
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
#define true
Definition: stdbool.h:25
bool isTableGenKeyword(const FormatToken &Tok) const
Definition: FormatToken.h:1934
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:1733
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
bool isTableGen() const
Definition: Format.h:3297
@ LK_Java
Should be used for Java.
Definition: Format.h:3269
@ LK_TextProto
Should be used for Protocol Buffer messages in text format (https://developers.google....
Definition: Format.h:3283
std::vector< std::string > AttributeMacros
This option is renamed to BreakTemplateDeclarations.
Definition: Format.h:1194
bool KeepFormFeed
This option is deprecated.
Definition: Format.h:3220
std::vector< std::string > VariableTemplates
A vector of non-keyword identifiers that should be interpreted as variable template names.
Definition: Format.h:5114
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:3346
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:3301
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:5044
std::vector< std::string > StatementAttributeLikeMacros
Macros which are ignored in front of a statement, as if they were an attribute.
Definition: Format.h:4976
std::vector< std::string > IfMacros
A vector of macros that should be interpreted as conditionals instead of as function calls.
Definition: Format.h:2733
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls.
Definition: Format.h:2710
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition: Format.h:5063
bool isCSharp() const
Definition: Format.h:3290
std::vector< std::string > WhitespaceSensitiveMacros
A vector of macros which are whitespace-sensitive and should not be touched.
Definition: Format.h:5145
std::vector< std::string > TemplateNames
A vector of non-keyword identifiers that should be interpreted as template names.
Definition: Format.h:5053
bool isProto() const
Definition: Format.h:3294
bool isVerilog() const
Definition: Format.h:3293
bool isJavaScript() const
Definition: Format.h:3292
std::vector< std::string > NamespaceMacros
A vector of macros which are used to open namespace blocks.
Definition: Format.h:3455
std::vector< std::string > StatementMacros
A vector of macros that should be interpreted as complete statements.
Definition: Format.h:4986
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:3350
bool InsertNewlineAtEOF
Insert a newline at end of file if missing.
Definition: Format.h:2981
std::vector< std::string > TypenameMacros
A vector of macros that should be interpreted as type declarations instead of as function calls.
Definition: Format.h:5080
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:507
bool isNot(T Kind) const
Definition: FormatToken.h:631
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:317
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
Definition: FormatToken.h:476
unsigned IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:333
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:466
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:330
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:481
void setType(TokenType T)
Definition: FormatToken.h:431
bool HasFormFeedBefore
Has "\n\f\n" or "\n\f\r\n" before TokenText.
Definition: FormatToken.h:587
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:612
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:485
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:624
unsigned IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:336
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:326
void setFinalizedType(TokenType T)
Sets the type and also the finalized flag.
Definition: FormatToken.h:445