clang 20.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
19#include "clang/Format/Format.h"
20#include "llvm/Support/Regex.h"
21
22namespace clang {
23namespace format {
24
26 const SourceManager &SourceMgr, FileID ID, unsigned Column,
27 const FormatStyle &Style, encoding::Encoding Encoding,
28 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29 IdentifierTable &IdentTable)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31 Column(Column), TrailingWhitespace(0),
32 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36 MacroBlockEndRegex(Style.MacroBlockEnd) {
37 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38 Lex->SetKeepWhitespaceMode(true);
39
40 for (const std::string &ForEachMacro : Style.ForEachMacros) {
41 auto Identifier = &IdentTable.get(ForEachMacro);
42 Macros.insert({Identifier, TT_ForEachMacro});
43 }
44 for (const std::string &IfMacro : Style.IfMacros) {
45 auto Identifier = &IdentTable.get(IfMacro);
46 Macros.insert({Identifier, TT_IfMacro});
47 }
48 for (const std::string &AttributeMacro : Style.AttributeMacros) {
49 auto Identifier = &IdentTable.get(AttributeMacro);
50 Macros.insert({Identifier, TT_AttributeMacro});
51 }
52 for (const std::string &StatementMacro : Style.StatementMacros) {
53 auto Identifier = &IdentTable.get(StatementMacro);
54 Macros.insert({Identifier, TT_StatementMacro});
55 }
56 for (const std::string &TypenameMacro : Style.TypenameMacros) {
57 auto Identifier = &IdentTable.get(TypenameMacro);
58 Macros.insert({Identifier, TT_TypenameMacro});
59 }
60 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61 auto Identifier = &IdentTable.get(NamespaceMacro);
62 Macros.insert({Identifier, TT_NamespaceMacro});
63 }
64 for (const std::string &WhitespaceSensitiveMacro :
66 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67 Macros.insert({Identifier, TT_UntouchableMacroFunc});
68 }
69 for (const std::string &StatementAttributeLikeMacro :
71 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73 }
74
75 for (const auto &TemplateName : Style.TemplateNames)
76 TemplateNames.insert(&IdentTable.get(TemplateName));
77 for (const auto &TypeName : Style.TypeNames)
78 TypeNames.insert(&IdentTable.get(TypeName));
79}
80
82 assert(Tokens.empty());
83 assert(FirstInLineIndex == 0);
84 do {
85 Tokens.push_back(getNextToken());
86 if (Style.isJavaScript()) {
87 tryParseJSRegexLiteral();
88 handleTemplateStrings();
89 }
91 tryParsePythonComment();
92 tryMergePreviousTokens();
93 if (Style.isCSharp()) {
94 // This needs to come after tokens have been merged so that C#
95 // string literals are correctly identified.
96 handleCSharpVerbatimAndInterpolatedStrings();
97 }
98 if (Style.isTableGen()) {
99 handleTableGenMultilineString();
100 handleTableGenNumericLikeIdentifier();
101 }
102 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
103 FirstInLineIndex = Tokens.size() - 1;
104 } while (Tokens.back()->isNot(tok::eof));
105 if (Style.InsertNewlineAtEOF) {
106 auto &TokEOF = *Tokens.back();
107 if (TokEOF.NewlinesBefore == 0) {
108 TokEOF.NewlinesBefore = 1;
109 TokEOF.OriginalColumn = 0;
110 }
111 }
112 return Tokens;
113}
114
115void FormatTokenLexer::tryMergePreviousTokens() {
116 if (tryMerge_TMacro())
117 return;
118 if (tryMergeConflictMarkers())
119 return;
120 if (tryMergeLessLess())
121 return;
122 if (tryMergeGreaterGreater())
123 return;
124 if (tryMergeForEach())
125 return;
126 if (Style.isCpp() && tryTransformTryUsageForC())
127 return;
128
129 if (Style.isJavaScript() || Style.isCSharp()) {
130 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
131 tok::question};
132 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
133 tok::period};
134 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
135
136 if (tryMergeTokens(FatArrow, TT_FatArrow))
137 return;
138 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
139 // Treat like the "||" operator (as opposed to the ternary ?).
140 Tokens.back()->Tok.setKind(tok::pipepipe);
141 return;
142 }
143 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
144 // Treat like a regular "." access.
145 Tokens.back()->Tok.setKind(tok::period);
146 return;
147 }
148 if (tryMergeNullishCoalescingEqual())
149 return;
150 }
151
152 if (Style.isCSharp()) {
153 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
154 tok::question, tok::l_square};
155
156 if (tryMergeCSharpKeywordVariables())
157 return;
158 if (tryMergeCSharpStringLiteral())
159 return;
160 if (tryTransformCSharpForEach())
161 return;
162 if (tryMergeTokens(CSharpNullConditionalLSquare,
163 TT_CSharpNullConditionalLSquare)) {
164 // Treat like a regular "[" operator.
165 Tokens.back()->Tok.setKind(tok::l_square);
166 return;
167 }
168 }
169
170 if (tryMergeNSStringLiteral())
171 return;
172
173 if (Style.isJavaScript()) {
174 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
175 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
176 tok::equal};
177 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
178 tok::greaterequal};
179 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
180 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
181 tok::starequal};
182 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
183 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
184
185 // FIXME: Investigate what token type gives the correct operator priority.
186 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
187 return;
188 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
189 return;
190 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
191 return;
192 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
193 return;
194 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
195 Tokens.back()->Tok.setKind(tok::starequal);
196 return;
197 }
198 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
199 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
200 // Treat like the "=" assignment operator.
201 Tokens.back()->Tok.setKind(tok::equal);
202 return;
203 }
204 if (tryMergeJSPrivateIdentifier())
205 return;
206 }
207
208 if (Style.Language == FormatStyle::LK_Java) {
209 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
210 tok::greater, tok::greater, tok::greaterequal};
211 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
212 return;
213 }
214
215 if (Style.isVerilog()) {
216 // Merge the number following a base like `'h?a0`.
217 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
218 Tokens.end()[-2]->is(tok::numeric_constant) &&
219 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
220 tok::question) &&
221 tryMergeTokens(2, TT_Unknown)) {
222 return;
223 }
224 // Part select.
225 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
226 TT_BitFieldColon)) {
227 return;
228 }
229 // Xnor. The combined token is treated as a caret which can also be either a
230 // unary or binary operator. The actual type is determined in
231 // TokenAnnotator. We also check the token length so we know it is not
232 // already a merged token.
233 if (Tokens.back()->TokenText.size() == 1 &&
234 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
235 TT_BinaryOperator)) {
236 Tokens.back()->Tok.setKind(tok::caret);
237 return;
238 }
239 // Signed shift and distribution weight.
240 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
241 Tokens.back()->Tok.setKind(tok::lessless);
242 return;
243 }
244 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
245 Tokens.back()->Tok.setKind(tok::greatergreater);
246 return;
247 }
248 if (tryMergeTokensAny({{tok::lessless, tok::equal},
249 {tok::lessless, tok::lessequal},
250 {tok::greatergreater, tok::equal},
251 {tok::greatergreater, tok::greaterequal},
252 {tok::colon, tok::equal},
253 {tok::colon, tok::slash}},
254 TT_BinaryOperator)) {
255 Tokens.back()->ForcedPrecedence = prec::Assignment;
256 return;
257 }
258 // Exponentiation, signed shift, case equality, and wildcard equality.
259 if (tryMergeTokensAny({{tok::star, tok::star},
260 {tok::lessless, tok::less},
261 {tok::greatergreater, tok::greater},
262 {tok::exclaimequal, tok::equal},
263 {tok::exclaimequal, tok::question},
264 {tok::equalequal, tok::equal},
265 {tok::equalequal, tok::question}},
266 TT_BinaryOperator)) {
267 return;
268 }
269 // Module paths in specify blocks and the implication and boolean equality
270 // operators.
271 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
272 {tok::plus, tok::star, tok::greater},
273 {tok::minusequal, tok::greater},
274 {tok::minus, tok::star, tok::greater},
275 {tok::less, tok::arrow},
276 {tok::equal, tok::greater},
277 {tok::star, tok::greater},
278 {tok::pipeequal, tok::greater},
279 {tok::pipe, tok::arrow},
280 {tok::hash, tok::minus, tok::hash},
281 {tok::hash, tok::equal, tok::hash}},
282 TT_BinaryOperator) ||
283 Tokens.back()->is(tok::arrow)) {
284 Tokens.back()->ForcedPrecedence = prec::Comma;
285 return;
286 }
287 }
288 if (Style.isTableGen()) {
289 // TableGen's Multi line string starts with [{
290 if (tryMergeTokens({tok::l_square, tok::l_brace},
291 TT_TableGenMultiLineString)) {
292 // Set again with finalizing. This must never be annotated as other types.
293 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
294 Tokens.back()->Tok.setKind(tok::string_literal);
295 return;
296 }
297 // TableGen's bang operator is the form !<name>.
298 // !cond is a special case with specific syntax.
299 if (tryMergeTokens({tok::exclaim, tok::identifier},
300 TT_TableGenBangOperator)) {
301 Tokens.back()->Tok.setKind(tok::identifier);
302 Tokens.back()->Tok.setIdentifierInfo(nullptr);
303 if (Tokens.back()->TokenText == "!cond")
304 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
305 else
306 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
307 return;
308 }
309 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
310 // Here, "! if" becomes "!if". That is, ! captures if even when the space
311 // exists. That is only one possibility in TableGen's syntax.
312 Tokens.back()->Tok.setKind(tok::identifier);
313 Tokens.back()->Tok.setIdentifierInfo(nullptr);
314 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
315 return;
316 }
317 // +, - with numbers are literals. Not unary operators.
318 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
319 Tokens.back()->Tok.setKind(tok::numeric_constant);
320 return;
321 }
322 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
323 Tokens.back()->Tok.setKind(tok::numeric_constant);
324 return;
325 }
326 }
327}
328
329bool FormatTokenLexer::tryMergeNSStringLiteral() {
330 if (Tokens.size() < 2)
331 return false;
332 auto &At = *(Tokens.end() - 2);
333 auto &String = *(Tokens.end() - 1);
334 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
335 return false;
336 At->Tok.setKind(tok::string_literal);
337 At->TokenText = StringRef(At->TokenText.begin(),
338 String->TokenText.end() - At->TokenText.begin());
339 At->ColumnWidth += String->ColumnWidth;
340 At->setType(TT_ObjCStringLiteral);
341 Tokens.erase(Tokens.end() - 1);
342 return true;
343}
344
345bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
346 // Merges #idenfier into a single identifier with the text #identifier
347 // but the token tok::identifier.
348 if (Tokens.size() < 2)
349 return false;
350 auto &Hash = *(Tokens.end() - 2);
351 auto &Identifier = *(Tokens.end() - 1);
352 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
353 return false;
354 Hash->Tok.setKind(tok::identifier);
355 Hash->TokenText =
356 StringRef(Hash->TokenText.begin(),
357 Identifier->TokenText.end() - Hash->TokenText.begin());
358 Hash->ColumnWidth += Identifier->ColumnWidth;
359 Hash->setType(TT_JsPrivateIdentifier);
360 Tokens.erase(Tokens.end() - 1);
361 return true;
362}
363
364// Search for verbatim or interpolated string literals @"ABC" or
365// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
366// prevent splitting of @, $ and ".
367// Merging of multiline verbatim strings with embedded '"' is handled in
368// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
369bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
370 if (Tokens.size() < 2)
371 return false;
372
373 // Look for @"aaaaaa" or $"aaaaaa".
374 const auto String = *(Tokens.end() - 1);
375 if (String->isNot(tok::string_literal))
376 return false;
377
378 auto Prefix = *(Tokens.end() - 2);
379 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
380 return false;
381
382 if (Tokens.size() > 2) {
383 const auto Tok = *(Tokens.end() - 3);
384 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
385 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
386 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
387 Tok->ColumnWidth += Prefix->ColumnWidth;
388 Tokens.erase(Tokens.end() - 2);
389 Prefix = Tok;
390 }
391 }
392
393 // Convert back into just a string_literal.
394 Prefix->Tok.setKind(tok::string_literal);
395 Prefix->TokenText =
396 StringRef(Prefix->TokenText.begin(),
397 String->TokenText.end() - Prefix->TokenText.begin());
398 Prefix->ColumnWidth += String->ColumnWidth;
399 Prefix->setType(TT_CSharpStringLiteral);
400 Tokens.erase(Tokens.end() - 1);
401 return true;
402}
403
404// Valid C# attribute targets:
405// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
406const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
407 "assembly", "module", "field", "event", "method",
408 "param", "property", "return", "type",
409};
410
411bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
412 if (Tokens.size() < 2)
413 return false;
414 auto &NullishCoalescing = *(Tokens.end() - 2);
415 auto &Equal = *(Tokens.end() - 1);
416 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
417 Equal->isNot(tok::equal)) {
418 return false;
419 }
420 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
421 NullishCoalescing->TokenText =
422 StringRef(NullishCoalescing->TokenText.begin(),
423 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
424 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
425 NullishCoalescing->setType(TT_NullCoalescingEqual);
426 Tokens.erase(Tokens.end() - 1);
427 return true;
428}
429
430bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
431 if (Tokens.size() < 2)
432 return false;
433 const auto At = *(Tokens.end() - 2);
434 if (At->isNot(tok::at))
435 return false;
436 const auto Keyword = *(Tokens.end() - 1);
437 if (Keyword->TokenText == "$")
438 return false;
439 if (!Keywords.isCSharpKeyword(*Keyword))
440 return false;
441
442 At->Tok.setKind(tok::identifier);
443 At->TokenText = StringRef(At->TokenText.begin(),
444 Keyword->TokenText.end() - At->TokenText.begin());
445 At->ColumnWidth += Keyword->ColumnWidth;
446 At->setType(Keyword->getType());
447 Tokens.erase(Tokens.end() - 1);
448 return true;
449}
450
451// In C# transform identifier foreach into kw_foreach
452bool FormatTokenLexer::tryTransformCSharpForEach() {
453 if (Tokens.size() < 1)
454 return false;
455 auto &Identifier = *(Tokens.end() - 1);
456 if (Identifier->isNot(tok::identifier))
457 return false;
458 if (Identifier->TokenText != "foreach")
459 return false;
460
461 Identifier->setType(TT_ForEachMacro);
462 Identifier->Tok.setKind(tok::kw_for);
463 return true;
464}
465
466bool FormatTokenLexer::tryMergeForEach() {
467 if (Tokens.size() < 2)
468 return false;
469 auto &For = *(Tokens.end() - 2);
470 auto &Each = *(Tokens.end() - 1);
471 if (For->isNot(tok::kw_for))
472 return false;
473 if (Each->isNot(tok::identifier))
474 return false;
475 if (Each->TokenText != "each")
476 return false;
477
478 For->setType(TT_ForEachMacro);
479 For->Tok.setKind(tok::kw_for);
480
481 For->TokenText = StringRef(For->TokenText.begin(),
482 Each->TokenText.end() - For->TokenText.begin());
483 For->ColumnWidth += Each->ColumnWidth;
484 Tokens.erase(Tokens.end() - 1);
485 return true;
486}
487
488bool FormatTokenLexer::tryTransformTryUsageForC() {
489 if (Tokens.size() < 2)
490 return false;
491 auto &Try = *(Tokens.end() - 2);
492 if (Try->isNot(tok::kw_try))
493 return false;
494 auto &Next = *(Tokens.end() - 1);
495 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
496 return false;
497
498 if (Tokens.size() > 2) {
499 auto &At = *(Tokens.end() - 3);
500 if (At->is(tok::at))
501 return false;
502 }
503
504 Try->Tok.setKind(tok::identifier);
505 return true;
506}
507
508bool FormatTokenLexer::tryMergeLessLess() {
509 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
510 if (Tokens.size() < 3)
511 return false;
512
513 auto First = Tokens.end() - 3;
514 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
515 return false;
516
517 // Only merge if there currently is no whitespace between the two "<".
518 if (First[1]->hasWhitespaceBefore())
519 return false;
520
521 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
522 if (X && X->is(tok::less))
523 return false;
524
525 auto Y = First[2];
526 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
527 return false;
528
529 First[0]->Tok.setKind(tok::lessless);
530 First[0]->TokenText = "<<";
531 First[0]->ColumnWidth += 1;
532 Tokens.erase(Tokens.end() - 2);
533 return true;
534}
535
536bool FormatTokenLexer::tryMergeGreaterGreater() {
537 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
538 if (Tokens.size() < 2)
539 return false;
540
541 auto First = Tokens.end() - 2;
542 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
543 return false;
544
545 // Only merge if there currently is no whitespace between the first two ">".
546 if (First[1]->hasWhitespaceBefore())
547 return false;
548
549 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
550 if (Tok && Tok->isNot(tok::kw_operator))
551 return false;
552
553 First[0]->Tok.setKind(tok::greatergreater);
554 First[0]->TokenText = ">>";
555 First[0]->ColumnWidth += 1;
556 Tokens.erase(Tokens.end() - 1);
557 return true;
558}
559
560bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
561 TokenType NewType) {
562 if (Tokens.size() < Kinds.size())
563 return false;
564
565 SmallVectorImpl<FormatToken *>::const_iterator First =
566 Tokens.end() - Kinds.size();
567 for (unsigned i = 0; i < Kinds.size(); ++i)
568 if (First[i]->isNot(Kinds[i]))
569 return false;
570
571 return tryMergeTokens(Kinds.size(), NewType);
572}
573
574bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
575 if (Tokens.size() < Count)
576 return false;
577
578 SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
579 unsigned AddLength = 0;
580 for (size_t i = 1; i < Count; ++i) {
581 // If there is whitespace separating the token and the previous one,
582 // they should not be merged.
583 if (First[i]->hasWhitespaceBefore())
584 return false;
585 AddLength += First[i]->TokenText.size();
586 }
587
588 Tokens.resize(Tokens.size() - Count + 1);
589 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
590 First[0]->TokenText.size() + AddLength);
591 First[0]->ColumnWidth += AddLength;
592 First[0]->setType(NewType);
593 return true;
594}
595
596bool FormatTokenLexer::tryMergeTokensAny(
597 ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
598 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
599 return tryMergeTokens(Kinds, NewType);
600 });
601}
602
603// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
604bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
605 // NB: This is not entirely correct, as an r_paren can introduce an operand
606 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
607 // corner case to not matter in practice, though.
608 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
609 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
610 tok::colon, tok::question, tok::tilde) ||
611 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
612 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
613 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
614 Tok->isBinaryOperator();
615}
616
617bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
618 if (!Prev)
619 return true;
620
621 // Regex literals can only follow after prefix unary operators, not after
622 // postfix unary operators. If the '++' is followed by a non-operand
623 // introducing token, the slash here is the operand and not the start of a
624 // regex.
625 // `!` is an unary prefix operator, but also a post-fix operator that casts
626 // away nullability, so the same check applies.
627 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
628 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
629
630 // The previous token must introduce an operand location where regex
631 // literals can occur.
632 if (!precedesOperand(Prev))
633 return false;
634
635 return true;
636}
637
638// Tries to parse a JavaScript Regex literal starting at the current token,
639// if that begins with a slash and is in a location where JavaScript allows
640// regex literals. Changes the current token to a regex literal and updates
641// its text if successful.
642void FormatTokenLexer::tryParseJSRegexLiteral() {
643 FormatToken *RegexToken = Tokens.back();
644 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
645 return;
646
647 FormatToken *Prev = nullptr;
648 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
649 // NB: Because previous pointers are not initialized yet, this cannot use
650 // Token.getPreviousNonComment.
651 if (FT->isNot(tok::comment)) {
652 Prev = FT;
653 break;
654 }
655 }
656
657 if (!canPrecedeRegexLiteral(Prev))
658 return;
659
660 // 'Manually' lex ahead in the current file buffer.
661 const char *Offset = Lex->getBufferLocation();
662 const char *RegexBegin = Offset - RegexToken->TokenText.size();
663 StringRef Buffer = Lex->getBuffer();
664 bool InCharacterClass = false;
665 bool HaveClosingSlash = false;
666 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
667 // Regular expressions are terminated with a '/', which can only be
668 // escaped using '\' or a character class between '[' and ']'.
669 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
670 switch (*Offset) {
671 case '\\':
672 // Skip the escaped character.
673 ++Offset;
674 break;
675 case '[':
676 InCharacterClass = true;
677 break;
678 case ']':
679 InCharacterClass = false;
680 break;
681 case '/':
682 if (!InCharacterClass)
683 HaveClosingSlash = true;
684 break;
685 }
686 }
687
688 RegexToken->setType(TT_RegexLiteral);
689 // Treat regex literals like other string_literals.
690 RegexToken->Tok.setKind(tok::string_literal);
691 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
692 RegexToken->ColumnWidth = RegexToken->TokenText.size();
693
694 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
695}
696
697static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
698 bool Interpolated) {
699 auto Repeated = [&Begin, End]() {
700 return Begin + 1 < End && Begin[1] == Begin[0];
701 };
702
703 // Look for a terminating '"' in the current file buffer.
704 // Make no effort to format code within an interpolated or verbatim string.
705 //
706 // Interpolated strings could contain { } with " characters inside.
707 // $"{x ?? "null"}"
708 // should not be split into $"{x ?? ", null, "}" but should be treated as a
709 // single string-literal.
710 //
711 // We opt not to try and format expressions inside {} within a C#
712 // interpolated string. Formatting expressions within an interpolated string
713 // would require similar work as that done for JavaScript template strings
714 // in `handleTemplateStrings()`.
715 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
716 switch (*Begin) {
717 case '\\':
718 if (!Verbatim)
719 ++Begin;
720 break;
721 case '{':
722 if (Interpolated) {
723 // {{ inside an interpolated string is escaped, so skip it.
724 if (Repeated())
725 ++Begin;
726 else
727 ++UnmatchedOpeningBraceCount;
728 }
729 break;
730 case '}':
731 if (Interpolated) {
732 // }} inside an interpolated string is escaped, so skip it.
733 if (Repeated())
734 ++Begin;
735 else if (UnmatchedOpeningBraceCount > 0)
736 --UnmatchedOpeningBraceCount;
737 else
738 return End;
739 }
740 break;
741 case '"':
742 if (UnmatchedOpeningBraceCount > 0)
743 break;
744 // "" within a verbatim string is an escaped double quote: skip it.
745 if (Verbatim && Repeated()) {
746 ++Begin;
747 break;
748 }
749 return Begin;
750 }
751 }
752
753 return End;
754}
755
756void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
757 FormatToken *CSharpStringLiteral = Tokens.back();
758
759 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
760 return;
761
762 auto &TokenText = CSharpStringLiteral->TokenText;
763
764 bool Verbatim = false;
765 bool Interpolated = false;
766 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
767 Verbatim = true;
768 Interpolated = true;
769 } else if (TokenText.starts_with(R"(@")")) {
770 Verbatim = true;
771 } else if (TokenText.starts_with(R"($")")) {
772 Interpolated = true;
773 }
774
775 // Deal with multiline strings.
776 if (!Verbatim && !Interpolated)
777 return;
778
779 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
780 const char *Offset = StrBegin;
781 if (Verbatim && Interpolated)
782 Offset += 3;
783 else
784 Offset += 2;
785
786 const auto End = Lex->getBuffer().end();
787 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
788
789 // Make no attempt to format code properly if a verbatim string is
790 // unterminated.
791 if (Offset >= End)
792 return;
793
794 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
795 TokenText = LiteralText;
796
797 // Adjust width for potentially multiline string literals.
798 size_t FirstBreak = LiteralText.find('\n');
799 StringRef FirstLineText = FirstBreak == StringRef::npos
800 ? LiteralText
801 : LiteralText.substr(0, FirstBreak);
802 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
803 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
804 Encoding);
805 size_t LastBreak = LiteralText.rfind('\n');
806 if (LastBreak != StringRef::npos) {
807 CSharpStringLiteral->IsMultiline = true;
808 unsigned StartColumn = 0;
809 CSharpStringLiteral->LastLineColumnWidth =
810 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
811 StartColumn, Style.TabWidth, Encoding);
812 }
813
814 assert(Offset < End);
815 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
816}
817
818void FormatTokenLexer::handleTableGenMultilineString() {
819 FormatToken *MultiLineString = Tokens.back();
820 if (MultiLineString->isNot(TT_TableGenMultiLineString))
821 return;
822
823 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
824 // "}]" is the end of multi line string.
825 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
826 if (CloseOffset == StringRef::npos)
827 return;
828 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
829 MultiLineString->TokenText = Text;
830 resetLexer(SourceMgr.getFileOffset(
831 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
832 auto FirstLineText = Text;
833 auto FirstBreak = Text.find('\n');
834 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
835 if (FirstBreak != StringRef::npos) {
836 MultiLineString->IsMultiline = true;
837 FirstLineText = Text.substr(0, FirstBreak + 1);
838 // LastLineColumnWidth holds the width of the last line.
839 auto LastBreak = Text.rfind('\n');
840 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
841 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
842 Style.TabWidth, Encoding);
843 }
844 // ColumnWidth holds only the width of the first line.
845 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
846 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
847}
848
849void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
850 FormatToken *Tok = Tokens.back();
851 // TableGen identifiers can begin with digits. Such tokens are lexed as
852 // numeric_constant now.
853 if (Tok->isNot(tok::numeric_constant))
854 return;
855 StringRef Text = Tok->TokenText;
856 // The following check is based on llvm::TGLexer::LexToken.
857 // That lexes the token as a number if any of the following holds:
858 // 1. It starts with '+', '-'.
859 // 2. All the characters are digits.
860 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
861 // 4. The first non-digit character is 'x', and the next is a hex digit.
862 // Note that in the case 3 and 4, if the next character does not exists in
863 // this token, the token is an identifier.
864 if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
865 return;
866 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
867 // All the characters are digits
868 if (NonDigitPos == StringRef::npos)
869 return;
870 char FirstNonDigit = Text[NonDigitPos];
871 if (NonDigitPos < Text.size() - 1) {
872 char TheNext = Text[NonDigitPos + 1];
873 // Regarded as a binary number.
874 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
875 return;
876 // Regarded as hex number.
877 if (FirstNonDigit == 'x' && isxdigit(TheNext))
878 return;
879 }
880 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
881 // This is actually an identifier in TableGen.
882 Tok->Tok.setKind(tok::identifier);
883 Tok->Tok.setIdentifierInfo(nullptr);
884 }
885}
886
887void FormatTokenLexer::handleTemplateStrings() {
888 FormatToken *BacktickToken = Tokens.back();
889
890 if (BacktickToken->is(tok::l_brace)) {
891 StateStack.push(LexerState::NORMAL);
892 return;
893 }
894 if (BacktickToken->is(tok::r_brace)) {
895 if (StateStack.size() == 1)
896 return;
897 StateStack.pop();
898 if (StateStack.top() != LexerState::TEMPLATE_STRING)
899 return;
900 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
901 } else if (BacktickToken->is(tok::unknown) &&
902 BacktickToken->TokenText == "`") {
903 StateStack.push(LexerState::TEMPLATE_STRING);
904 } else {
905 return; // Not actually a template
906 }
907
908 // 'Manually' lex ahead in the current file buffer.
909 const char *Offset = Lex->getBufferLocation();
910 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
911 for (; Offset != Lex->getBuffer().end(); ++Offset) {
912 if (Offset[0] == '`') {
913 StateStack.pop();
914 ++Offset;
915 break;
916 }
917 if (Offset[0] == '\\') {
918 ++Offset; // Skip the escaped character.
919 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
920 Offset[1] == '{') {
921 // '${' introduces an expression interpolation in the template string.
922 StateStack.push(LexerState::NORMAL);
923 Offset += 2;
924 break;
925 }
926 }
927
928 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
929 BacktickToken->setType(TT_TemplateString);
930 BacktickToken->Tok.setKind(tok::string_literal);
931 BacktickToken->TokenText = LiteralText;
932
933 // Adjust width for potentially multiline string literals.
934 size_t FirstBreak = LiteralText.find('\n');
935 StringRef FirstLineText = FirstBreak == StringRef::npos
936 ? LiteralText
937 : LiteralText.substr(0, FirstBreak);
938 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
939 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
940 size_t LastBreak = LiteralText.rfind('\n');
941 if (LastBreak != StringRef::npos) {
942 BacktickToken->IsMultiline = true;
943 unsigned StartColumn = 0; // The template tail spans the entire line.
944 BacktickToken->LastLineColumnWidth =
945 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
946 StartColumn, Style.TabWidth, Encoding);
947 }
948
949 SourceLocation loc = Lex->getSourceLocation(Offset);
950 resetLexer(SourceMgr.getFileOffset(loc));
951}
952
953void FormatTokenLexer::tryParsePythonComment() {
954 FormatToken *HashToken = Tokens.back();
955 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
956 return;
957 // Turn the remainder of this line into a comment.
958 const char *CommentBegin =
959 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
960 size_t From = CommentBegin - Lex->getBuffer().begin();
961 size_t To = Lex->getBuffer().find_first_of('\n', From);
962 if (To == StringRef::npos)
963 To = Lex->getBuffer().size();
964 size_t Len = To - From;
965 HashToken->setType(TT_LineComment);
966 HashToken->Tok.setKind(tok::comment);
967 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
968 SourceLocation Loc = To < Lex->getBuffer().size()
969 ? Lex->getSourceLocation(CommentBegin + Len)
970 : SourceMgr.getLocForEndOfFile(ID);
971 resetLexer(SourceMgr.getFileOffset(Loc));
972}
973
974bool FormatTokenLexer::tryMerge_TMacro() {
975 if (Tokens.size() < 4)
976 return false;
977 FormatToken *Last = Tokens.back();
978 if (Last->isNot(tok::r_paren))
979 return false;
980
981 FormatToken *String = Tokens[Tokens.size() - 2];
982 if (String->isNot(tok::string_literal) || String->IsMultiline)
983 return false;
984
985 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
986 return false;
987
988 FormatToken *Macro = Tokens[Tokens.size() - 4];
989 if (Macro->TokenText != "_T")
990 return false;
991
992 const char *Start = Macro->TokenText.data();
993 const char *End = Last->TokenText.data() + Last->TokenText.size();
994 String->TokenText = StringRef(Start, End - Start);
995 String->IsFirst = Macro->IsFirst;
996 String->LastNewlineOffset = Macro->LastNewlineOffset;
997 String->WhitespaceRange = Macro->WhitespaceRange;
998 String->OriginalColumn = Macro->OriginalColumn;
999 String->ColumnWidth = encoding::columnWidthWithTabs(
1000 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1001 String->NewlinesBefore = Macro->NewlinesBefore;
1002 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1003
1004 Tokens.pop_back();
1005 Tokens.pop_back();
1006 Tokens.pop_back();
1007 Tokens.back() = String;
1008 if (FirstInLineIndex >= Tokens.size())
1009 FirstInLineIndex = Tokens.size() - 1;
1010 return true;
1011}
1012
1013bool FormatTokenLexer::tryMergeConflictMarkers() {
1014 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1015 return false;
1016
1017 // Conflict lines look like:
1018 // <marker> <text from the vcs>
1019 // For example:
1020 // >>>>>>> /file/in/file/system at revision 1234
1021 //
1022 // We merge all tokens in a line that starts with a conflict marker
1023 // into a single token with a special token type that the unwrapped line
1024 // parser will use to correctly rebuild the underlying code.
1025
1026 FileID ID;
1027 // Get the position of the first token in the line.
1028 unsigned FirstInLineOffset;
1029 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1030 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1031 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1032 // Calculate the offset of the start of the current line.
1033 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1034 if (LineOffset == StringRef::npos)
1035 LineOffset = 0;
1036 else
1037 ++LineOffset;
1038
1039 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1040 StringRef LineStart;
1041 if (FirstSpace == StringRef::npos)
1042 LineStart = Buffer.substr(LineOffset);
1043 else
1044 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1045
1046 TokenType Type = TT_Unknown;
1047 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1048 Type = TT_ConflictStart;
1049 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1050 LineStart == "====") {
1051 Type = TT_ConflictAlternative;
1052 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1053 Type = TT_ConflictEnd;
1054 }
1055
1056 if (Type != TT_Unknown) {
1057 FormatToken *Next = Tokens.back();
1058
1059 Tokens.resize(FirstInLineIndex + 1);
1060 // We do not need to build a complete token here, as we will skip it
1061 // during parsing anyway (as we must not touch whitespace around conflict
1062 // markers).
1063 Tokens.back()->setType(Type);
1064 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1065
1066 Tokens.push_back(Next);
1067 return true;
1068 }
1069
1070 return false;
1071}
1072
1073FormatToken *FormatTokenLexer::getStashedToken() {
1074 // Create a synthesized second '>' or '<' token.
1075 Token Tok = FormatTok->Tok;
1076 StringRef TokenText = FormatTok->TokenText;
1077
1078 unsigned OriginalColumn = FormatTok->OriginalColumn;
1079 FormatTok = new (Allocator.Allocate()) FormatToken;
1080 FormatTok->Tok = Tok;
1081 SourceLocation TokLocation =
1082 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1083 FormatTok->Tok.setLocation(TokLocation);
1084 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1085 FormatTok->TokenText = TokenText;
1086 FormatTok->ColumnWidth = 1;
1087 FormatTok->OriginalColumn = OriginalColumn + 1;
1088
1089 return FormatTok;
1090}
1091
1092/// Truncate the current token to the new length and make the lexer continue
1093/// from the end of the truncated token. Used for other languages that have
1094/// different token boundaries, like JavaScript in which a comment ends at a
1095/// line break regardless of whether the line break follows a backslash. Also
1096/// used to set the lexer to the end of whitespace if the lexer regards
1097/// whitespace and an unrecognized symbol as one token.
1098void FormatTokenLexer::truncateToken(size_t NewLen) {
1099 assert(NewLen <= FormatTok->TokenText.size());
1100 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1101 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1102 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1104 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1105 Encoding);
1106 FormatTok->Tok.setLength(NewLen);
1107}
1108
1109/// Count the length of leading whitespace in a token.
1110static size_t countLeadingWhitespace(StringRef Text) {
1111 // Basically counting the length matched by this regex.
1112 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
1113 // Directly using the regex turned out to be slow. With the regex
1114 // version formatting all files in this directory took about 1.25
1115 // seconds. This version took about 0.5 seconds.
1116 const unsigned char *const Begin = Text.bytes_begin();
1117 const unsigned char *const End = Text.bytes_end();
1118 const unsigned char *Cur = Begin;
1119 while (Cur < End) {
1120 if (isspace(Cur[0])) {
1121 ++Cur;
1122 } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
1123 // A '\' followed by a newline always escapes the newline, regardless
1124 // of whether there is another '\' before it.
1125 // The source has a null byte at the end. So the end of the entire input
1126 // isn't reached yet. Also the lexer doesn't break apart an escaped
1127 // newline.
1128 assert(End - Cur >= 2);
1129 Cur += 2;
1130 } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
1131 (Cur[3] == '\n' || Cur[3] == '\r')) {
1132 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1133 // characters are quoted individually in this comment because if we write
1134 // them together some compilers warn that we have a trigraph in the code.
1135 assert(End - Cur >= 4);
1136 Cur += 4;
1137 } else {
1138 break;
1139 }
1140 }
1141 return Cur - Begin;
1142}
1143
1144FormatToken *FormatTokenLexer::getNextToken() {
1145 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1146 StateStack.pop();
1147 return getStashedToken();
1148 }
1149
1150 FormatTok = new (Allocator.Allocate()) FormatToken;
1151 readRawToken(*FormatTok);
1152 SourceLocation WhitespaceStart =
1153 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1154 FormatTok->IsFirst = IsFirstToken;
1155 IsFirstToken = false;
1156
1157 // Consume and record whitespace until we find a significant token.
1158 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1159 // followed by a symbol such as backtick. Those symbols may be
1160 // significant in other languages.
1161 unsigned WhitespaceLength = TrailingWhitespace;
1162 while (FormatTok->isNot(tok::eof)) {
1163 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1164 if (LeadingWhitespace == 0)
1165 break;
1166 if (LeadingWhitespace < FormatTok->TokenText.size())
1167 truncateToken(LeadingWhitespace);
1168 StringRef Text = FormatTok->TokenText;
1169 bool InEscape = false;
1170 for (int i = 0, e = Text.size(); i != e; ++i) {
1171 switch (Text[i]) {
1172 case '\r':
1173 // If this is a CRLF sequence, break here and the LF will be handled on
1174 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1175 // the same as a single LF.
1176 if (i + 1 < e && Text[i + 1] == '\n')
1177 break;
1178 [[fallthrough]];
1179 case '\n':
1180 ++FormatTok->NewlinesBefore;
1181 if (!InEscape)
1182 FormatTok->HasUnescapedNewline = true;
1183 else
1184 InEscape = false;
1185 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1186 Column = 0;
1187 break;
1188 case '\f':
1189 if (Style.KeepFormFeed && !FormatTok->HasFormFeedBefore &&
1190 // The form feed is immediately preceded and followed by a newline.
1191 i > 0 && Text[i - 1] == '\n' &&
1192 ((i + 1 < e && Text[i + 1] == '\n') ||
1193 (i + 2 < e && Text[i + 1] == '\r' && Text[i + 2] == '\n'))) {
1194 FormatTok->HasFormFeedBefore = true;
1195 }
1196 [[fallthrough]];
1197 case '\v':
1198 Column = 0;
1199 break;
1200 case ' ':
1201 ++Column;
1202 break;
1203 case '\t':
1204 Column +=
1205 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1206 break;
1207 case '\\':
1208 case '?':
1209 case '/':
1210 // The text was entirely whitespace when this loop was entered. Thus
1211 // this has to be an escape sequence.
1212 assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1213 Text.substr(i, 4) == "\?\?/\r" ||
1214 Text.substr(i, 4) == "\?\?/\n" ||
1215 (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1216 Text.substr(i - 1, 4) == "\?\?/\n")) ||
1217 (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1218 Text.substr(i - 2, 4) == "\?\?/\n")));
1219 InEscape = true;
1220 break;
1221 default:
1222 // This shouldn't happen.
1223 assert(false);
1224 break;
1225 }
1226 }
1227 WhitespaceLength += Text.size();
1228 readRawToken(*FormatTok);
1229 }
1230
1231 if (FormatTok->is(tok::unknown))
1232 FormatTok->setType(TT_ImplicitStringLiteral);
1233
1234 // JavaScript and Java do not allow to escape the end of the line with a
1235 // backslash. Backslashes are syntax errors in plain source, but can occur in
1236 // comments. When a single line comment ends with a \, it'll cause the next
1237 // line of code to be lexed as a comment, breaking formatting. The code below
1238 // finds comments that contain a backslash followed by a line break, truncates
1239 // the comment token at the backslash, and resets the lexer to restart behind
1240 // the backslash.
1241 if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1242 FormatTok->is(tok::comment) && FormatTok->TokenText.starts_with("//")) {
1243 size_t BackslashPos = FormatTok->TokenText.find('\\');
1244 while (BackslashPos != StringRef::npos) {
1245 if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1246 FormatTok->TokenText[BackslashPos + 1] == '\n') {
1247 truncateToken(BackslashPos + 1);
1248 break;
1249 }
1250 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1251 }
1252 }
1253
1254 if (Style.isVerilog()) {
1255 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1256 SmallVector<StringRef, 1> Matches;
1257 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1258 // And it uses the hash for delays and parameter lists. In order to continue
1259 // using `tok::hash` in other places, the backtick gets marked as the hash
1260 // here. And in order to tell the backtick and hash apart for
1261 // Verilog-specific stuff, the hash becomes an identifier.
1262 if (FormatTok->is(tok::numeric_constant)) {
1263 // In Verilog the quote is not part of a number.
1264 auto Quote = FormatTok->TokenText.find('\'');
1265 if (Quote != StringRef::npos)
1266 truncateToken(Quote);
1267 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1268 FormatTok->Tok.setKind(tok::raw_identifier);
1269 } else if (FormatTok->is(tok::raw_identifier)) {
1270 if (FormatTok->TokenText == "`") {
1271 FormatTok->Tok.setIdentifierInfo(nullptr);
1272 FormatTok->Tok.setKind(tok::hash);
1273 } else if (FormatTok->TokenText == "``") {
1274 FormatTok->Tok.setIdentifierInfo(nullptr);
1275 FormatTok->Tok.setKind(tok::hashhash);
1276 } else if (Tokens.size() > 0 &&
1277 Tokens.back()->is(Keywords.kw_apostrophe) &&
1278 NumberBase.match(FormatTok->TokenText, &Matches)) {
1279 // In Verilog in a based number literal like `'b10`, there may be
1280 // whitespace between `'b` and `10`. Therefore we handle the base and
1281 // the rest of the number literal as two tokens. But if there is no
1282 // space in the input code, we need to manually separate the two parts.
1283 truncateToken(Matches[0].size());
1284 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1285 }
1286 }
1287 }
1288
1289 FormatTok->WhitespaceRange = SourceRange(
1290 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1291
1292 FormatTok->OriginalColumn = Column;
1293
1294 TrailingWhitespace = 0;
1295 if (FormatTok->is(tok::comment)) {
1296 // FIXME: Add the trimmed whitespace to Column.
1297 StringRef UntrimmedText = FormatTok->TokenText;
1298 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1299 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1300 } else if (FormatTok->is(tok::raw_identifier)) {
1301 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1302 FormatTok->Tok.setIdentifierInfo(&Info);
1303 FormatTok->Tok.setKind(Info.getTokenID());
1304 if (Style.Language == FormatStyle::LK_Java &&
1305 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1306 tok::kw_operator)) {
1307 FormatTok->Tok.setKind(tok::identifier);
1308 FormatTok->Tok.setIdentifierInfo(nullptr);
1309 } else if (Style.isJavaScript() &&
1310 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1311 tok::kw_operator)) {
1312 FormatTok->Tok.setKind(tok::identifier);
1313 FormatTok->Tok.setIdentifierInfo(nullptr);
1314 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1315 FormatTok->Tok.setKind(tok::identifier);
1316 FormatTok->Tok.setIdentifierInfo(nullptr);
1317 }
1318 } else if (FormatTok->is(tok::greatergreater)) {
1319 FormatTok->Tok.setKind(tok::greater);
1320 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1321 ++Column;
1322 StateStack.push(LexerState::TOKEN_STASHED);
1323 } else if (FormatTok->is(tok::lessless)) {
1324 FormatTok->Tok.setKind(tok::less);
1325 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1326 ++Column;
1327 StateStack.push(LexerState::TOKEN_STASHED);
1328 }
1329
1330 if (Style.isVerilog() && Tokens.size() > 0 &&
1331 Tokens.back()->is(TT_VerilogNumberBase) &&
1332 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1333 // Mark the number following a base like `'h?a0` as a number.
1334 FormatTok->Tok.setKind(tok::numeric_constant);
1335 }
1336
1337 // Now FormatTok is the next non-whitespace token.
1338
1339 StringRef Text = FormatTok->TokenText;
1340 size_t FirstNewlinePos = Text.find('\n');
1341 if (FirstNewlinePos == StringRef::npos) {
1342 // FIXME: ColumnWidth actually depends on the start column, we need to
1343 // take this into account when the token is moved.
1344 FormatTok->ColumnWidth =
1345 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1346 Column += FormatTok->ColumnWidth;
1347 } else {
1348 FormatTok->IsMultiline = true;
1349 // FIXME: ColumnWidth actually depends on the start column, we need to
1350 // take this into account when the token is moved.
1352 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1353
1354 // The last line of the token always starts in column 0.
1355 // Thus, the length can be precomputed even in the presence of tabs.
1357 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1358 Column = FormatTok->LastLineColumnWidth;
1359 }
1360
1361 if (Style.isCpp()) {
1362 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1363 auto it = Macros.find(Identifier);
1364 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1365 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1366 tok::pp_define) &&
1367 it != Macros.end()) {
1368 FormatTok->setType(it->second);
1369 if (it->second == TT_IfMacro) {
1370 // The lexer token currently has type tok::kw_unknown. However, for this
1371 // substitution to be treated correctly in the TokenAnnotator, faking
1372 // the tok value seems to be needed. Not sure if there's a more elegant
1373 // way.
1374 FormatTok->Tok.setKind(tok::kw_if);
1375 }
1376 } else if (FormatTok->is(tok::identifier)) {
1377 if (MacroBlockBeginRegex.match(Text))
1378 FormatTok->setType(TT_MacroBlockBegin);
1379 else if (MacroBlockEndRegex.match(Text))
1380 FormatTok->setType(TT_MacroBlockEnd);
1381 else if (TemplateNames.contains(Identifier))
1382 FormatTok->setFinalizedType(TT_TemplateName);
1383 else if (TypeNames.contains(Identifier))
1384 FormatTok->setFinalizedType(TT_TypeName);
1385 }
1386 }
1387
1388 return FormatTok;
1389}
1390
1391bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1392 // In Verilog the quote is not a character literal.
1393 //
1394 // Make the backtick and double backtick identifiers to match against them
1395 // more easily.
1396 //
1397 // In Verilog an escaped identifier starts with backslash and ends with
1398 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1399 // also begin an escaped newline outside of an escaped identifier. We check
1400 // for that outside of the Regex since we can't use negative lookhead
1401 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1402 // identifier may have a length of 0 according to Section A.9.3.
1403 // FIXME: If there is an escaped newline in the middle of an escaped
1404 // identifier, allow for pasting the two lines together, But escaped
1405 // identifiers usually occur only in generated code anyway.
1406 static const llvm::Regex VerilogToken(R"re(^('|``?|\\‍(\\)re"
1407 "(\r?\n|\r)|[^[:space:]])*)");
1408
1409 SmallVector<StringRef, 4> Matches;
1410 const char *Start = Lex->getBufferLocation();
1411 if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1412 &Matches)) {
1413 return false;
1414 }
1415 // There is a null byte at the end of the buffer, so we don't have to check
1416 // Start[1] is within the buffer.
1417 if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1418 return false;
1419 size_t Len = Matches[0].size();
1420
1421 // The kind has to be an identifier so we can match it against those defined
1422 // in Keywords. The kind has to be set before the length because the setLength
1423 // function checks that the kind is not an annotation.
1424 Tok.setKind(tok::raw_identifier);
1425 Tok.setLength(Len);
1426 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1427 Tok.setRawIdentifierData(Start);
1428 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1429 return true;
1430}
1431
1432void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1433 // For Verilog, first see if there is a special token, and fall back to the
1434 // normal lexer if there isn't one.
1435 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1436 Lex->LexFromRawLexer(Tok.Tok);
1437 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1438 Tok.Tok.getLength());
1439 // For formatting, treat unterminated string literals like normal string
1440 // literals.
1441 if (Tok.is(tok::unknown)) {
1442 if (Tok.TokenText.starts_with("\"")) {
1443 Tok.Tok.setKind(tok::string_literal);
1444 Tok.IsUnterminatedLiteral = true;
1445 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1446 Tok.Tok.setKind(tok::string_literal);
1447 }
1448 }
1449
1450 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1451 Tok.Tok.setKind(tok::string_literal);
1452
1453 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1454 FormattingDisabled = false;
1455
1456 Tok.Finalized = FormattingDisabled;
1457
1458 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1459 FormattingDisabled = true;
1460}
1461
1462void FormatTokenLexer::resetLexer(unsigned Offset) {
1463 StringRef Buffer = SourceMgr.getBufferData(ID);
1464 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1465 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1466 Lex->SetKeepWhitespaceMode(true);
1467 TrailingWhitespace = 0;
1468}
1469
1470} // namespace format
1471} // namespace clang
MatchType Type
static char ID
Definition: Arena.cpp:183
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:3033
StringRef Identifier
Definition: Format.cpp:3040
Various functions to configurably format source code.
#define X(type, name)
Definition: Value.h:144
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
SourceLocation Begin
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
llvm::MemoryBufferRef getBufferOrFake(FileID FID, SourceLocation Loc=SourceLocation()) const
Return the buffer for the specified FileID.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
void setLength(unsigned Len)
Definition: Token.h:141
void setKind(tok::TokenKind K)
Definition: Token.h:95
void setLocation(SourceLocation L)
Definition: Token.h:140
bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const
Definition: Token.h:101
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:196
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition: Format.cpp:4209
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:3904
bool isClangFormatOn(StringRef Comment)
Definition: Format.cpp:4205
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:209
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
#define true
Definition: stdbool.h:25
bool isTableGenKeyword(const FormatToken &Tok) const
Definition: FormatToken.h:1931
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:1730
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
bool isTableGen() const
Definition: Format.h:3292
@ LK_Java
Should be used for Java.
Definition: Format.h:3264
@ LK_TextProto
Should be used for Protocol Buffer messages in text format (https://developers.google....
Definition: Format.h:3278
std::vector< std::string > AttributeMacros
This option is renamed to BreakTemplateDeclarations.
Definition: Format.h:1190
bool KeepFormFeed
This option is deprecated.
Definition: Format.h:3215
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:3341
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:3296
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:5039
std::vector< std::string > StatementAttributeLikeMacros
Macros which are ignored in front of a statement, as if they were an attribute.
Definition: Format.h:4971
std::vector< std::string > IfMacros
A vector of macros that should be interpreted as conditionals instead of as function calls.
Definition: Format.h:2729
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls.
Definition: Format.h:2706
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition: Format.h:5058
bool isCSharp() const
Definition: Format.h:3285
std::vector< std::string > WhitespaceSensitiveMacros
A vector of macros which are whitespace-sensitive and should not be touched.
Definition: Format.h:5131
std::vector< std::string > TemplateNames
A vector of non-keyword identifiers that should be interpreted as template names.
Definition: Format.h:5048
bool isProto() const
Definition: Format.h:3289
bool isVerilog() const
Definition: Format.h:3288
bool isJavaScript() const
Definition: Format.h:3287
std::vector< std::string > NamespaceMacros
A vector of macros which are used to open namespace blocks.
Definition: Format.h:3450
std::vector< std::string > StatementMacros
A vector of macros that should be interpreted as complete statements.
Definition: Format.h:4981
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:3345
bool InsertNewlineAtEOF
Insert a newline at end of file if missing.
Definition: Format.h:2977
std::vector< std::string > TypenameMacros
A vector of macros that should be interpreted as type declarations instead of as function calls.
Definition: Format.h:5075
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:504
bool isNot(T Kind) const
Definition: FormatToken.h:628
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:314
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
Definition: FormatToken.h:473
unsigned IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:330
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:463
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:327
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:478
void setType(TokenType T)
Definition: FormatToken.h:428
bool HasFormFeedBefore
Has "\n\f\n" or "\n\f\r\n" before TokenText.
Definition: FormatToken.h:584
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:609
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:482
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:621
unsigned IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:333
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:323
void setFinalizedType(TokenType T)
Sets the type and also the finalized flag.
Definition: FormatToken.h:442