clang 20.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
19#include "clang/Format/Format.h"
20#include "llvm/Support/Regex.h"
21
22namespace clang {
23namespace format {
24
26 const SourceManager &SourceMgr, FileID ID, unsigned Column,
27 const FormatStyle &Style, encoding::Encoding Encoding,
28 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29 IdentifierTable &IdentTable)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31 Column(Column), TrailingWhitespace(0),
32 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36 MacroBlockEndRegex(Style.MacroBlockEnd) {
37 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38 Lex->SetKeepWhitespaceMode(true);
39
40 for (const std::string &ForEachMacro : Style.ForEachMacros) {
41 auto Identifier = &IdentTable.get(ForEachMacro);
42 Macros.insert({Identifier, TT_ForEachMacro});
43 }
44 for (const std::string &IfMacro : Style.IfMacros) {
45 auto Identifier = &IdentTable.get(IfMacro);
46 Macros.insert({Identifier, TT_IfMacro});
47 }
48 for (const std::string &AttributeMacro : Style.AttributeMacros) {
49 auto Identifier = &IdentTable.get(AttributeMacro);
50 Macros.insert({Identifier, TT_AttributeMacro});
51 }
52 for (const std::string &StatementMacro : Style.StatementMacros) {
53 auto Identifier = &IdentTable.get(StatementMacro);
54 Macros.insert({Identifier, TT_StatementMacro});
55 }
56 for (const std::string &TypenameMacro : Style.TypenameMacros) {
57 auto Identifier = &IdentTable.get(TypenameMacro);
58 Macros.insert({Identifier, TT_TypenameMacro});
59 }
60 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61 auto Identifier = &IdentTable.get(NamespaceMacro);
62 Macros.insert({Identifier, TT_NamespaceMacro});
63 }
64 for (const std::string &WhitespaceSensitiveMacro :
66 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67 Macros.insert({Identifier, TT_UntouchableMacroFunc});
68 }
69 for (const std::string &StatementAttributeLikeMacro :
71 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73 }
74
75 for (const auto &TypeName : Style.TypeNames)
76 TypeNames.insert(&IdentTable.get(TypeName));
77}
78
80 assert(Tokens.empty());
81 assert(FirstInLineIndex == 0);
82 do {
83 Tokens.push_back(getNextToken());
84 if (Style.isJavaScript()) {
85 tryParseJSRegexLiteral();
86 handleTemplateStrings();
87 }
89 tryParsePythonComment();
90 tryMergePreviousTokens();
91 if (Style.isCSharp()) {
92 // This needs to come after tokens have been merged so that C#
93 // string literals are correctly identified.
94 handleCSharpVerbatimAndInterpolatedStrings();
95 }
96 if (Style.isTableGen()) {
97 handleTableGenMultilineString();
98 handleTableGenNumericLikeIdentifier();
99 }
100 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
101 FirstInLineIndex = Tokens.size() - 1;
102 } while (Tokens.back()->isNot(tok::eof));
103 return Tokens;
104}
105
106void FormatTokenLexer::tryMergePreviousTokens() {
107 if (tryMerge_TMacro())
108 return;
109 if (tryMergeConflictMarkers())
110 return;
111 if (tryMergeLessLess())
112 return;
113 if (tryMergeGreaterGreater())
114 return;
115 if (tryMergeForEach())
116 return;
117 if (Style.isCpp() && tryTransformTryUsageForC())
118 return;
119
120 if (Style.isJavaScript() || Style.isCSharp()) {
121 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
122 tok::question};
123 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
124 tok::period};
125 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
126
127 if (tryMergeTokens(FatArrow, TT_FatArrow))
128 return;
129 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
130 // Treat like the "||" operator (as opposed to the ternary ?).
131 Tokens.back()->Tok.setKind(tok::pipepipe);
132 return;
133 }
134 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
135 // Treat like a regular "." access.
136 Tokens.back()->Tok.setKind(tok::period);
137 return;
138 }
139 if (tryMergeNullishCoalescingEqual())
140 return;
141 }
142
143 if (Style.isCSharp()) {
144 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
145 tok::question, tok::l_square};
146
147 if (tryMergeCSharpKeywordVariables())
148 return;
149 if (tryMergeCSharpStringLiteral())
150 return;
151 if (tryTransformCSharpForEach())
152 return;
153 if (tryMergeTokens(CSharpNullConditionalLSquare,
154 TT_CSharpNullConditionalLSquare)) {
155 // Treat like a regular "[" operator.
156 Tokens.back()->Tok.setKind(tok::l_square);
157 return;
158 }
159 }
160
161 if (tryMergeNSStringLiteral())
162 return;
163
164 if (Style.isJavaScript()) {
165 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
166 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
167 tok::equal};
168 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
169 tok::greaterequal};
170 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
171 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
172 tok::starequal};
173 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
174 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
175
176 // FIXME: Investigate what token type gives the correct operator priority.
177 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
178 return;
179 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
180 return;
181 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
182 return;
183 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
184 return;
185 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
186 Tokens.back()->Tok.setKind(tok::starequal);
187 return;
188 }
189 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
190 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
191 // Treat like the "=" assignment operator.
192 Tokens.back()->Tok.setKind(tok::equal);
193 return;
194 }
195 if (tryMergeJSPrivateIdentifier())
196 return;
197 }
198
199 if (Style.Language == FormatStyle::LK_Java) {
200 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
201 tok::greater, tok::greater, tok::greaterequal};
202 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
203 return;
204 }
205
206 if (Style.isVerilog()) {
207 // Merge the number following a base like `'h?a0`.
208 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
209 Tokens.end()[-2]->is(tok::numeric_constant) &&
210 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
211 tok::question) &&
212 tryMergeTokens(2, TT_Unknown)) {
213 return;
214 }
215 // Part select.
216 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
217 TT_BitFieldColon)) {
218 return;
219 }
220 // Xnor. The combined token is treated as a caret which can also be either a
221 // unary or binary operator. The actual type is determined in
222 // TokenAnnotator. We also check the token length so we know it is not
223 // already a merged token.
224 if (Tokens.back()->TokenText.size() == 1 &&
225 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
226 TT_BinaryOperator)) {
227 Tokens.back()->Tok.setKind(tok::caret);
228 return;
229 }
230 // Signed shift and distribution weight.
231 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
232 Tokens.back()->Tok.setKind(tok::lessless);
233 return;
234 }
235 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
236 Tokens.back()->Tok.setKind(tok::greatergreater);
237 return;
238 }
239 if (tryMergeTokensAny({{tok::lessless, tok::equal},
240 {tok::lessless, tok::lessequal},
241 {tok::greatergreater, tok::equal},
242 {tok::greatergreater, tok::greaterequal},
243 {tok::colon, tok::equal},
244 {tok::colon, tok::slash}},
245 TT_BinaryOperator)) {
246 Tokens.back()->ForcedPrecedence = prec::Assignment;
247 return;
248 }
249 // Exponentiation, signed shift, case equality, and wildcard equality.
250 if (tryMergeTokensAny({{tok::star, tok::star},
251 {tok::lessless, tok::less},
252 {tok::greatergreater, tok::greater},
253 {tok::exclaimequal, tok::equal},
254 {tok::exclaimequal, tok::question},
255 {tok::equalequal, tok::equal},
256 {tok::equalequal, tok::question}},
257 TT_BinaryOperator)) {
258 return;
259 }
260 // Module paths in specify blocks and the implication and boolean equality
261 // operators.
262 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
263 {tok::plus, tok::star, tok::greater},
264 {tok::minusequal, tok::greater},
265 {tok::minus, tok::star, tok::greater},
266 {tok::less, tok::arrow},
267 {tok::equal, tok::greater},
268 {tok::star, tok::greater},
269 {tok::pipeequal, tok::greater},
270 {tok::pipe, tok::arrow},
271 {tok::hash, tok::minus, tok::hash},
272 {tok::hash, tok::equal, tok::hash}},
273 TT_BinaryOperator) ||
274 Tokens.back()->is(tok::arrow)) {
275 Tokens.back()->ForcedPrecedence = prec::Comma;
276 return;
277 }
278 }
279 if (Style.isTableGen()) {
280 // TableGen's Multi line string starts with [{
281 if (tryMergeTokens({tok::l_square, tok::l_brace},
282 TT_TableGenMultiLineString)) {
283 // Set again with finalizing. This must never be annotated as other types.
284 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
285 Tokens.back()->Tok.setKind(tok::string_literal);
286 return;
287 }
288 // TableGen's bang operator is the form !<name>.
289 // !cond is a special case with specific syntax.
290 if (tryMergeTokens({tok::exclaim, tok::identifier},
291 TT_TableGenBangOperator)) {
292 Tokens.back()->Tok.setKind(tok::identifier);
293 Tokens.back()->Tok.setIdentifierInfo(nullptr);
294 if (Tokens.back()->TokenText == "!cond")
295 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
296 else
297 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
298 return;
299 }
300 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
301 // Here, "! if" becomes "!if". That is, ! captures if even when the space
302 // exists. That is only one possibility in TableGen's syntax.
303 Tokens.back()->Tok.setKind(tok::identifier);
304 Tokens.back()->Tok.setIdentifierInfo(nullptr);
305 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
306 return;
307 }
308 // +, - with numbers are literals. Not unary operators.
309 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
310 Tokens.back()->Tok.setKind(tok::numeric_constant);
311 return;
312 }
313 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
314 Tokens.back()->Tok.setKind(tok::numeric_constant);
315 return;
316 }
317 }
318}
319
320bool FormatTokenLexer::tryMergeNSStringLiteral() {
321 if (Tokens.size() < 2)
322 return false;
323 auto &At = *(Tokens.end() - 2);
324 auto &String = *(Tokens.end() - 1);
325 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
326 return false;
327 At->Tok.setKind(tok::string_literal);
328 At->TokenText = StringRef(At->TokenText.begin(),
329 String->TokenText.end() - At->TokenText.begin());
330 At->ColumnWidth += String->ColumnWidth;
331 At->setType(TT_ObjCStringLiteral);
332 Tokens.erase(Tokens.end() - 1);
333 return true;
334}
335
336bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
337 // Merges #idenfier into a single identifier with the text #identifier
338 // but the token tok::identifier.
339 if (Tokens.size() < 2)
340 return false;
341 auto &Hash = *(Tokens.end() - 2);
342 auto &Identifier = *(Tokens.end() - 1);
343 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
344 return false;
345 Hash->Tok.setKind(tok::identifier);
346 Hash->TokenText =
347 StringRef(Hash->TokenText.begin(),
348 Identifier->TokenText.end() - Hash->TokenText.begin());
349 Hash->ColumnWidth += Identifier->ColumnWidth;
350 Hash->setType(TT_JsPrivateIdentifier);
351 Tokens.erase(Tokens.end() - 1);
352 return true;
353}
354
355// Search for verbatim or interpolated string literals @"ABC" or
356// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
357// prevent splitting of @, $ and ".
358// Merging of multiline verbatim strings with embedded '"' is handled in
359// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
360bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
361 if (Tokens.size() < 2)
362 return false;
363
364 // Look for @"aaaaaa" or $"aaaaaa".
365 const auto String = *(Tokens.end() - 1);
366 if (String->isNot(tok::string_literal))
367 return false;
368
369 auto Prefix = *(Tokens.end() - 2);
370 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
371 return false;
372
373 if (Tokens.size() > 2) {
374 const auto Tok = *(Tokens.end() - 3);
375 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
376 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
377 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
378 Tok->ColumnWidth += Prefix->ColumnWidth;
379 Tokens.erase(Tokens.end() - 2);
380 Prefix = Tok;
381 }
382 }
383
384 // Convert back into just a string_literal.
385 Prefix->Tok.setKind(tok::string_literal);
386 Prefix->TokenText =
387 StringRef(Prefix->TokenText.begin(),
388 String->TokenText.end() - Prefix->TokenText.begin());
389 Prefix->ColumnWidth += String->ColumnWidth;
390 Prefix->setType(TT_CSharpStringLiteral);
391 Tokens.erase(Tokens.end() - 1);
392 return true;
393}
394
395// Valid C# attribute targets:
396// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
397const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
398 "assembly", "module", "field", "event", "method",
399 "param", "property", "return", "type",
400};
401
402bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
403 if (Tokens.size() < 2)
404 return false;
405 auto &NullishCoalescing = *(Tokens.end() - 2);
406 auto &Equal = *(Tokens.end() - 1);
407 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
408 Equal->isNot(tok::equal)) {
409 return false;
410 }
411 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
412 NullishCoalescing->TokenText =
413 StringRef(NullishCoalescing->TokenText.begin(),
414 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
415 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
416 NullishCoalescing->setType(TT_NullCoalescingEqual);
417 Tokens.erase(Tokens.end() - 1);
418 return true;
419}
420
421bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
422 if (Tokens.size() < 2)
423 return false;
424 const auto At = *(Tokens.end() - 2);
425 if (At->isNot(tok::at))
426 return false;
427 const auto Keyword = *(Tokens.end() - 1);
428 if (Keyword->TokenText == "$")
429 return false;
430 if (!Keywords.isCSharpKeyword(*Keyword))
431 return false;
432
433 At->Tok.setKind(tok::identifier);
434 At->TokenText = StringRef(At->TokenText.begin(),
435 Keyword->TokenText.end() - At->TokenText.begin());
436 At->ColumnWidth += Keyword->ColumnWidth;
437 At->setType(Keyword->getType());
438 Tokens.erase(Tokens.end() - 1);
439 return true;
440}
441
442// In C# transform identifier foreach into kw_foreach
443bool FormatTokenLexer::tryTransformCSharpForEach() {
444 if (Tokens.size() < 1)
445 return false;
446 auto &Identifier = *(Tokens.end() - 1);
447 if (Identifier->isNot(tok::identifier))
448 return false;
449 if (Identifier->TokenText != "foreach")
450 return false;
451
452 Identifier->setType(TT_ForEachMacro);
453 Identifier->Tok.setKind(tok::kw_for);
454 return true;
455}
456
457bool FormatTokenLexer::tryMergeForEach() {
458 if (Tokens.size() < 2)
459 return false;
460 auto &For = *(Tokens.end() - 2);
461 auto &Each = *(Tokens.end() - 1);
462 if (For->isNot(tok::kw_for))
463 return false;
464 if (Each->isNot(tok::identifier))
465 return false;
466 if (Each->TokenText != "each")
467 return false;
468
469 For->setType(TT_ForEachMacro);
470 For->Tok.setKind(tok::kw_for);
471
472 For->TokenText = StringRef(For->TokenText.begin(),
473 Each->TokenText.end() - For->TokenText.begin());
474 For->ColumnWidth += Each->ColumnWidth;
475 Tokens.erase(Tokens.end() - 1);
476 return true;
477}
478
479bool FormatTokenLexer::tryTransformTryUsageForC() {
480 if (Tokens.size() < 2)
481 return false;
482 auto &Try = *(Tokens.end() - 2);
483 if (Try->isNot(tok::kw_try))
484 return false;
485 auto &Next = *(Tokens.end() - 1);
486 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
487 return false;
488
489 if (Tokens.size() > 2) {
490 auto &At = *(Tokens.end() - 3);
491 if (At->is(tok::at))
492 return false;
493 }
494
495 Try->Tok.setKind(tok::identifier);
496 return true;
497}
498
499bool FormatTokenLexer::tryMergeLessLess() {
500 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
501 if (Tokens.size() < 3)
502 return false;
503
504 auto First = Tokens.end() - 3;
505 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
506 return false;
507
508 // Only merge if there currently is no whitespace between the two "<".
509 if (First[1]->hasWhitespaceBefore())
510 return false;
511
512 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
513 if (X && X->is(tok::less))
514 return false;
515
516 auto Y = First[2];
517 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
518 return false;
519
520 First[0]->Tok.setKind(tok::lessless);
521 First[0]->TokenText = "<<";
522 First[0]->ColumnWidth += 1;
523 Tokens.erase(Tokens.end() - 2);
524 return true;
525}
526
527bool FormatTokenLexer::tryMergeGreaterGreater() {
528 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
529 if (Tokens.size() < 2)
530 return false;
531
532 auto First = Tokens.end() - 2;
533 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
534 return false;
535
536 // Only merge if there currently is no whitespace between the first two ">".
537 if (First[1]->hasWhitespaceBefore())
538 return false;
539
540 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
541 if (Tok && Tok->isNot(tok::kw_operator))
542 return false;
543
544 First[0]->Tok.setKind(tok::greatergreater);
545 First[0]->TokenText = ">>";
546 First[0]->ColumnWidth += 1;
547 Tokens.erase(Tokens.end() - 1);
548 return true;
549}
550
551bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
552 TokenType NewType) {
553 if (Tokens.size() < Kinds.size())
554 return false;
555
556 SmallVectorImpl<FormatToken *>::const_iterator First =
557 Tokens.end() - Kinds.size();
558 for (unsigned i = 0; i < Kinds.size(); ++i)
559 if (First[i]->isNot(Kinds[i]))
560 return false;
561
562 return tryMergeTokens(Kinds.size(), NewType);
563}
564
565bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
566 if (Tokens.size() < Count)
567 return false;
568
569 SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
570 unsigned AddLength = 0;
571 for (size_t i = 1; i < Count; ++i) {
572 // If there is whitespace separating the token and the previous one,
573 // they should not be merged.
574 if (First[i]->hasWhitespaceBefore())
575 return false;
576 AddLength += First[i]->TokenText.size();
577 }
578
579 Tokens.resize(Tokens.size() - Count + 1);
580 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
581 First[0]->TokenText.size() + AddLength);
582 First[0]->ColumnWidth += AddLength;
583 First[0]->setType(NewType);
584 return true;
585}
586
587bool FormatTokenLexer::tryMergeTokensAny(
588 ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
589 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
590 return tryMergeTokens(Kinds, NewType);
591 });
592}
593
594// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
595bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
596 // NB: This is not entirely correct, as an r_paren can introduce an operand
597 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
598 // corner case to not matter in practice, though.
599 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
600 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
601 tok::colon, tok::question, tok::tilde) ||
602 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
603 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
604 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
605 Tok->isBinaryOperator();
606}
607
608bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
609 if (!Prev)
610 return true;
611
612 // Regex literals can only follow after prefix unary operators, not after
613 // postfix unary operators. If the '++' is followed by a non-operand
614 // introducing token, the slash here is the operand and not the start of a
615 // regex.
616 // `!` is an unary prefix operator, but also a post-fix operator that casts
617 // away nullability, so the same check applies.
618 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
619 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
620
621 // The previous token must introduce an operand location where regex
622 // literals can occur.
623 if (!precedesOperand(Prev))
624 return false;
625
626 return true;
627}
628
629// Tries to parse a JavaScript Regex literal starting at the current token,
630// if that begins with a slash and is in a location where JavaScript allows
631// regex literals. Changes the current token to a regex literal and updates
632// its text if successful.
633void FormatTokenLexer::tryParseJSRegexLiteral() {
634 FormatToken *RegexToken = Tokens.back();
635 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
636 return;
637
638 FormatToken *Prev = nullptr;
639 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
640 // NB: Because previous pointers are not initialized yet, this cannot use
641 // Token.getPreviousNonComment.
642 if (FT->isNot(tok::comment)) {
643 Prev = FT;
644 break;
645 }
646 }
647
648 if (!canPrecedeRegexLiteral(Prev))
649 return;
650
651 // 'Manually' lex ahead in the current file buffer.
652 const char *Offset = Lex->getBufferLocation();
653 const char *RegexBegin = Offset - RegexToken->TokenText.size();
654 StringRef Buffer = Lex->getBuffer();
655 bool InCharacterClass = false;
656 bool HaveClosingSlash = false;
657 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
658 // Regular expressions are terminated with a '/', which can only be
659 // escaped using '\' or a character class between '[' and ']'.
660 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
661 switch (*Offset) {
662 case '\\':
663 // Skip the escaped character.
664 ++Offset;
665 break;
666 case '[':
667 InCharacterClass = true;
668 break;
669 case ']':
670 InCharacterClass = false;
671 break;
672 case '/':
673 if (!InCharacterClass)
674 HaveClosingSlash = true;
675 break;
676 }
677 }
678
679 RegexToken->setType(TT_RegexLiteral);
680 // Treat regex literals like other string_literals.
681 RegexToken->Tok.setKind(tok::string_literal);
682 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
683 RegexToken->ColumnWidth = RegexToken->TokenText.size();
684
685 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
686}
687
688static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
689 bool Interpolated) {
690 auto Repeated = [&Begin, End]() {
691 return Begin + 1 < End && Begin[1] == Begin[0];
692 };
693
694 // Look for a terminating '"' in the current file buffer.
695 // Make no effort to format code within an interpolated or verbatim string.
696 //
697 // Interpolated strings could contain { } with " characters inside.
698 // $"{x ?? "null"}"
699 // should not be split into $"{x ?? ", null, "}" but should be treated as a
700 // single string-literal.
701 //
702 // We opt not to try and format expressions inside {} within a C#
703 // interpolated string. Formatting expressions within an interpolated string
704 // would require similar work as that done for JavaScript template strings
705 // in `handleTemplateStrings()`.
706 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
707 switch (*Begin) {
708 case '\\':
709 if (!Verbatim)
710 ++Begin;
711 break;
712 case '{':
713 if (Interpolated) {
714 // {{ inside an interpolated string is escaped, so skip it.
715 if (Repeated())
716 ++Begin;
717 else
718 ++UnmatchedOpeningBraceCount;
719 }
720 break;
721 case '}':
722 if (Interpolated) {
723 // }} inside an interpolated string is escaped, so skip it.
724 if (Repeated())
725 ++Begin;
726 else if (UnmatchedOpeningBraceCount > 0)
727 --UnmatchedOpeningBraceCount;
728 else
729 return End;
730 }
731 break;
732 case '"':
733 if (UnmatchedOpeningBraceCount > 0)
734 break;
735 // "" within a verbatim string is an escaped double quote: skip it.
736 if (Verbatim && Repeated()) {
737 ++Begin;
738 break;
739 }
740 return Begin;
741 }
742 }
743
744 return End;
745}
746
747void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
748 FormatToken *CSharpStringLiteral = Tokens.back();
749
750 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
751 return;
752
753 auto &TokenText = CSharpStringLiteral->TokenText;
754
755 bool Verbatim = false;
756 bool Interpolated = false;
757 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
758 Verbatim = true;
759 Interpolated = true;
760 } else if (TokenText.starts_with(R"(@")")) {
761 Verbatim = true;
762 } else if (TokenText.starts_with(R"($")")) {
763 Interpolated = true;
764 }
765
766 // Deal with multiline strings.
767 if (!Verbatim && !Interpolated)
768 return;
769
770 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
771 const char *Offset = StrBegin;
772 if (Verbatim && Interpolated)
773 Offset += 3;
774 else
775 Offset += 2;
776
777 const auto End = Lex->getBuffer().end();
778 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
779
780 // Make no attempt to format code properly if a verbatim string is
781 // unterminated.
782 if (Offset >= End)
783 return;
784
785 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
786 TokenText = LiteralText;
787
788 // Adjust width for potentially multiline string literals.
789 size_t FirstBreak = LiteralText.find('\n');
790 StringRef FirstLineText = FirstBreak == StringRef::npos
791 ? LiteralText
792 : LiteralText.substr(0, FirstBreak);
793 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
794 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
795 Encoding);
796 size_t LastBreak = LiteralText.rfind('\n');
797 if (LastBreak != StringRef::npos) {
798 CSharpStringLiteral->IsMultiline = true;
799 unsigned StartColumn = 0;
800 CSharpStringLiteral->LastLineColumnWidth =
801 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
802 StartColumn, Style.TabWidth, Encoding);
803 }
804
805 assert(Offset < End);
806 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
807}
808
809void FormatTokenLexer::handleTableGenMultilineString() {
810 FormatToken *MultiLineString = Tokens.back();
811 if (MultiLineString->isNot(TT_TableGenMultiLineString))
812 return;
813
814 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
815 // "}]" is the end of multi line string.
816 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
817 if (CloseOffset == StringRef::npos)
818 return;
819 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
820 MultiLineString->TokenText = Text;
821 resetLexer(SourceMgr.getFileOffset(
822 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
823 auto FirstLineText = Text;
824 auto FirstBreak = Text.find('\n');
825 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
826 if (FirstBreak != StringRef::npos) {
827 MultiLineString->IsMultiline = true;
828 FirstLineText = Text.substr(0, FirstBreak + 1);
829 // LastLineColumnWidth holds the width of the last line.
830 auto LastBreak = Text.rfind('\n');
831 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
832 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
833 Style.TabWidth, Encoding);
834 }
835 // ColumnWidth holds only the width of the first line.
836 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
837 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
838}
839
840void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
841 FormatToken *Tok = Tokens.back();
842 // TableGen identifiers can begin with digits. Such tokens are lexed as
843 // numeric_constant now.
844 if (Tok->isNot(tok::numeric_constant))
845 return;
846 StringRef Text = Tok->TokenText;
847 // The following check is based on llvm::TGLexer::LexToken.
848 // That lexes the token as a number if any of the following holds:
849 // 1. It starts with '+', '-'.
850 // 2. All the characters are digits.
851 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
852 // 4. The first non-digit character is 'x', and the next is a hex digit.
853 // Note that in the case 3 and 4, if the next character does not exists in
854 // this token, the token is an identifier.
855 if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
856 return;
857 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
858 // All the characters are digits
859 if (NonDigitPos == StringRef::npos)
860 return;
861 char FirstNonDigit = Text[NonDigitPos];
862 if (NonDigitPos < Text.size() - 1) {
863 char TheNext = Text[NonDigitPos + 1];
864 // Regarded as a binary number.
865 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
866 return;
867 // Regarded as hex number.
868 if (FirstNonDigit == 'x' && isxdigit(TheNext))
869 return;
870 }
871 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
872 // This is actually an identifier in TableGen.
873 Tok->Tok.setKind(tok::identifier);
874 Tok->Tok.setIdentifierInfo(nullptr);
875 }
876}
877
878void FormatTokenLexer::handleTemplateStrings() {
879 FormatToken *BacktickToken = Tokens.back();
880
881 if (BacktickToken->is(tok::l_brace)) {
882 StateStack.push(LexerState::NORMAL);
883 return;
884 }
885 if (BacktickToken->is(tok::r_brace)) {
886 if (StateStack.size() == 1)
887 return;
888 StateStack.pop();
889 if (StateStack.top() != LexerState::TEMPLATE_STRING)
890 return;
891 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
892 } else if (BacktickToken->is(tok::unknown) &&
893 BacktickToken->TokenText == "`") {
894 StateStack.push(LexerState::TEMPLATE_STRING);
895 } else {
896 return; // Not actually a template
897 }
898
899 // 'Manually' lex ahead in the current file buffer.
900 const char *Offset = Lex->getBufferLocation();
901 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
902 for (; Offset != Lex->getBuffer().end(); ++Offset) {
903 if (Offset[0] == '`') {
904 StateStack.pop();
905 ++Offset;
906 break;
907 }
908 if (Offset[0] == '\\') {
909 ++Offset; // Skip the escaped character.
910 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
911 Offset[1] == '{') {
912 // '${' introduces an expression interpolation in the template string.
913 StateStack.push(LexerState::NORMAL);
914 Offset += 2;
915 break;
916 }
917 }
918
919 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
920 BacktickToken->setType(TT_TemplateString);
921 BacktickToken->Tok.setKind(tok::string_literal);
922 BacktickToken->TokenText = LiteralText;
923
924 // Adjust width for potentially multiline string literals.
925 size_t FirstBreak = LiteralText.find('\n');
926 StringRef FirstLineText = FirstBreak == StringRef::npos
927 ? LiteralText
928 : LiteralText.substr(0, FirstBreak);
929 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
930 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
931 size_t LastBreak = LiteralText.rfind('\n');
932 if (LastBreak != StringRef::npos) {
933 BacktickToken->IsMultiline = true;
934 unsigned StartColumn = 0; // The template tail spans the entire line.
935 BacktickToken->LastLineColumnWidth =
936 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
937 StartColumn, Style.TabWidth, Encoding);
938 }
939
940 SourceLocation loc = Lex->getSourceLocation(Offset);
941 resetLexer(SourceMgr.getFileOffset(loc));
942}
943
944void FormatTokenLexer::tryParsePythonComment() {
945 FormatToken *HashToken = Tokens.back();
946 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
947 return;
948 // Turn the remainder of this line into a comment.
949 const char *CommentBegin =
950 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
951 size_t From = CommentBegin - Lex->getBuffer().begin();
952 size_t To = Lex->getBuffer().find_first_of('\n', From);
953 if (To == StringRef::npos)
954 To = Lex->getBuffer().size();
955 size_t Len = To - From;
956 HashToken->setType(TT_LineComment);
957 HashToken->Tok.setKind(tok::comment);
958 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
959 SourceLocation Loc = To < Lex->getBuffer().size()
960 ? Lex->getSourceLocation(CommentBegin + Len)
961 : SourceMgr.getLocForEndOfFile(ID);
962 resetLexer(SourceMgr.getFileOffset(Loc));
963}
964
965bool FormatTokenLexer::tryMerge_TMacro() {
966 if (Tokens.size() < 4)
967 return false;
968 FormatToken *Last = Tokens.back();
969 if (Last->isNot(tok::r_paren))
970 return false;
971
972 FormatToken *String = Tokens[Tokens.size() - 2];
973 if (String->isNot(tok::string_literal) || String->IsMultiline)
974 return false;
975
976 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
977 return false;
978
979 FormatToken *Macro = Tokens[Tokens.size() - 4];
980 if (Macro->TokenText != "_T")
981 return false;
982
983 const char *Start = Macro->TokenText.data();
984 const char *End = Last->TokenText.data() + Last->TokenText.size();
985 String->TokenText = StringRef(Start, End - Start);
986 String->IsFirst = Macro->IsFirst;
987 String->LastNewlineOffset = Macro->LastNewlineOffset;
988 String->WhitespaceRange = Macro->WhitespaceRange;
989 String->OriginalColumn = Macro->OriginalColumn;
990 String->ColumnWidth = encoding::columnWidthWithTabs(
991 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
992 String->NewlinesBefore = Macro->NewlinesBefore;
993 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
994
995 Tokens.pop_back();
996 Tokens.pop_back();
997 Tokens.pop_back();
998 Tokens.back() = String;
999 if (FirstInLineIndex >= Tokens.size())
1000 FirstInLineIndex = Tokens.size() - 1;
1001 return true;
1002}
1003
1004bool FormatTokenLexer::tryMergeConflictMarkers() {
1005 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1006 return false;
1007
1008 // Conflict lines look like:
1009 // <marker> <text from the vcs>
1010 // For example:
1011 // >>>>>>> /file/in/file/system at revision 1234
1012 //
1013 // We merge all tokens in a line that starts with a conflict marker
1014 // into a single token with a special token type that the unwrapped line
1015 // parser will use to correctly rebuild the underlying code.
1016
1017 FileID ID;
1018 // Get the position of the first token in the line.
1019 unsigned FirstInLineOffset;
1020 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1021 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1022 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1023 // Calculate the offset of the start of the current line.
1024 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1025 if (LineOffset == StringRef::npos)
1026 LineOffset = 0;
1027 else
1028 ++LineOffset;
1029
1030 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1031 StringRef LineStart;
1032 if (FirstSpace == StringRef::npos)
1033 LineStart = Buffer.substr(LineOffset);
1034 else
1035 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1036
1037 TokenType Type = TT_Unknown;
1038 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1039 Type = TT_ConflictStart;
1040 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1041 LineStart == "====") {
1042 Type = TT_ConflictAlternative;
1043 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1044 Type = TT_ConflictEnd;
1045 }
1046
1047 if (Type != TT_Unknown) {
1048 FormatToken *Next = Tokens.back();
1049
1050 Tokens.resize(FirstInLineIndex + 1);
1051 // We do not need to build a complete token here, as we will skip it
1052 // during parsing anyway (as we must not touch whitespace around conflict
1053 // markers).
1054 Tokens.back()->setType(Type);
1055 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1056
1057 Tokens.push_back(Next);
1058 return true;
1059 }
1060
1061 return false;
1062}
1063
1064FormatToken *FormatTokenLexer::getStashedToken() {
1065 // Create a synthesized second '>' or '<' token.
1066 Token Tok = FormatTok->Tok;
1067 StringRef TokenText = FormatTok->TokenText;
1068
1069 unsigned OriginalColumn = FormatTok->OriginalColumn;
1070 FormatTok = new (Allocator.Allocate()) FormatToken;
1071 FormatTok->Tok = Tok;
1072 SourceLocation TokLocation =
1073 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1074 FormatTok->Tok.setLocation(TokLocation);
1075 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1076 FormatTok->TokenText = TokenText;
1077 FormatTok->ColumnWidth = 1;
1078 FormatTok->OriginalColumn = OriginalColumn + 1;
1079
1080 return FormatTok;
1081}
1082
1083/// Truncate the current token to the new length and make the lexer continue
1084/// from the end of the truncated token. Used for other languages that have
1085/// different token boundaries, like JavaScript in which a comment ends at a
1086/// line break regardless of whether the line break follows a backslash. Also
1087/// used to set the lexer to the end of whitespace if the lexer regards
1088/// whitespace and an unrecognized symbol as one token.
1089void FormatTokenLexer::truncateToken(size_t NewLen) {
1090 assert(NewLen <= FormatTok->TokenText.size());
1091 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1092 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1093 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1095 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1096 Encoding);
1097 FormatTok->Tok.setLength(NewLen);
1098}
1099
1100/// Count the length of leading whitespace in a token.
1101static size_t countLeadingWhitespace(StringRef Text) {
1102 // Basically counting the length matched by this regex.
1103 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
1104 // Directly using the regex turned out to be slow. With the regex
1105 // version formatting all files in this directory took about 1.25
1106 // seconds. This version took about 0.5 seconds.
1107 const unsigned char *const Begin = Text.bytes_begin();
1108 const unsigned char *const End = Text.bytes_end();
1109 const unsigned char *Cur = Begin;
1110 while (Cur < End) {
1111 if (isspace(Cur[0])) {
1112 ++Cur;
1113 } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
1114 // A '\' followed by a newline always escapes the newline, regardless
1115 // of whether there is another '\' before it.
1116 // The source has a null byte at the end. So the end of the entire input
1117 // isn't reached yet. Also the lexer doesn't break apart an escaped
1118 // newline.
1119 assert(End - Cur >= 2);
1120 Cur += 2;
1121 } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
1122 (Cur[3] == '\n' || Cur[3] == '\r')) {
1123 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1124 // characters are quoted individually in this comment because if we write
1125 // them together some compilers warn that we have a trigraph in the code.
1126 assert(End - Cur >= 4);
1127 Cur += 4;
1128 } else {
1129 break;
1130 }
1131 }
1132 return Cur - Begin;
1133}
1134
1135FormatToken *FormatTokenLexer::getNextToken() {
1136 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1137 StateStack.pop();
1138 return getStashedToken();
1139 }
1140
1141 FormatTok = new (Allocator.Allocate()) FormatToken;
1142 readRawToken(*FormatTok);
1143 SourceLocation WhitespaceStart =
1144 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1145 FormatTok->IsFirst = IsFirstToken;
1146 IsFirstToken = false;
1147
1148 // Consume and record whitespace until we find a significant token.
1149 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1150 // followed by a symbol such as backtick. Those symbols may be
1151 // significant in other languages.
1152 unsigned WhitespaceLength = TrailingWhitespace;
1153 while (FormatTok->isNot(tok::eof)) {
1154 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1155 if (LeadingWhitespace == 0)
1156 break;
1157 if (LeadingWhitespace < FormatTok->TokenText.size())
1158 truncateToken(LeadingWhitespace);
1159 StringRef Text = FormatTok->TokenText;
1160 bool InEscape = false;
1161 for (int i = 0, e = Text.size(); i != e; ++i) {
1162 switch (Text[i]) {
1163 case '\r':
1164 // If this is a CRLF sequence, break here and the LF will be handled on
1165 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1166 // the same as a single LF.
1167 if (i + 1 < e && Text[i + 1] == '\n')
1168 break;
1169 [[fallthrough]];
1170 case '\n':
1171 ++FormatTok->NewlinesBefore;
1172 if (!InEscape)
1173 FormatTok->HasUnescapedNewline = true;
1174 else
1175 InEscape = false;
1176 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1177 Column = 0;
1178 break;
1179 case '\f':
1180 case '\v':
1181 Column = 0;
1182 break;
1183 case ' ':
1184 ++Column;
1185 break;
1186 case '\t':
1187 Column +=
1188 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1189 break;
1190 case '\\':
1191 case '?':
1192 case '/':
1193 // The text was entirely whitespace when this loop was entered. Thus
1194 // this has to be an escape sequence.
1195 assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1196 Text.substr(i, 4) == "\?\?/\r" ||
1197 Text.substr(i, 4) == "\?\?/\n" ||
1198 (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1199 Text.substr(i - 1, 4) == "\?\?/\n")) ||
1200 (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1201 Text.substr(i - 2, 4) == "\?\?/\n")));
1202 InEscape = true;
1203 break;
1204 default:
1205 // This shouldn't happen.
1206 assert(false);
1207 break;
1208 }
1209 }
1210 WhitespaceLength += Text.size();
1211 readRawToken(*FormatTok);
1212 }
1213
1214 if (FormatTok->is(tok::unknown))
1215 FormatTok->setType(TT_ImplicitStringLiteral);
1216
1217 // JavaScript and Java do not allow to escape the end of the line with a
1218 // backslash. Backslashes are syntax errors in plain source, but can occur in
1219 // comments. When a single line comment ends with a \, it'll cause the next
1220 // line of code to be lexed as a comment, breaking formatting. The code below
1221 // finds comments that contain a backslash followed by a line break, truncates
1222 // the comment token at the backslash, and resets the lexer to restart behind
1223 // the backslash.
1224 if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1225 FormatTok->is(tok::comment) && FormatTok->TokenText.starts_with("//")) {
1226 size_t BackslashPos = FormatTok->TokenText.find('\\');
1227 while (BackslashPos != StringRef::npos) {
1228 if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1229 FormatTok->TokenText[BackslashPos + 1] == '\n') {
1230 truncateToken(BackslashPos + 1);
1231 break;
1232 }
1233 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1234 }
1235 }
1236
1237 if (Style.isVerilog()) {
1238 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1239 SmallVector<StringRef, 1> Matches;
1240 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1241 // And it uses the hash for delays and parameter lists. In order to continue
1242 // using `tok::hash` in other places, the backtick gets marked as the hash
1243 // here. And in order to tell the backtick and hash apart for
1244 // Verilog-specific stuff, the hash becomes an identifier.
1245 if (FormatTok->is(tok::numeric_constant)) {
1246 // In Verilog the quote is not part of a number.
1247 auto Quote = FormatTok->TokenText.find('\'');
1248 if (Quote != StringRef::npos)
1249 truncateToken(Quote);
1250 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1251 FormatTok->Tok.setKind(tok::raw_identifier);
1252 } else if (FormatTok->is(tok::raw_identifier)) {
1253 if (FormatTok->TokenText == "`") {
1254 FormatTok->Tok.setIdentifierInfo(nullptr);
1255 FormatTok->Tok.setKind(tok::hash);
1256 } else if (FormatTok->TokenText == "``") {
1257 FormatTok->Tok.setIdentifierInfo(nullptr);
1258 FormatTok->Tok.setKind(tok::hashhash);
1259 } else if (Tokens.size() > 0 &&
1260 Tokens.back()->is(Keywords.kw_apostrophe) &&
1261 NumberBase.match(FormatTok->TokenText, &Matches)) {
1262 // In Verilog in a based number literal like `'b10`, there may be
1263 // whitespace between `'b` and `10`. Therefore we handle the base and
1264 // the rest of the number literal as two tokens. But if there is no
1265 // space in the input code, we need to manually separate the two parts.
1266 truncateToken(Matches[0].size());
1267 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1268 }
1269 }
1270 }
1271
1272 FormatTok->WhitespaceRange = SourceRange(
1273 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1274
1275 FormatTok->OriginalColumn = Column;
1276
1277 TrailingWhitespace = 0;
1278 if (FormatTok->is(tok::comment)) {
1279 // FIXME: Add the trimmed whitespace to Column.
1280 StringRef UntrimmedText = FormatTok->TokenText;
1281 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1282 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1283 } else if (FormatTok->is(tok::raw_identifier)) {
1284 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1285 FormatTok->Tok.setIdentifierInfo(&Info);
1286 FormatTok->Tok.setKind(Info.getTokenID());
1287 if (Style.Language == FormatStyle::LK_Java &&
1288 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1289 tok::kw_operator)) {
1290 FormatTok->Tok.setKind(tok::identifier);
1291 FormatTok->Tok.setIdentifierInfo(nullptr);
1292 } else if (Style.isJavaScript() &&
1293 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1294 tok::kw_operator)) {
1295 FormatTok->Tok.setKind(tok::identifier);
1296 FormatTok->Tok.setIdentifierInfo(nullptr);
1297 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1298 FormatTok->Tok.setKind(tok::identifier);
1299 FormatTok->Tok.setIdentifierInfo(nullptr);
1300 }
1301 } else if (FormatTok->is(tok::greatergreater)) {
1302 FormatTok->Tok.setKind(tok::greater);
1303 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1304 ++Column;
1305 StateStack.push(LexerState::TOKEN_STASHED);
1306 } else if (FormatTok->is(tok::lessless)) {
1307 FormatTok->Tok.setKind(tok::less);
1308 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1309 ++Column;
1310 StateStack.push(LexerState::TOKEN_STASHED);
1311 }
1312
1313 if (Style.isVerilog() && Tokens.size() > 0 &&
1314 Tokens.back()->is(TT_VerilogNumberBase) &&
1315 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1316 // Mark the number following a base like `'h?a0` as a number.
1317 FormatTok->Tok.setKind(tok::numeric_constant);
1318 }
1319
1320 // Now FormatTok is the next non-whitespace token.
1321
1322 StringRef Text = FormatTok->TokenText;
1323 size_t FirstNewlinePos = Text.find('\n');
1324 if (FirstNewlinePos == StringRef::npos) {
1325 // FIXME: ColumnWidth actually depends on the start column, we need to
1326 // take this into account when the token is moved.
1327 FormatTok->ColumnWidth =
1328 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1329 Column += FormatTok->ColumnWidth;
1330 } else {
1331 FormatTok->IsMultiline = true;
1332 // FIXME: ColumnWidth actually depends on the start column, we need to
1333 // take this into account when the token is moved.
1335 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1336
1337 // The last line of the token always starts in column 0.
1338 // Thus, the length can be precomputed even in the presence of tabs.
1340 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1341 Column = FormatTok->LastLineColumnWidth;
1342 }
1343
1344 if (Style.isCpp()) {
1345 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1346 auto it = Macros.find(Identifier);
1347 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1348 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1349 tok::pp_define) &&
1350 it != Macros.end()) {
1351 FormatTok->setType(it->second);
1352 if (it->second == TT_IfMacro) {
1353 // The lexer token currently has type tok::kw_unknown. However, for this
1354 // substitution to be treated correctly in the TokenAnnotator, faking
1355 // the tok value seems to be needed. Not sure if there's a more elegant
1356 // way.
1357 FormatTok->Tok.setKind(tok::kw_if);
1358 }
1359 } else if (FormatTok->is(tok::identifier)) {
1360 if (MacroBlockBeginRegex.match(Text))
1361 FormatTok->setType(TT_MacroBlockBegin);
1362 else if (MacroBlockEndRegex.match(Text))
1363 FormatTok->setType(TT_MacroBlockEnd);
1364 else if (TypeNames.contains(Identifier))
1365 FormatTok->setFinalizedType(TT_TypeName);
1366 }
1367 }
1368
1369 return FormatTok;
1370}
1371
1372bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1373 // In Verilog the quote is not a character literal.
1374 //
1375 // Make the backtick and double backtick identifiers to match against them
1376 // more easily.
1377 //
1378 // In Verilog an escaped identifier starts with backslash and ends with
1379 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1380 // also begin an escaped newline outside of an escaped identifier. We check
1381 // for that outside of the Regex since we can't use negative lookhead
1382 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1383 // identifier may have a length of 0 according to Section A.9.3.
1384 // FIXME: If there is an escaped newline in the middle of an escaped
1385 // identifier, allow for pasting the two lines together, But escaped
1386 // identifiers usually occur only in generated code anyway.
1387 static const llvm::Regex VerilogToken(R"re(^('|``?|\\‍(\\)re"
1388 "(\r?\n|\r)|[^[:space:]])*)");
1389
1390 SmallVector<StringRef, 4> Matches;
1391 const char *Start = Lex->getBufferLocation();
1392 if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1393 &Matches)) {
1394 return false;
1395 }
1396 // There is a null byte at the end of the buffer, so we don't have to check
1397 // Start[1] is within the buffer.
1398 if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1399 return false;
1400 size_t Len = Matches[0].size();
1401
1402 // The kind has to be an identifier so we can match it against those defined
1403 // in Keywords. The kind has to be set before the length because the setLength
1404 // function checks that the kind is not an annotation.
1405 Tok.setKind(tok::raw_identifier);
1406 Tok.setLength(Len);
1407 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1408 Tok.setRawIdentifierData(Start);
1409 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1410 return true;
1411}
1412
1413void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1414 // For Verilog, first see if there is a special token, and fall back to the
1415 // normal lexer if there isn't one.
1416 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1417 Lex->LexFromRawLexer(Tok.Tok);
1418 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1419 Tok.Tok.getLength());
1420 // For formatting, treat unterminated string literals like normal string
1421 // literals.
1422 if (Tok.is(tok::unknown)) {
1423 if (Tok.TokenText.starts_with("\"")) {
1424 Tok.Tok.setKind(tok::string_literal);
1425 Tok.IsUnterminatedLiteral = true;
1426 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1427 Tok.Tok.setKind(tok::string_literal);
1428 }
1429 }
1430
1431 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1432 Tok.Tok.setKind(tok::string_literal);
1433
1434 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1435 FormattingDisabled = false;
1436
1437 Tok.Finalized = FormattingDisabled;
1438
1439 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1440 FormattingDisabled = true;
1441}
1442
1443void FormatTokenLexer::resetLexer(unsigned Offset) {
1444 StringRef Buffer = SourceMgr.getBufferData(ID);
1445 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1446 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1447 Lex->SetKeepWhitespaceMode(true);
1448 TrailingWhitespace = 0;
1449}
1450
1451} // namespace format
1452} // namespace clang
MatchType Type
static char ID
Definition: Arena.cpp:183
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:3002
StringRef Identifier
Definition: Format.cpp:3009
Various functions to configurably format source code.
#define X(type, name)
Definition: Value.h:143
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
SourceLocation Begin
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
llvm::MemoryBufferRef getBufferOrFake(FileID FID, SourceLocation Loc=SourceLocation()) const
Return the buffer for the specified FileID.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
void setLength(unsigned Len)
Definition: Token.h:141
void setKind(tok::TokenKind K)
Definition: Token.h:95
void setLocation(SourceLocation L)
Definition: Token.h:140
bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const
Definition: Token.h:101
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:196
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition: Format.cpp:4177
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:3873
bool isClangFormatOn(StringRef Comment)
Definition: Format.cpp:4173
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:207
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
#define true
Definition: stdbool.h:25
bool isTableGenKeyword(const FormatToken &Tok) const
Definition: FormatToken.h:1926
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:1725
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
bool isTableGen() const
Definition: Format.h:3253
@ LK_Java
Should be used for Java.
Definition: Format.h:3225
@ LK_TextProto
Should be used for Protocol Buffer messages in text format (https://developers.google....
Definition: Format.h:3239
std::vector< std::string > AttributeMacros
This option is renamed to BreakTemplateDeclarations.
Definition: Format.h:1174
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:3302
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:3257
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:4959
std::vector< std::string > StatementAttributeLikeMacros
Macros which are ignored in front of a statement, as if they were an attribute.
Definition: Format.h:4890
std::vector< std::string > IfMacros
A vector of macros that should be interpreted as conditionals instead of as function calls.
Definition: Format.h:2697
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls.
Definition: Format.h:2674
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition: Format.h:4969
bool isCSharp() const
Definition: Format.h:3246
std::vector< std::string > WhitespaceSensitiveMacros
A vector of macros which are whitespace-sensitive and should not be touched.
Definition: Format.h:5042
bool isProto() const
Definition: Format.h:3250
bool isVerilog() const
Definition: Format.h:3249
bool isJavaScript() const
Definition: Format.h:3248
std::vector< std::string > NamespaceMacros
A vector of macros which are used to open namespace blocks.
Definition: Format.h:3411
std::vector< std::string > StatementMacros
A vector of macros that should be interpreted as complete statements.
Definition: Format.h:4901
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:3306
std::vector< std::string > TypenameMacros
A vector of macros that should be interpreted as type declarations instead of as function calls.
Definition: Format.h:4986
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:502
bool isNot(T Kind) const
Definition: FormatToken.h:623
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:312
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
Definition: FormatToken.h:471
unsigned IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:328
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:461
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:325
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:476
void setType(TokenType T)
Definition: FormatToken.h:426
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:604
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:480
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:616
unsigned IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:331
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:321
void setFinalizedType(TokenType T)
Sets the type and also the finalized flag.
Definition: FormatToken.h:440