clang 19.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
17
18namespace clang {
19namespace format {
20
22 const SourceManager &SourceMgr, FileID ID, unsigned Column,
23 const FormatStyle &Style, encoding::Encoding Encoding,
24 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
25 IdentifierTable &IdentTable)
26 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
27 Column(Column), TrailingWhitespace(0),
28 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
29 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
30 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
31 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
32 MacroBlockEndRegex(Style.MacroBlockEnd) {
33 assert(IsCpp == Style.isCpp());
34 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
35 Lex->SetKeepWhitespaceMode(true);
36
37 for (const std::string &ForEachMacro : Style.ForEachMacros) {
38 auto Identifier = &IdentTable.get(ForEachMacro);
39 Macros.insert({Identifier, TT_ForEachMacro});
40 }
41 for (const std::string &IfMacro : Style.IfMacros) {
42 auto Identifier = &IdentTable.get(IfMacro);
43 Macros.insert({Identifier, TT_IfMacro});
44 }
45 for (const std::string &AttributeMacro : Style.AttributeMacros) {
46 auto Identifier = &IdentTable.get(AttributeMacro);
47 Macros.insert({Identifier, TT_AttributeMacro});
48 }
49 for (const std::string &StatementMacro : Style.StatementMacros) {
50 auto Identifier = &IdentTable.get(StatementMacro);
51 Macros.insert({Identifier, TT_StatementMacro});
52 }
53 for (const std::string &TypenameMacro : Style.TypenameMacros) {
54 auto Identifier = &IdentTable.get(TypenameMacro);
55 Macros.insert({Identifier, TT_TypenameMacro});
56 }
57 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
58 auto Identifier = &IdentTable.get(NamespaceMacro);
59 Macros.insert({Identifier, TT_NamespaceMacro});
60 }
61 for (const std::string &WhitespaceSensitiveMacro :
63 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
64 Macros.insert({Identifier, TT_UntouchableMacroFunc});
65 }
66 for (const std::string &StatementAttributeLikeMacro :
68 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
69 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
70 }
71
72 for (const auto &TypeName : Style.TypeNames)
73 TypeNames.insert(&IdentTable.get(TypeName));
74}
75
77 assert(Tokens.empty());
78 assert(FirstInLineIndex == 0);
79 do {
80 Tokens.push_back(getNextToken());
81 if (Style.isJavaScript()) {
82 tryParseJSRegexLiteral();
83 handleTemplateStrings();
84 }
86 tryParsePythonComment();
87 tryMergePreviousTokens();
88 if (Style.isCSharp()) {
89 // This needs to come after tokens have been merged so that C#
90 // string literals are correctly identified.
91 handleCSharpVerbatimAndInterpolatedStrings();
92 }
93 if (Style.isTableGen()) {
94 handleTableGenMultilineString();
95 handleTableGenNumericLikeIdentifier();
96 }
97 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
98 FirstInLineIndex = Tokens.size() - 1;
99 } while (Tokens.back()->isNot(tok::eof));
100 return Tokens;
101}
102
103void FormatTokenLexer::tryMergePreviousTokens() {
104 if (tryMerge_TMacro())
105 return;
106 if (tryMergeConflictMarkers())
107 return;
108 if (tryMergeLessLess())
109 return;
110 if (tryMergeGreaterGreater())
111 return;
112 if (tryMergeForEach())
113 return;
114 if (IsCpp && tryTransformTryUsageForC())
115 return;
116
117 if (Style.isJavaScript() || Style.isCSharp()) {
118 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
119 tok::question};
120 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
121 tok::period};
122 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
123
124 if (tryMergeTokens(FatArrow, TT_FatArrow))
125 return;
126 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
127 // Treat like the "||" operator (as opposed to the ternary ?).
128 Tokens.back()->Tok.setKind(tok::pipepipe);
129 return;
130 }
131 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
132 // Treat like a regular "." access.
133 Tokens.back()->Tok.setKind(tok::period);
134 return;
135 }
136 if (tryMergeNullishCoalescingEqual())
137 return;
138 }
139
140 if (Style.isCSharp()) {
141 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
142 tok::question, tok::l_square};
143
144 if (tryMergeCSharpKeywordVariables())
145 return;
146 if (tryMergeCSharpStringLiteral())
147 return;
148 if (tryTransformCSharpForEach())
149 return;
150 if (tryMergeTokens(CSharpNullConditionalLSquare,
151 TT_CSharpNullConditionalLSquare)) {
152 // Treat like a regular "[" operator.
153 Tokens.back()->Tok.setKind(tok::l_square);
154 return;
155 }
156 }
157
158 if (tryMergeNSStringLiteral())
159 return;
160
161 if (Style.isJavaScript()) {
162 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
163 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
164 tok::equal};
165 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
166 tok::greaterequal};
167 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
168 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
169 tok::starequal};
170 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
171 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
172
173 // FIXME: Investigate what token type gives the correct operator priority.
174 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
175 return;
176 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
177 return;
178 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
179 return;
180 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
181 return;
182 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
183 Tokens.back()->Tok.setKind(tok::starequal);
184 return;
185 }
186 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
187 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
188 // Treat like the "=" assignment operator.
189 Tokens.back()->Tok.setKind(tok::equal);
190 return;
191 }
192 if (tryMergeJSPrivateIdentifier())
193 return;
194 }
195
196 if (Style.Language == FormatStyle::LK_Java) {
197 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
198 tok::greater, tok::greater, tok::greaterequal};
199 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
200 return;
201 }
202
203 if (Style.isVerilog()) {
204 // Merge the number following a base like `'h?a0`.
205 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
206 Tokens.end()[-2]->is(tok::numeric_constant) &&
207 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
208 tok::question) &&
209 tryMergeTokens(2, TT_Unknown)) {
210 return;
211 }
212 // Part select.
213 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
214 TT_BitFieldColon)) {
215 return;
216 }
217 // Xnor. The combined token is treated as a caret which can also be either a
218 // unary or binary operator. The actual type is determined in
219 // TokenAnnotator. We also check the token length so we know it is not
220 // already a merged token.
221 if (Tokens.back()->TokenText.size() == 1 &&
222 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
223 TT_BinaryOperator)) {
224 Tokens.back()->Tok.setKind(tok::caret);
225 return;
226 }
227 // Signed shift and distribution weight.
228 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
229 Tokens.back()->Tok.setKind(tok::lessless);
230 return;
231 }
232 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
233 Tokens.back()->Tok.setKind(tok::greatergreater);
234 return;
235 }
236 if (tryMergeTokensAny({{tok::lessless, tok::equal},
237 {tok::lessless, tok::lessequal},
238 {tok::greatergreater, tok::equal},
239 {tok::greatergreater, tok::greaterequal},
240 {tok::colon, tok::equal},
241 {tok::colon, tok::slash}},
242 TT_BinaryOperator)) {
243 Tokens.back()->ForcedPrecedence = prec::Assignment;
244 return;
245 }
246 // Exponentiation, signed shift, case equality, and wildcard equality.
247 if (tryMergeTokensAny({{tok::star, tok::star},
248 {tok::lessless, tok::less},
249 {tok::greatergreater, tok::greater},
250 {tok::exclaimequal, tok::equal},
251 {tok::exclaimequal, tok::question},
252 {tok::equalequal, tok::equal},
253 {tok::equalequal, tok::question}},
254 TT_BinaryOperator)) {
255 return;
256 }
257 // Module paths in specify blocks and the implication and boolean equality
258 // operators.
259 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
260 {tok::plus, tok::star, tok::greater},
261 {tok::minusequal, tok::greater},
262 {tok::minus, tok::star, tok::greater},
263 {tok::less, tok::arrow},
264 {tok::equal, tok::greater},
265 {tok::star, tok::greater},
266 {tok::pipeequal, tok::greater},
267 {tok::pipe, tok::arrow},
268 {tok::hash, tok::minus, tok::hash},
269 {tok::hash, tok::equal, tok::hash}},
270 TT_BinaryOperator) ||
271 Tokens.back()->is(tok::arrow)) {
272 Tokens.back()->ForcedPrecedence = prec::Comma;
273 return;
274 }
275 }
276 if (Style.isTableGen()) {
277 // TableGen's Multi line string starts with [{
278 if (tryMergeTokens({tok::l_square, tok::l_brace},
279 TT_TableGenMultiLineString)) {
280 // Set again with finalizing. This must never be annotated as other types.
281 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
282 Tokens.back()->Tok.setKind(tok::string_literal);
283 return;
284 }
285 // TableGen's bang operator is the form !<name>.
286 // !cond is a special case with specific syntax.
287 if (tryMergeTokens({tok::exclaim, tok::identifier},
288 TT_TableGenBangOperator)) {
289 Tokens.back()->Tok.setKind(tok::identifier);
290 Tokens.back()->Tok.setIdentifierInfo(nullptr);
291 if (Tokens.back()->TokenText == "!cond")
292 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
293 else
294 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
295 return;
296 }
297 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
298 // Here, "! if" becomes "!if". That is, ! captures if even when the space
299 // exists. That is only one possibility in TableGen's syntax.
300 Tokens.back()->Tok.setKind(tok::identifier);
301 Tokens.back()->Tok.setIdentifierInfo(nullptr);
302 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
303 return;
304 }
305 // +, - with numbers are literals. Not unary operators.
306 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
307 Tokens.back()->Tok.setKind(tok::numeric_constant);
308 return;
309 }
310 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
311 Tokens.back()->Tok.setKind(tok::numeric_constant);
312 return;
313 }
314 }
315}
316
317bool FormatTokenLexer::tryMergeNSStringLiteral() {
318 if (Tokens.size() < 2)
319 return false;
320 auto &At = *(Tokens.end() - 2);
321 auto &String = *(Tokens.end() - 1);
322 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
323 return false;
324 At->Tok.setKind(tok::string_literal);
325 At->TokenText = StringRef(At->TokenText.begin(),
326 String->TokenText.end() - At->TokenText.begin());
327 At->ColumnWidth += String->ColumnWidth;
328 At->setType(TT_ObjCStringLiteral);
329 Tokens.erase(Tokens.end() - 1);
330 return true;
331}
332
333bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
334 // Merges #idenfier into a single identifier with the text #identifier
335 // but the token tok::identifier.
336 if (Tokens.size() < 2)
337 return false;
338 auto &Hash = *(Tokens.end() - 2);
339 auto &Identifier = *(Tokens.end() - 1);
340 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
341 return false;
342 Hash->Tok.setKind(tok::identifier);
343 Hash->TokenText =
344 StringRef(Hash->TokenText.begin(),
345 Identifier->TokenText.end() - Hash->TokenText.begin());
346 Hash->ColumnWidth += Identifier->ColumnWidth;
347 Hash->setType(TT_JsPrivateIdentifier);
348 Tokens.erase(Tokens.end() - 1);
349 return true;
350}
351
352// Search for verbatim or interpolated string literals @"ABC" or
353// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
354// prevent splitting of @, $ and ".
355// Merging of multiline verbatim strings with embedded '"' is handled in
356// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
357bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
358 if (Tokens.size() < 2)
359 return false;
360
361 // Look for @"aaaaaa" or $"aaaaaa".
362 const auto String = *(Tokens.end() - 1);
363 if (String->isNot(tok::string_literal))
364 return false;
365
366 auto Prefix = *(Tokens.end() - 2);
367 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
368 return false;
369
370 if (Tokens.size() > 2) {
371 const auto Tok = *(Tokens.end() - 3);
372 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
373 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
374 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
375 Tok->ColumnWidth += Prefix->ColumnWidth;
376 Tokens.erase(Tokens.end() - 2);
377 Prefix = Tok;
378 }
379 }
380
381 // Convert back into just a string_literal.
382 Prefix->Tok.setKind(tok::string_literal);
383 Prefix->TokenText =
384 StringRef(Prefix->TokenText.begin(),
385 String->TokenText.end() - Prefix->TokenText.begin());
386 Prefix->ColumnWidth += String->ColumnWidth;
387 Prefix->setType(TT_CSharpStringLiteral);
388 Tokens.erase(Tokens.end() - 1);
389 return true;
390}
391
392// Valid C# attribute targets:
393// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
394const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
395 "assembly", "module", "field", "event", "method",
396 "param", "property", "return", "type",
397};
398
399bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
400 if (Tokens.size() < 2)
401 return false;
402 auto &NullishCoalescing = *(Tokens.end() - 2);
403 auto &Equal = *(Tokens.end() - 1);
404 if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
405 Equal->isNot(tok::equal)) {
406 return false;
407 }
408 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
409 NullishCoalescing->TokenText =
410 StringRef(NullishCoalescing->TokenText.begin(),
411 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
412 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
413 NullishCoalescing->setType(TT_NullCoalescingEqual);
414 Tokens.erase(Tokens.end() - 1);
415 return true;
416}
417
418bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
419 if (Tokens.size() < 2)
420 return false;
421 const auto At = *(Tokens.end() - 2);
422 if (At->isNot(tok::at))
423 return false;
424 const auto Keyword = *(Tokens.end() - 1);
425 if (Keyword->TokenText == "$")
426 return false;
427 if (!Keywords.isCSharpKeyword(*Keyword))
428 return false;
429
430 At->Tok.setKind(tok::identifier);
431 At->TokenText = StringRef(At->TokenText.begin(),
432 Keyword->TokenText.end() - At->TokenText.begin());
433 At->ColumnWidth += Keyword->ColumnWidth;
434 At->setType(Keyword->getType());
435 Tokens.erase(Tokens.end() - 1);
436 return true;
437}
438
439// In C# transform identifier foreach into kw_foreach
440bool FormatTokenLexer::tryTransformCSharpForEach() {
441 if (Tokens.size() < 1)
442 return false;
443 auto &Identifier = *(Tokens.end() - 1);
444 if (Identifier->isNot(tok::identifier))
445 return false;
446 if (Identifier->TokenText != "foreach")
447 return false;
448
449 Identifier->setType(TT_ForEachMacro);
450 Identifier->Tok.setKind(tok::kw_for);
451 return true;
452}
453
454bool FormatTokenLexer::tryMergeForEach() {
455 if (Tokens.size() < 2)
456 return false;
457 auto &For = *(Tokens.end() - 2);
458 auto &Each = *(Tokens.end() - 1);
459 if (For->isNot(tok::kw_for))
460 return false;
461 if (Each->isNot(tok::identifier))
462 return false;
463 if (Each->TokenText != "each")
464 return false;
465
466 For->setType(TT_ForEachMacro);
467 For->Tok.setKind(tok::kw_for);
468
469 For->TokenText = StringRef(For->TokenText.begin(),
470 Each->TokenText.end() - For->TokenText.begin());
471 For->ColumnWidth += Each->ColumnWidth;
472 Tokens.erase(Tokens.end() - 1);
473 return true;
474}
475
476bool FormatTokenLexer::tryTransformTryUsageForC() {
477 if (Tokens.size() < 2)
478 return false;
479 auto &Try = *(Tokens.end() - 2);
480 if (Try->isNot(tok::kw_try))
481 return false;
482 auto &Next = *(Tokens.end() - 1);
483 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
484 return false;
485
486 if (Tokens.size() > 2) {
487 auto &At = *(Tokens.end() - 3);
488 if (At->is(tok::at))
489 return false;
490 }
491
492 Try->Tok.setKind(tok::identifier);
493 return true;
494}
495
496bool FormatTokenLexer::tryMergeLessLess() {
497 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
498 if (Tokens.size() < 3)
499 return false;
500
501 auto First = Tokens.end() - 3;
502 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
503 return false;
504
505 // Only merge if there currently is no whitespace between the two "<".
506 if (First[1]->hasWhitespaceBefore())
507 return false;
508
509 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
510 if (X && X->is(tok::less))
511 return false;
512
513 auto Y = First[2];
514 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
515 return false;
516
517 First[0]->Tok.setKind(tok::lessless);
518 First[0]->TokenText = "<<";
519 First[0]->ColumnWidth += 1;
520 Tokens.erase(Tokens.end() - 2);
521 return true;
522}
523
524bool FormatTokenLexer::tryMergeGreaterGreater() {
525 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
526 if (Tokens.size() < 2)
527 return false;
528
529 auto First = Tokens.end() - 2;
530 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
531 return false;
532
533 // Only merge if there currently is no whitespace between the first two ">".
534 if (First[1]->hasWhitespaceBefore())
535 return false;
536
537 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
538 if (Tok && Tok->isNot(tok::kw_operator))
539 return false;
540
541 First[0]->Tok.setKind(tok::greatergreater);
542 First[0]->TokenText = ">>";
543 First[0]->ColumnWidth += 1;
544 Tokens.erase(Tokens.end() - 1);
545 return true;
546}
547
548bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
549 TokenType NewType) {
550 if (Tokens.size() < Kinds.size())
551 return false;
552
553 SmallVectorImpl<FormatToken *>::const_iterator First =
554 Tokens.end() - Kinds.size();
555 for (unsigned i = 0; i < Kinds.size(); ++i)
556 if (First[i]->isNot(Kinds[i]))
557 return false;
558
559 return tryMergeTokens(Kinds.size(), NewType);
560}
561
562bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
563 if (Tokens.size() < Count)
564 return false;
565
566 SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
567 unsigned AddLength = 0;
568 for (size_t i = 1; i < Count; ++i) {
569 // If there is whitespace separating the token and the previous one,
570 // they should not be merged.
571 if (First[i]->hasWhitespaceBefore())
572 return false;
573 AddLength += First[i]->TokenText.size();
574 }
575
576 Tokens.resize(Tokens.size() - Count + 1);
577 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
578 First[0]->TokenText.size() + AddLength);
579 First[0]->ColumnWidth += AddLength;
580 First[0]->setType(NewType);
581 return true;
582}
583
584bool FormatTokenLexer::tryMergeTokensAny(
585 ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
586 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
587 return tryMergeTokens(Kinds, NewType);
588 });
589}
590
591// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
592bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
593 // NB: This is not entirely correct, as an r_paren can introduce an operand
594 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
595 // corner case to not matter in practice, though.
596 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
597 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
598 tok::colon, tok::question, tok::tilde) ||
599 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
600 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
601 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
602 Tok->isBinaryOperator();
603}
604
605bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
606 if (!Prev)
607 return true;
608
609 // Regex literals can only follow after prefix unary operators, not after
610 // postfix unary operators. If the '++' is followed by a non-operand
611 // introducing token, the slash here is the operand and not the start of a
612 // regex.
613 // `!` is an unary prefix operator, but also a post-fix operator that casts
614 // away nullability, so the same check applies.
615 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
616 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
617
618 // The previous token must introduce an operand location where regex
619 // literals can occur.
620 if (!precedesOperand(Prev))
621 return false;
622
623 return true;
624}
625
626// Tries to parse a JavaScript Regex literal starting at the current token,
627// if that begins with a slash and is in a location where JavaScript allows
628// regex literals. Changes the current token to a regex literal and updates
629// its text if successful.
630void FormatTokenLexer::tryParseJSRegexLiteral() {
631 FormatToken *RegexToken = Tokens.back();
632 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
633 return;
634
635 FormatToken *Prev = nullptr;
636 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
637 // NB: Because previous pointers are not initialized yet, this cannot use
638 // Token.getPreviousNonComment.
639 if (FT->isNot(tok::comment)) {
640 Prev = FT;
641 break;
642 }
643 }
644
645 if (!canPrecedeRegexLiteral(Prev))
646 return;
647
648 // 'Manually' lex ahead in the current file buffer.
649 const char *Offset = Lex->getBufferLocation();
650 const char *RegexBegin = Offset - RegexToken->TokenText.size();
651 StringRef Buffer = Lex->getBuffer();
652 bool InCharacterClass = false;
653 bool HaveClosingSlash = false;
654 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
655 // Regular expressions are terminated with a '/', which can only be
656 // escaped using '\' or a character class between '[' and ']'.
657 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
658 switch (*Offset) {
659 case '\\':
660 // Skip the escaped character.
661 ++Offset;
662 break;
663 case '[':
664 InCharacterClass = true;
665 break;
666 case ']':
667 InCharacterClass = false;
668 break;
669 case '/':
670 if (!InCharacterClass)
671 HaveClosingSlash = true;
672 break;
673 }
674 }
675
676 RegexToken->setType(TT_RegexLiteral);
677 // Treat regex literals like other string_literals.
678 RegexToken->Tok.setKind(tok::string_literal);
679 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
680 RegexToken->ColumnWidth = RegexToken->TokenText.size();
681
682 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
683}
684
685static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
686 bool Interpolated) {
687 auto Repeated = [&Begin, End]() {
688 return Begin + 1 < End && Begin[1] == Begin[0];
689 };
690
691 // Look for a terminating '"' in the current file buffer.
692 // Make no effort to format code within an interpolated or verbatim string.
693 //
694 // Interpolated strings could contain { } with " characters inside.
695 // $"{x ?? "null"}"
696 // should not be split into $"{x ?? ", null, "}" but should be treated as a
697 // single string-literal.
698 //
699 // We opt not to try and format expressions inside {} within a C#
700 // interpolated string. Formatting expressions within an interpolated string
701 // would require similar work as that done for JavaScript template strings
702 // in `handleTemplateStrings()`.
703 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
704 switch (*Begin) {
705 case '\\':
706 if (!Verbatim)
707 ++Begin;
708 break;
709 case '{':
710 if (Interpolated) {
711 // {{ inside an interpolated string is escaped, so skip it.
712 if (Repeated())
713 ++Begin;
714 else
715 ++UnmatchedOpeningBraceCount;
716 }
717 break;
718 case '}':
719 if (Interpolated) {
720 // }} inside an interpolated string is escaped, so skip it.
721 if (Repeated())
722 ++Begin;
723 else if (UnmatchedOpeningBraceCount > 0)
724 --UnmatchedOpeningBraceCount;
725 else
726 return End;
727 }
728 break;
729 case '"':
730 if (UnmatchedOpeningBraceCount > 0)
731 break;
732 // "" within a verbatim string is an escaped double quote: skip it.
733 if (Verbatim && Repeated()) {
734 ++Begin;
735 break;
736 }
737 return Begin;
738 }
739 }
740
741 return End;
742}
743
744void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
745 FormatToken *CSharpStringLiteral = Tokens.back();
746
747 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
748 return;
749
750 auto &TokenText = CSharpStringLiteral->TokenText;
751
752 bool Verbatim = false;
753 bool Interpolated = false;
754 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
755 Verbatim = true;
756 Interpolated = true;
757 } else if (TokenText.starts_with(R"(@")")) {
758 Verbatim = true;
759 } else if (TokenText.starts_with(R"($")")) {
760 Interpolated = true;
761 }
762
763 // Deal with multiline strings.
764 if (!Verbatim && !Interpolated)
765 return;
766
767 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
768 const char *Offset = StrBegin;
769 if (Verbatim && Interpolated)
770 Offset += 3;
771 else
772 Offset += 2;
773
774 const auto End = Lex->getBuffer().end();
775 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
776
777 // Make no attempt to format code properly if a verbatim string is
778 // unterminated.
779 if (Offset >= End)
780 return;
781
782 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
783 TokenText = LiteralText;
784
785 // Adjust width for potentially multiline string literals.
786 size_t FirstBreak = LiteralText.find('\n');
787 StringRef FirstLineText = FirstBreak == StringRef::npos
788 ? LiteralText
789 : LiteralText.substr(0, FirstBreak);
790 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
791 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
792 Encoding);
793 size_t LastBreak = LiteralText.rfind('\n');
794 if (LastBreak != StringRef::npos) {
795 CSharpStringLiteral->IsMultiline = true;
796 unsigned StartColumn = 0;
797 CSharpStringLiteral->LastLineColumnWidth =
798 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
799 StartColumn, Style.TabWidth, Encoding);
800 }
801
802 assert(Offset < End);
803 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
804}
805
806void FormatTokenLexer::handleTableGenMultilineString() {
807 FormatToken *MultiLineString = Tokens.back();
808 if (MultiLineString->isNot(TT_TableGenMultiLineString))
809 return;
810
811 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
812 // "}]" is the end of multi line string.
813 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
814 if (CloseOffset == StringRef::npos)
815 return;
816 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
817 MultiLineString->TokenText = Text;
818 resetLexer(SourceMgr.getFileOffset(
819 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
820 auto FirstLineText = Text;
821 auto FirstBreak = Text.find('\n');
822 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
823 if (FirstBreak != StringRef::npos) {
824 MultiLineString->IsMultiline = true;
825 FirstLineText = Text.substr(0, FirstBreak + 1);
826 // LastLineColumnWidth holds the width of the last line.
827 auto LastBreak = Text.rfind('\n');
828 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
829 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
830 Style.TabWidth, Encoding);
831 }
832 // ColumnWidth holds only the width of the first line.
833 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
834 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
835}
836
837void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
838 FormatToken *Tok = Tokens.back();
839 // TableGen identifiers can begin with digits. Such tokens are lexed as
840 // numeric_constant now.
841 if (Tok->isNot(tok::numeric_constant))
842 return;
843 StringRef Text = Tok->TokenText;
844 // The following check is based on llvm::TGLexer::LexToken.
845 // That lexes the token as a number if any of the following holds:
846 // 1. It starts with '+', '-'.
847 // 2. All the characters are digits.
848 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
849 // 4. The first non-digit character is 'x', and the next is a hex digit.
850 // Note that in the case 3 and 4, if the next character does not exists in
851 // this token, the token is an identifier.
852 if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
853 return;
854 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
855 // All the characters are digits
856 if (NonDigitPos == StringRef::npos)
857 return;
858 char FirstNonDigit = Text[NonDigitPos];
859 if (NonDigitPos < Text.size() - 1) {
860 char TheNext = Text[NonDigitPos + 1];
861 // Regarded as a binary number.
862 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
863 return;
864 // Regarded as hex number.
865 if (FirstNonDigit == 'x' && isxdigit(TheNext))
866 return;
867 }
868 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
869 // This is actually an identifier in TableGen.
870 Tok->Tok.setKind(tok::identifier);
871 Tok->Tok.setIdentifierInfo(nullptr);
872 }
873}
874
875void FormatTokenLexer::handleTemplateStrings() {
876 FormatToken *BacktickToken = Tokens.back();
877
878 if (BacktickToken->is(tok::l_brace)) {
879 StateStack.push(LexerState::NORMAL);
880 return;
881 }
882 if (BacktickToken->is(tok::r_brace)) {
883 if (StateStack.size() == 1)
884 return;
885 StateStack.pop();
886 if (StateStack.top() != LexerState::TEMPLATE_STRING)
887 return;
888 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
889 } else if (BacktickToken->is(tok::unknown) &&
890 BacktickToken->TokenText == "`") {
891 StateStack.push(LexerState::TEMPLATE_STRING);
892 } else {
893 return; // Not actually a template
894 }
895
896 // 'Manually' lex ahead in the current file buffer.
897 const char *Offset = Lex->getBufferLocation();
898 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
899 for (; Offset != Lex->getBuffer().end(); ++Offset) {
900 if (Offset[0] == '`') {
901 StateStack.pop();
902 ++Offset;
903 break;
904 }
905 if (Offset[0] == '\\') {
906 ++Offset; // Skip the escaped character.
907 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
908 Offset[1] == '{') {
909 // '${' introduces an expression interpolation in the template string.
910 StateStack.push(LexerState::NORMAL);
911 Offset += 2;
912 break;
913 }
914 }
915
916 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
917 BacktickToken->setType(TT_TemplateString);
918 BacktickToken->Tok.setKind(tok::string_literal);
919 BacktickToken->TokenText = LiteralText;
920
921 // Adjust width for potentially multiline string literals.
922 size_t FirstBreak = LiteralText.find('\n');
923 StringRef FirstLineText = FirstBreak == StringRef::npos
924 ? LiteralText
925 : LiteralText.substr(0, FirstBreak);
926 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
927 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
928 size_t LastBreak = LiteralText.rfind('\n');
929 if (LastBreak != StringRef::npos) {
930 BacktickToken->IsMultiline = true;
931 unsigned StartColumn = 0; // The template tail spans the entire line.
932 BacktickToken->LastLineColumnWidth =
933 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
934 StartColumn, Style.TabWidth, Encoding);
935 }
936
937 SourceLocation loc = Lex->getSourceLocation(Offset);
938 resetLexer(SourceMgr.getFileOffset(loc));
939}
940
941void FormatTokenLexer::tryParsePythonComment() {
942 FormatToken *HashToken = Tokens.back();
943 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
944 return;
945 // Turn the remainder of this line into a comment.
946 const char *CommentBegin =
947 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
948 size_t From = CommentBegin - Lex->getBuffer().begin();
949 size_t To = Lex->getBuffer().find_first_of('\n', From);
950 if (To == StringRef::npos)
951 To = Lex->getBuffer().size();
952 size_t Len = To - From;
953 HashToken->setType(TT_LineComment);
954 HashToken->Tok.setKind(tok::comment);
955 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
956 SourceLocation Loc = To < Lex->getBuffer().size()
957 ? Lex->getSourceLocation(CommentBegin + Len)
958 : SourceMgr.getLocForEndOfFile(ID);
959 resetLexer(SourceMgr.getFileOffset(Loc));
960}
961
962bool FormatTokenLexer::tryMerge_TMacro() {
963 if (Tokens.size() < 4)
964 return false;
965 FormatToken *Last = Tokens.back();
966 if (Last->isNot(tok::r_paren))
967 return false;
968
969 FormatToken *String = Tokens[Tokens.size() - 2];
970 if (String->isNot(tok::string_literal) || String->IsMultiline)
971 return false;
972
973 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
974 return false;
975
976 FormatToken *Macro = Tokens[Tokens.size() - 4];
977 if (Macro->TokenText != "_T")
978 return false;
979
980 const char *Start = Macro->TokenText.data();
981 const char *End = Last->TokenText.data() + Last->TokenText.size();
982 String->TokenText = StringRef(Start, End - Start);
983 String->IsFirst = Macro->IsFirst;
984 String->LastNewlineOffset = Macro->LastNewlineOffset;
985 String->WhitespaceRange = Macro->WhitespaceRange;
986 String->OriginalColumn = Macro->OriginalColumn;
987 String->ColumnWidth = encoding::columnWidthWithTabs(
988 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
989 String->NewlinesBefore = Macro->NewlinesBefore;
990 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
991
992 Tokens.pop_back();
993 Tokens.pop_back();
994 Tokens.pop_back();
995 Tokens.back() = String;
996 if (FirstInLineIndex >= Tokens.size())
997 FirstInLineIndex = Tokens.size() - 1;
998 return true;
999}
1000
1001bool FormatTokenLexer::tryMergeConflictMarkers() {
1002 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1003 return false;
1004
1005 // Conflict lines look like:
1006 // <marker> <text from the vcs>
1007 // For example:
1008 // >>>>>>> /file/in/file/system at revision 1234
1009 //
1010 // We merge all tokens in a line that starts with a conflict marker
1011 // into a single token with a special token type that the unwrapped line
1012 // parser will use to correctly rebuild the underlying code.
1013
1014 FileID ID;
1015 // Get the position of the first token in the line.
1016 unsigned FirstInLineOffset;
1017 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1018 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1019 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1020 // Calculate the offset of the start of the current line.
1021 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1022 if (LineOffset == StringRef::npos)
1023 LineOffset = 0;
1024 else
1025 ++LineOffset;
1026
1027 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1028 StringRef LineStart;
1029 if (FirstSpace == StringRef::npos)
1030 LineStart = Buffer.substr(LineOffset);
1031 else
1032 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1033
1034 TokenType Type = TT_Unknown;
1035 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1036 Type = TT_ConflictStart;
1037 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1038 LineStart == "====") {
1039 Type = TT_ConflictAlternative;
1040 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1041 Type = TT_ConflictEnd;
1042 }
1043
1044 if (Type != TT_Unknown) {
1045 FormatToken *Next = Tokens.back();
1046
1047 Tokens.resize(FirstInLineIndex + 1);
1048 // We do not need to build a complete token here, as we will skip it
1049 // during parsing anyway (as we must not touch whitespace around conflict
1050 // markers).
1051 Tokens.back()->setType(Type);
1052 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1053
1054 Tokens.push_back(Next);
1055 return true;
1056 }
1057
1058 return false;
1059}
1060
1061FormatToken *FormatTokenLexer::getStashedToken() {
1062 // Create a synthesized second '>' or '<' token.
1063 Token Tok = FormatTok->Tok;
1064 StringRef TokenText = FormatTok->TokenText;
1065
1066 unsigned OriginalColumn = FormatTok->OriginalColumn;
1067 FormatTok = new (Allocator.Allocate()) FormatToken;
1068 FormatTok->Tok = Tok;
1069 SourceLocation TokLocation =
1070 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1071 FormatTok->Tok.setLocation(TokLocation);
1072 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1073 FormatTok->TokenText = TokenText;
1074 FormatTok->ColumnWidth = 1;
1075 FormatTok->OriginalColumn = OriginalColumn + 1;
1076
1077 return FormatTok;
1078}
1079
1080/// Truncate the current token to the new length and make the lexer continue
1081/// from the end of the truncated token. Used for other languages that have
1082/// different token boundaries, like JavaScript in which a comment ends at a
1083/// line break regardless of whether the line break follows a backslash. Also
1084/// used to set the lexer to the end of whitespace if the lexer regards
1085/// whitespace and an unrecognized symbol as one token.
1086void FormatTokenLexer::truncateToken(size_t NewLen) {
1087 assert(NewLen <= FormatTok->TokenText.size());
1088 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1089 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1090 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1092 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1093 Encoding);
1094 FormatTok->Tok.setLength(NewLen);
1095}
1096
1097/// Count the length of leading whitespace in a token.
1098static size_t countLeadingWhitespace(StringRef Text) {
1099 // Basically counting the length matched by this regex.
1100 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
1101 // Directly using the regex turned out to be slow. With the regex
1102 // version formatting all files in this directory took about 1.25
1103 // seconds. This version took about 0.5 seconds.
1104 const unsigned char *const Begin = Text.bytes_begin();
1105 const unsigned char *const End = Text.bytes_end();
1106 const unsigned char *Cur = Begin;
1107 while (Cur < End) {
1108 if (isspace(Cur[0])) {
1109 ++Cur;
1110 } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
1111 // A '\' followed by a newline always escapes the newline, regardless
1112 // of whether there is another '\' before it.
1113 // The source has a null byte at the end. So the end of the entire input
1114 // isn't reached yet. Also the lexer doesn't break apart an escaped
1115 // newline.
1116 assert(End - Cur >= 2);
1117 Cur += 2;
1118 } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
1119 (Cur[3] == '\n' || Cur[3] == '\r')) {
1120 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1121 // characters are quoted individually in this comment because if we write
1122 // them together some compilers warn that we have a trigraph in the code.
1123 assert(End - Cur >= 4);
1124 Cur += 4;
1125 } else {
1126 break;
1127 }
1128 }
1129 return Cur - Begin;
1130}
1131
1132FormatToken *FormatTokenLexer::getNextToken() {
1133 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1134 StateStack.pop();
1135 return getStashedToken();
1136 }
1137
1138 FormatTok = new (Allocator.Allocate()) FormatToken;
1139 readRawToken(*FormatTok);
1140 SourceLocation WhitespaceStart =
1141 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1142 FormatTok->IsFirst = IsFirstToken;
1143 IsFirstToken = false;
1144
1145 // Consume and record whitespace until we find a significant token.
1146 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1147 // followed by a symbol such as backtick. Those symbols may be
1148 // significant in other languages.
1149 unsigned WhitespaceLength = TrailingWhitespace;
1150 while (FormatTok->isNot(tok::eof)) {
1151 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1152 if (LeadingWhitespace == 0)
1153 break;
1154 if (LeadingWhitespace < FormatTok->TokenText.size())
1155 truncateToken(LeadingWhitespace);
1156 StringRef Text = FormatTok->TokenText;
1157 bool InEscape = false;
1158 for (int i = 0, e = Text.size(); i != e; ++i) {
1159 switch (Text[i]) {
1160 case '\r':
1161 // If this is a CRLF sequence, break here and the LF will be handled on
1162 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1163 // the same as a single LF.
1164 if (i + 1 < e && Text[i + 1] == '\n')
1165 break;
1166 [[fallthrough]];
1167 case '\n':
1168 ++FormatTok->NewlinesBefore;
1169 if (!InEscape)
1170 FormatTok->HasUnescapedNewline = true;
1171 else
1172 InEscape = false;
1173 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1174 Column = 0;
1175 break;
1176 case '\f':
1177 case '\v':
1178 Column = 0;
1179 break;
1180 case ' ':
1181 ++Column;
1182 break;
1183 case '\t':
1184 Column +=
1185 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1186 break;
1187 case '\\':
1188 case '?':
1189 case '/':
1190 // The text was entirely whitespace when this loop was entered. Thus
1191 // this has to be an escape sequence.
1192 assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1193 Text.substr(i, 4) == "\?\?/\r" ||
1194 Text.substr(i, 4) == "\?\?/\n" ||
1195 (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1196 Text.substr(i - 1, 4) == "\?\?/\n")) ||
1197 (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1198 Text.substr(i - 2, 4) == "\?\?/\n")));
1199 InEscape = true;
1200 break;
1201 default:
1202 // This shouldn't happen.
1203 assert(false);
1204 break;
1205 }
1206 }
1207 WhitespaceLength += Text.size();
1208 readRawToken(*FormatTok);
1209 }
1210
1211 if (FormatTok->is(tok::unknown))
1212 FormatTok->setType(TT_ImplicitStringLiteral);
1213
1214 // JavaScript and Java do not allow to escape the end of the line with a
1215 // backslash. Backslashes are syntax errors in plain source, but can occur in
1216 // comments. When a single line comment ends with a \, it'll cause the next
1217 // line of code to be lexed as a comment, breaking formatting. The code below
1218 // finds comments that contain a backslash followed by a line break, truncates
1219 // the comment token at the backslash, and resets the lexer to restart behind
1220 // the backslash.
1221 if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1222 FormatTok->is(tok::comment) && FormatTok->TokenText.starts_with("//")) {
1223 size_t BackslashPos = FormatTok->TokenText.find('\\');
1224 while (BackslashPos != StringRef::npos) {
1225 if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1226 FormatTok->TokenText[BackslashPos + 1] == '\n') {
1227 truncateToken(BackslashPos + 1);
1228 break;
1229 }
1230 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1231 }
1232 }
1233
1234 if (Style.isVerilog()) {
1235 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1236 SmallVector<StringRef, 1> Matches;
1237 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1238 // And it uses the hash for delays and parameter lists. In order to continue
1239 // using `tok::hash` in other places, the backtick gets marked as the hash
1240 // here. And in order to tell the backtick and hash apart for
1241 // Verilog-specific stuff, the hash becomes an identifier.
1242 if (FormatTok->is(tok::numeric_constant)) {
1243 // In Verilog the quote is not part of a number.
1244 auto Quote = FormatTok->TokenText.find('\'');
1245 if (Quote != StringRef::npos)
1246 truncateToken(Quote);
1247 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1248 FormatTok->Tok.setKind(tok::raw_identifier);
1249 } else if (FormatTok->is(tok::raw_identifier)) {
1250 if (FormatTok->TokenText == "`") {
1251 FormatTok->Tok.setIdentifierInfo(nullptr);
1252 FormatTok->Tok.setKind(tok::hash);
1253 } else if (FormatTok->TokenText == "``") {
1254 FormatTok->Tok.setIdentifierInfo(nullptr);
1255 FormatTok->Tok.setKind(tok::hashhash);
1256 } else if (Tokens.size() > 0 &&
1257 Tokens.back()->is(Keywords.kw_apostrophe) &&
1258 NumberBase.match(FormatTok->TokenText, &Matches)) {
1259 // In Verilog in a based number literal like `'b10`, there may be
1260 // whitespace between `'b` and `10`. Therefore we handle the base and
1261 // the rest of the number literal as two tokens. But if there is no
1262 // space in the input code, we need to manually separate the two parts.
1263 truncateToken(Matches[0].size());
1264 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1265 }
1266 }
1267 }
1268
1269 FormatTok->WhitespaceRange = SourceRange(
1270 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1271
1272 FormatTok->OriginalColumn = Column;
1273
1274 TrailingWhitespace = 0;
1275 if (FormatTok->is(tok::comment)) {
1276 // FIXME: Add the trimmed whitespace to Column.
1277 StringRef UntrimmedText = FormatTok->TokenText;
1278 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1279 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1280 } else if (FormatTok->is(tok::raw_identifier)) {
1281 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1282 FormatTok->Tok.setIdentifierInfo(&Info);
1283 FormatTok->Tok.setKind(Info.getTokenID());
1284 if (Style.Language == FormatStyle::LK_Java &&
1285 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1286 tok::kw_operator)) {
1287 FormatTok->Tok.setKind(tok::identifier);
1288 FormatTok->Tok.setIdentifierInfo(nullptr);
1289 } else if (Style.isJavaScript() &&
1290 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1291 tok::kw_operator)) {
1292 FormatTok->Tok.setKind(tok::identifier);
1293 FormatTok->Tok.setIdentifierInfo(nullptr);
1294 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1295 FormatTok->Tok.setKind(tok::identifier);
1296 FormatTok->Tok.setIdentifierInfo(nullptr);
1297 }
1298 } else if (FormatTok->is(tok::greatergreater)) {
1299 FormatTok->Tok.setKind(tok::greater);
1300 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1301 ++Column;
1302 StateStack.push(LexerState::TOKEN_STASHED);
1303 } else if (FormatTok->is(tok::lessless)) {
1304 FormatTok->Tok.setKind(tok::less);
1305 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1306 ++Column;
1307 StateStack.push(LexerState::TOKEN_STASHED);
1308 }
1309
1310 if (Style.isVerilog() && Tokens.size() > 0 &&
1311 Tokens.back()->is(TT_VerilogNumberBase) &&
1312 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1313 // Mark the number following a base like `'h?a0` as a number.
1314 FormatTok->Tok.setKind(tok::numeric_constant);
1315 }
1316
1317 // Now FormatTok is the next non-whitespace token.
1318
1319 StringRef Text = FormatTok->TokenText;
1320 size_t FirstNewlinePos = Text.find('\n');
1321 if (FirstNewlinePos == StringRef::npos) {
1322 // FIXME: ColumnWidth actually depends on the start column, we need to
1323 // take this into account when the token is moved.
1324 FormatTok->ColumnWidth =
1325 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1326 Column += FormatTok->ColumnWidth;
1327 } else {
1328 FormatTok->IsMultiline = true;
1329 // FIXME: ColumnWidth actually depends on the start column, we need to
1330 // take this into account when the token is moved.
1332 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1333
1334 // The last line of the token always starts in column 0.
1335 // Thus, the length can be precomputed even in the presence of tabs.
1337 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1338 Column = FormatTok->LastLineColumnWidth;
1339 }
1340
1341 if (IsCpp) {
1342 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1343 auto it = Macros.find(Identifier);
1344 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1345 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1346 tok::pp_define) &&
1347 it != Macros.end()) {
1348 FormatTok->setType(it->second);
1349 if (it->second == TT_IfMacro) {
1350 // The lexer token currently has type tok::kw_unknown. However, for this
1351 // substitution to be treated correctly in the TokenAnnotator, faking
1352 // the tok value seems to be needed. Not sure if there's a more elegant
1353 // way.
1354 FormatTok->Tok.setKind(tok::kw_if);
1355 }
1356 } else if (FormatTok->is(tok::identifier)) {
1357 if (MacroBlockBeginRegex.match(Text))
1358 FormatTok->setType(TT_MacroBlockBegin);
1359 else if (MacroBlockEndRegex.match(Text))
1360 FormatTok->setType(TT_MacroBlockEnd);
1361 else if (TypeNames.contains(Identifier))
1362 FormatTok->setFinalizedType(TT_TypeName);
1363 }
1364 }
1365
1366 return FormatTok;
1367}
1368
1369bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1370 // In Verilog the quote is not a character literal.
1371 //
1372 // Make the backtick and double backtick identifiers to match against them
1373 // more easily.
1374 //
1375 // In Verilog an escaped identifier starts with backslash and ends with
1376 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1377 // also begin an escaped newline outside of an escaped identifier. We check
1378 // for that outside of the Regex since we can't use negative lookhead
1379 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1380 // identifier may have a length of 0 according to Section A.9.3.
1381 // FIXME: If there is an escaped newline in the middle of an escaped
1382 // identifier, allow for pasting the two lines together, But escaped
1383 // identifiers usually occur only in generated code anyway.
1384 static const llvm::Regex VerilogToken(R"re(^('|``?|\\‍(\\)re"
1385 "(\r?\n|\r)|[^[:space:]])*)");
1386
1387 SmallVector<StringRef, 4> Matches;
1388 const char *Start = Lex->getBufferLocation();
1389 if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1390 &Matches)) {
1391 return false;
1392 }
1393 // There is a null byte at the end of the buffer, so we don't have to check
1394 // Start[1] is within the buffer.
1395 if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1396 return false;
1397 size_t Len = Matches[0].size();
1398
1399 // The kind has to be an identifier so we can match it against those defined
1400 // in Keywords. The kind has to be set before the length because the setLength
1401 // function checks that the kind is not an annotation.
1402 Tok.setKind(tok::raw_identifier);
1403 Tok.setLength(Len);
1404 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1405 Tok.setRawIdentifierData(Start);
1406 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1407 return true;
1408}
1409
1410void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1411 // For Verilog, first see if there is a special token, and fall back to the
1412 // normal lexer if there isn't one.
1413 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1414 Lex->LexFromRawLexer(Tok.Tok);
1415 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1416 Tok.Tok.getLength());
1417 // For formatting, treat unterminated string literals like normal string
1418 // literals.
1419 if (Tok.is(tok::unknown)) {
1420 if (Tok.TokenText.starts_with("\"")) {
1421 Tok.Tok.setKind(tok::string_literal);
1422 Tok.IsUnterminatedLiteral = true;
1423 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1424 Tok.Tok.setKind(tok::string_literal);
1425 }
1426 }
1427
1428 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1429 Tok.Tok.setKind(tok::string_literal);
1430
1431 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1432 FormattingDisabled = false;
1433
1434 Tok.Finalized = FormattingDisabled;
1435
1436 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1437 FormattingDisabled = true;
1438}
1439
1440void FormatTokenLexer::resetLexer(unsigned Offset) {
1441 StringRef Buffer = SourceMgr.getBufferData(ID);
1442 LangOpts = getFormattingLangOpts(Style);
1443 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1444 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1445 Lex->SetKeepWhitespaceMode(true);
1446 TrailingWhitespace = 0;
1447}
1448
1449} // namespace format
1450} // namespace clang
MatchType Type
static char ID
Definition: Arena.cpp:183
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
StringRef Text
Definition: Format.cpp:2953
StringRef Identifier
Definition: Format.cpp:2960
#define X(type, name)
Definition: Value.h:142
Defines the SourceManager interface.
SourceLocation Begin
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
llvm::MemoryBufferRef getBufferOrFake(FileID FID, SourceLocation Loc=SourceLocation()) const
Return the buffer for the specified FileID.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
void setLength(unsigned Len)
Definition: Token.h:141
void setKind(tok::TokenKind K)
Definition: Token.h:95
void setLocation(SourceLocation L)
Definition: Token.h:140
bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const
Definition: Token.h:101
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:196
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:60
bool IsCpp
Whether the language is C/C++/Objective-C/Objective-C++.
Definition: FormatToken.cpp:21
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition: Format.cpp:4110
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:3815
bool isClangFormatOn(StringRef Comment)
Definition: Format.cpp:4106
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:198
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
#define true
Definition: stdbool.h:21
bool isTableGenKeyword(const FormatToken &Tok) const
Definition: FormatToken.h:1889
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:1691
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
bool isTableGen() const
Definition: Format.h:3121
@ LK_Java
Should be used for Java.
Definition: Format.h:3093
@ LK_TextProto
Should be used for Protocol Buffer messages in text format (https://developers.google....
Definition: Format.h:3107
std::vector< std::string > AttributeMacros
This option is renamed to BreakTemplateDeclarations.
Definition: Format.h:1121
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:3170
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:3125
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:4733
std::vector< std::string > StatementAttributeLikeMacros
Macros which are ignored in front of a statement, as if they were an attribute.
Definition: Format.h:4718
std::vector< std::string > IfMacros
A vector of macros that should be interpreted as conditionals instead of as function calls.
Definition: Format.h:2595
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls.
Definition: Format.h:2572
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition: Format.h:4743
bool isCSharp() const
Definition: Format.h:3114
std::vector< std::string > WhitespaceSensitiveMacros
A vector of macros which are whitespace-sensitive and should not be touched.
Definition: Format.h:4816
bool isProto() const
Definition: Format.h:3118
bool isVerilog() const
Definition: Format.h:3117
bool isJavaScript() const
Definition: Format.h:3116
std::vector< std::string > NamespaceMacros
A vector of macros which are used to open namespace blocks.
Definition: Format.h:3279
std::vector< std::string > StatementMacros
A vector of macros that should be interpreted as complete statements.
Definition: Format.h:4729
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:3174
std::vector< std::string > TypenameMacros
A vector of macros that should be interpreted as type declarations instead of as function calls.
Definition: Format.h:4760
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:493
bool isNot(T Kind) const
Definition: FormatToken.h:611
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:303
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
Definition: FormatToken.h:462
unsigned IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:319
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:452
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:316
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:467
void setType(TokenType T)
Definition: FormatToken.h:417
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:592
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:471
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:604
unsigned IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:322
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:312
void setFinalizedType(TokenType T)
Sets the type and also the finalized flag.
Definition: FormatToken.h:431