clang 23.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
20#include "clang/Format/Format.h"
21#include "llvm/Support/Regex.h"
22
23namespace clang {
24namespace format {
25
27 const SourceManager &SourceMgr, FileID ID, unsigned Column,
28 const FormatStyle &Style, encoding::Encoding Encoding,
29 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
30 IdentifierTable &IdentTable)
31 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
32 Column(Column), TrailingWhitespace(0),
33 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
34 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
35 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
36 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
37 MacroBlockEndRegex(Style.MacroBlockEnd), VerilogProtectedBlock(false) {
38 Lex = std::make_unique<Lexer>(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
39 LangOpts);
40 Lex->SetKeepWhitespaceMode(true);
41
42 for (const std::string &ForEachMacro : Style.ForEachMacros) {
43 auto Identifier = &IdentTable.get(ForEachMacro);
44 Macros.insert({Identifier, TT_ForEachMacro});
45 }
46 for (const std::string &IfMacro : Style.IfMacros) {
47 auto Identifier = &IdentTable.get(IfMacro);
48 Macros.insert({Identifier, TT_IfMacro});
49 }
50 for (const std::string &AttributeMacro : Style.AttributeMacros) {
51 auto Identifier = &IdentTable.get(AttributeMacro);
52 Macros.insert({Identifier, TT_AttributeMacro});
53 }
54 for (const std::string &StatementMacro : Style.StatementMacros) {
55 auto Identifier = &IdentTable.get(StatementMacro);
56 Macros.insert({Identifier, TT_StatementMacro});
57 }
58 for (const std::string &TypenameMacro : Style.TypenameMacros) {
59 auto Identifier = &IdentTable.get(TypenameMacro);
60 Macros.insert({Identifier, TT_TypenameMacro});
61 }
62 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
63 auto Identifier = &IdentTable.get(NamespaceMacro);
64 Macros.insert({Identifier, TT_NamespaceMacro});
65 }
66 for (const std::string &WhitespaceSensitiveMacro :
67 Style.WhitespaceSensitiveMacros) {
68 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
69 Macros.insert({Identifier, TT_UntouchableMacroFunc});
70 }
71 for (const std::string &StatementAttributeLikeMacro :
72 Style.StatementAttributeLikeMacros) {
73 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
74 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
75 }
76
77 for (const auto &Macro : Style.MacrosSkippedByRemoveParentheses)
78 MacrosSkippedByRemoveParentheses.insert(&IdentTable.get(Macro));
79 for (const auto &TemplateName : Style.TemplateNames)
80 TemplateNames.insert(&IdentTable.get(TemplateName));
81 for (const auto &TypeName : Style.TypeNames)
82 TypeNames.insert(&IdentTable.get(TypeName));
83 for (const auto &VariableTemplate : Style.VariableTemplates)
84 VariableTemplates.insert(&IdentTable.get(VariableTemplate));
85}
86
88 assert(Tokens.empty());
89 assert(FirstInLineIndex == 0);
90
91 enum { FO_None, FO_CurrentLine, FO_NextLine } FormatOff = FO_None;
92 llvm::Regex FormatOffRegex(Style.OneLineFormatOffRegex);
93 do {
94 Tokens.push_back(getNextToken());
95
96 auto &Tok = *Tokens.back();
97 switch (const auto NewlinesBefore = Tok.NewlinesBefore; FormatOff) {
98 case FO_NextLine:
99 if (NewlinesBefore > 1) {
100 FormatOff = FO_None;
101 } else {
102 Tok.Finalized = true;
103 FormatOff = FO_CurrentLine;
104 }
105 break;
106 case FO_CurrentLine:
107 if (NewlinesBefore == 0) {
108 Tok.Finalized = true;
109 break;
110 }
111 FormatOff = FO_None;
112 [[fallthrough]];
113 default:
114 if (!FormattingDisabled && FormatOffRegex.match(Tok.TokenText)) {
115 if (Tok.is(tok::comment) &&
116 (NewlinesBefore > 0 || Tokens.size() == 1)) {
117 Tok.Finalized = true;
118 FormatOff = FO_NextLine;
119 } else {
120 for (auto *Token : reverse(Tokens)) {
121 Token->Finalized = true;
122 if (Token->NewlinesBefore > 0)
123 break;
124 }
125 FormatOff = FO_CurrentLine;
126 }
127 }
128 }
129
130 if (Style.isJavaScript()) {
131 tryParseJSRegexLiteral();
132 handleTemplateStrings();
133 } else if (Style.isTextProto()) {
134 tryParsePythonComment();
135 }
136
137 tryMergePreviousTokens();
138
139 if (Style.isCSharp()) {
140 // This needs to come after tokens have been merged so that C#
141 // string literals are correctly identified.
142 handleCSharpVerbatimAndInterpolatedStrings();
143 } else if (Style.isTableGen()) {
144 handleTableGenMultilineString();
145 handleTableGenNumericLikeIdentifier();
146 }
147
148 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
149 FirstInLineIndex = Tokens.size() - 1;
150 } while (Tokens.back()->isNot(tok::eof));
151
152 if (Style.InsertNewlineAtEOF) {
153 auto &TokEOF = *Tokens.back();
154 if (TokEOF.NewlinesBefore == 0) {
155 TokEOF.NewlinesBefore = 1;
156 TokEOF.OriginalColumn = 0;
157 }
158 }
159
160 return Tokens;
161}
162
163void FormatTokenLexer::tryMergePreviousTokens() {
164 if (tryMerge_TMacro())
165 return;
166 if (tryMergeConflictMarkers())
167 return;
168 if (tryMergeLessLess())
169 return;
170 if (tryMergeGreaterGreater())
171 return;
172 if (tryMergeForEach())
173 return;
174
175 if ((Style.Language == FormatStyle::LK_Cpp ||
176 Style.Language == FormatStyle::LK_ObjC) &&
177 tryMergeUserDefinedLiteral()) {
178 return;
179 }
180
181 if (Style.isJavaScript() || Style.isCSharp()) {
182 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
183 tok::question};
184 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
185 tok::period};
186 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
187
188 if (tryMergeTokens(FatArrow, TT_FatArrow))
189 return;
190 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
191 // Treat like the "||" operator (as opposed to the ternary ?).
192 Tokens.back()->Tok.setKind(tok::pipepipe);
193 return;
194 }
195 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
196 // Treat like a regular "." access.
197 Tokens.back()->Tok.setKind(tok::period);
198 return;
199 }
200 if (tryMergeNullishCoalescingEqual())
201 return;
202
203 if (Style.isCSharp()) {
204 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
205 tok::question, tok::l_square};
206
207 if (tryMergeCSharpKeywordVariables())
208 return;
209 if (tryMergeCSharpStringLiteral())
210 return;
211 if (tryTransformCSharpForEach())
212 return;
213 if (tryMergeTokens(CSharpNullConditionalLSquare,
214 TT_CSharpNullConditionalLSquare)) {
215 // Treat like a regular "[" operator.
216 Tokens.back()->Tok.setKind(tok::l_square);
217 return;
218 }
219 }
220 }
221
222 if (tryMergeNSStringLiteral())
223 return;
224
225 if (Style.isJavaScript()) {
226 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
227 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
228 tok::equal};
229 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
230 tok::greaterequal};
231 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
232 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
233 tok::starequal};
234 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
235 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
236
237 // FIXME: Investigate what token type gives the correct operator priority.
238 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
239 return;
240 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
241 return;
242 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
243 return;
244 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
245 return;
246 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
247 Tokens.back()->Tok.setKind(tok::starequal);
248 return;
249 }
250 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
251 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
252 // Treat like the "=" assignment operator.
253 Tokens.back()->Tok.setKind(tok::equal);
254 return;
255 }
256 if (tryMergeJSPrivateIdentifier())
257 return;
258 } else if (Style.isJava()) {
259 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
260 tok::greater, tok::greater, tok::greaterequal};
261 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
262 return;
263 } else if (Style.isVerilog()) {
264 // Merge the number following a base like `'h?a0`.
265 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
266 Tokens.end()[-2]->is(tok::numeric_constant) &&
267 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
268 tok::question) &&
269 tryMergeTokens(2, TT_Unknown)) {
270 return;
271 }
272 // Part select.
273 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
274 TT_BitFieldColon)) {
275 return;
276 }
277 // Xnor. The combined token is treated as a caret which can also be either a
278 // unary or binary operator. The actual type is determined in
279 // TokenAnnotator. We also check the token length so we know it is not
280 // already a merged token.
281 if (Tokens.back()->TokenText.size() == 1 &&
282 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
283 TT_BinaryOperator)) {
284 Tokens.back()->Tok.setKind(tok::caret);
285 return;
286 }
287 // Signed shift and distribution weight.
288 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
289 Tokens.back()->Tok.setKind(tok::lessless);
290 return;
291 }
292 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
293 Tokens.back()->Tok.setKind(tok::greatergreater);
294 return;
295 }
296 if (tryMergeTokensAny({{tok::lessless, tok::equal},
297 {tok::lessless, tok::lessequal},
298 {tok::greatergreater, tok::equal},
299 {tok::greatergreater, tok::greaterequal},
300 {tok::colon, tok::equal},
301 {tok::colon, tok::slash}},
302 TT_BinaryOperator)) {
303 Tokens.back()->ForcedPrecedence = prec::Assignment;
304 return;
305 }
306 // Exponentiation, signed shift, case equality, and wildcard equality.
307 if (tryMergeTokensAny({{tok::star, tok::star},
308 {tok::lessless, tok::less},
309 {tok::greatergreater, tok::greater},
310 {tok::exclaimequal, tok::equal},
311 {tok::exclaimequal, tok::question},
312 {tok::equalequal, tok::equal},
313 {tok::equalequal, tok::question}},
314 TT_BinaryOperator)) {
315 return;
316 }
317 // Module paths in specify blocks and the implication and boolean equality
318 // operators.
319 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
320 {tok::plus, tok::star, tok::greater},
321 {tok::minusequal, tok::greater},
322 {tok::minus, tok::star, tok::greater},
323 {tok::less, tok::arrow},
324 {tok::equal, tok::greater},
325 {tok::star, tok::greater},
326 {tok::pipeequal, tok::greater},
327 {tok::pipe, tok::arrow}},
328 TT_BinaryOperator) ||
329 Tokens.back()->is(tok::arrow)) {
330 Tokens.back()->ForcedPrecedence = prec::Comma;
331 return;
332 }
333 if (Tokens.size() >= 3 &&
334 Tokens[Tokens.size() - 3]->is(Keywords.kw_verilogHash) &&
335 Tokens[Tokens.size() - 2]->isOneOf(tok::minus, tok::equal) &&
336 Tokens[Tokens.size() - 1]->is(Keywords.kw_verilogHash) &&
337 tryMergeTokens(3, TT_BinaryOperator)) {
338 Tokens.back()->setFinalizedType(TT_BinaryOperator);
339 Tokens.back()->ForcedPrecedence = prec::Comma;
340 return;
341 }
342 } else if (Style.isTableGen()) {
343 // TableGen's Multi line string starts with [{
344 if (tryMergeTokens({tok::l_square, tok::l_brace},
345 TT_TableGenMultiLineString)) {
346 // Set again with finalizing. This must never be annotated as other types.
347 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
348 Tokens.back()->Tok.setKind(tok::string_literal);
349 return;
350 }
351 // TableGen's bang operator is the form !<name>.
352 // !cond is a special case with specific syntax.
353 if (tryMergeTokens({tok::exclaim, tok::identifier},
354 TT_TableGenBangOperator)) {
355 Tokens.back()->Tok.setKind(tok::identifier);
356 Tokens.back()->Tok.setIdentifierInfo(nullptr);
357 if (Tokens.back()->TokenText == "!cond")
358 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
359 else
360 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
361 return;
362 }
363 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
364 // Here, "! if" becomes "!if". That is, ! captures if even when the space
365 // exists. That is only one possibility in TableGen's syntax.
366 Tokens.back()->Tok.setKind(tok::identifier);
367 Tokens.back()->Tok.setIdentifierInfo(nullptr);
368 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
369 return;
370 }
371 // +, - with numbers are literals. Not unary operators.
372 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
373 Tokens.back()->Tok.setKind(tok::numeric_constant);
374 return;
375 }
376 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
377 Tokens.back()->Tok.setKind(tok::numeric_constant);
378 return;
379 }
380 }
381}
382
383bool FormatTokenLexer::tryMergeNSStringLiteral() {
384 if (Tokens.size() < 2)
385 return false;
386 auto &At = *(Tokens.end() - 2);
387 auto &String = *(Tokens.end() - 1);
388 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
389 return false;
390 At->Tok.setKind(tok::string_literal);
391 At->TokenText = StringRef(At->TokenText.begin(),
392 String->TokenText.end() - At->TokenText.begin());
393 At->ColumnWidth += String->ColumnWidth;
394 At->setType(TT_ObjCStringLiteral);
395 Tokens.erase(Tokens.end() - 1);
396 return true;
397}
398
399bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
400 // Merges #idenfier into a single identifier with the text #identifier
401 // but the token tok::identifier.
402 if (Tokens.size() < 2)
403 return false;
404 auto &Hash = *(Tokens.end() - 2);
405 auto &Identifier = *(Tokens.end() - 1);
406 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
407 return false;
408 Hash->Tok.setKind(tok::identifier);
409 Hash->TokenText =
410 StringRef(Hash->TokenText.begin(),
411 Identifier->TokenText.end() - Hash->TokenText.begin());
412 Hash->ColumnWidth += Identifier->ColumnWidth;
413 Hash->setType(TT_JsPrivateIdentifier);
414 Tokens.erase(Tokens.end() - 1);
415 return true;
416}
417
418// Search for verbatim or interpolated string literals @"ABC" or
419// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
420// prevent splitting of @, $ and ".
421// Merging of multiline verbatim strings with embedded '"' is handled in
422// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
423bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
424 if (Tokens.size() < 2)
425 return false;
426
427 // Look for @"aaaaaa" or $"aaaaaa".
428 const auto String = *(Tokens.end() - 1);
429 if (String->isNot(tok::string_literal))
430 return false;
431
432 auto Prefix = *(Tokens.end() - 2);
433 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
434 return false;
435
436 if (Tokens.size() > 2) {
437 const auto Tok = *(Tokens.end() - 3);
438 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
439 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
440 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
441 Tok->ColumnWidth += Prefix->ColumnWidth;
442 Tokens.erase(Tokens.end() - 2);
443 Prefix = Tok;
444 }
445 }
446
447 // Convert back into just a string_literal.
448 Prefix->Tok.setKind(tok::string_literal);
449 Prefix->TokenText =
450 StringRef(Prefix->TokenText.begin(),
451 String->TokenText.end() - Prefix->TokenText.begin());
452 Prefix->ColumnWidth += String->ColumnWidth;
453 Prefix->setType(TT_CSharpStringLiteral);
454 Tokens.erase(Tokens.end() - 1);
455 return true;
456}
457
458// Valid C# attribute targets:
459// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
460const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
461 "assembly", "module", "field", "event", "method",
462 "param", "property", "return", "type",
463};
464
465bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
466 if (Tokens.size() < 2)
467 return false;
468 auto &NullishCoalescing = *(Tokens.end() - 2);
469 auto &Equal = *(Tokens.end() - 1);
470 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
471 Equal->isNot(tok::equal)) {
472 return false;
473 }
474 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
475 NullishCoalescing->TokenText =
476 StringRef(NullishCoalescing->TokenText.begin(),
477 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
478 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
479 NullishCoalescing->setType(TT_NullCoalescingEqual);
480 Tokens.erase(Tokens.end() - 1);
481 return true;
482}
483
484bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
485 if (Tokens.size() < 2)
486 return false;
487 const auto At = *(Tokens.end() - 2);
488 if (At->isNot(tok::at))
489 return false;
490 const auto Keyword = *(Tokens.end() - 1);
491 if (Keyword->TokenText == "$")
492 return false;
493 if (!Keywords.isCSharpKeyword(*Keyword))
494 return false;
495
496 At->Tok.setKind(tok::identifier);
497 At->TokenText = StringRef(At->TokenText.begin(),
498 Keyword->TokenText.end() - At->TokenText.begin());
499 At->ColumnWidth += Keyword->ColumnWidth;
500 At->setType(Keyword->getType());
501 Tokens.erase(Tokens.end() - 1);
502 return true;
503}
504
505// In C# transform identifier foreach into kw_foreach
506bool FormatTokenLexer::tryTransformCSharpForEach() {
507 if (Tokens.empty())
508 return false;
509 auto &Identifier = *(Tokens.end() - 1);
510 if (Identifier->isNot(tok::identifier))
511 return false;
512 if (Identifier->TokenText != "foreach")
513 return false;
514
515 Identifier->setType(TT_ForEachMacro);
516 Identifier->Tok.setKind(tok::kw_for);
517 return true;
518}
519
520bool FormatTokenLexer::tryMergeForEach() {
521 if (Tokens.size() < 2)
522 return false;
523 auto &For = *(Tokens.end() - 2);
524 auto &Each = *(Tokens.end() - 1);
525 if (For->isNot(tok::kw_for))
526 return false;
527 if (Each->isNot(tok::identifier))
528 return false;
529 if (Each->TokenText != "each")
530 return false;
531
532 For->setType(TT_ForEachMacro);
533 For->Tok.setKind(tok::kw_for);
534
535 For->TokenText = StringRef(For->TokenText.begin(),
536 Each->TokenText.end() - For->TokenText.begin());
537 For->ColumnWidth += Each->ColumnWidth;
538 Tokens.erase(Tokens.end() - 1);
539 return true;
540}
541
542bool FormatTokenLexer::tryMergeLessLess() {
543 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
544 if (Tokens.size() < 3)
545 return false;
546
547 auto First = Tokens.end() - 3;
548 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
549 return false;
550
551 // Only merge if there currently is no whitespace between the two "<".
552 if (First[1]->hasWhitespaceBefore())
553 return false;
554
555 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
556 if (X && X->is(tok::less))
557 return false;
558
559 auto Y = First[2];
560 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
561 return false;
562
563 First[0]->Tok.setKind(tok::lessless);
564 First[0]->TokenText = "<<";
565 First[0]->ColumnWidth += 1;
566 Tokens.erase(Tokens.end() - 2);
567 return true;
568}
569
570bool FormatTokenLexer::tryMergeGreaterGreater() {
571 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
572 if (Tokens.size() < 2)
573 return false;
574
575 auto First = Tokens.end() - 2;
576 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
577 return false;
578
579 // Only merge if there currently is no whitespace between the first two ">".
580 if (First[1]->hasWhitespaceBefore())
581 return false;
582
583 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
584 if (Tok && Tok->isNot(tok::kw_operator))
585 return false;
586
587 First[0]->Tok.setKind(tok::greatergreater);
588 First[0]->TokenText = ">>";
589 First[0]->ColumnWidth += 1;
590 Tokens.erase(Tokens.end() - 1);
591 return true;
592}
593
594bool FormatTokenLexer::tryMergeUserDefinedLiteral() {
595 if (Tokens.size() < 2)
596 return false;
597
598 auto *First = Tokens.end() - 2;
599 auto &Suffix = First[1];
600 if (Suffix->hasWhitespaceBefore() || Suffix->TokenText != "$")
601 return false;
602
603 auto &Literal = First[0];
604 if (!Literal->Tok.isLiteral())
605 return false;
606
607 auto &Text = Literal->TokenText;
608 if (!Text.ends_with("_"))
609 return false;
610
611 Text = StringRef(Text.data(), Text.size() + 1);
612 ++Literal->ColumnWidth;
613 Tokens.erase(&Suffix);
614 return true;
615}
616
617bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
618 TokenType NewType) {
619 if (Tokens.size() < Kinds.size())
620 return false;
621
622 const auto *First = Tokens.end() - Kinds.size();
623 for (unsigned i = 0; i < Kinds.size(); ++i)
624 if (First[i]->isNot(Kinds[i]))
625 return false;
626
627 return tryMergeTokens(Kinds.size(), NewType);
628}
629
630bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
631 if (Tokens.size() < Count)
632 return false;
633
634 const auto *First = Tokens.end() - Count;
635 unsigned AddLength = 0;
636 for (size_t i = 1; i < Count; ++i) {
637 // If there is whitespace separating the token and the previous one,
638 // they should not be merged.
639 if (First[i]->hasWhitespaceBefore())
640 return false;
641 AddLength += First[i]->TokenText.size();
642 }
643
644 Tokens.resize(Tokens.size() - Count + 1);
645 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
646 First[0]->TokenText.size() + AddLength);
647 First[0]->ColumnWidth += AddLength;
648 First[0]->setType(NewType);
649 return true;
650}
651
652bool FormatTokenLexer::tryMergeTokensAny(
654 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
655 return tryMergeTokens(Kinds, NewType);
656 });
657}
658
659// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
660bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
661 // NB: This is not entirely correct, as an r_paren can introduce an operand
662 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
663 // corner case to not matter in practice, though.
664 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
665 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
666 tok::colon, tok::question, tok::tilde) ||
667 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
668 tok::kw_else, tok::kw_void, tok::kw_typeof,
669 Keywords.kw_instanceof, Keywords.kw_in) ||
670 Tok->isPlacementOperator() || Tok->isBinaryOperator();
671}
672
673bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
674 if (!Prev)
675 return true;
676
677 // Regex literals can only follow after prefix unary operators, not after
678 // postfix unary operators. If the '++' is followed by a non-operand
679 // introducing token, the slash here is the operand and not the start of a
680 // regex.
681 // `!` is an unary prefix operator, but also a post-fix operator that casts
682 // away nullability, so the same check applies.
683 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
684 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
685
686 // The previous token must introduce an operand location where regex
687 // literals can occur.
688 if (!precedesOperand(Prev))
689 return false;
690
691 return true;
692}
693
694void FormatTokenLexer::tryParseJavaTextBlock() {
695 if (FormatTok->TokenText != "\"\"")
696 return;
697
698 const auto *S = Lex->getBufferLocation();
699 const auto *End = Lex->getBuffer().end();
700
701 if (S == End || *S != '\"')
702 return;
703
704 ++S; // Skip the `"""` that begins a text block.
705
706 // Find the `"""` that ends the text block.
707 bool Escaped = false;
708 for (int Count = 0; Count < 3 && S < End; ++S) {
709 if (Escaped) {
710 Escaped = false;
711 continue;
712 }
713 switch (*S) {
714 case '\"':
715 ++Count;
716 break;
717 case '\\':
718 Escaped = true;
719 [[fallthrough]];
720 default:
721 Count = 0;
722 }
723 }
724
725 // Ignore the possibly invalid text block.
726 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(S)));
727}
728
729// Tries to parse a JavaScript Regex literal starting at the current token,
730// if that begins with a slash and is in a location where JavaScript allows
731// regex literals. Changes the current token to a regex literal and updates
732// its text if successful.
733void FormatTokenLexer::tryParseJSRegexLiteral() {
734 FormatToken *RegexToken = Tokens.back();
735 if (RegexToken->isNoneOf(tok::slash, tok::slashequal))
736 return;
737
738 FormatToken *Prev = nullptr;
739 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
740 // NB: Because previous pointers are not initialized yet, this cannot use
741 // Token.getPreviousNonComment.
742 if (FT->isNot(tok::comment)) {
743 Prev = FT;
744 break;
745 }
746 }
747
748 if (!canPrecedeRegexLiteral(Prev))
749 return;
750
751 // 'Manually' lex ahead in the current file buffer.
752 const char *Offset = Lex->getBufferLocation();
753 const char *RegexBegin = Offset - RegexToken->TokenText.size();
754 StringRef Buffer = Lex->getBuffer();
755 bool InCharacterClass = false;
756 bool HaveClosingSlash = false;
757 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
758 // Regular expressions are terminated with a '/', which can only be
759 // escaped using '\' or a character class between '[' and ']'.
760 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
761 switch (*Offset) {
762 case '\\':
763 // Skip the escaped character.
764 ++Offset;
765 break;
766 case '[':
767 InCharacterClass = true;
768 break;
769 case ']':
770 InCharacterClass = false;
771 break;
772 case '/':
773 if (!InCharacterClass)
774 HaveClosingSlash = true;
775 break;
776 }
777 }
778
779 RegexToken->setType(TT_RegexLiteral);
780 // Treat regex literals like other string_literals.
781 RegexToken->Tok.setKind(tok::string_literal);
782 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
783 RegexToken->ColumnWidth = RegexToken->TokenText.size();
784
785 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
786}
787
788static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
789 bool Interpolated) {
790 auto Repeated = [&Begin, End]() {
791 return Begin + 1 < End && Begin[1] == Begin[0];
792 };
793
794 // Look for a terminating '"' in the current file buffer.
795 // Make no effort to format code within an interpolated or verbatim string.
796 //
797 // Interpolated strings could contain { } with " characters inside.
798 // $"{x ?? "null"}"
799 // should not be split into $"{x ?? ", null, "}" but should be treated as a
800 // single string-literal.
801 //
802 // We opt not to try and format expressions inside {} within a C#
803 // interpolated string. Formatting expressions within an interpolated string
804 // would require similar work as that done for JavaScript template strings
805 // in `handleTemplateStrings()`.
806 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
807 switch (*Begin) {
808 case '\\':
809 if (!Verbatim)
810 ++Begin;
811 break;
812 case '{':
813 if (Interpolated) {
814 // {{ inside an interpolated string is escaped, so skip it.
815 if (Repeated())
816 ++Begin;
817 else
818 ++UnmatchedOpeningBraceCount;
819 }
820 break;
821 case '}':
822 if (Interpolated) {
823 // }} inside an interpolated string is escaped, so skip it.
824 if (Repeated())
825 ++Begin;
826 else if (UnmatchedOpeningBraceCount > 0)
827 --UnmatchedOpeningBraceCount;
828 else
829 return End;
830 }
831 break;
832 case '"':
833 if (UnmatchedOpeningBraceCount > 0)
834 break;
835 // "" within a verbatim string is an escaped double quote: skip it.
836 if (Verbatim && Repeated()) {
837 ++Begin;
838 break;
839 }
840 return Begin;
841 }
842 }
843
844 return End;
845}
846
847void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
848 FormatToken *CSharpStringLiteral = Tokens.back();
849
850 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
851 return;
852
853 auto &TokenText = CSharpStringLiteral->TokenText;
854
855 bool Verbatim = false;
856 bool Interpolated = false;
857 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
858 Verbatim = true;
859 Interpolated = true;
860 } else if (TokenText.starts_with(R"(@")")) {
861 Verbatim = true;
862 } else if (TokenText.starts_with(R"($")")) {
863 Interpolated = true;
864 }
865
866 // Deal with multiline strings.
867 if (!Verbatim && !Interpolated)
868 return;
869
870 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
871 const char *Offset = StrBegin;
872 Offset += Verbatim && Interpolated ? 3 : 2;
873
874 const auto End = Lex->getBuffer().end();
875 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
876
877 // Make no attempt to format code properly if a verbatim string is
878 // unterminated.
879 if (Offset >= End)
880 return;
881
882 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
883 TokenText = LiteralText;
884
885 // Adjust width for potentially multiline string literals.
886 size_t FirstBreak = LiteralText.find('\n');
887 StringRef FirstLineText = FirstBreak == StringRef::npos
888 ? LiteralText
889 : LiteralText.substr(0, FirstBreak);
890 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
891 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
892 Encoding);
893 size_t LastBreak = LiteralText.rfind('\n');
894 if (LastBreak != StringRef::npos) {
895 CSharpStringLiteral->IsMultiline = true;
896 unsigned StartColumn = 0;
897 CSharpStringLiteral->LastLineColumnWidth =
898 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
899 StartColumn, Style.TabWidth, Encoding);
900 }
901
902 assert(Offset < End);
903 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
904}
905
906void FormatTokenLexer::handleTableGenMultilineString() {
907 FormatToken *MultiLineString = Tokens.back();
908 if (MultiLineString->isNot(TT_TableGenMultiLineString))
909 return;
910
911 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
912 // "}]" is the end of multi line string.
913 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
914 if (CloseOffset == StringRef::npos)
915 return;
916 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
917 MultiLineString->TokenText = Text;
918 resetLexer(SourceMgr.getFileOffset(
919 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
920 auto FirstLineText = Text;
921 auto FirstBreak = Text.find('\n');
922 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
923 if (FirstBreak != StringRef::npos) {
924 MultiLineString->IsMultiline = true;
925 FirstLineText = Text.substr(0, FirstBreak + 1);
926 // LastLineColumnWidth holds the width of the last line.
927 auto LastBreak = Text.rfind('\n');
928 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
929 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
930 Style.TabWidth, Encoding);
931 }
932 // ColumnWidth holds only the width of the first line.
933 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
934 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
935}
936
937void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
938 FormatToken *Tok = Tokens.back();
939 // TableGen identifiers can begin with digits. Such tokens are lexed as
940 // numeric_constant now.
941 if (Tok->isNot(tok::numeric_constant))
942 return;
943 StringRef Text = Tok->TokenText;
944 // The following check is based on llvm::TGLexer::LexToken.
945 // That lexes the token as a number if any of the following holds:
946 // 1. It starts with '+', '-'.
947 // 2. All the characters are digits.
948 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
949 // 4. The first non-digit character is 'x', and the next is a hex digit.
950 // Note that in the case 3 and 4, if the next character does not exists in
951 // this token, the token is an identifier.
952 if (Text.empty() || Text[0] == '+' || Text[0] == '-')
953 return;
954 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
955 // All the characters are digits
956 if (NonDigitPos == StringRef::npos)
957 return;
958 char FirstNonDigit = Text[NonDigitPos];
959 if (NonDigitPos < Text.size() - 1) {
960 char TheNext = Text[NonDigitPos + 1];
961 // Regarded as a binary number.
962 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
963 return;
964 // Regarded as hex number.
965 if (FirstNonDigit == 'x' && isxdigit(TheNext))
966 return;
967 }
968 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
969 // This is actually an identifier in TableGen.
970 Tok->Tok.setKind(tok::identifier);
971 Tok->Tok.setIdentifierInfo(nullptr);
972 }
973}
974
975void FormatTokenLexer::handleTemplateStrings() {
976 FormatToken *BacktickToken = Tokens.back();
977
978 if (BacktickToken->is(tok::l_brace)) {
979 StateStack.push(LexerState::NORMAL);
980 return;
981 }
982 if (BacktickToken->is(tok::r_brace)) {
983 if (StateStack.size() == 1)
984 return;
985 StateStack.pop();
986 if (StateStack.top() != LexerState::TEMPLATE_STRING)
987 return;
988 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
989 } else if (BacktickToken->is(tok::unknown) &&
990 BacktickToken->TokenText == "`") {
991 StateStack.push(LexerState::TEMPLATE_STRING);
992 } else {
993 return; // Not actually a template
994 }
995
996 // 'Manually' lex ahead in the current file buffer.
997 const char *Offset = Lex->getBufferLocation();
998 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
999 for (; Offset != Lex->getBuffer().end(); ++Offset) {
1000 if (Offset[0] == '`') {
1001 StateStack.pop();
1002 ++Offset;
1003 break;
1004 }
1005 if (Offset[0] == '\\') {
1006 ++Offset; // Skip the escaped character.
1007 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
1008 Offset[1] == '{') {
1009 // '${' introduces an expression interpolation in the template string.
1010 StateStack.push(LexerState::NORMAL);
1011 Offset += 2;
1012 break;
1013 }
1014 }
1015
1016 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
1017 BacktickToken->setType(TT_TemplateString);
1018 BacktickToken->Tok.setKind(tok::string_literal);
1019 BacktickToken->TokenText = LiteralText;
1020
1021 // Adjust width for potentially multiline string literals.
1022 size_t FirstBreak = LiteralText.find('\n');
1023 StringRef FirstLineText = FirstBreak == StringRef::npos
1024 ? LiteralText
1025 : LiteralText.substr(0, FirstBreak);
1026 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
1027 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
1028 size_t LastBreak = LiteralText.rfind('\n');
1029 if (LastBreak != StringRef::npos) {
1030 BacktickToken->IsMultiline = true;
1031 unsigned StartColumn = 0; // The template tail spans the entire line.
1032 BacktickToken->LastLineColumnWidth =
1033 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
1034 StartColumn, Style.TabWidth, Encoding);
1035 }
1036
1037 SourceLocation loc = Lex->getSourceLocation(Offset);
1038 resetLexer(SourceMgr.getFileOffset(loc));
1039}
1040
1041void FormatTokenLexer::tryParsePythonComment() {
1042 FormatToken *HashToken = Tokens.back();
1043 if (HashToken->isNoneOf(tok::hash, tok::hashhash))
1044 return;
1045 // Turn the remainder of this line into a comment.
1046 const char *CommentBegin =
1047 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
1048 size_t From = CommentBegin - Lex->getBuffer().begin();
1049 size_t To = Lex->getBuffer().find_first_of('\n', From);
1050 if (To == StringRef::npos)
1051 To = Lex->getBuffer().size();
1052 size_t Len = To - From;
1053 HashToken->setType(TT_LineComment);
1054 HashToken->Tok.setKind(tok::comment);
1055 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
1056 SourceLocation Loc = To < Lex->getBuffer().size()
1057 ? Lex->getSourceLocation(CommentBegin + Len)
1058 : SourceMgr.getLocForEndOfFile(ID);
1059 resetLexer(SourceMgr.getFileOffset(Loc));
1060}
1061
1062bool FormatTokenLexer::tryMerge_TMacro() {
1063 if (Tokens.size() < 4)
1064 return false;
1065 FormatToken *Last = Tokens.back();
1066 if (Last->isNot(tok::r_paren))
1067 return false;
1068
1069 FormatToken *String = Tokens[Tokens.size() - 2];
1070 if (String->isNot(tok::string_literal) || String->IsMultiline)
1071 return false;
1072
1073 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
1074 return false;
1075
1076 FormatToken *Macro = Tokens[Tokens.size() - 4];
1077 if (Macro->TokenText != "_T")
1078 return false;
1079
1080 const char *Start = Macro->TokenText.data();
1081 const char *End = Last->TokenText.data() + Last->TokenText.size();
1082 String->TokenText = StringRef(Start, End - Start);
1083 String->IsFirst = Macro->IsFirst;
1084 String->LastNewlineOffset = Macro->LastNewlineOffset;
1085 String->WhitespaceRange = Macro->WhitespaceRange;
1086 String->OriginalColumn = Macro->OriginalColumn;
1087 String->ColumnWidth = encoding::columnWidthWithTabs(
1088 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1089 String->NewlinesBefore = Macro->NewlinesBefore;
1090 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1091
1092 Tokens.pop_back();
1093 Tokens.pop_back();
1094 Tokens.pop_back();
1095 Tokens.back() = String;
1096 if (FirstInLineIndex >= Tokens.size())
1097 FirstInLineIndex = Tokens.size() - 1;
1098 return true;
1099}
1100
1101bool FormatTokenLexer::tryMergeConflictMarkers() {
1102 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1103 return false;
1104
1105 // Conflict lines look like:
1106 // <marker> <text from the vcs>
1107 // For example:
1108 // >>>>>>> /file/in/file/system at revision 1234
1109 //
1110 // We merge all tokens in a line that starts with a conflict marker
1111 // into a single token with a special token type that the unwrapped line
1112 // parser will use to correctly rebuild the underlying code.
1113
1114 FileID ID;
1115 // Get the position of the first token in the line.
1116 unsigned FirstInLineOffset;
1117 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1118 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1119 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1120 // Calculate the offset of the start of the current line.
1121 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1122 if (LineOffset == StringRef::npos)
1123 LineOffset = 0;
1124 else
1125 ++LineOffset;
1126
1127 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1128 StringRef LineStart;
1129 if (FirstSpace == StringRef::npos)
1130 LineStart = Buffer.substr(LineOffset);
1131 else
1132 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1133
1134 TokenType Type = TT_Unknown;
1135 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1136 Type = TT_ConflictStart;
1137 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1138 LineStart == "====") {
1139 Type = TT_ConflictAlternative;
1140 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1141 Type = TT_ConflictEnd;
1142 }
1143
1144 if (Type != TT_Unknown) {
1145 FormatToken *Next = Tokens.back();
1146
1147 Tokens.resize(FirstInLineIndex + 1);
1148 // We do not need to build a complete token here, as we will skip it
1149 // during parsing anyway (as we must not touch whitespace around conflict
1150 // markers).
1151 Tokens.back()->setType(Type);
1152 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1153
1154 Tokens.push_back(Next);
1155 return true;
1156 }
1157
1158 return false;
1159}
1160
1161FormatToken *FormatTokenLexer::getStashedToken() {
1162 // Create a synthesized second '>' or '<' token.
1163 Token Tok = FormatTok->Tok;
1164 StringRef TokenText = FormatTok->TokenText;
1165
1166 unsigned OriginalColumn = FormatTok->OriginalColumn;
1167 FormatTok = new (Allocator.Allocate()) FormatToken;
1168 FormatTok->Tok = Tok;
1169 SourceLocation TokLocation =
1170 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1171 FormatTok->Tok.setLocation(TokLocation);
1172 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1173 FormatTok->TokenText = TokenText;
1174 FormatTok->ColumnWidth = 1;
1175 FormatTok->OriginalColumn = OriginalColumn + 1;
1176
1177 return FormatTok;
1178}
1179
1180/// Truncate the current token to the new length and make the lexer continue
1181/// from the end of the truncated token. Used for other languages that have
1182/// different token boundaries, like JavaScript in which a comment ends at a
1183/// line break regardless of whether the line break follows a backslash. Also
1184/// used to set the lexer to the end of whitespace if the lexer regards
1185/// whitespace and an unrecognized symbol as one token.
1186void FormatTokenLexer::truncateToken(size_t NewLen) {
1187 assert(NewLen <= FormatTok->TokenText.size());
1188 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1189 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1190 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1191 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1192 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1193 Encoding);
1194 FormatTok->Tok.setLength(NewLen);
1195}
1196
1197/// Count the length of leading whitespace in a token.
1198static size_t countLeadingWhitespace(StringRef Text) {
1199 // Basically counting the length matched by this regex.
1200 // "^([\n\r\f\v \t]|\\\\[\n\r])+"
1201 // Directly using the regex turned out to be slow. With the regex
1202 // version formatting all files in this directory took about 1.25
1203 // seconds. This version took about 0.5 seconds.
1204 const unsigned char *const Begin = Text.bytes_begin();
1205 const unsigned char *const End = Text.bytes_end();
1206 const unsigned char *Cur = Begin;
1207 while (Cur < End) {
1208 if (isWhitespace(Cur[0])) {
1209 ++Cur;
1210 } else if (Cur[0] == '\\') {
1211 // A backslash followed by optional horizontal whitespaces (P22232R2) and
1212 // then a newline always escapes the newline.
1213 // The source has a null byte at the end. So the end of the entire input
1214 // isn't reached yet. Also the lexer doesn't break apart an escaped
1215 // newline.
1216 const auto *Lookahead = Cur + 1;
1217 while (isHorizontalWhitespace(*Lookahead))
1218 ++Lookahead;
1219 // No line splice found; the backslash is a token.
1220 if (!isVerticalWhitespace(*Lookahead))
1221 break;
1222 // Splice found, consume it.
1223 Cur = Lookahead + 1;
1224 } else {
1225 break;
1226 }
1227 }
1228 return Cur - Begin;
1229}
1230
1231FormatToken *FormatTokenLexer::getNextToken() {
1232 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1233 StateStack.pop();
1234 return getStashedToken();
1235 }
1236
1237 FormatTok = new (Allocator.Allocate()) FormatToken;
1238 readRawToken(*FormatTok);
1239 SourceLocation WhitespaceStart =
1240 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1241 FormatTok->IsFirst = IsFirstToken;
1242 IsFirstToken = false;
1243
1244 // Consume and record whitespace until we find a significant token.
1245 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1246 // followed by a symbol such as backtick. Those symbols may be
1247 // significant in other languages.
1248 unsigned WhitespaceLength = TrailingWhitespace;
1249 while (FormatTok->isNot(tok::eof)) {
1250 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1251 if (LeadingWhitespace == 0)
1252 break;
1253 if (LeadingWhitespace < FormatTok->TokenText.size())
1254 truncateToken(LeadingWhitespace);
1255 StringRef Text = FormatTok->TokenText;
1256 bool InEscape = false;
1257 for (int i = 0, e = Text.size(); i != e; ++i) {
1258 switch (Text[i]) {
1259 case '\r':
1260 // If this is a CRLF sequence, break here and the LF will be handled on
1261 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1262 // the same as a single LF.
1263 if (i + 1 < e && Text[i + 1] == '\n')
1264 break;
1265 [[fallthrough]];
1266 case '\n':
1267 ++FormatTok->NewlinesBefore;
1268 if (!InEscape)
1269 FormatTok->HasUnescapedNewline = true;
1270 else
1271 InEscape = false;
1272 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1273 Column = 0;
1274 break;
1275 case '\f':
1276 if (Style.KeepFormFeed && !FormatTok->HasFormFeedBefore &&
1277 // The form feed is immediately preceded and followed by a newline.
1278 i > 0 && Text[i - 1] == '\n' &&
1279 ((i + 1 < e && Text[i + 1] == '\n') ||
1280 (i + 2 < e && Text[i + 1] == '\r' && Text[i + 2] == '\n'))) {
1281 FormatTok->HasFormFeedBefore = true;
1282 }
1283 [[fallthrough]];
1284 case '\v':
1285 Column = 0;
1286 break;
1287 case ' ':
1288 ++Column;
1289 break;
1290 case '\t':
1291 Column +=
1292 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1293 break;
1294 case '\\':
1295 // The code preceding the loop and in the countLeadingWhitespace
1296 // function guarantees that Text is entirely whitespace, not including
1297 // comments but including escaped newlines. So the character shows up,
1298 // then it has to be in an escape sequence.
1299 assert([&]() -> bool {
1300 size_t j = i + 1;
1301 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
1302 ++j;
1303 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
1304 }());
1305 InEscape = true;
1306 break;
1307 default:
1308 // This shouldn't happen.
1309 assert(false);
1310 break;
1311 }
1312 }
1313 WhitespaceLength += Text.size();
1314 readRawToken(*FormatTok);
1315 }
1316
1317 if (FormatTok->is(tok::unknown))
1318 FormatTok->setType(TT_ImplicitStringLiteral);
1319
1320 const bool IsCpp = Style.isCpp();
1321
1322 // JavaScript and Java do not allow to escape the end of the line with a
1323 // backslash. Backslashes are syntax errors in plain source, but can occur in
1324 // comments. When a single line comment ends with a \, it'll cause the next
1325 // line of code to be lexed as a comment, breaking formatting. The code below
1326 // finds comments that contain a backslash followed by a line break, truncates
1327 // the comment token at the backslash, and resets the lexer to restart behind
1328 // the backslash.
1329 if (const auto Text = FormatTok->TokenText;
1330 Text.starts_with("//") &&
1331 (IsCpp || Style.isJavaScript() || Style.isJava())) {
1332 assert(FormatTok->is(tok::comment));
1333 for (auto Pos = Text.find('\\'); Pos++ != StringRef::npos;
1334 Pos = Text.find('\\', Pos)) {
1335 if (Pos < Text.size() && Text[Pos] == '\n' &&
1336 (!IsCpp || Text.substr(Pos + 1).ltrim().starts_with("//"))) {
1337 truncateToken(Pos);
1338 break;
1339 }
1340 }
1341 }
1342
1343 if (Style.isVerilog()) {
1344 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1345 SmallVector<StringRef, 1> Matches;
1346 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1347 // And it uses the hash for delays and parameter lists. In order to continue
1348 // using `tok::hash` in other places, the backtick gets marked as the hash
1349 // here. And in order to tell the backtick and hash apart for
1350 // Verilog-specific stuff, the hash becomes an identifier.
1351 if (FormatTok->is(tok::numeric_constant)) {
1352 // In Verilog the quote is not part of a number.
1353 auto Quote = FormatTok->TokenText.find('\'');
1354 if (Quote != StringRef::npos)
1355 truncateToken(Quote);
1356 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1357 FormatTok->Tok.setKind(tok::raw_identifier);
1358 } else if (FormatTok->is(tok::raw_identifier)) {
1359 if (FormatTok->TokenText == "`") {
1360 FormatTok->Tok.setIdentifierInfo(nullptr);
1361 FormatTok->Tok.setKind(tok::hash);
1362 } else if (FormatTok->TokenText == "``") {
1363 FormatTok->Tok.setIdentifierInfo(nullptr);
1364 FormatTok->Tok.setKind(tok::hashhash);
1365 } else if (!Tokens.empty() && Tokens.back()->is(Keywords.kw_apostrophe) &&
1366 NumberBase.match(FormatTok->TokenText, &Matches)) {
1367 // In Verilog in a based number literal like `'b10`, there may be
1368 // whitespace between `'b` and `10`. Therefore we handle the base and
1369 // the rest of the number literal as two tokens. But if there is no
1370 // space in the input code, we need to manually separate the two parts.
1371 truncateToken(Matches[0].size());
1372 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1373 }
1374 }
1375 }
1376
1377 FormatTok->WhitespaceRange = SourceRange(
1378 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1379
1380 FormatTok->OriginalColumn = Column;
1381
1382 TrailingWhitespace = 0;
1383 if (FormatTok->is(tok::comment)) {
1384 // FIXME: Add the trimmed whitespace to Column.
1385 StringRef UntrimmedText = FormatTok->TokenText;
1386 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1387 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1388 } else if (FormatTok->is(tok::raw_identifier)) {
1389 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1390 FormatTok->Tok.setIdentifierInfo(&Info);
1391 FormatTok->Tok.setKind(Info.getTokenID());
1392 if (Style.isJava() &&
1393 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1394 tok::kw_operator)) {
1395 FormatTok->Tok.setKind(tok::identifier);
1396 } else if (Style.isJavaScript() &&
1397 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1398 tok::kw_operator)) {
1399 FormatTok->Tok.setKind(tok::identifier);
1400 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1401 FormatTok->Tok.setKind(tok::identifier);
1402 } else if (Style.isVerilog()) {
1403 if (Keywords.isVerilogIdentifier(*FormatTok))
1404 FormatTok->Tok.setKind(tok::identifier);
1405 // Look for the protect line. The next lines needs to be lexed as a single
1406 // token.
1407 if (Tokens.size() - FirstInLineIndex >= 3u &&
1408 Tokens[FirstInLineIndex]->is(tok::hash) &&
1409 Tokens[FirstInLineIndex + 1u]->is(tok::pp_pragma) &&
1410 Tokens[FirstInLineIndex + 2u]->is(Keywords.kw_protect) &&
1411 FormatTok->isOneOf(
1412 Keywords.kw_data_block, Keywords.kw_data_decrypt_key,
1413 Keywords.kw_data_public_key, Keywords.kw_digest_block,
1414 Keywords.kw_digest_decrypt_key, Keywords.kw_digest_public_key,
1415 Keywords.kw_key_block, Keywords.kw_key_public_key)) {
1416 VerilogProtectedBlock = true;
1417 }
1418 }
1419 } else if (const bool Greater = FormatTok->is(tok::greatergreater);
1420 Greater || FormatTok->is(tok::lessless)) {
1421 FormatTok->Tok.setKind(Greater ? tok::greater : tok::less);
1422 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1423 ++Column;
1424 StateStack.push(LexerState::TOKEN_STASHED);
1425 } else if (Style.isJava() && FormatTok->is(tok::string_literal)) {
1426 tryParseJavaTextBlock();
1427 }
1428
1429 if (Style.isVerilog() && !Tokens.empty() &&
1430 Tokens.back()->is(TT_VerilogNumberBase) &&
1431 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1432 // Mark the number following a base like `'h?a0` as a number.
1433 FormatTok->Tok.setKind(tok::numeric_constant);
1434 }
1435
1436 // Now FormatTok is the next non-whitespace token.
1437
1438 StringRef Text = FormatTok->TokenText;
1439 size_t FirstNewlinePos = Text.find('\n');
1440 if (FirstNewlinePos == StringRef::npos) {
1441 // FIXME: ColumnWidth actually depends on the start column, we need to
1442 // take this into account when the token is moved.
1443 FormatTok->ColumnWidth =
1444 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1445 Column += FormatTok->ColumnWidth;
1446 } else {
1447 FormatTok->IsMultiline = true;
1448 // FIXME: ColumnWidth actually depends on the start column, we need to
1449 // take this into account when the token is moved.
1450 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1451 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1452
1453 // The last line of the token always starts in column 0.
1454 // Thus, the length can be precomputed even in the presence of tabs.
1455 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1456 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1457 Column = FormatTok->LastLineColumnWidth;
1458 }
1459
1460 if (IsCpp) {
1461 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1462 auto it = Macros.find(Identifier);
1463 if ((Tokens.empty() || !Tokens.back()->Tok.getIdentifierInfo() ||
1464 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() !=
1465 tok::pp_define) &&
1466 it != Macros.end()) {
1467 FormatTok->setType(it->second);
1468 if (it->second == TT_IfMacro) {
1469 // The lexer token currently has type tok::kw_unknown. However, for this
1470 // substitution to be treated correctly in the TokenAnnotator, faking
1471 // the tok value seems to be needed. Not sure if there's a more elegant
1472 // way.
1473 FormatTok->Tok.setKind(tok::kw_if);
1474 }
1475 } else if (FormatTok->is(tok::identifier)) {
1476 if (MacroBlockBeginRegex.match(Text))
1477 FormatTok->setType(TT_MacroBlockBegin);
1478 else if (MacroBlockEndRegex.match(Text))
1479 FormatTok->setType(TT_MacroBlockEnd);
1480 else if (MacrosSkippedByRemoveParentheses.contains(Identifier))
1481 FormatTok->setFinalizedType(TT_FunctionLikeMacro);
1482 else if (TemplateNames.contains(Identifier))
1483 FormatTok->setFinalizedType(TT_TemplateName);
1484 else if (TypeNames.contains(Identifier))
1485 FormatTok->setFinalizedType(TT_TypeName);
1486 else if (VariableTemplates.contains(Identifier))
1487 FormatTok->setFinalizedType(TT_VariableTemplate);
1488 }
1489 }
1490
1491 return FormatTok;
1492}
1493
1494bool FormatTokenLexer::readVerilogProtected(FormatToken &Tok) {
1495 // The block follows the pragma line.
1496 if (!VerilogProtectedBlock || Tok.NewlinesBefore == 0)
1497 return false;
1498 VerilogProtectedBlock = false;
1499
1500 // The block can be empty. Then no token is necessary. A backtick on its own
1501 // line is likely a uuencode line. A backtick followed by something is assumed
1502 // to be the pragma line that ends the block.
1503 const char *const Start = Lex->getBufferLocation();
1504 size_t Len = Lex->getBuffer().end() - Start;
1505 if (Len == 0 ||
1506 (Len >= 2 && Start[0] == '`' && !isVerticalWhitespace(Start[1]))) {
1507 return false;
1508 }
1509
1510 // The block ends when the next pragma line starts.
1511 static const llvm::Regex NextDirective("[\n\r][ \t]*`[^\n\r]");
1512 SmallVector<StringRef, 1> Matches;
1513 if (NextDirective.match(StringRef(Start, Len), &Matches)) {
1514 assert(Matches.size() == 1);
1515 Len = Matches[0].begin() - Start;
1516 }
1517
1518 Tok.Tok.setKind(tok::string_literal);
1519 Tok.Tok.setLength(Len);
1520 Tok.Tok.setLocation(Lex->getSourceLocation(Start, Len));
1521 Tok.setFinalizedType(TT_VerilogProtected);
1522 Lex->seek(Lex->getCurrentBufferOffset() + Len,
1523 /*IsAtStartOfLine=*/false);
1524 return true;
1525}
1526
1527bool FormatTokenLexer::readRawTokenVerilogSpecific(FormatToken &Tok) {
1528 if (readVerilogProtected(Tok))
1529 return true;
1530 const char *Start = Lex->getBufferLocation();
1531 size_t Len;
1532 switch (Start[0]) {
1533 // In Verilog the quote is not a character literal.
1534 case '\'':
1535 Len = 1;
1536 break;
1537 // Make the backtick and double backtick identifiers to match against them
1538 // more easily.
1539 case '`':
1540 if (Start[1] == '`')
1541 Len = 2;
1542 else
1543 Len = 1;
1544 break;
1545 // In Verilog an escaped identifier starts with a backslash and ends with
1546 // whitespace. Unless that whitespace is an escaped newline.
1547 // FIXME: If there is an escaped newline in the middle of an escaped
1548 // identifier, allow for pasting the two lines together, But escaped
1549 // identifiers usually occur only in generated code anyway.
1550 case '\\':
1551 // A backslash can also begin an escaped newline outside of an escaped
1552 // identifier.
1553 if (Start[1] == '\r' || Start[1] == '\n')
1554 return false;
1555 Len = 1;
1556 while (Start[Len] != '\0' && Start[Len] != '\f' && Start[Len] != '\n' &&
1557 Start[Len] != '\r' && Start[Len] != '\t' && Start[Len] != '\v' &&
1558 Start[Len] != ' ') {
1559 // There is a null byte at the end of the buffer, so we don't have to
1560 // check whether the next byte is within the buffer.
1561 if (Start[Len] == '\\' && Start[Len + 1] == '\r' &&
1562 Start[Len + 2] == '\n') {
1563 Len += 3;
1564 } else if (Start[Len] == '\\' &&
1565 (Start[Len + 1] == '\r' || Start[Len + 1] == '\n')) {
1566 Len += 2;
1567 } else {
1568 Len += 1;
1569 }
1570 }
1571 break;
1572 default:
1573 return false;
1574 }
1575
1576 // The kind has to be an identifier so we can match it against those defined
1577 // in Keywords. The kind has to be set before the length because the setLength
1578 // function checks that the kind is not an annotation.
1579 Tok.Tok.setKind(tok::raw_identifier);
1580 Tok.Tok.setLength(Len);
1581 Tok.Tok.setLocation(Lex->getSourceLocation(Start, Len));
1582 Tok.Tok.setRawIdentifierData(Start);
1583 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1584 return true;
1585}
1586
1587void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1588 // For Verilog, first see if there is a special token, and fall back to the
1589 // normal lexer if there isn't one.
1590 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok))
1591 Lex->LexFromRawLexer(Tok.Tok);
1592 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1593 Tok.Tok.getLength());
1594 // For formatting, treat unterminated string literals like normal string
1595 // literals.
1596 if (Tok.is(tok::unknown)) {
1597 if (Tok.TokenText.starts_with("\"")) {
1598 Tok.Tok.setKind(tok::string_literal);
1599 Tok.IsUnterminatedLiteral = true;
1600 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1601 Tok.Tok.setKind(tok::string_literal);
1602 }
1603 }
1604
1605 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1606 Tok.Tok.setKind(tok::string_literal);
1607
1608 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1609 FormattingDisabled = false;
1610
1611 Tok.Finalized = FormattingDisabled;
1612
1613 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1614 FormattingDisabled = true;
1615}
1616
1617void FormatTokenLexer::resetLexer(unsigned Offset) {
1618 StringRef Buffer = SourceMgr.getBufferData(ID);
1619 Lex = std::make_unique<Lexer>(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1620 Buffer.begin(), Buffer.begin() + Offset,
1621 Buffer.end());
1622 Lex->SetKeepWhitespaceMode(true);
1623 TrailingWhitespace = 0;
1624}
1625
1626} // namespace format
1627} // namespace clang
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
bool is(tok::TokenKind Kind) const
StringRef TokenText
The raw text of the token.
FormatToken()
Token Tok
The Token.
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
bool isNot(T Kind) const
FormatToken * Next
The next token in the unwrapped line.
Various functions to configurably format source code.
#define X(type, name)
Definition Value.h:97
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
Token - This structure provides full information about a lexed token.
Definition Token.h:36
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition Token.h:142
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
uint32_t Literal
Literals are represented as positive integers.
Definition CNFFormula.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition Format.cpp:4864
bool isClangFormatOn(StringRef Comment)
Definition Format.cpp:4860
TokenType
Determines the semantic type of a syntactic token, e.g.
LangOptions getFormattingLangOpts(const FormatStyle &Style)
Definition Format.cpp:4458
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition CharInfo.h:99
std::vector< std::string > Macros
A list of macros of the form <definition>=<expansion> .
Definition Format.h:3951
@ TemplateName
The identifier is a template name. FIXME: Add an annotation for that.
Definition Parser.h:61
nullptr
This class represents a compute construct, representing a 'Kind' of ‘parallel’, 'serial',...
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition Format.h:5933
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition CharInfo.h:91
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition CharInfo.h:108
@ Keyword
The name has been typo-corrected to a keyword.
Definition Sema.h:562
@ Type
The name was classified as a type.
Definition Sema.h:564
std::vector< std::string > MacrosSkippedByRemoveParentheses
A vector of function-like macros whose invocations should be skipped by RemoveParentheses.
Definition Format.h:3956
std::vector< std::string > TemplateNames
A vector of non-keyword identifiers that should be interpreted as template names.
Definition Format.h:5923
std::vector< std::string > VariableTemplates
A vector of non-keyword identifiers that should be interpreted as variable template names.
Definition Format.h:5984
#define true
Definition stdbool.h:25
A wrapper around a Token storing information about the whitespace characters preceding it.
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
bool HasFormFeedBefore
Has "\n\f\n" or "\n\f\r\n" before TokenText.
unsigned IsFirst
Indicates that this is the first token of the file.