clang 23.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
20#include "clang/Format/Format.h"
21#include "llvm/Support/Regex.h"
22
23namespace clang {
24namespace format {
25
27 const SourceManager &SourceMgr, FileID ID, unsigned Column,
28 const FormatStyle &Style, encoding::Encoding Encoding,
29 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
30 IdentifierTable &IdentTable)
31 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
32 Column(Column), TrailingWhitespace(0),
33 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
34 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
35 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
36 FormattingDisabled(false), FormatOffRegex(Style.OneLineFormatOffRegex),
37 MacroBlockBeginRegex(Style.MacroBlockBegin),
38 MacroBlockEndRegex(Style.MacroBlockEnd), VerilogProtectedBlock(false) {
39 Lex = std::make_unique<Lexer>(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
40 LangOpts);
41 Lex->SetKeepWhitespaceMode(true);
42
43 for (const std::string &ForEachMacro : Style.ForEachMacros) {
44 auto Identifier = &IdentTable.get(ForEachMacro);
45 Macros.insert({Identifier, TT_ForEachMacro});
46 }
47 for (const std::string &IfMacro : Style.IfMacros) {
48 auto Identifier = &IdentTable.get(IfMacro);
49 Macros.insert({Identifier, TT_IfMacro});
50 }
51 for (const std::string &AttributeMacro : Style.AttributeMacros) {
52 auto Identifier = &IdentTable.get(AttributeMacro);
53 Macros.insert({Identifier, TT_AttributeMacro});
54 }
55 for (const std::string &StatementMacro : Style.StatementMacros) {
56 auto Identifier = &IdentTable.get(StatementMacro);
57 Macros.insert({Identifier, TT_StatementMacro});
58 }
59 for (const std::string &TypenameMacro : Style.TypenameMacros) {
60 auto Identifier = &IdentTable.get(TypenameMacro);
61 Macros.insert({Identifier, TT_TypenameMacro});
62 }
63 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
64 auto Identifier = &IdentTable.get(NamespaceMacro);
65 Macros.insert({Identifier, TT_NamespaceMacro});
66 }
67 for (const std::string &WhitespaceSensitiveMacro :
68 Style.WhitespaceSensitiveMacros) {
69 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
70 Macros.insert({Identifier, TT_UntouchableMacroFunc});
71 }
72 for (const std::string &StatementAttributeLikeMacro :
73 Style.StatementAttributeLikeMacros) {
74 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
75 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
76 }
77
78 for (const auto &Macro : Style.MacrosSkippedByRemoveParentheses)
79 MacrosSkippedByRemoveParentheses.insert(&IdentTable.get(Macro));
80 for (const auto &TemplateName : Style.TemplateNames)
81 TemplateNames.insert(&IdentTable.get(TemplateName));
82 for (const auto &TypeName : Style.TypeNames)
83 TypeNames.insert(&IdentTable.get(TypeName));
84 for (const auto &VariableTemplate : Style.VariableTemplates)
85 VariableTemplates.insert(&IdentTable.get(VariableTemplate));
86}
87
89 assert(Tokens.empty());
90 assert(FirstInLineIndex == 0);
91 enum { FO_None, FO_CurrentLine, FO_NextLine } FormatOff = FO_None;
92 do {
93 Tokens.push_back(getNextToken());
94 auto &Tok = *Tokens.back();
95 const auto NewlinesBefore = Tok.NewlinesBefore;
96 switch (FormatOff) {
97 case FO_NextLine:
98 if (NewlinesBefore > 1) {
99 FormatOff = FO_None;
100 } else {
101 Tok.Finalized = true;
102 FormatOff = FO_CurrentLine;
103 }
104 break;
105 case FO_CurrentLine:
106 if (NewlinesBefore == 0) {
107 Tok.Finalized = true;
108 break;
109 }
110 FormatOff = FO_None;
111 [[fallthrough]];
112 default:
113 if (!FormattingDisabled && FormatOffRegex.match(Tok.TokenText)) {
114 if (Tok.is(tok::comment) &&
115 (NewlinesBefore > 0 || Tokens.size() == 1)) {
116 Tok.Finalized = true;
117 FormatOff = FO_NextLine;
118 } else {
119 for (auto *Token : reverse(Tokens)) {
120 Token->Finalized = true;
121 if (Token->NewlinesBefore > 0)
122 break;
123 }
124 FormatOff = FO_CurrentLine;
125 }
126 }
127 }
128 if (Style.isJavaScript()) {
129 tryParseJSRegexLiteral();
130 handleTemplateStrings();
131 } else if (Style.isTextProto()) {
132 tryParsePythonComment();
133 }
134 tryMergePreviousTokens();
135 if (Style.isCSharp()) {
136 // This needs to come after tokens have been merged so that C#
137 // string literals are correctly identified.
138 handleCSharpVerbatimAndInterpolatedStrings();
139 } else if (Style.isTableGen()) {
140 handleTableGenMultilineString();
141 handleTableGenNumericLikeIdentifier();
142 }
143 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
144 FirstInLineIndex = Tokens.size() - 1;
145 } while (Tokens.back()->isNot(tok::eof));
146 if (Style.InsertNewlineAtEOF) {
147 auto &TokEOF = *Tokens.back();
148 if (TokEOF.NewlinesBefore == 0) {
149 TokEOF.NewlinesBefore = 1;
150 TokEOF.OriginalColumn = 0;
151 }
152 }
153 return Tokens;
154}
155
156void FormatTokenLexer::tryMergePreviousTokens() {
157 if (tryMerge_TMacro())
158 return;
159 if (tryMergeConflictMarkers())
160 return;
161 if (tryMergeLessLess())
162 return;
163 if (tryMergeGreaterGreater())
164 return;
165 if (tryMergeForEach())
166 return;
167
168 if ((Style.Language == FormatStyle::LK_Cpp ||
169 Style.Language == FormatStyle::LK_ObjC) &&
170 tryMergeUserDefinedLiteral()) {
171 return;
172 }
173
174 if (Style.isJavaScript() || Style.isCSharp()) {
175 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
176 tok::question};
177 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
178 tok::period};
179 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
180
181 if (tryMergeTokens(FatArrow, TT_FatArrow))
182 return;
183 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
184 // Treat like the "||" operator (as opposed to the ternary ?).
185 Tokens.back()->Tok.setKind(tok::pipepipe);
186 return;
187 }
188 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
189 // Treat like a regular "." access.
190 Tokens.back()->Tok.setKind(tok::period);
191 return;
192 }
193 if (tryMergeNullishCoalescingEqual())
194 return;
195
196 if (Style.isCSharp()) {
197 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
198 tok::question, tok::l_square};
199
200 if (tryMergeCSharpKeywordVariables())
201 return;
202 if (tryMergeCSharpStringLiteral())
203 return;
204 if (tryTransformCSharpForEach())
205 return;
206 if (tryMergeTokens(CSharpNullConditionalLSquare,
207 TT_CSharpNullConditionalLSquare)) {
208 // Treat like a regular "[" operator.
209 Tokens.back()->Tok.setKind(tok::l_square);
210 return;
211 }
212 }
213 }
214
215 if (tryMergeNSStringLiteral())
216 return;
217
218 if (Style.isJavaScript()) {
219 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
220 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
221 tok::equal};
222 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
223 tok::greaterequal};
224 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
225 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
226 tok::starequal};
227 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
228 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
229
230 // FIXME: Investigate what token type gives the correct operator priority.
231 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
232 return;
233 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
234 return;
235 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
236 return;
237 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
238 return;
239 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
240 Tokens.back()->Tok.setKind(tok::starequal);
241 return;
242 }
243 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
244 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
245 // Treat like the "=" assignment operator.
246 Tokens.back()->Tok.setKind(tok::equal);
247 return;
248 }
249 if (tryMergeJSPrivateIdentifier())
250 return;
251 } else if (Style.isJava()) {
252 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
253 tok::greater, tok::greater, tok::greaterequal};
254 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
255 return;
256 } else if (Style.isVerilog()) {
257 // Merge the number following a base like `'h?a0`.
258 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
259 Tokens.end()[-2]->is(tok::numeric_constant) &&
260 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
261 tok::question) &&
262 tryMergeTokens(2, TT_Unknown)) {
263 return;
264 }
265 // Part select.
266 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
267 TT_BitFieldColon)) {
268 return;
269 }
270 // Xnor. The combined token is treated as a caret which can also be either a
271 // unary or binary operator. The actual type is determined in
272 // TokenAnnotator. We also check the token length so we know it is not
273 // already a merged token.
274 if (Tokens.back()->TokenText.size() == 1 &&
275 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
276 TT_BinaryOperator)) {
277 Tokens.back()->Tok.setKind(tok::caret);
278 return;
279 }
280 // Signed shift and distribution weight.
281 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
282 Tokens.back()->Tok.setKind(tok::lessless);
283 return;
284 }
285 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
286 Tokens.back()->Tok.setKind(tok::greatergreater);
287 return;
288 }
289 if (tryMergeTokensAny({{tok::lessless, tok::equal},
290 {tok::lessless, tok::lessequal},
291 {tok::greatergreater, tok::equal},
292 {tok::greatergreater, tok::greaterequal},
293 {tok::colon, tok::equal},
294 {tok::colon, tok::slash}},
295 TT_BinaryOperator)) {
296 Tokens.back()->ForcedPrecedence = prec::Assignment;
297 return;
298 }
299 // Exponentiation, signed shift, case equality, and wildcard equality.
300 if (tryMergeTokensAny({{tok::star, tok::star},
301 {tok::lessless, tok::less},
302 {tok::greatergreater, tok::greater},
303 {tok::exclaimequal, tok::equal},
304 {tok::exclaimequal, tok::question},
305 {tok::equalequal, tok::equal},
306 {tok::equalequal, tok::question}},
307 TT_BinaryOperator)) {
308 return;
309 }
310 // Module paths in specify blocks and the implication and boolean equality
311 // operators.
312 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
313 {tok::plus, tok::star, tok::greater},
314 {tok::minusequal, tok::greater},
315 {tok::minus, tok::star, tok::greater},
316 {tok::less, tok::arrow},
317 {tok::equal, tok::greater},
318 {tok::star, tok::greater},
319 {tok::pipeequal, tok::greater},
320 {tok::pipe, tok::arrow}},
321 TT_BinaryOperator) ||
322 Tokens.back()->is(tok::arrow)) {
323 Tokens.back()->ForcedPrecedence = prec::Comma;
324 return;
325 }
326 if (Tokens.size() >= 3 &&
327 Tokens[Tokens.size() - 3]->is(Keywords.kw_verilogHash) &&
328 Tokens[Tokens.size() - 2]->isOneOf(tok::minus, tok::equal) &&
329 Tokens[Tokens.size() - 1]->is(Keywords.kw_verilogHash) &&
330 tryMergeTokens(3, TT_BinaryOperator)) {
331 Tokens.back()->setFinalizedType(TT_BinaryOperator);
332 Tokens.back()->ForcedPrecedence = prec::Comma;
333 return;
334 }
335 } else if (Style.isTableGen()) {
336 // TableGen's Multi line string starts with [{
337 if (tryMergeTokens({tok::l_square, tok::l_brace},
338 TT_TableGenMultiLineString)) {
339 // Set again with finalizing. This must never be annotated as other types.
340 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
341 Tokens.back()->Tok.setKind(tok::string_literal);
342 return;
343 }
344 // TableGen's bang operator is the form !<name>.
345 // !cond is a special case with specific syntax.
346 if (tryMergeTokens({tok::exclaim, tok::identifier},
347 TT_TableGenBangOperator)) {
348 Tokens.back()->Tok.setKind(tok::identifier);
349 Tokens.back()->Tok.setIdentifierInfo(nullptr);
350 if (Tokens.back()->TokenText == "!cond")
351 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
352 else
353 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
354 return;
355 }
356 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
357 // Here, "! if" becomes "!if". That is, ! captures if even when the space
358 // exists. That is only one possibility in TableGen's syntax.
359 Tokens.back()->Tok.setKind(tok::identifier);
360 Tokens.back()->Tok.setIdentifierInfo(nullptr);
361 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
362 return;
363 }
364 // +, - with numbers are literals. Not unary operators.
365 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
366 Tokens.back()->Tok.setKind(tok::numeric_constant);
367 return;
368 }
369 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
370 Tokens.back()->Tok.setKind(tok::numeric_constant);
371 return;
372 }
373 }
374}
375
376bool FormatTokenLexer::tryMergeNSStringLiteral() {
377 if (Tokens.size() < 2)
378 return false;
379 auto &At = *(Tokens.end() - 2);
380 auto &String = *(Tokens.end() - 1);
381 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
382 return false;
383 At->Tok.setKind(tok::string_literal);
384 At->TokenText = StringRef(At->TokenText.begin(),
385 String->TokenText.end() - At->TokenText.begin());
386 At->ColumnWidth += String->ColumnWidth;
387 At->setType(TT_ObjCStringLiteral);
388 Tokens.erase(Tokens.end() - 1);
389 return true;
390}
391
392bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
393 // Merges #idenfier into a single identifier with the text #identifier
394 // but the token tok::identifier.
395 if (Tokens.size() < 2)
396 return false;
397 auto &Hash = *(Tokens.end() - 2);
398 auto &Identifier = *(Tokens.end() - 1);
399 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
400 return false;
401 Hash->Tok.setKind(tok::identifier);
402 Hash->TokenText =
403 StringRef(Hash->TokenText.begin(),
404 Identifier->TokenText.end() - Hash->TokenText.begin());
405 Hash->ColumnWidth += Identifier->ColumnWidth;
406 Hash->setType(TT_JsPrivateIdentifier);
407 Tokens.erase(Tokens.end() - 1);
408 return true;
409}
410
411// Search for verbatim or interpolated string literals @"ABC" or
412// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
413// prevent splitting of @, $ and ".
414// Merging of multiline verbatim strings with embedded '"' is handled in
415// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
416bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
417 if (Tokens.size() < 2)
418 return false;
419
420 // Look for @"aaaaaa" or $"aaaaaa".
421 const auto String = *(Tokens.end() - 1);
422 if (String->isNot(tok::string_literal))
423 return false;
424
425 auto Prefix = *(Tokens.end() - 2);
426 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
427 return false;
428
429 if (Tokens.size() > 2) {
430 const auto Tok = *(Tokens.end() - 3);
431 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
432 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
433 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
434 Tok->ColumnWidth += Prefix->ColumnWidth;
435 Tokens.erase(Tokens.end() - 2);
436 Prefix = Tok;
437 }
438 }
439
440 // Convert back into just a string_literal.
441 Prefix->Tok.setKind(tok::string_literal);
442 Prefix->TokenText =
443 StringRef(Prefix->TokenText.begin(),
444 String->TokenText.end() - Prefix->TokenText.begin());
445 Prefix->ColumnWidth += String->ColumnWidth;
446 Prefix->setType(TT_CSharpStringLiteral);
447 Tokens.erase(Tokens.end() - 1);
448 return true;
449}
450
451// Valid C# attribute targets:
452// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
453const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
454 "assembly", "module", "field", "event", "method",
455 "param", "property", "return", "type",
456};
457
458bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
459 if (Tokens.size() < 2)
460 return false;
461 auto &NullishCoalescing = *(Tokens.end() - 2);
462 auto &Equal = *(Tokens.end() - 1);
463 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
464 Equal->isNot(tok::equal)) {
465 return false;
466 }
467 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
468 NullishCoalescing->TokenText =
469 StringRef(NullishCoalescing->TokenText.begin(),
470 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
471 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
472 NullishCoalescing->setType(TT_NullCoalescingEqual);
473 Tokens.erase(Tokens.end() - 1);
474 return true;
475}
476
477bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
478 if (Tokens.size() < 2)
479 return false;
480 const auto At = *(Tokens.end() - 2);
481 if (At->isNot(tok::at))
482 return false;
483 const auto Keyword = *(Tokens.end() - 1);
484 if (Keyword->TokenText == "$")
485 return false;
486 if (!Keywords.isCSharpKeyword(*Keyword))
487 return false;
488
489 At->Tok.setKind(tok::identifier);
490 At->TokenText = StringRef(At->TokenText.begin(),
491 Keyword->TokenText.end() - At->TokenText.begin());
492 At->ColumnWidth += Keyword->ColumnWidth;
493 At->setType(Keyword->getType());
494 Tokens.erase(Tokens.end() - 1);
495 return true;
496}
497
498// In C# transform identifier foreach into kw_foreach
499bool FormatTokenLexer::tryTransformCSharpForEach() {
500 if (Tokens.empty())
501 return false;
502 auto &Identifier = *(Tokens.end() - 1);
503 if (Identifier->isNot(tok::identifier))
504 return false;
505 if (Identifier->TokenText != "foreach")
506 return false;
507
508 Identifier->setType(TT_ForEachMacro);
509 Identifier->Tok.setKind(tok::kw_for);
510 return true;
511}
512
513bool FormatTokenLexer::tryMergeForEach() {
514 if (Tokens.size() < 2)
515 return false;
516 auto &For = *(Tokens.end() - 2);
517 auto &Each = *(Tokens.end() - 1);
518 if (For->isNot(tok::kw_for))
519 return false;
520 if (Each->isNot(tok::identifier))
521 return false;
522 if (Each->TokenText != "each")
523 return false;
524
525 For->setType(TT_ForEachMacro);
526 For->Tok.setKind(tok::kw_for);
527
528 For->TokenText = StringRef(For->TokenText.begin(),
529 Each->TokenText.end() - For->TokenText.begin());
530 For->ColumnWidth += Each->ColumnWidth;
531 Tokens.erase(Tokens.end() - 1);
532 return true;
533}
534
535bool FormatTokenLexer::tryMergeLessLess() {
536 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
537 if (Tokens.size() < 3)
538 return false;
539
540 auto First = Tokens.end() - 3;
541 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
542 return false;
543
544 // Only merge if there currently is no whitespace between the two "<".
545 if (First[1]->hasWhitespaceBefore())
546 return false;
547
548 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
549 if (X && X->is(tok::less))
550 return false;
551
552 auto Y = First[2];
553 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
554 return false;
555
556 First[0]->Tok.setKind(tok::lessless);
557 First[0]->TokenText = "<<";
558 First[0]->ColumnWidth += 1;
559 Tokens.erase(Tokens.end() - 2);
560 return true;
561}
562
563bool FormatTokenLexer::tryMergeGreaterGreater() {
564 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
565 if (Tokens.size() < 2)
566 return false;
567
568 auto First = Tokens.end() - 2;
569 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
570 return false;
571
572 // Only merge if there currently is no whitespace between the first two ">".
573 if (First[1]->hasWhitespaceBefore())
574 return false;
575
576 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
577 if (Tok && Tok->isNot(tok::kw_operator))
578 return false;
579
580 First[0]->Tok.setKind(tok::greatergreater);
581 First[0]->TokenText = ">>";
582 First[0]->ColumnWidth += 1;
583 Tokens.erase(Tokens.end() - 1);
584 return true;
585}
586
587bool FormatTokenLexer::tryMergeUserDefinedLiteral() {
588 if (Tokens.size() < 2)
589 return false;
590
591 auto *First = Tokens.end() - 2;
592 auto &Suffix = First[1];
593 if (Suffix->hasWhitespaceBefore() || Suffix->TokenText != "$")
594 return false;
595
596 auto &Literal = First[0];
597 if (!Literal->Tok.isLiteral())
598 return false;
599
600 auto &Text = Literal->TokenText;
601 if (!Text.ends_with("_"))
602 return false;
603
604 Text = StringRef(Text.data(), Text.size() + 1);
605 ++Literal->ColumnWidth;
606 Tokens.erase(&Suffix);
607 return true;
608}
609
610bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
611 TokenType NewType) {
612 if (Tokens.size() < Kinds.size())
613 return false;
614
615 const auto *First = Tokens.end() - Kinds.size();
616 for (unsigned i = 0; i < Kinds.size(); ++i)
617 if (First[i]->isNot(Kinds[i]))
618 return false;
619
620 return tryMergeTokens(Kinds.size(), NewType);
621}
622
623bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
624 if (Tokens.size() < Count)
625 return false;
626
627 const auto *First = Tokens.end() - Count;
628 unsigned AddLength = 0;
629 for (size_t i = 1; i < Count; ++i) {
630 // If there is whitespace separating the token and the previous one,
631 // they should not be merged.
632 if (First[i]->hasWhitespaceBefore())
633 return false;
634 AddLength += First[i]->TokenText.size();
635 }
636
637 Tokens.resize(Tokens.size() - Count + 1);
638 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
639 First[0]->TokenText.size() + AddLength);
640 First[0]->ColumnWidth += AddLength;
641 First[0]->setType(NewType);
642 return true;
643}
644
645bool FormatTokenLexer::tryMergeTokensAny(
647 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
648 return tryMergeTokens(Kinds, NewType);
649 });
650}
651
652// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
653bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
654 // NB: This is not entirely correct, as an r_paren can introduce an operand
655 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
656 // corner case to not matter in practice, though.
657 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
658 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
659 tok::colon, tok::question, tok::tilde) ||
660 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
661 tok::kw_else, tok::kw_void, tok::kw_typeof,
662 Keywords.kw_instanceof, Keywords.kw_in) ||
663 Tok->isPlacementOperator() || Tok->isBinaryOperator();
664}
665
666bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
667 if (!Prev)
668 return true;
669
670 // Regex literals can only follow after prefix unary operators, not after
671 // postfix unary operators. If the '++' is followed by a non-operand
672 // introducing token, the slash here is the operand and not the start of a
673 // regex.
674 // `!` is an unary prefix operator, but also a post-fix operator that casts
675 // away nullability, so the same check applies.
676 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
677 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
678
679 // The previous token must introduce an operand location where regex
680 // literals can occur.
681 if (!precedesOperand(Prev))
682 return false;
683
684 return true;
685}
686
687void FormatTokenLexer::tryParseJavaTextBlock() {
688 if (FormatTok->TokenText != "\"\"")
689 return;
690
691 const auto *S = Lex->getBufferLocation();
692 const auto *End = Lex->getBuffer().end();
693
694 if (S == End || *S != '\"')
695 return;
696
697 ++S; // Skip the `"""` that begins a text block.
698
699 // Find the `"""` that ends the text block.
700 bool Escaped = false;
701 for (int Count = 0; Count < 3 && S < End; ++S) {
702 if (Escaped) {
703 Escaped = false;
704 continue;
705 }
706 switch (*S) {
707 case '\"':
708 ++Count;
709 break;
710 case '\\':
711 Escaped = true;
712 [[fallthrough]];
713 default:
714 Count = 0;
715 }
716 }
717
718 // Ignore the possibly invalid text block.
719 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(S)));
720}
721
722// Tries to parse a JavaScript Regex literal starting at the current token,
723// if that begins with a slash and is in a location where JavaScript allows
724// regex literals. Changes the current token to a regex literal and updates
725// its text if successful.
726void FormatTokenLexer::tryParseJSRegexLiteral() {
727 FormatToken *RegexToken = Tokens.back();
728 if (RegexToken->isNoneOf(tok::slash, tok::slashequal))
729 return;
730
731 FormatToken *Prev = nullptr;
732 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
733 // NB: Because previous pointers are not initialized yet, this cannot use
734 // Token.getPreviousNonComment.
735 if (FT->isNot(tok::comment)) {
736 Prev = FT;
737 break;
738 }
739 }
740
741 if (!canPrecedeRegexLiteral(Prev))
742 return;
743
744 // 'Manually' lex ahead in the current file buffer.
745 const char *Offset = Lex->getBufferLocation();
746 const char *RegexBegin = Offset - RegexToken->TokenText.size();
747 StringRef Buffer = Lex->getBuffer();
748 bool InCharacterClass = false;
749 bool HaveClosingSlash = false;
750 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
751 // Regular expressions are terminated with a '/', which can only be
752 // escaped using '\' or a character class between '[' and ']'.
753 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
754 switch (*Offset) {
755 case '\\':
756 // Skip the escaped character.
757 ++Offset;
758 break;
759 case '[':
760 InCharacterClass = true;
761 break;
762 case ']':
763 InCharacterClass = false;
764 break;
765 case '/':
766 if (!InCharacterClass)
767 HaveClosingSlash = true;
768 break;
769 }
770 }
771
772 RegexToken->setType(TT_RegexLiteral);
773 // Treat regex literals like other string_literals.
774 RegexToken->Tok.setKind(tok::string_literal);
775 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
776 RegexToken->ColumnWidth = RegexToken->TokenText.size();
777
778 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
779}
780
781static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
782 bool Interpolated) {
783 auto Repeated = [&Begin, End]() {
784 return Begin + 1 < End && Begin[1] == Begin[0];
785 };
786
787 // Look for a terminating '"' in the current file buffer.
788 // Make no effort to format code within an interpolated or verbatim string.
789 //
790 // Interpolated strings could contain { } with " characters inside.
791 // $"{x ?? "null"}"
792 // should not be split into $"{x ?? ", null, "}" but should be treated as a
793 // single string-literal.
794 //
795 // We opt not to try and format expressions inside {} within a C#
796 // interpolated string. Formatting expressions within an interpolated string
797 // would require similar work as that done for JavaScript template strings
798 // in `handleTemplateStrings()`.
799 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
800 switch (*Begin) {
801 case '\\':
802 if (!Verbatim)
803 ++Begin;
804 break;
805 case '{':
806 if (Interpolated) {
807 // {{ inside an interpolated string is escaped, so skip it.
808 if (Repeated())
809 ++Begin;
810 else
811 ++UnmatchedOpeningBraceCount;
812 }
813 break;
814 case '}':
815 if (Interpolated) {
816 // }} inside an interpolated string is escaped, so skip it.
817 if (Repeated())
818 ++Begin;
819 else if (UnmatchedOpeningBraceCount > 0)
820 --UnmatchedOpeningBraceCount;
821 else
822 return End;
823 }
824 break;
825 case '"':
826 if (UnmatchedOpeningBraceCount > 0)
827 break;
828 // "" within a verbatim string is an escaped double quote: skip it.
829 if (Verbatim && Repeated()) {
830 ++Begin;
831 break;
832 }
833 return Begin;
834 }
835 }
836
837 return End;
838}
839
840void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
841 FormatToken *CSharpStringLiteral = Tokens.back();
842
843 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
844 return;
845
846 auto &TokenText = CSharpStringLiteral->TokenText;
847
848 bool Verbatim = false;
849 bool Interpolated = false;
850 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
851 Verbatim = true;
852 Interpolated = true;
853 } else if (TokenText.starts_with(R"(@")")) {
854 Verbatim = true;
855 } else if (TokenText.starts_with(R"($")")) {
856 Interpolated = true;
857 }
858
859 // Deal with multiline strings.
860 if (!Verbatim && !Interpolated)
861 return;
862
863 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
864 const char *Offset = StrBegin;
865 Offset += Verbatim && Interpolated ? 3 : 2;
866
867 const auto End = Lex->getBuffer().end();
868 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
869
870 // Make no attempt to format code properly if a verbatim string is
871 // unterminated.
872 if (Offset >= End)
873 return;
874
875 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
876 TokenText = LiteralText;
877
878 // Adjust width for potentially multiline string literals.
879 size_t FirstBreak = LiteralText.find('\n');
880 StringRef FirstLineText = FirstBreak == StringRef::npos
881 ? LiteralText
882 : LiteralText.substr(0, FirstBreak);
883 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
884 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
885 Encoding);
886 size_t LastBreak = LiteralText.rfind('\n');
887 if (LastBreak != StringRef::npos) {
888 CSharpStringLiteral->IsMultiline = true;
889 unsigned StartColumn = 0;
890 CSharpStringLiteral->LastLineColumnWidth =
891 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
892 StartColumn, Style.TabWidth, Encoding);
893 }
894
895 assert(Offset < End);
896 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
897}
898
899void FormatTokenLexer::handleTableGenMultilineString() {
900 FormatToken *MultiLineString = Tokens.back();
901 if (MultiLineString->isNot(TT_TableGenMultiLineString))
902 return;
903
904 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
905 // "}]" is the end of multi line string.
906 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
907 if (CloseOffset == StringRef::npos)
908 return;
909 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
910 MultiLineString->TokenText = Text;
911 resetLexer(SourceMgr.getFileOffset(
912 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
913 auto FirstLineText = Text;
914 auto FirstBreak = Text.find('\n');
915 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
916 if (FirstBreak != StringRef::npos) {
917 MultiLineString->IsMultiline = true;
918 FirstLineText = Text.substr(0, FirstBreak + 1);
919 // LastLineColumnWidth holds the width of the last line.
920 auto LastBreak = Text.rfind('\n');
921 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
922 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
923 Style.TabWidth, Encoding);
924 }
925 // ColumnWidth holds only the width of the first line.
926 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
927 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
928}
929
930void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
931 FormatToken *Tok = Tokens.back();
932 // TableGen identifiers can begin with digits. Such tokens are lexed as
933 // numeric_constant now.
934 if (Tok->isNot(tok::numeric_constant))
935 return;
936 StringRef Text = Tok->TokenText;
937 // The following check is based on llvm::TGLexer::LexToken.
938 // That lexes the token as a number if any of the following holds:
939 // 1. It starts with '+', '-'.
940 // 2. All the characters are digits.
941 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
942 // 4. The first non-digit character is 'x', and the next is a hex digit.
943 // Note that in the case 3 and 4, if the next character does not exists in
944 // this token, the token is an identifier.
945 if (Text.empty() || Text[0] == '+' || Text[0] == '-')
946 return;
947 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
948 // All the characters are digits
949 if (NonDigitPos == StringRef::npos)
950 return;
951 char FirstNonDigit = Text[NonDigitPos];
952 if (NonDigitPos < Text.size() - 1) {
953 char TheNext = Text[NonDigitPos + 1];
954 // Regarded as a binary number.
955 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
956 return;
957 // Regarded as hex number.
958 if (FirstNonDigit == 'x' && isxdigit(TheNext))
959 return;
960 }
961 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
962 // This is actually an identifier in TableGen.
963 Tok->Tok.setKind(tok::identifier);
964 Tok->Tok.setIdentifierInfo(nullptr);
965 }
966}
967
968void FormatTokenLexer::handleTemplateStrings() {
969 FormatToken *BacktickToken = Tokens.back();
970
971 if (BacktickToken->is(tok::l_brace)) {
972 StateStack.push(LexerState::NORMAL);
973 return;
974 }
975 if (BacktickToken->is(tok::r_brace)) {
976 if (StateStack.size() == 1)
977 return;
978 StateStack.pop();
979 if (StateStack.top() != LexerState::TEMPLATE_STRING)
980 return;
981 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
982 } else if (BacktickToken->is(tok::unknown) &&
983 BacktickToken->TokenText == "`") {
984 StateStack.push(LexerState::TEMPLATE_STRING);
985 } else {
986 return; // Not actually a template
987 }
988
989 // 'Manually' lex ahead in the current file buffer.
990 const char *Offset = Lex->getBufferLocation();
991 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
992 for (; Offset != Lex->getBuffer().end(); ++Offset) {
993 if (Offset[0] == '`') {
994 StateStack.pop();
995 ++Offset;
996 break;
997 }
998 if (Offset[0] == '\\') {
999 ++Offset; // Skip the escaped character.
1000 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
1001 Offset[1] == '{') {
1002 // '${' introduces an expression interpolation in the template string.
1003 StateStack.push(LexerState::NORMAL);
1004 Offset += 2;
1005 break;
1006 }
1007 }
1008
1009 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
1010 BacktickToken->setType(TT_TemplateString);
1011 BacktickToken->Tok.setKind(tok::string_literal);
1012 BacktickToken->TokenText = LiteralText;
1013
1014 // Adjust width for potentially multiline string literals.
1015 size_t FirstBreak = LiteralText.find('\n');
1016 StringRef FirstLineText = FirstBreak == StringRef::npos
1017 ? LiteralText
1018 : LiteralText.substr(0, FirstBreak);
1019 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
1020 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
1021 size_t LastBreak = LiteralText.rfind('\n');
1022 if (LastBreak != StringRef::npos) {
1023 BacktickToken->IsMultiline = true;
1024 unsigned StartColumn = 0; // The template tail spans the entire line.
1025 BacktickToken->LastLineColumnWidth =
1026 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
1027 StartColumn, Style.TabWidth, Encoding);
1028 }
1029
1030 SourceLocation loc = Lex->getSourceLocation(Offset);
1031 resetLexer(SourceMgr.getFileOffset(loc));
1032}
1033
1034void FormatTokenLexer::tryParsePythonComment() {
1035 FormatToken *HashToken = Tokens.back();
1036 if (HashToken->isNoneOf(tok::hash, tok::hashhash))
1037 return;
1038 // Turn the remainder of this line into a comment.
1039 const char *CommentBegin =
1040 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
1041 size_t From = CommentBegin - Lex->getBuffer().begin();
1042 size_t To = Lex->getBuffer().find_first_of('\n', From);
1043 if (To == StringRef::npos)
1044 To = Lex->getBuffer().size();
1045 size_t Len = To - From;
1046 HashToken->setType(TT_LineComment);
1047 HashToken->Tok.setKind(tok::comment);
1048 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
1049 SourceLocation Loc = To < Lex->getBuffer().size()
1050 ? Lex->getSourceLocation(CommentBegin + Len)
1051 : SourceMgr.getLocForEndOfFile(ID);
1052 resetLexer(SourceMgr.getFileOffset(Loc));
1053}
1054
1055bool FormatTokenLexer::tryMerge_TMacro() {
1056 if (Tokens.size() < 4)
1057 return false;
1058 FormatToken *Last = Tokens.back();
1059 if (Last->isNot(tok::r_paren))
1060 return false;
1061
1062 FormatToken *String = Tokens[Tokens.size() - 2];
1063 if (String->isNot(tok::string_literal) || String->IsMultiline)
1064 return false;
1065
1066 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
1067 return false;
1068
1069 FormatToken *Macro = Tokens[Tokens.size() - 4];
1070 if (Macro->TokenText != "_T")
1071 return false;
1072
1073 const char *Start = Macro->TokenText.data();
1074 const char *End = Last->TokenText.data() + Last->TokenText.size();
1075 String->TokenText = StringRef(Start, End - Start);
1076 String->IsFirst = Macro->IsFirst;
1077 String->LastNewlineOffset = Macro->LastNewlineOffset;
1078 String->WhitespaceRange = Macro->WhitespaceRange;
1079 String->OriginalColumn = Macro->OriginalColumn;
1080 String->ColumnWidth = encoding::columnWidthWithTabs(
1081 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1082 String->NewlinesBefore = Macro->NewlinesBefore;
1083 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1084
1085 Tokens.pop_back();
1086 Tokens.pop_back();
1087 Tokens.pop_back();
1088 Tokens.back() = String;
1089 if (FirstInLineIndex >= Tokens.size())
1090 FirstInLineIndex = Tokens.size() - 1;
1091 return true;
1092}
1093
1094bool FormatTokenLexer::tryMergeConflictMarkers() {
1095 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1096 return false;
1097
1098 // Conflict lines look like:
1099 // <marker> <text from the vcs>
1100 // For example:
1101 // >>>>>>> /file/in/file/system at revision 1234
1102 //
1103 // We merge all tokens in a line that starts with a conflict marker
1104 // into a single token with a special token type that the unwrapped line
1105 // parser will use to correctly rebuild the underlying code.
1106
1107 FileID ID;
1108 // Get the position of the first token in the line.
1109 unsigned FirstInLineOffset;
1110 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1111 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1112 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1113 // Calculate the offset of the start of the current line.
1114 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1115 if (LineOffset == StringRef::npos)
1116 LineOffset = 0;
1117 else
1118 ++LineOffset;
1119
1120 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1121 StringRef LineStart;
1122 if (FirstSpace == StringRef::npos)
1123 LineStart = Buffer.substr(LineOffset);
1124 else
1125 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1126
1127 TokenType Type = TT_Unknown;
1128 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1129 Type = TT_ConflictStart;
1130 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1131 LineStart == "====") {
1132 Type = TT_ConflictAlternative;
1133 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1134 Type = TT_ConflictEnd;
1135 }
1136
1137 if (Type != TT_Unknown) {
1138 FormatToken *Next = Tokens.back();
1139
1140 Tokens.resize(FirstInLineIndex + 1);
1141 // We do not need to build a complete token here, as we will skip it
1142 // during parsing anyway (as we must not touch whitespace around conflict
1143 // markers).
1144 Tokens.back()->setType(Type);
1145 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1146
1147 Tokens.push_back(Next);
1148 return true;
1149 }
1150
1151 return false;
1152}
1153
1154FormatToken *FormatTokenLexer::getStashedToken() {
1155 // Create a synthesized second '>' or '<' token.
1156 Token Tok = FormatTok->Tok;
1157 StringRef TokenText = FormatTok->TokenText;
1158
1159 unsigned OriginalColumn = FormatTok->OriginalColumn;
1160 FormatTok = new (Allocator.Allocate()) FormatToken;
1161 FormatTok->Tok = Tok;
1162 SourceLocation TokLocation =
1163 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1164 FormatTok->Tok.setLocation(TokLocation);
1165 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1166 FormatTok->TokenText = TokenText;
1167 FormatTok->ColumnWidth = 1;
1168 FormatTok->OriginalColumn = OriginalColumn + 1;
1169
1170 return FormatTok;
1171}
1172
1173/// Truncate the current token to the new length and make the lexer continue
1174/// from the end of the truncated token. Used for other languages that have
1175/// different token boundaries, like JavaScript in which a comment ends at a
1176/// line break regardless of whether the line break follows a backslash. Also
1177/// used to set the lexer to the end of whitespace if the lexer regards
1178/// whitespace and an unrecognized symbol as one token.
1179void FormatTokenLexer::truncateToken(size_t NewLen) {
1180 assert(NewLen <= FormatTok->TokenText.size());
1181 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1182 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1183 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1184 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1185 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1186 Encoding);
1187 FormatTok->Tok.setLength(NewLen);
1188}
1189
1190/// Count the length of leading whitespace in a token.
1191static size_t countLeadingWhitespace(StringRef Text) {
1192 // Basically counting the length matched by this regex.
1193 // "^([\n\r\f\v \t]|\\\\[\n\r])+"
1194 // Directly using the regex turned out to be slow. With the regex
1195 // version formatting all files in this directory took about 1.25
1196 // seconds. This version took about 0.5 seconds.
1197 const unsigned char *const Begin = Text.bytes_begin();
1198 const unsigned char *const End = Text.bytes_end();
1199 const unsigned char *Cur = Begin;
1200 while (Cur < End) {
1201 if (isWhitespace(Cur[0])) {
1202 ++Cur;
1203 } else if (Cur[0] == '\\') {
1204 // A backslash followed by optional horizontal whitespaces (P22232R2) and
1205 // then a newline always escapes the newline.
1206 // The source has a null byte at the end. So the end of the entire input
1207 // isn't reached yet. Also the lexer doesn't break apart an escaped
1208 // newline.
1209 const auto *Lookahead = Cur + 1;
1210 while (isHorizontalWhitespace(*Lookahead))
1211 ++Lookahead;
1212 // No line splice found; the backslash is a token.
1213 if (!isVerticalWhitespace(*Lookahead))
1214 break;
1215 // Splice found, consume it.
1216 Cur = Lookahead + 1;
1217 } else {
1218 break;
1219 }
1220 }
1221 return Cur - Begin;
1222}
1223
1224FormatToken *FormatTokenLexer::getNextToken() {
1225 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1226 StateStack.pop();
1227 return getStashedToken();
1228 }
1229
1230 FormatTok = new (Allocator.Allocate()) FormatToken;
1231 readRawToken(*FormatTok);
1232 SourceLocation WhitespaceStart =
1233 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1234 FormatTok->IsFirst = IsFirstToken;
1235 IsFirstToken = false;
1236
1237 // Consume and record whitespace until we find a significant token.
1238 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1239 // followed by a symbol such as backtick. Those symbols may be
1240 // significant in other languages.
1241 unsigned WhitespaceLength = TrailingWhitespace;
1242 while (FormatTok->isNot(tok::eof)) {
1243 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1244 if (LeadingWhitespace == 0)
1245 break;
1246 if (LeadingWhitespace < FormatTok->TokenText.size())
1247 truncateToken(LeadingWhitespace);
1248 StringRef Text = FormatTok->TokenText;
1249 bool InEscape = false;
1250 for (int i = 0, e = Text.size(); i != e; ++i) {
1251 switch (Text[i]) {
1252 case '\r':
1253 // If this is a CRLF sequence, break here and the LF will be handled on
1254 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1255 // the same as a single LF.
1256 if (i + 1 < e && Text[i + 1] == '\n')
1257 break;
1258 [[fallthrough]];
1259 case '\n':
1260 ++FormatTok->NewlinesBefore;
1261 if (!InEscape)
1262 FormatTok->HasUnescapedNewline = true;
1263 else
1264 InEscape = false;
1265 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1266 Column = 0;
1267 break;
1268 case '\f':
1269 if (Style.KeepFormFeed && !FormatTok->HasFormFeedBefore &&
1270 // The form feed is immediately preceded and followed by a newline.
1271 i > 0 && Text[i - 1] == '\n' &&
1272 ((i + 1 < e && Text[i + 1] == '\n') ||
1273 (i + 2 < e && Text[i + 1] == '\r' && Text[i + 2] == '\n'))) {
1274 FormatTok->HasFormFeedBefore = true;
1275 }
1276 [[fallthrough]];
1277 case '\v':
1278 Column = 0;
1279 break;
1280 case ' ':
1281 ++Column;
1282 break;
1283 case '\t':
1284 Column +=
1285 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1286 break;
1287 case '\\':
1288 // The code preceding the loop and in the countLeadingWhitespace
1289 // function guarantees that Text is entirely whitespace, not including
1290 // comments but including escaped newlines. So the character shows up,
1291 // then it has to be in an escape sequence.
1292 assert([&]() -> bool {
1293 size_t j = i + 1;
1294 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
1295 ++j;
1296 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
1297 }());
1298 InEscape = true;
1299 break;
1300 default:
1301 // This shouldn't happen.
1302 assert(false);
1303 break;
1304 }
1305 }
1306 WhitespaceLength += Text.size();
1307 readRawToken(*FormatTok);
1308 }
1309
1310 if (FormatTok->is(tok::unknown))
1311 FormatTok->setType(TT_ImplicitStringLiteral);
1312
1313 const bool IsCpp = Style.isCpp();
1314
1315 // JavaScript and Java do not allow to escape the end of the line with a
1316 // backslash. Backslashes are syntax errors in plain source, but can occur in
1317 // comments. When a single line comment ends with a \, it'll cause the next
1318 // line of code to be lexed as a comment, breaking formatting. The code below
1319 // finds comments that contain a backslash followed by a line break, truncates
1320 // the comment token at the backslash, and resets the lexer to restart behind
1321 // the backslash.
1322 if (const auto Text = FormatTok->TokenText;
1323 Text.starts_with("//") &&
1324 (IsCpp || Style.isJavaScript() || Style.isJava())) {
1325 assert(FormatTok->is(tok::comment));
1326 for (auto Pos = Text.find('\\'); Pos++ != StringRef::npos;
1327 Pos = Text.find('\\', Pos)) {
1328 if (Pos < Text.size() && Text[Pos] == '\n' &&
1329 (!IsCpp || Text.substr(Pos + 1).ltrim().starts_with("//"))) {
1330 truncateToken(Pos);
1331 break;
1332 }
1333 }
1334 }
1335
1336 if (Style.isVerilog()) {
1337 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1338 SmallVector<StringRef, 1> Matches;
1339 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1340 // And it uses the hash for delays and parameter lists. In order to continue
1341 // using `tok::hash` in other places, the backtick gets marked as the hash
1342 // here. And in order to tell the backtick and hash apart for
1343 // Verilog-specific stuff, the hash becomes an identifier.
1344 if (FormatTok->is(tok::numeric_constant)) {
1345 // In Verilog the quote is not part of a number.
1346 auto Quote = FormatTok->TokenText.find('\'');
1347 if (Quote != StringRef::npos)
1348 truncateToken(Quote);
1349 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1350 FormatTok->Tok.setKind(tok::raw_identifier);
1351 } else if (FormatTok->is(tok::raw_identifier)) {
1352 if (FormatTok->TokenText == "`") {
1353 FormatTok->Tok.setIdentifierInfo(nullptr);
1354 FormatTok->Tok.setKind(tok::hash);
1355 } else if (FormatTok->TokenText == "``") {
1356 FormatTok->Tok.setIdentifierInfo(nullptr);
1357 FormatTok->Tok.setKind(tok::hashhash);
1358 } else if (!Tokens.empty() && Tokens.back()->is(Keywords.kw_apostrophe) &&
1359 NumberBase.match(FormatTok->TokenText, &Matches)) {
1360 // In Verilog in a based number literal like `'b10`, there may be
1361 // whitespace between `'b` and `10`. Therefore we handle the base and
1362 // the rest of the number literal as two tokens. But if there is no
1363 // space in the input code, we need to manually separate the two parts.
1364 truncateToken(Matches[0].size());
1365 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1366 }
1367 }
1368 }
1369
1370 FormatTok->WhitespaceRange = SourceRange(
1371 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1372
1373 FormatTok->OriginalColumn = Column;
1374
1375 TrailingWhitespace = 0;
1376 if (FormatTok->is(tok::comment)) {
1377 // FIXME: Add the trimmed whitespace to Column.
1378 StringRef UntrimmedText = FormatTok->TokenText;
1379 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1380 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1381 } else if (FormatTok->is(tok::raw_identifier)) {
1382 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1383 FormatTok->Tok.setIdentifierInfo(&Info);
1384 FormatTok->Tok.setKind(Info.getTokenID());
1385 if (Style.isJava() &&
1386 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1387 tok::kw_operator)) {
1388 FormatTok->Tok.setKind(tok::identifier);
1389 } else if (Style.isJavaScript() &&
1390 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1391 tok::kw_operator)) {
1392 FormatTok->Tok.setKind(tok::identifier);
1393 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1394 FormatTok->Tok.setKind(tok::identifier);
1395 } else if (Style.isVerilog()) {
1396 if (Keywords.isVerilogIdentifier(*FormatTok))
1397 FormatTok->Tok.setKind(tok::identifier);
1398 // Look for the protect line. The next lines needs to be lexed as a single
1399 // token.
1400 if (Tokens.size() - FirstInLineIndex >= 3u &&
1401 Tokens[FirstInLineIndex]->is(tok::hash) &&
1402 Tokens[FirstInLineIndex + 1u]->is(tok::pp_pragma) &&
1403 Tokens[FirstInLineIndex + 2u]->is(Keywords.kw_protect) &&
1404 FormatTok->isOneOf(
1405 Keywords.kw_data_block, Keywords.kw_data_decrypt_key,
1406 Keywords.kw_data_public_key, Keywords.kw_digest_block,
1407 Keywords.kw_digest_decrypt_key, Keywords.kw_digest_public_key,
1408 Keywords.kw_key_block, Keywords.kw_key_public_key)) {
1409 VerilogProtectedBlock = true;
1410 }
1411 }
1412 } else if (const bool Greater = FormatTok->is(tok::greatergreater);
1413 Greater || FormatTok->is(tok::lessless)) {
1414 FormatTok->Tok.setKind(Greater ? tok::greater : tok::less);
1415 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1416 ++Column;
1417 StateStack.push(LexerState::TOKEN_STASHED);
1418 } else if (Style.isJava() && FormatTok->is(tok::string_literal)) {
1419 tryParseJavaTextBlock();
1420 }
1421
1422 if (Style.isVerilog() && !Tokens.empty() &&
1423 Tokens.back()->is(TT_VerilogNumberBase) &&
1424 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1425 // Mark the number following a base like `'h?a0` as a number.
1426 FormatTok->Tok.setKind(tok::numeric_constant);
1427 }
1428
1429 // Now FormatTok is the next non-whitespace token.
1430
1431 StringRef Text = FormatTok->TokenText;
1432 size_t FirstNewlinePos = Text.find('\n');
1433 if (FirstNewlinePos == StringRef::npos) {
1434 // FIXME: ColumnWidth actually depends on the start column, we need to
1435 // take this into account when the token is moved.
1436 FormatTok->ColumnWidth =
1437 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1438 Column += FormatTok->ColumnWidth;
1439 } else {
1440 FormatTok->IsMultiline = true;
1441 // FIXME: ColumnWidth actually depends on the start column, we need to
1442 // take this into account when the token is moved.
1443 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1444 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1445
1446 // The last line of the token always starts in column 0.
1447 // Thus, the length can be precomputed even in the presence of tabs.
1448 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1449 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1450 Column = FormatTok->LastLineColumnWidth;
1451 }
1452
1453 if (IsCpp) {
1454 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1455 auto it = Macros.find(Identifier);
1456 if ((Tokens.empty() || !Tokens.back()->Tok.getIdentifierInfo() ||
1457 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() !=
1458 tok::pp_define) &&
1459 it != Macros.end()) {
1460 FormatTok->setType(it->second);
1461 if (it->second == TT_IfMacro) {
1462 // The lexer token currently has type tok::kw_unknown. However, for this
1463 // substitution to be treated correctly in the TokenAnnotator, faking
1464 // the tok value seems to be needed. Not sure if there's a more elegant
1465 // way.
1466 FormatTok->Tok.setKind(tok::kw_if);
1467 }
1468 } else if (FormatTok->is(tok::identifier)) {
1469 if (MacroBlockBeginRegex.match(Text))
1470 FormatTok->setType(TT_MacroBlockBegin);
1471 else if (MacroBlockEndRegex.match(Text))
1472 FormatTok->setType(TT_MacroBlockEnd);
1473 else if (MacrosSkippedByRemoveParentheses.contains(Identifier))
1474 FormatTok->setFinalizedType(TT_FunctionLikeMacro);
1475 else if (TemplateNames.contains(Identifier))
1476 FormatTok->setFinalizedType(TT_TemplateName);
1477 else if (TypeNames.contains(Identifier))
1478 FormatTok->setFinalizedType(TT_TypeName);
1479 else if (VariableTemplates.contains(Identifier))
1480 FormatTok->setFinalizedType(TT_VariableTemplate);
1481 }
1482 }
1483
1484 return FormatTok;
1485}
1486
1487bool FormatTokenLexer::readVerilogProtected(FormatToken &Tok) {
1488 // The block follows the pragma line.
1489 if (!VerilogProtectedBlock || Tok.NewlinesBefore == 0)
1490 return false;
1491 VerilogProtectedBlock = false;
1492
1493 // The block can be empty. Then no token is necessary. A backtick on its own
1494 // line is likely a uuencode line. A backtick followed by something is assumed
1495 // to be the pragma line that ends the block.
1496 const char *const Start = Lex->getBufferLocation();
1497 size_t Len = Lex->getBuffer().end() - Start;
1498 if (Len == 0 ||
1499 (Len >= 2 && Start[0] == '`' && !isVerticalWhitespace(Start[1]))) {
1500 return false;
1501 }
1502
1503 // The block ends when the next pragma line starts.
1504 static const llvm::Regex NextDirective("[\n\r][ \t]*`[^\n\r]");
1505 SmallVector<StringRef, 1> Matches;
1506 if (NextDirective.match(StringRef(Start, Len), &Matches)) {
1507 assert(Matches.size() == 1);
1508 Len = Matches[0].begin() - Start;
1509 }
1510
1511 Tok.Tok.setKind(tok::string_literal);
1512 Tok.Tok.setLength(Len);
1513 Tok.Tok.setLocation(Lex->getSourceLocation(Start, Len));
1514 Tok.setFinalizedType(TT_VerilogProtected);
1515 Lex->seek(Lex->getCurrentBufferOffset() + Len,
1516 /*IsAtStartOfLine=*/false);
1517 return true;
1518}
1519
1520bool FormatTokenLexer::readRawTokenVerilogSpecific(FormatToken &Tok) {
1521 if (readVerilogProtected(Tok))
1522 return true;
1523 const char *Start = Lex->getBufferLocation();
1524 size_t Len;
1525 switch (Start[0]) {
1526 // In Verilog the quote is not a character literal.
1527 case '\'':
1528 Len = 1;
1529 break;
1530 // Make the backtick and double backtick identifiers to match against them
1531 // more easily.
1532 case '`':
1533 if (Start[1] == '`')
1534 Len = 2;
1535 else
1536 Len = 1;
1537 break;
1538 // In Verilog an escaped identifier starts with a backslash and ends with
1539 // whitespace. Unless that whitespace is an escaped newline.
1540 // FIXME: If there is an escaped newline in the middle of an escaped
1541 // identifier, allow for pasting the two lines together, But escaped
1542 // identifiers usually occur only in generated code anyway.
1543 case '\\':
1544 // A backslash can also begin an escaped newline outside of an escaped
1545 // identifier.
1546 if (Start[1] == '\r' || Start[1] == '\n')
1547 return false;
1548 Len = 1;
1549 while (Start[Len] != '\0' && Start[Len] != '\f' && Start[Len] != '\n' &&
1550 Start[Len] != '\r' && Start[Len] != '\t' && Start[Len] != '\v' &&
1551 Start[Len] != ' ') {
1552 // There is a null byte at the end of the buffer, so we don't have to
1553 // check whether the next byte is within the buffer.
1554 if (Start[Len] == '\\' && Start[Len + 1] == '\r' &&
1555 Start[Len + 2] == '\n') {
1556 Len += 3;
1557 } else if (Start[Len] == '\\' &&
1558 (Start[Len + 1] == '\r' || Start[Len + 1] == '\n')) {
1559 Len += 2;
1560 } else {
1561 Len += 1;
1562 }
1563 }
1564 break;
1565 default:
1566 return false;
1567 }
1568
1569 // The kind has to be an identifier so we can match it against those defined
1570 // in Keywords. The kind has to be set before the length because the setLength
1571 // function checks that the kind is not an annotation.
1572 Tok.Tok.setKind(tok::raw_identifier);
1573 Tok.Tok.setLength(Len);
1574 Tok.Tok.setLocation(Lex->getSourceLocation(Start, Len));
1575 Tok.Tok.setRawIdentifierData(Start);
1576 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1577 return true;
1578}
1579
1580void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1581 // For Verilog, first see if there is a special token, and fall back to the
1582 // normal lexer if there isn't one.
1583 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok))
1584 Lex->LexFromRawLexer(Tok.Tok);
1585 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1586 Tok.Tok.getLength());
1587 // For formatting, treat unterminated string literals like normal string
1588 // literals.
1589 if (Tok.is(tok::unknown)) {
1590 if (Tok.TokenText.starts_with("\"")) {
1591 Tok.Tok.setKind(tok::string_literal);
1592 Tok.IsUnterminatedLiteral = true;
1593 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1594 Tok.Tok.setKind(tok::string_literal);
1595 }
1596 }
1597
1598 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1599 Tok.Tok.setKind(tok::string_literal);
1600
1601 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1602 FormattingDisabled = false;
1603
1604 Tok.Finalized = FormattingDisabled;
1605
1606 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1607 FormattingDisabled = true;
1608}
1609
1610void FormatTokenLexer::resetLexer(unsigned Offset) {
1611 StringRef Buffer = SourceMgr.getBufferData(ID);
1612 Lex = std::make_unique<Lexer>(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1613 Buffer.begin(), Buffer.begin() + Offset,
1614 Buffer.end());
1615 Lex->SetKeepWhitespaceMode(true);
1616 TrailingWhitespace = 0;
1617}
1618
1619} // namespace format
1620} // namespace clang
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
bool is(tok::TokenKind Kind) const
StringRef TokenText
The raw text of the token.
FormatToken()
Token Tok
The Token.
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
bool isNot(T Kind) const
FormatToken * Next
The next token in the unwrapped line.
Various functions to configurably format source code.
#define X(type, name)
Definition Value.h:97
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
Token - This structure provides full information about a lexed token.
Definition Token.h:36
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition Token.h:142
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
uint32_t Literal
Literals are represented as positive integers.
Definition CNFFormula.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition Format.cpp:4864
bool isClangFormatOn(StringRef Comment)
Definition Format.cpp:4860
TokenType
Determines the semantic type of a syntactic token, e.g.
LangOptions getFormattingLangOpts(const FormatStyle &Style)
Definition Format.cpp:4458
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition CharInfo.h:99
std::vector< std::string > Macros
A list of macros of the form <definition>=<expansion> .
Definition Format.h:3951
@ TemplateName
The identifier is a template name. FIXME: Add an annotation for that.
Definition Parser.h:61
nullptr
This class represents a compute construct, representing a 'Kind' of ‘parallel’, 'serial',...
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition Format.h:5933
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition CharInfo.h:91
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition CharInfo.h:108
@ Keyword
The name has been typo-corrected to a keyword.
Definition Sema.h:562
@ Type
The name was classified as a type.
Definition Sema.h:564
std::vector< std::string > MacrosSkippedByRemoveParentheses
A vector of function-like macros whose invocations should be skipped by RemoveParentheses.
Definition Format.h:3956
std::vector< std::string > TemplateNames
A vector of non-keyword identifiers that should be interpreted as template names.
Definition Format.h:5923
std::vector< std::string > VariableTemplates
A vector of non-keyword identifiers that should be interpreted as variable template names.
Definition Format.h:5984
#define true
Definition stdbool.h:25
A wrapper around a Token storing information about the whitespace characters preceding it.
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
bool HasFormFeedBefore
Has "\n\f\n" or "\n\f\r\n" before TokenText.
unsigned IsFirst
Indicates that this is the first token of the file.