clang 18.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
19#include "clang/Format/Format.h"
20#include "llvm/Support/Regex.h"
21
22namespace clang {
23namespace format {
24
26 const SourceManager &SourceMgr, FileID ID, unsigned Column,
27 const FormatStyle &Style, encoding::Encoding Encoding,
28 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29 IdentifierTable &IdentTable)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31 Column(Column), TrailingWhitespace(0),
32 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36 MacroBlockEndRegex(Style.MacroBlockEnd) {
37 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38 Lex->SetKeepWhitespaceMode(true);
39
40 for (const std::string &ForEachMacro : Style.ForEachMacros) {
41 auto Identifier = &IdentTable.get(ForEachMacro);
42 Macros.insert({Identifier, TT_ForEachMacro});
43 }
44 for (const std::string &IfMacro : Style.IfMacros) {
45 auto Identifier = &IdentTable.get(IfMacro);
46 Macros.insert({Identifier, TT_IfMacro});
47 }
48 for (const std::string &AttributeMacro : Style.AttributeMacros) {
49 auto Identifier = &IdentTable.get(AttributeMacro);
50 Macros.insert({Identifier, TT_AttributeMacro});
51 }
52 for (const std::string &StatementMacro : Style.StatementMacros) {
53 auto Identifier = &IdentTable.get(StatementMacro);
54 Macros.insert({Identifier, TT_StatementMacro});
55 }
56 for (const std::string &TypenameMacro : Style.TypenameMacros) {
57 auto Identifier = &IdentTable.get(TypenameMacro);
58 Macros.insert({Identifier, TT_TypenameMacro});
59 }
60 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61 auto Identifier = &IdentTable.get(NamespaceMacro);
62 Macros.insert({Identifier, TT_NamespaceMacro});
63 }
64 for (const std::string &WhitespaceSensitiveMacro :
66 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67 Macros.insert({Identifier, TT_UntouchableMacroFunc});
68 }
69 for (const std::string &StatementAttributeLikeMacro :
71 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73 }
74
75 for (const auto &TypeName : Style.TypeNames)
76 TypeNames.insert(&IdentTable.get(TypeName));
77}
78
80 assert(Tokens.empty());
81 assert(FirstInLineIndex == 0);
82 do {
83 Tokens.push_back(getNextToken());
84 if (Style.isJavaScript()) {
85 tryParseJSRegexLiteral();
86 handleTemplateStrings();
87 }
89 tryParsePythonComment();
90 tryMergePreviousTokens();
91 if (Style.isCSharp()) {
92 // This needs to come after tokens have been merged so that C#
93 // string literals are correctly identified.
94 handleCSharpVerbatimAndInterpolatedStrings();
95 }
96 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
97 FirstInLineIndex = Tokens.size() - 1;
98 } while (Tokens.back()->isNot(tok::eof));
99 return Tokens;
100}
101
102void FormatTokenLexer::tryMergePreviousTokens() {
103 if (tryMerge_TMacro())
104 return;
105 if (tryMergeConflictMarkers())
106 return;
107 if (tryMergeLessLess())
108 return;
109 if (tryMergeGreaterGreater())
110 return;
111 if (tryMergeForEach())
112 return;
113 if (Style.isCpp() && tryTransformTryUsageForC())
114 return;
115
116 if (Style.isJavaScript() || Style.isCSharp()) {
117 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
118 tok::question};
119 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
120 tok::period};
121 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
122
123 if (tryMergeTokens(FatArrow, TT_FatArrow))
124 return;
125 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
126 // Treat like the "||" operator (as opposed to the ternary ?).
127 Tokens.back()->Tok.setKind(tok::pipepipe);
128 return;
129 }
130 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
131 // Treat like a regular "." access.
132 Tokens.back()->Tok.setKind(tok::period);
133 return;
134 }
135 if (tryMergeNullishCoalescingEqual())
136 return;
137 }
138
139 if (Style.isCSharp()) {
140 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
141 tok::question, tok::l_square};
142
143 if (tryMergeCSharpKeywordVariables())
144 return;
145 if (tryMergeCSharpStringLiteral())
146 return;
147 if (tryTransformCSharpForEach())
148 return;
149 if (tryMergeTokens(CSharpNullConditionalLSquare,
150 TT_CSharpNullConditionalLSquare)) {
151 // Treat like a regular "[" operator.
152 Tokens.back()->Tok.setKind(tok::l_square);
153 return;
154 }
155 }
156
157 if (tryMergeNSStringLiteral())
158 return;
159
160 if (Style.isJavaScript()) {
161 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
162 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
163 tok::equal};
164 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
165 tok::greaterequal};
166 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
167 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
168 tok::starequal};
169 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
170 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
171
172 // FIXME: Investigate what token type gives the correct operator priority.
173 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
174 return;
175 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
176 return;
177 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
178 return;
179 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
180 return;
181 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
182 Tokens.back()->Tok.setKind(tok::starequal);
183 return;
184 }
185 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
186 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
187 // Treat like the "=" assignment operator.
188 Tokens.back()->Tok.setKind(tok::equal);
189 return;
190 }
191 if (tryMergeJSPrivateIdentifier())
192 return;
193 }
194
195 if (Style.Language == FormatStyle::LK_Java) {
196 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
197 tok::greater, tok::greater, tok::greaterequal};
198 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
199 return;
200 }
201
202 if (Style.isVerilog()) {
203 // Merge the number following a base like `'h?a0`.
204 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
205 Tokens.end()[-2]->is(tok::numeric_constant) &&
206 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
207 tok::question) &&
208 tryMergeTokens(2, TT_Unknown)) {
209 return;
210 }
211 // Part select.
212 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
213 TT_BitFieldColon)) {
214 return;
215 }
216 // Xnor. The combined token is treated as a caret which can also be either a
217 // unary or binary operator. The actual type is determined in
218 // TokenAnnotator. We also check the token length so we know it is not
219 // already a merged token.
220 if (Tokens.back()->TokenText.size() == 1 &&
221 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
222 TT_BinaryOperator)) {
223 Tokens.back()->Tok.setKind(tok::caret);
224 return;
225 }
226 // Signed shift and distribution weight.
227 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
228 Tokens.back()->Tok.setKind(tok::lessless);
229 return;
230 }
231 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
232 Tokens.back()->Tok.setKind(tok::greatergreater);
233 return;
234 }
235 if (tryMergeTokensAny({{tok::lessless, tok::equal},
236 {tok::lessless, tok::lessequal},
237 {tok::greatergreater, tok::equal},
238 {tok::greatergreater, tok::greaterequal},
239 {tok::colon, tok::equal},
240 {tok::colon, tok::slash}},
241 TT_BinaryOperator)) {
242 Tokens.back()->ForcedPrecedence = prec::Assignment;
243 return;
244 }
245 // Exponentiation, signed shift, case equality, and wildcard equality.
246 if (tryMergeTokensAny({{tok::star, tok::star},
247 {tok::lessless, tok::less},
248 {tok::greatergreater, tok::greater},
249 {tok::exclaimequal, tok::equal},
250 {tok::exclaimequal, tok::question},
251 {tok::equalequal, tok::equal},
252 {tok::equalequal, tok::question}},
253 TT_BinaryOperator)) {
254 return;
255 }
256 // Module paths in specify blocks and implications in properties.
257 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
258 {tok::plus, tok::star, tok::greater},
259 {tok::minusequal, tok::greater},
260 {tok::minus, tok::star, tok::greater},
261 {tok::less, tok::arrow},
262 {tok::equal, tok::greater},
263 {tok::star, tok::greater},
264 {tok::pipeequal, tok::greater},
265 {tok::pipe, tok::arrow},
266 {tok::hash, tok::minus, tok::hash},
267 {tok::hash, tok::equal, tok::hash}},
268 TT_BinaryOperator)) {
269 Tokens.back()->ForcedPrecedence = prec::Comma;
270 return;
271 }
272 }
273}
274
275bool FormatTokenLexer::tryMergeNSStringLiteral() {
276 if (Tokens.size() < 2)
277 return false;
278 auto &At = *(Tokens.end() - 2);
279 auto &String = *(Tokens.end() - 1);
280 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
281 return false;
282 At->Tok.setKind(tok::string_literal);
283 At->TokenText = StringRef(At->TokenText.begin(),
284 String->TokenText.end() - At->TokenText.begin());
285 At->ColumnWidth += String->ColumnWidth;
286 At->setType(TT_ObjCStringLiteral);
287 Tokens.erase(Tokens.end() - 1);
288 return true;
289}
290
291bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
292 // Merges #idenfier into a single identifier with the text #identifier
293 // but the token tok::identifier.
294 if (Tokens.size() < 2)
295 return false;
296 auto &Hash = *(Tokens.end() - 2);
297 auto &Identifier = *(Tokens.end() - 1);
298 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
299 return false;
300 Hash->Tok.setKind(tok::identifier);
301 Hash->TokenText =
302 StringRef(Hash->TokenText.begin(),
303 Identifier->TokenText.end() - Hash->TokenText.begin());
304 Hash->ColumnWidth += Identifier->ColumnWidth;
305 Hash->setType(TT_JsPrivateIdentifier);
306 Tokens.erase(Tokens.end() - 1);
307 return true;
308}
309
310// Search for verbatim or interpolated string literals @"ABC" or
311// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
312// prevent splitting of @, $ and ".
313// Merging of multiline verbatim strings with embedded '"' is handled in
314// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
315bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
316 if (Tokens.size() < 2)
317 return false;
318
319 // Look for @"aaaaaa" or $"aaaaaa".
320 const auto String = *(Tokens.end() - 1);
321 if (String->isNot(tok::string_literal))
322 return false;
323
324 auto Prefix = *(Tokens.end() - 2);
325 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
326 return false;
327
328 if (Tokens.size() > 2) {
329 const auto Tok = *(Tokens.end() - 3);
330 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
331 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
332 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
333 Tok->ColumnWidth += Prefix->ColumnWidth;
334 Tokens.erase(Tokens.end() - 2);
335 Prefix = Tok;
336 }
337 }
338
339 // Convert back into just a string_literal.
340 Prefix->Tok.setKind(tok::string_literal);
341 Prefix->TokenText =
342 StringRef(Prefix->TokenText.begin(),
343 String->TokenText.end() - Prefix->TokenText.begin());
344 Prefix->ColumnWidth += String->ColumnWidth;
345 Prefix->setType(TT_CSharpStringLiteral);
346 Tokens.erase(Tokens.end() - 1);
347 return true;
348}
349
350// Valid C# attribute targets:
351// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
352const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
353 "assembly", "module", "field", "event", "method",
354 "param", "property", "return", "type",
355};
356
357bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
358 if (Tokens.size() < 2)
359 return false;
360 auto &NullishCoalescing = *(Tokens.end() - 2);
361 auto &Equal = *(Tokens.end() - 1);
362 if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
363 Equal->isNot(tok::equal)) {
364 return false;
365 }
366 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
367 NullishCoalescing->TokenText =
368 StringRef(NullishCoalescing->TokenText.begin(),
369 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
370 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
371 NullishCoalescing->setType(TT_NullCoalescingEqual);
372 Tokens.erase(Tokens.end() - 1);
373 return true;
374}
375
376bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
377 if (Tokens.size() < 2)
378 return false;
379 const auto At = *(Tokens.end() - 2);
380 if (At->isNot(tok::at))
381 return false;
382 const auto Keyword = *(Tokens.end() - 1);
383 if (Keyword->TokenText == "$")
384 return false;
385 if (!Keywords.isCSharpKeyword(*Keyword))
386 return false;
387
388 At->Tok.setKind(tok::identifier);
389 At->TokenText = StringRef(At->TokenText.begin(),
390 Keyword->TokenText.end() - At->TokenText.begin());
391 At->ColumnWidth += Keyword->ColumnWidth;
392 At->setType(Keyword->getType());
393 Tokens.erase(Tokens.end() - 1);
394 return true;
395}
396
397// In C# transform identifier foreach into kw_foreach
398bool FormatTokenLexer::tryTransformCSharpForEach() {
399 if (Tokens.size() < 1)
400 return false;
401 auto &Identifier = *(Tokens.end() - 1);
402 if (Identifier->isNot(tok::identifier))
403 return false;
404 if (Identifier->TokenText != "foreach")
405 return false;
406
407 Identifier->setType(TT_ForEachMacro);
408 Identifier->Tok.setKind(tok::kw_for);
409 return true;
410}
411
412bool FormatTokenLexer::tryMergeForEach() {
413 if (Tokens.size() < 2)
414 return false;
415 auto &For = *(Tokens.end() - 2);
416 auto &Each = *(Tokens.end() - 1);
417 if (For->isNot(tok::kw_for))
418 return false;
419 if (Each->isNot(tok::identifier))
420 return false;
421 if (Each->TokenText != "each")
422 return false;
423
424 For->setType(TT_ForEachMacro);
425 For->Tok.setKind(tok::kw_for);
426
427 For->TokenText = StringRef(For->TokenText.begin(),
428 Each->TokenText.end() - For->TokenText.begin());
429 For->ColumnWidth += Each->ColumnWidth;
430 Tokens.erase(Tokens.end() - 1);
431 return true;
432}
433
434bool FormatTokenLexer::tryTransformTryUsageForC() {
435 if (Tokens.size() < 2)
436 return false;
437 auto &Try = *(Tokens.end() - 2);
438 if (Try->isNot(tok::kw_try))
439 return false;
440 auto &Next = *(Tokens.end() - 1);
441 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
442 return false;
443
444 if (Tokens.size() > 2) {
445 auto &At = *(Tokens.end() - 3);
446 if (At->is(tok::at))
447 return false;
448 }
449
450 Try->Tok.setKind(tok::identifier);
451 return true;
452}
453
454bool FormatTokenLexer::tryMergeLessLess() {
455 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
456 if (Tokens.size() < 3)
457 return false;
458
459 auto First = Tokens.end() - 3;
460 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
461 return false;
462
463 // Only merge if there currently is no whitespace between the two "<".
464 if (First[1]->hasWhitespaceBefore())
465 return false;
466
467 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
468 if (X && X->is(tok::less))
469 return false;
470
471 auto Y = First[2];
472 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
473 return false;
474
475 First[0]->Tok.setKind(tok::lessless);
476 First[0]->TokenText = "<<";
477 First[0]->ColumnWidth += 1;
478 Tokens.erase(Tokens.end() - 2);
479 return true;
480}
481
482bool FormatTokenLexer::tryMergeGreaterGreater() {
483 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
484 if (Tokens.size() < 2)
485 return false;
486
487 auto First = Tokens.end() - 2;
488 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
489 return false;
490
491 // Only merge if there currently is no whitespace between the first two ">".
492 if (First[1]->hasWhitespaceBefore())
493 return false;
494
495 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
496 if (Tok && Tok->isNot(tok::kw_operator))
497 return false;
498
499 First[0]->Tok.setKind(tok::greatergreater);
500 First[0]->TokenText = ">>";
501 First[0]->ColumnWidth += 1;
502 Tokens.erase(Tokens.end() - 1);
503 return true;
504}
505
506bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
507 TokenType NewType) {
508 if (Tokens.size() < Kinds.size())
509 return false;
510
511 SmallVectorImpl<FormatToken *>::const_iterator First =
512 Tokens.end() - Kinds.size();
513 for (unsigned i = 0; i < Kinds.size(); ++i)
514 if (First[i]->isNot(Kinds[i]))
515 return false;
516
517 return tryMergeTokens(Kinds.size(), NewType);
518}
519
520bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
521 if (Tokens.size() < Count)
522 return false;
523
524 SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
525 unsigned AddLength = 0;
526 for (size_t i = 1; i < Count; ++i) {
527 // If there is whitespace separating the token and the previous one,
528 // they should not be merged.
529 if (First[i]->hasWhitespaceBefore())
530 return false;
531 AddLength += First[i]->TokenText.size();
532 }
533
534 Tokens.resize(Tokens.size() - Count + 1);
535 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
536 First[0]->TokenText.size() + AddLength);
537 First[0]->ColumnWidth += AddLength;
538 First[0]->setType(NewType);
539 return true;
540}
541
542bool FormatTokenLexer::tryMergeTokensAny(
543 ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
544 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
545 return tryMergeTokens(Kinds, NewType);
546 });
547}
548
549// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
550bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
551 // NB: This is not entirely correct, as an r_paren can introduce an operand
552 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
553 // corner case to not matter in practice, though.
554 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
555 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
556 tok::colon, tok::question, tok::tilde) ||
557 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
558 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
559 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
560 Tok->isBinaryOperator();
561}
562
563bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
564 if (!Prev)
565 return true;
566
567 // Regex literals can only follow after prefix unary operators, not after
568 // postfix unary operators. If the '++' is followed by a non-operand
569 // introducing token, the slash here is the operand and not the start of a
570 // regex.
571 // `!` is an unary prefix operator, but also a post-fix operator that casts
572 // away nullability, so the same check applies.
573 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
574 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
575
576 // The previous token must introduce an operand location where regex
577 // literals can occur.
578 if (!precedesOperand(Prev))
579 return false;
580
581 return true;
582}
583
584// Tries to parse a JavaScript Regex literal starting at the current token,
585// if that begins with a slash and is in a location where JavaScript allows
586// regex literals. Changes the current token to a regex literal and updates
587// its text if successful.
588void FormatTokenLexer::tryParseJSRegexLiteral() {
589 FormatToken *RegexToken = Tokens.back();
590 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
591 return;
592
593 FormatToken *Prev = nullptr;
594 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
595 // NB: Because previous pointers are not initialized yet, this cannot use
596 // Token.getPreviousNonComment.
597 if (FT->isNot(tok::comment)) {
598 Prev = FT;
599 break;
600 }
601 }
602
603 if (!canPrecedeRegexLiteral(Prev))
604 return;
605
606 // 'Manually' lex ahead in the current file buffer.
607 const char *Offset = Lex->getBufferLocation();
608 const char *RegexBegin = Offset - RegexToken->TokenText.size();
609 StringRef Buffer = Lex->getBuffer();
610 bool InCharacterClass = false;
611 bool HaveClosingSlash = false;
612 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
613 // Regular expressions are terminated with a '/', which can only be
614 // escaped using '\' or a character class between '[' and ']'.
615 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
616 switch (*Offset) {
617 case '\\':
618 // Skip the escaped character.
619 ++Offset;
620 break;
621 case '[':
622 InCharacterClass = true;
623 break;
624 case ']':
625 InCharacterClass = false;
626 break;
627 case '/':
628 if (!InCharacterClass)
629 HaveClosingSlash = true;
630 break;
631 }
632 }
633
634 RegexToken->setType(TT_RegexLiteral);
635 // Treat regex literals like other string_literals.
636 RegexToken->Tok.setKind(tok::string_literal);
637 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
638 RegexToken->ColumnWidth = RegexToken->TokenText.size();
639
640 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
641}
642
643static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
644 bool Interpolated) {
645 auto Repeated = [&Begin, End]() {
646 return Begin + 1 < End && Begin[1] == Begin[0];
647 };
648
649 // Look for a terminating '"' in the current file buffer.
650 // Make no effort to format code within an interpolated or verbatim string.
651 //
652 // Interpolated strings could contain { } with " characters inside.
653 // $"{x ?? "null"}"
654 // should not be split into $"{x ?? ", null, "}" but should be treated as a
655 // single string-literal.
656 //
657 // We opt not to try and format expressions inside {} within a C#
658 // interpolated string. Formatting expressions within an interpolated string
659 // would require similar work as that done for JavaScript template strings
660 // in `handleTemplateStrings()`.
661 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
662 switch (*Begin) {
663 case '\\':
664 if (!Verbatim)
665 ++Begin;
666 break;
667 case '{':
668 if (Interpolated) {
669 // {{ inside an interpolated string is escaped, so skip it.
670 if (Repeated())
671 ++Begin;
672 else
673 ++UnmatchedOpeningBraceCount;
674 }
675 break;
676 case '}':
677 if (Interpolated) {
678 // }} inside an interpolated string is escaped, so skip it.
679 if (Repeated())
680 ++Begin;
681 else if (UnmatchedOpeningBraceCount > 0)
682 --UnmatchedOpeningBraceCount;
683 else
684 return End;
685 }
686 break;
687 case '"':
688 if (UnmatchedOpeningBraceCount > 0)
689 break;
690 // "" within a verbatim string is an escaped double quote: skip it.
691 if (Verbatim && Repeated()) {
692 ++Begin;
693 break;
694 }
695 return Begin;
696 }
697 }
698
699 return End;
700}
701
702void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
703 FormatToken *CSharpStringLiteral = Tokens.back();
704
705 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
706 return;
707
708 auto &TokenText = CSharpStringLiteral->TokenText;
709
710 bool Verbatim = false;
711 bool Interpolated = false;
712 if (TokenText.startswith(R"($@")") || TokenText.startswith(R"(@$")")) {
713 Verbatim = true;
714 Interpolated = true;
715 } else if (TokenText.startswith(R"(@")")) {
716 Verbatim = true;
717 } else if (TokenText.startswith(R"($")")) {
718 Interpolated = true;
719 }
720
721 // Deal with multiline strings.
722 if (!Verbatim && !Interpolated)
723 return;
724
725 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
726 const char *Offset = StrBegin;
727 if (Verbatim && Interpolated)
728 Offset += 3;
729 else
730 Offset += 2;
731
732 const auto End = Lex->getBuffer().end();
733 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
734
735 // Make no attempt to format code properly if a verbatim string is
736 // unterminated.
737 if (Offset >= End)
738 return;
739
740 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
741 TokenText = LiteralText;
742
743 // Adjust width for potentially multiline string literals.
744 size_t FirstBreak = LiteralText.find('\n');
745 StringRef FirstLineText = FirstBreak == StringRef::npos
746 ? LiteralText
747 : LiteralText.substr(0, FirstBreak);
748 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
749 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
750 Encoding);
751 size_t LastBreak = LiteralText.rfind('\n');
752 if (LastBreak != StringRef::npos) {
753 CSharpStringLiteral->IsMultiline = true;
754 unsigned StartColumn = 0;
755 CSharpStringLiteral->LastLineColumnWidth =
756 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
757 StartColumn, Style.TabWidth, Encoding);
758 }
759
760 assert(Offset < End);
761 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
762}
763
764void FormatTokenLexer::handleTemplateStrings() {
765 FormatToken *BacktickToken = Tokens.back();
766
767 if (BacktickToken->is(tok::l_brace)) {
768 StateStack.push(LexerState::NORMAL);
769 return;
770 }
771 if (BacktickToken->is(tok::r_brace)) {
772 if (StateStack.size() == 1)
773 return;
774 StateStack.pop();
775 if (StateStack.top() != LexerState::TEMPLATE_STRING)
776 return;
777 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
778 } else if (BacktickToken->is(tok::unknown) &&
779 BacktickToken->TokenText == "`") {
780 StateStack.push(LexerState::TEMPLATE_STRING);
781 } else {
782 return; // Not actually a template
783 }
784
785 // 'Manually' lex ahead in the current file buffer.
786 const char *Offset = Lex->getBufferLocation();
787 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
788 for (; Offset != Lex->getBuffer().end(); ++Offset) {
789 if (Offset[0] == '`') {
790 StateStack.pop();
791 ++Offset;
792 break;
793 }
794 if (Offset[0] == '\\') {
795 ++Offset; // Skip the escaped character.
796 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
797 Offset[1] == '{') {
798 // '${' introduces an expression interpolation in the template string.
799 StateStack.push(LexerState::NORMAL);
800 Offset += 2;
801 break;
802 }
803 }
804
805 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
806 BacktickToken->setType(TT_TemplateString);
807 BacktickToken->Tok.setKind(tok::string_literal);
808 BacktickToken->TokenText = LiteralText;
809
810 // Adjust width for potentially multiline string literals.
811 size_t FirstBreak = LiteralText.find('\n');
812 StringRef FirstLineText = FirstBreak == StringRef::npos
813 ? LiteralText
814 : LiteralText.substr(0, FirstBreak);
815 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
816 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
817 size_t LastBreak = LiteralText.rfind('\n');
818 if (LastBreak != StringRef::npos) {
819 BacktickToken->IsMultiline = true;
820 unsigned StartColumn = 0; // The template tail spans the entire line.
821 BacktickToken->LastLineColumnWidth =
822 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
823 StartColumn, Style.TabWidth, Encoding);
824 }
825
826 SourceLocation loc = Lex->getSourceLocation(Offset);
827 resetLexer(SourceMgr.getFileOffset(loc));
828}
829
830void FormatTokenLexer::tryParsePythonComment() {
831 FormatToken *HashToken = Tokens.back();
832 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
833 return;
834 // Turn the remainder of this line into a comment.
835 const char *CommentBegin =
836 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
837 size_t From = CommentBegin - Lex->getBuffer().begin();
838 size_t To = Lex->getBuffer().find_first_of('\n', From);
839 if (To == StringRef::npos)
840 To = Lex->getBuffer().size();
841 size_t Len = To - From;
842 HashToken->setType(TT_LineComment);
843 HashToken->Tok.setKind(tok::comment);
844 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
845 SourceLocation Loc = To < Lex->getBuffer().size()
846 ? Lex->getSourceLocation(CommentBegin + Len)
847 : SourceMgr.getLocForEndOfFile(ID);
848 resetLexer(SourceMgr.getFileOffset(Loc));
849}
850
851bool FormatTokenLexer::tryMerge_TMacro() {
852 if (Tokens.size() < 4)
853 return false;
854 FormatToken *Last = Tokens.back();
855 if (Last->isNot(tok::r_paren))
856 return false;
857
858 FormatToken *String = Tokens[Tokens.size() - 2];
859 if (String->isNot(tok::string_literal) || String->IsMultiline)
860 return false;
861
862 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
863 return false;
864
865 FormatToken *Macro = Tokens[Tokens.size() - 4];
866 if (Macro->TokenText != "_T")
867 return false;
868
869 const char *Start = Macro->TokenText.data();
870 const char *End = Last->TokenText.data() + Last->TokenText.size();
871 String->TokenText = StringRef(Start, End - Start);
872 String->IsFirst = Macro->IsFirst;
873 String->LastNewlineOffset = Macro->LastNewlineOffset;
874 String->WhitespaceRange = Macro->WhitespaceRange;
875 String->OriginalColumn = Macro->OriginalColumn;
876 String->ColumnWidth = encoding::columnWidthWithTabs(
877 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
878 String->NewlinesBefore = Macro->NewlinesBefore;
879 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
880
881 Tokens.pop_back();
882 Tokens.pop_back();
883 Tokens.pop_back();
884 Tokens.back() = String;
885 if (FirstInLineIndex >= Tokens.size())
886 FirstInLineIndex = Tokens.size() - 1;
887 return true;
888}
889
890bool FormatTokenLexer::tryMergeConflictMarkers() {
891 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
892 return false;
893
894 // Conflict lines look like:
895 // <marker> <text from the vcs>
896 // For example:
897 // >>>>>>> /file/in/file/system at revision 1234
898 //
899 // We merge all tokens in a line that starts with a conflict marker
900 // into a single token with a special token type that the unwrapped line
901 // parser will use to correctly rebuild the underlying code.
902
903 FileID ID;
904 // Get the position of the first token in the line.
905 unsigned FirstInLineOffset;
906 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
907 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
908 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
909 // Calculate the offset of the start of the current line.
910 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
911 if (LineOffset == StringRef::npos)
912 LineOffset = 0;
913 else
914 ++LineOffset;
915
916 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
917 StringRef LineStart;
918 if (FirstSpace == StringRef::npos)
919 LineStart = Buffer.substr(LineOffset);
920 else
921 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
922
923 TokenType Type = TT_Unknown;
924 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
925 Type = TT_ConflictStart;
926 } else if (LineStart == "|||||||" || LineStart == "=======" ||
927 LineStart == "====") {
928 Type = TT_ConflictAlternative;
929 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
930 Type = TT_ConflictEnd;
931 }
932
933 if (Type != TT_Unknown) {
934 FormatToken *Next = Tokens.back();
935
936 Tokens.resize(FirstInLineIndex + 1);
937 // We do not need to build a complete token here, as we will skip it
938 // during parsing anyway (as we must not touch whitespace around conflict
939 // markers).
940 Tokens.back()->setType(Type);
941 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
942
943 Tokens.push_back(Next);
944 return true;
945 }
946
947 return false;
948}
949
950FormatToken *FormatTokenLexer::getStashedToken() {
951 // Create a synthesized second '>' or '<' token.
952 Token Tok = FormatTok->Tok;
953 StringRef TokenText = FormatTok->TokenText;
954
955 unsigned OriginalColumn = FormatTok->OriginalColumn;
956 FormatTok = new (Allocator.Allocate()) FormatToken;
957 FormatTok->Tok = Tok;
958 SourceLocation TokLocation =
959 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
960 FormatTok->Tok.setLocation(TokLocation);
961 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
962 FormatTok->TokenText = TokenText;
963 FormatTok->ColumnWidth = 1;
964 FormatTok->OriginalColumn = OriginalColumn + 1;
965
966 return FormatTok;
967}
968
969/// Truncate the current token to the new length and make the lexer continue
970/// from the end of the truncated token. Used for other languages that have
971/// different token boundaries, like JavaScript in which a comment ends at a
972/// line break regardless of whether the line break follows a backslash. Also
973/// used to set the lexer to the end of whitespace if the lexer regards
974/// whitespace and an unrecognized symbol as one token.
975void FormatTokenLexer::truncateToken(size_t NewLen) {
976 assert(NewLen <= FormatTok->TokenText.size());
977 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
978 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
979 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
981 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
982 Encoding);
983 FormatTok->Tok.setLength(NewLen);
984}
985
986/// Count the length of leading whitespace in a token.
987static size_t countLeadingWhitespace(StringRef Text) {
988 // Basically counting the length matched by this regex.
989 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
990 // Directly using the regex turned out to be slow. With the regex
991 // version formatting all files in this directory took about 1.25
992 // seconds. This version took about 0.5 seconds.
993 const unsigned char *const Begin = Text.bytes_begin();
994 const unsigned char *const End = Text.bytes_end();
995 const unsigned char *Cur = Begin;
996 while (Cur < End) {
997 if (isspace(Cur[0])) {
998 ++Cur;
999 } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
1000 // A '\' followed by a newline always escapes the newline, regardless
1001 // of whether there is another '\' before it.
1002 // The source has a null byte at the end. So the end of the entire input
1003 // isn't reached yet. Also the lexer doesn't break apart an escaped
1004 // newline.
1005 assert(End - Cur >= 2);
1006 Cur += 2;
1007 } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
1008 (Cur[3] == '\n' || Cur[3] == '\r')) {
1009 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1010 // characters are quoted individually in this comment because if we write
1011 // them together some compilers warn that we have a trigraph in the code.
1012 assert(End - Cur >= 4);
1013 Cur += 4;
1014 } else {
1015 break;
1016 }
1017 }
1018 return Cur - Begin;
1019}
1020
1021FormatToken *FormatTokenLexer::getNextToken() {
1022 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1023 StateStack.pop();
1024 return getStashedToken();
1025 }
1026
1027 FormatTok = new (Allocator.Allocate()) FormatToken;
1028 readRawToken(*FormatTok);
1029 SourceLocation WhitespaceStart =
1030 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1031 FormatTok->IsFirst = IsFirstToken;
1032 IsFirstToken = false;
1033
1034 // Consume and record whitespace until we find a significant token.
1035 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1036 // followed by a symbol such as backtick. Those symbols may be
1037 // significant in other languages.
1038 unsigned WhitespaceLength = TrailingWhitespace;
1039 while (FormatTok->isNot(tok::eof)) {
1040 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1041 if (LeadingWhitespace == 0)
1042 break;
1043 if (LeadingWhitespace < FormatTok->TokenText.size())
1044 truncateToken(LeadingWhitespace);
1045 StringRef Text = FormatTok->TokenText;
1046 bool InEscape = false;
1047 for (int i = 0, e = Text.size(); i != e; ++i) {
1048 switch (Text[i]) {
1049 case '\r':
1050 // If this is a CRLF sequence, break here and the LF will be handled on
1051 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1052 // the same as a single LF.
1053 if (i + 1 < e && Text[i + 1] == '\n')
1054 break;
1055 [[fallthrough]];
1056 case '\n':
1057 ++FormatTok->NewlinesBefore;
1058 if (!InEscape)
1059 FormatTok->HasUnescapedNewline = true;
1060 else
1061 InEscape = false;
1062 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1063 Column = 0;
1064 break;
1065 case '\f':
1066 case '\v':
1067 Column = 0;
1068 break;
1069 case ' ':
1070 ++Column;
1071 break;
1072 case '\t':
1073 Column +=
1074 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1075 break;
1076 case '\\':
1077 case '?':
1078 case '/':
1079 // The text was entirely whitespace when this loop was entered. Thus
1080 // this has to be an escape sequence.
1081 assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1082 Text.substr(i, 4) == "\?\?/\r" ||
1083 Text.substr(i, 4) == "\?\?/\n" ||
1084 (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1085 Text.substr(i - 1, 4) == "\?\?/\n")) ||
1086 (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1087 Text.substr(i - 2, 4) == "\?\?/\n")));
1088 InEscape = true;
1089 break;
1090 default:
1091 // This shouldn't happen.
1092 assert(false);
1093 break;
1094 }
1095 }
1096 WhitespaceLength += Text.size();
1097 readRawToken(*FormatTok);
1098 }
1099
1100 if (FormatTok->is(tok::unknown))
1101 FormatTok->setType(TT_ImplicitStringLiteral);
1102
1103 // JavaScript and Java do not allow to escape the end of the line with a
1104 // backslash. Backslashes are syntax errors in plain source, but can occur in
1105 // comments. When a single line comment ends with a \, it'll cause the next
1106 // line of code to be lexed as a comment, breaking formatting. The code below
1107 // finds comments that contain a backslash followed by a line break, truncates
1108 // the comment token at the backslash, and resets the lexer to restart behind
1109 // the backslash.
1110 if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1111 FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
1112 size_t BackslashPos = FormatTok->TokenText.find('\\');
1113 while (BackslashPos != StringRef::npos) {
1114 if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1115 FormatTok->TokenText[BackslashPos + 1] == '\n') {
1116 truncateToken(BackslashPos + 1);
1117 break;
1118 }
1119 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1120 }
1121 }
1122
1123 if (Style.isVerilog()) {
1124 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1125 SmallVector<StringRef, 1> Matches;
1126 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1127 // And it uses the hash for delays and parameter lists. In order to continue
1128 // using `tok::hash` in other places, the backtick gets marked as the hash
1129 // here. And in order to tell the backtick and hash apart for
1130 // Verilog-specific stuff, the hash becomes an identifier.
1131 if (FormatTok->is(tok::numeric_constant)) {
1132 // In Verilog the quote is not part of a number.
1133 auto Quote = FormatTok->TokenText.find('\'');
1134 if (Quote != StringRef::npos)
1135 truncateToken(Quote);
1136 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1137 FormatTok->Tok.setKind(tok::raw_identifier);
1138 } else if (FormatTok->is(tok::raw_identifier)) {
1139 if (FormatTok->TokenText == "`") {
1140 FormatTok->Tok.setIdentifierInfo(nullptr);
1141 FormatTok->Tok.setKind(tok::hash);
1142 } else if (FormatTok->TokenText == "``") {
1143 FormatTok->Tok.setIdentifierInfo(nullptr);
1144 FormatTok->Tok.setKind(tok::hashhash);
1145 } else if (Tokens.size() > 0 &&
1146 Tokens.back()->is(Keywords.kw_apostrophe) &&
1147 NumberBase.match(FormatTok->TokenText, &Matches)) {
1148 // In Verilog in a based number literal like `'b10`, there may be
1149 // whitespace between `'b` and `10`. Therefore we handle the base and
1150 // the rest of the number literal as two tokens. But if there is no
1151 // space in the input code, we need to manually separate the two parts.
1152 truncateToken(Matches[0].size());
1153 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1154 }
1155 }
1156 }
1157
1158 FormatTok->WhitespaceRange = SourceRange(
1159 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1160
1161 FormatTok->OriginalColumn = Column;
1162
1163 TrailingWhitespace = 0;
1164 if (FormatTok->is(tok::comment)) {
1165 // FIXME: Add the trimmed whitespace to Column.
1166 StringRef UntrimmedText = FormatTok->TokenText;
1167 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1168 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1169 } else if (FormatTok->is(tok::raw_identifier)) {
1170 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1171 FormatTok->Tok.setIdentifierInfo(&Info);
1172 FormatTok->Tok.setKind(Info.getTokenID());
1173 if (Style.Language == FormatStyle::LK_Java &&
1174 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1175 tok::kw_operator)) {
1176 FormatTok->Tok.setKind(tok::identifier);
1177 FormatTok->Tok.setIdentifierInfo(nullptr);
1178 } else if (Style.isJavaScript() &&
1179 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1180 tok::kw_operator)) {
1181 FormatTok->Tok.setKind(tok::identifier);
1182 FormatTok->Tok.setIdentifierInfo(nullptr);
1183 }
1184 } else if (FormatTok->is(tok::greatergreater)) {
1185 FormatTok->Tok.setKind(tok::greater);
1186 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1187 ++Column;
1188 StateStack.push(LexerState::TOKEN_STASHED);
1189 } else if (FormatTok->is(tok::lessless)) {
1190 FormatTok->Tok.setKind(tok::less);
1191 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1192 ++Column;
1193 StateStack.push(LexerState::TOKEN_STASHED);
1194 }
1195
1196 if (Style.isVerilog() && Tokens.size() > 0 &&
1197 Tokens.back()->is(TT_VerilogNumberBase) &&
1198 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1199 // Mark the number following a base like `'h?a0` as a number.
1200 FormatTok->Tok.setKind(tok::numeric_constant);
1201 }
1202
1203 // Now FormatTok is the next non-whitespace token.
1204
1205 StringRef Text = FormatTok->TokenText;
1206 size_t FirstNewlinePos = Text.find('\n');
1207 if (FirstNewlinePos == StringRef::npos) {
1208 // FIXME: ColumnWidth actually depends on the start column, we need to
1209 // take this into account when the token is moved.
1210 FormatTok->ColumnWidth =
1211 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1212 Column += FormatTok->ColumnWidth;
1213 } else {
1214 FormatTok->IsMultiline = true;
1215 // FIXME: ColumnWidth actually depends on the start column, we need to
1216 // take this into account when the token is moved.
1218 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1219
1220 // The last line of the token always starts in column 0.
1221 // Thus, the length can be precomputed even in the presence of tabs.
1223 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1224 Column = FormatTok->LastLineColumnWidth;
1225 }
1226
1227 if (Style.isCpp()) {
1228 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1229 auto it = Macros.find(Identifier);
1230 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1231 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1232 tok::pp_define) &&
1233 it != Macros.end()) {
1234 FormatTok->setType(it->second);
1235 if (it->second == TT_IfMacro) {
1236 // The lexer token currently has type tok::kw_unknown. However, for this
1237 // substitution to be treated correctly in the TokenAnnotator, faking
1238 // the tok value seems to be needed. Not sure if there's a more elegant
1239 // way.
1240 FormatTok->Tok.setKind(tok::kw_if);
1241 }
1242 } else if (FormatTok->is(tok::identifier)) {
1243 if (MacroBlockBeginRegex.match(Text))
1244 FormatTok->setType(TT_MacroBlockBegin);
1245 else if (MacroBlockEndRegex.match(Text))
1246 FormatTok->setType(TT_MacroBlockEnd);
1247 else if (TypeNames.contains(Identifier))
1248 FormatTok->setFinalizedType(TT_TypeName);
1249 }
1250 }
1251
1252 return FormatTok;
1253}
1254
1255bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1256 // In Verilog the quote is not a character literal.
1257 //
1258 // Make the backtick and double backtick identifiers to match against them
1259 // more easily.
1260 //
1261 // In Verilog an escaped identifier starts with backslash and ends with
1262 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1263 // also begin an escaped newline outside of an escaped identifier. We check
1264 // for that outside of the Regex since we can't use negative lookhead
1265 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1266 // identifier may have a length of 0 according to Section A.9.3.
1267 // FIXME: If there is an escaped newline in the middle of an escaped
1268 // identifier, allow for pasting the two lines together, But escaped
1269 // identifiers usually occur only in generated code anyway.
1270 static const llvm::Regex VerilogToken(R"re(^('|``?|\\‍(\\)re"
1271 "(\r?\n|\r)|[^[:space:]])*)");
1272
1273 SmallVector<StringRef, 4> Matches;
1274 const char *Start = Lex->getBufferLocation();
1275 if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1276 &Matches)) {
1277 return false;
1278 }
1279 // There is a null byte at the end of the buffer, so we don't have to check
1280 // Start[1] is within the buffer.
1281 if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1282 return false;
1283 size_t Len = Matches[0].size();
1284
1285 // The kind has to be an identifier so we can match it against those defined
1286 // in Keywords. The kind has to be set before the length because the setLength
1287 // function checks that the kind is not an annotation.
1288 Tok.setKind(tok::raw_identifier);
1289 Tok.setLength(Len);
1290 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1291 Tok.setRawIdentifierData(Start);
1292 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1293 return true;
1294}
1295
1296void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1297 // For Verilog, first see if there is a special token, and fall back to the
1298 // normal lexer if there isn't one.
1299 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1300 Lex->LexFromRawLexer(Tok.Tok);
1301 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1302 Tok.Tok.getLength());
1303 // For formatting, treat unterminated string literals like normal string
1304 // literals.
1305 if (Tok.is(tok::unknown)) {
1306 if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1307 Tok.Tok.setKind(tok::string_literal);
1308 Tok.IsUnterminatedLiteral = true;
1309 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1310 Tok.Tok.setKind(tok::string_literal);
1311 }
1312 }
1313
1314 if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Proto ||
1316 Tok.is(tok::char_constant)) {
1317 Tok.Tok.setKind(tok::string_literal);
1318 }
1319
1320 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1321 FormattingDisabled = false;
1322
1323 Tok.Finalized = FormattingDisabled;
1324
1325 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1326 FormattingDisabled = true;
1327}
1328
1329void FormatTokenLexer::resetLexer(unsigned Offset) {
1330 StringRef Buffer = SourceMgr.getBufferData(ID);
1331 LangOpts = getFormattingLangOpts(Style);
1332 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1333 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1334 Lex->SetKeepWhitespaceMode(true);
1335 TrailingWhitespace = 0;
1336}
1337
1338} // namespace format
1339} // namespace clang
MatchType Type
static char ID
Definition: Arena.cpp:163
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:2937
StringRef Identifier
Definition: Format.cpp:2944
Various functions to configurably format source code.
#define X(type, name)
Definition: Value.h:142
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
SourceLocation Begin
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
llvm::MemoryBufferRef getBufferOrFake(FileID FID, SourceLocation Loc=SourceLocation()) const
Return the buffer for the specified FileID.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:186
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:131
void setLength(unsigned Len)
Definition: Token.h:140
void setKind(tok::TokenKind K)
Definition: Token.h:94
void setLocation(SourceLocation L)
Definition: Token.h:139
bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const
Definition: Token.h:100
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:195
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:61
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition: Format.cpp:4086
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:3792
bool isClangFormatOn(StringRef Comment)
Definition: Format.cpp:4082
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:176
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
#define true
Definition: stdbool.h:21
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:1603
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
@ LK_Java
Should be used for Java.
Definition: Format.h:2919
@ LK_Proto
Should be used for Protocol Buffers (https://developers.google.com/protocol-buffers/).
Definition: Format.h:2928
@ LK_TextProto
Should be used for Protocol Buffer messages in text format (https://developers.google....
Definition: Format.h:2933
std::vector< std::string > AttributeMacros
A vector of strings that should be interpreted as attributes/qualifiers instead of identifiers.
Definition: Format.h:1025
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:2993
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:2948
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:4519
std::vector< std::string > StatementAttributeLikeMacros
Macros which are ignored in front of a statement, as if they were an attribute.
Definition: Format.h:4504
std::vector< std::string > IfMacros
A vector of macros that should be interpreted as conditionals instead of as function calls.
Definition: Format.h:2421
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls.
Definition: Format.h:2398
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition: Format.h:4529
bool isCSharp() const
Definition: Format.h:2940
std::vector< std::string > WhitespaceSensitiveMacros
A vector of macros which are whitespace-sensitive and should not be touched.
Definition: Format.h:4602
bool isVerilog() const
Definition: Format.h:2943
bool isJavaScript() const
Definition: Format.h:2942
std::vector< std::string > NamespaceMacros
A vector of macros which are used to open namespace blocks.
Definition: Format.h:3102
std::vector< std::string > StatementMacros
A vector of macros that should be interpreted as complete statements.
Definition: Format.h:4515
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:2997
std::vector< std::string > TypenameMacros
A vector of macros that should be interpreted as type declarations instead of as function calls.
Definition: Format.h:4546
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:462
bool isNot(T Kind) const
Definition: FormatToken.h:580
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:280
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
Definition: FormatToken.h:431
unsigned IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:296
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:421
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:293
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:436
void setType(TokenType T)
Definition: FormatToken.h:390
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:561
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:440
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:573
unsigned IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:299
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:289
void setFinalizedType(TokenType T)
Sets the type and also the finalized flag.
Definition: FormatToken.h:404