clang 17.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
19#include "clang/Format/Format.h"
20#include "llvm/Support/Regex.h"
21
22namespace clang {
23namespace format {
24
26 const SourceManager &SourceMgr, FileID ID, unsigned Column,
27 const FormatStyle &Style, encoding::Encoding Encoding,
28 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29 IdentifierTable &IdentTable)
30 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31 Column(Column), TrailingWhitespace(0),
32 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36 MacroBlockEndRegex(Style.MacroBlockEnd) {
37 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38 Lex->SetKeepWhitespaceMode(true);
39
40 for (const std::string &ForEachMacro : Style.ForEachMacros) {
41 auto Identifier = &IdentTable.get(ForEachMacro);
42 Macros.insert({Identifier, TT_ForEachMacro});
43 }
44 for (const std::string &IfMacro : Style.IfMacros) {
45 auto Identifier = &IdentTable.get(IfMacro);
46 Macros.insert({Identifier, TT_IfMacro});
47 }
48 for (const std::string &AttributeMacro : Style.AttributeMacros) {
49 auto Identifier = &IdentTable.get(AttributeMacro);
50 Macros.insert({Identifier, TT_AttributeMacro});
51 }
52 for (const std::string &StatementMacro : Style.StatementMacros) {
53 auto Identifier = &IdentTable.get(StatementMacro);
54 Macros.insert({Identifier, TT_StatementMacro});
55 }
56 for (const std::string &TypenameMacro : Style.TypenameMacros) {
57 auto Identifier = &IdentTable.get(TypenameMacro);
58 Macros.insert({Identifier, TT_TypenameMacro});
59 }
60 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61 auto Identifier = &IdentTable.get(NamespaceMacro);
62 Macros.insert({Identifier, TT_NamespaceMacro});
63 }
64 for (const std::string &WhitespaceSensitiveMacro :
66 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67 Macros.insert({Identifier, TT_UntouchableMacroFunc});
68 }
69 for (const std::string &StatementAttributeLikeMacro :
71 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73 }
74}
75
77 assert(Tokens.empty());
78 assert(FirstInLineIndex == 0);
79 do {
80 Tokens.push_back(getNextToken());
81 if (Style.isJavaScript()) {
82 tryParseJSRegexLiteral();
83 handleTemplateStrings();
84 }
86 tryParsePythonComment();
87 tryMergePreviousTokens();
88 if (Style.isCSharp()) {
89 // This needs to come after tokens have been merged so that C#
90 // string literals are correctly identified.
91 handleCSharpVerbatimAndInterpolatedStrings();
92 }
93 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
94 FirstInLineIndex = Tokens.size() - 1;
95 } while (Tokens.back()->isNot(tok::eof));
96 return Tokens;
97}
98
99void FormatTokenLexer::tryMergePreviousTokens() {
100 if (tryMerge_TMacro())
101 return;
102 if (tryMergeConflictMarkers())
103 return;
104 if (tryMergeLessLess())
105 return;
106 if (tryMergeGreaterGreater())
107 return;
108 if (tryMergeForEach())
109 return;
110 if (Style.isCpp() && tryTransformTryUsageForC())
111 return;
112
113 if (Style.isJavaScript() || Style.isCSharp()) {
114 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
115 tok::question};
116 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
117 tok::period};
118 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
119
120 if (tryMergeTokens(FatArrow, TT_FatArrow))
121 return;
122 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
123 // Treat like the "||" operator (as opposed to the ternary ?).
124 Tokens.back()->Tok.setKind(tok::pipepipe);
125 return;
126 }
127 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
128 // Treat like a regular "." access.
129 Tokens.back()->Tok.setKind(tok::period);
130 return;
131 }
132 if (tryMergeNullishCoalescingEqual())
133 return;
134 }
135
136 if (Style.isCSharp()) {
137 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
138 tok::question, tok::l_square};
139
140 if (tryMergeCSharpKeywordVariables())
141 return;
142 if (tryMergeCSharpStringLiteral())
143 return;
144 if (tryTransformCSharpForEach())
145 return;
146 if (tryMergeTokens(CSharpNullConditionalLSquare,
147 TT_CSharpNullConditionalLSquare)) {
148 // Treat like a regular "[" operator.
149 Tokens.back()->Tok.setKind(tok::l_square);
150 return;
151 }
152 }
153
154 if (tryMergeNSStringLiteral())
155 return;
156
157 if (Style.isJavaScript()) {
158 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
159 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
160 tok::equal};
161 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
162 tok::greaterequal};
163 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
164 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
165 tok::starequal};
166 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
167 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
168
169 // FIXME: Investigate what token type gives the correct operator priority.
170 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
171 return;
172 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
173 return;
174 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
175 return;
176 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
177 return;
178 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
179 Tokens.back()->Tok.setKind(tok::starequal);
180 return;
181 }
182 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
183 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
184 // Treat like the "=" assignment operator.
185 Tokens.back()->Tok.setKind(tok::equal);
186 return;
187 }
188 if (tryMergeJSPrivateIdentifier())
189 return;
190 }
191
192 if (Style.Language == FormatStyle::LK_Java) {
193 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
194 tok::greater, tok::greater, tok::greaterequal};
195 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
196 return;
197 }
198
199 if (Style.isVerilog()) {
200 // Merge the number following a base like `'h?a0`.
201 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
202 Tokens.end()[-2]->is(tok::numeric_constant) &&
203 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
204 tok::question) &&
205 tryMergeTokens(2, TT_Unknown)) {
206 return;
207 }
208 // Part select.
209 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
210 TT_BitFieldColon)) {
211 return;
212 }
213 // Xnor. The combined token is treated as a caret which can also be either a
214 // unary or binary operator. The actual type is determined in
215 // TokenAnnotator. We also check the token length so we know it is not
216 // already a merged token.
217 if (Tokens.back()->TokenText.size() == 1 &&
218 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
219 TT_BinaryOperator)) {
220 Tokens.back()->Tok.setKind(tok::caret);
221 return;
222 }
223 // Signed shift and distribution weight.
224 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
225 Tokens.back()->Tok.setKind(tok::lessless);
226 return;
227 }
228 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
229 Tokens.back()->Tok.setKind(tok::greatergreater);
230 return;
231 }
232 if (tryMergeTokensAny({{tok::lessless, tok::equal},
233 {tok::lessless, tok::lessequal},
234 {tok::greatergreater, tok::equal},
235 {tok::greatergreater, tok::greaterequal},
236 {tok::colon, tok::equal},
237 {tok::colon, tok::slash}},
238 TT_BinaryOperator)) {
239 Tokens.back()->ForcedPrecedence = prec::Assignment;
240 return;
241 }
242 // Exponentiation, signed shift, case equality, and wildcard equality.
243 if (tryMergeTokensAny({{tok::star, tok::star},
244 {tok::lessless, tok::less},
245 {tok::greatergreater, tok::greater},
246 {tok::exclaimequal, tok::equal},
247 {tok::exclaimequal, tok::question},
248 {tok::equalequal, tok::equal},
249 {tok::equalequal, tok::question}},
250 TT_BinaryOperator)) {
251 return;
252 }
253 // Module paths in specify blocks and implications in properties.
254 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
255 {tok::plus, tok::star, tok::greater},
256 {tok::minusequal, tok::greater},
257 {tok::minus, tok::star, tok::greater},
258 {tok::less, tok::arrow},
259 {tok::equal, tok::greater},
260 {tok::star, tok::greater},
261 {tok::pipeequal, tok::greater},
262 {tok::pipe, tok::arrow},
263 {tok::hash, tok::minus, tok::hash},
264 {tok::hash, tok::equal, tok::hash}},
265 TT_BinaryOperator)) {
266 Tokens.back()->ForcedPrecedence = prec::Comma;
267 return;
268 }
269 }
270}
271
272bool FormatTokenLexer::tryMergeNSStringLiteral() {
273 if (Tokens.size() < 2)
274 return false;
275 auto &At = *(Tokens.end() - 2);
276 auto &String = *(Tokens.end() - 1);
277 if (!At->is(tok::at) || !String->is(tok::string_literal))
278 return false;
279 At->Tok.setKind(tok::string_literal);
280 At->TokenText = StringRef(At->TokenText.begin(),
281 String->TokenText.end() - At->TokenText.begin());
282 At->ColumnWidth += String->ColumnWidth;
283 At->setType(TT_ObjCStringLiteral);
284 Tokens.erase(Tokens.end() - 1);
285 return true;
286}
287
288bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
289 // Merges #idenfier into a single identifier with the text #identifier
290 // but the token tok::identifier.
291 if (Tokens.size() < 2)
292 return false;
293 auto &Hash = *(Tokens.end() - 2);
294 auto &Identifier = *(Tokens.end() - 1);
295 if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
296 return false;
297 Hash->Tok.setKind(tok::identifier);
298 Hash->TokenText =
299 StringRef(Hash->TokenText.begin(),
300 Identifier->TokenText.end() - Hash->TokenText.begin());
301 Hash->ColumnWidth += Identifier->ColumnWidth;
302 Hash->setType(TT_JsPrivateIdentifier);
303 Tokens.erase(Tokens.end() - 1);
304 return true;
305}
306
307// Search for verbatim or interpolated string literals @"ABC" or
308// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
309// prevent splitting of @, $ and ".
310// Merging of multiline verbatim strings with embedded '"' is handled in
311// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
312bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
313 if (Tokens.size() < 2)
314 return false;
315
316 // Look for @"aaaaaa" or $"aaaaaa".
317 const auto String = *(Tokens.end() - 1);
318 if (String->isNot(tok::string_literal))
319 return false;
320
321 auto Prefix = *(Tokens.end() - 2);
322 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
323 return false;
324
325 if (Tokens.size() > 2) {
326 const auto Tok = *(Tokens.end() - 3);
327 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
328 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
329 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
330 Tok->ColumnWidth += Prefix->ColumnWidth;
331 Tokens.erase(Tokens.end() - 2);
332 Prefix = Tok;
333 }
334 }
335
336 // Convert back into just a string_literal.
337 Prefix->Tok.setKind(tok::string_literal);
338 Prefix->TokenText =
339 StringRef(Prefix->TokenText.begin(),
340 String->TokenText.end() - Prefix->TokenText.begin());
341 Prefix->ColumnWidth += String->ColumnWidth;
342 Prefix->setType(TT_CSharpStringLiteral);
343 Tokens.erase(Tokens.end() - 1);
344 return true;
345}
346
347// Valid C# attribute targets:
348// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
349const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
350 "assembly", "module", "field", "event", "method",
351 "param", "property", "return", "type",
352};
353
354bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
355 if (Tokens.size() < 2)
356 return false;
357 auto &NullishCoalescing = *(Tokens.end() - 2);
358 auto &Equal = *(Tokens.end() - 1);
359 if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
360 !Equal->is(tok::equal)) {
361 return false;
362 }
363 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
364 NullishCoalescing->TokenText =
365 StringRef(NullishCoalescing->TokenText.begin(),
366 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
367 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
368 NullishCoalescing->setType(TT_NullCoalescingEqual);
369 Tokens.erase(Tokens.end() - 1);
370 return true;
371}
372
373bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
374 if (Tokens.size() < 2)
375 return false;
376 const auto At = *(Tokens.end() - 2);
377 if (At->isNot(tok::at))
378 return false;
379 const auto Keyword = *(Tokens.end() - 1);
380 if (Keyword->TokenText == "$")
381 return false;
382 if (!Keywords.isCSharpKeyword(*Keyword))
383 return false;
384
385 At->Tok.setKind(tok::identifier);
386 At->TokenText = StringRef(At->TokenText.begin(),
387 Keyword->TokenText.end() - At->TokenText.begin());
388 At->ColumnWidth += Keyword->ColumnWidth;
389 At->setType(Keyword->getType());
390 Tokens.erase(Tokens.end() - 1);
391 return true;
392}
393
394// In C# transform identifier foreach into kw_foreach
395bool FormatTokenLexer::tryTransformCSharpForEach() {
396 if (Tokens.size() < 1)
397 return false;
398 auto &Identifier = *(Tokens.end() - 1);
399 if (!Identifier->is(tok::identifier))
400 return false;
401 if (Identifier->TokenText != "foreach")
402 return false;
403
404 Identifier->setType(TT_ForEachMacro);
405 Identifier->Tok.setKind(tok::kw_for);
406 return true;
407}
408
409bool FormatTokenLexer::tryMergeForEach() {
410 if (Tokens.size() < 2)
411 return false;
412 auto &For = *(Tokens.end() - 2);
413 auto &Each = *(Tokens.end() - 1);
414 if (!For->is(tok::kw_for))
415 return false;
416 if (!Each->is(tok::identifier))
417 return false;
418 if (Each->TokenText != "each")
419 return false;
420
421 For->setType(TT_ForEachMacro);
422 For->Tok.setKind(tok::kw_for);
423
424 For->TokenText = StringRef(For->TokenText.begin(),
425 Each->TokenText.end() - For->TokenText.begin());
426 For->ColumnWidth += Each->ColumnWidth;
427 Tokens.erase(Tokens.end() - 1);
428 return true;
429}
430
431bool FormatTokenLexer::tryTransformTryUsageForC() {
432 if (Tokens.size() < 2)
433 return false;
434 auto &Try = *(Tokens.end() - 2);
435 if (!Try->is(tok::kw_try))
436 return false;
437 auto &Next = *(Tokens.end() - 1);
438 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
439 return false;
440
441 if (Tokens.size() > 2) {
442 auto &At = *(Tokens.end() - 3);
443 if (At->is(tok::at))
444 return false;
445 }
446
447 Try->Tok.setKind(tok::identifier);
448 return true;
449}
450
451bool FormatTokenLexer::tryMergeLessLess() {
452 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
453 if (Tokens.size() < 3)
454 return false;
455
456 auto First = Tokens.end() - 3;
457 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
458 return false;
459
460 // Only merge if there currently is no whitespace between the two "<".
461 if (First[1]->hasWhitespaceBefore())
462 return false;
463
464 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
465 if (X && X->is(tok::less))
466 return false;
467
468 auto Y = First[2];
469 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
470 return false;
471
472 First[0]->Tok.setKind(tok::lessless);
473 First[0]->TokenText = "<<";
474 First[0]->ColumnWidth += 1;
475 Tokens.erase(Tokens.end() - 2);
476 return true;
477}
478
479bool FormatTokenLexer::tryMergeGreaterGreater() {
480 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
481 if (Tokens.size() < 2)
482 return false;
483
484 auto First = Tokens.end() - 2;
485 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
486 return false;
487
488 // Only merge if there currently is no whitespace between the first two ">".
489 if (First[1]->hasWhitespaceBefore())
490 return false;
491
492 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
493 if (Tok && Tok->isNot(tok::kw_operator))
494 return false;
495
496 First[0]->Tok.setKind(tok::greatergreater);
497 First[0]->TokenText = ">>";
498 First[0]->ColumnWidth += 1;
499 Tokens.erase(Tokens.end() - 1);
500 return true;
501}
502
503bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
504 TokenType NewType) {
505 if (Tokens.size() < Kinds.size())
506 return false;
507
508 SmallVectorImpl<FormatToken *>::const_iterator First =
509 Tokens.end() - Kinds.size();
510 for (unsigned i = 0; i < Kinds.size(); ++i)
511 if (!First[i]->is(Kinds[i]))
512 return false;
513
514 return tryMergeTokens(Kinds.size(), NewType);
515}
516
517bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
518 if (Tokens.size() < Count)
519 return false;
520
521 SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
522 unsigned AddLength = 0;
523 for (size_t i = 1; i < Count; ++i) {
524 // If there is whitespace separating the token and the previous one,
525 // they should not be merged.
526 if (First[i]->hasWhitespaceBefore())
527 return false;
528 AddLength += First[i]->TokenText.size();
529 }
530
531 Tokens.resize(Tokens.size() - Count + 1);
532 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
533 First[0]->TokenText.size() + AddLength);
534 First[0]->ColumnWidth += AddLength;
535 First[0]->setType(NewType);
536 return true;
537}
538
539bool FormatTokenLexer::tryMergeTokensAny(
540 ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
541 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
542 return tryMergeTokens(Kinds, NewType);
543 });
544}
545
546// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
547bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
548 // NB: This is not entirely correct, as an r_paren can introduce an operand
549 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
550 // corner case to not matter in practice, though.
551 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
552 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
553 tok::colon, tok::question, tok::tilde) ||
554 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
555 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
556 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
557 Tok->isBinaryOperator();
558}
559
560bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
561 if (!Prev)
562 return true;
563
564 // Regex literals can only follow after prefix unary operators, not after
565 // postfix unary operators. If the '++' is followed by a non-operand
566 // introducing token, the slash here is the operand and not the start of a
567 // regex.
568 // `!` is an unary prefix operator, but also a post-fix operator that casts
569 // away nullability, so the same check applies.
570 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
571 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
572
573 // The previous token must introduce an operand location where regex
574 // literals can occur.
575 if (!precedesOperand(Prev))
576 return false;
577
578 return true;
579}
580
581// Tries to parse a JavaScript Regex literal starting at the current token,
582// if that begins with a slash and is in a location where JavaScript allows
583// regex literals. Changes the current token to a regex literal and updates
584// its text if successful.
585void FormatTokenLexer::tryParseJSRegexLiteral() {
586 FormatToken *RegexToken = Tokens.back();
587 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
588 return;
589
590 FormatToken *Prev = nullptr;
591 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
592 // NB: Because previous pointers are not initialized yet, this cannot use
593 // Token.getPreviousNonComment.
594 if (FT->isNot(tok::comment)) {
595 Prev = FT;
596 break;
597 }
598 }
599
600 if (!canPrecedeRegexLiteral(Prev))
601 return;
602
603 // 'Manually' lex ahead in the current file buffer.
604 const char *Offset = Lex->getBufferLocation();
605 const char *RegexBegin = Offset - RegexToken->TokenText.size();
606 StringRef Buffer = Lex->getBuffer();
607 bool InCharacterClass = false;
608 bool HaveClosingSlash = false;
609 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
610 // Regular expressions are terminated with a '/', which can only be
611 // escaped using '\' or a character class between '[' and ']'.
612 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
613 switch (*Offset) {
614 case '\\':
615 // Skip the escaped character.
616 ++Offset;
617 break;
618 case '[':
619 InCharacterClass = true;
620 break;
621 case ']':
622 InCharacterClass = false;
623 break;
624 case '/':
625 if (!InCharacterClass)
626 HaveClosingSlash = true;
627 break;
628 }
629 }
630
631 RegexToken->setType(TT_RegexLiteral);
632 // Treat regex literals like other string_literals.
633 RegexToken->Tok.setKind(tok::string_literal);
634 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
635 RegexToken->ColumnWidth = RegexToken->TokenText.size();
636
637 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
638}
639
640static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
641 bool Interpolated) {
642 auto Repeated = [&Begin, End]() {
643 return Begin + 1 < End && Begin[1] == Begin[0];
644 };
645
646 // Look for a terminating '"' in the current file buffer.
647 // Make no effort to format code within an interpolated or verbatim string.
648 //
649 // Interpolated strings could contain { } with " characters inside.
650 // $"{x ?? "null"}"
651 // should not be split into $"{x ?? ", null, "}" but should be treated as a
652 // single string-literal.
653 //
654 // We opt not to try and format expressions inside {} within a C#
655 // interpolated string. Formatting expressions within an interpolated string
656 // would require similar work as that done for JavaScript template strings
657 // in `handleTemplateStrings()`.
658 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
659 switch (*Begin) {
660 case '\\':
661 if (!Verbatim)
662 ++Begin;
663 break;
664 case '{':
665 if (Interpolated) {
666 // {{ inside an interpolated string is escaped, so skip it.
667 if (Repeated())
668 ++Begin;
669 else
670 ++UnmatchedOpeningBraceCount;
671 }
672 break;
673 case '}':
674 if (Interpolated) {
675 // }} inside an interpolated string is escaped, so skip it.
676 if (Repeated())
677 ++Begin;
678 else if (UnmatchedOpeningBraceCount > 0)
679 --UnmatchedOpeningBraceCount;
680 else
681 return End;
682 }
683 break;
684 case '"':
685 if (UnmatchedOpeningBraceCount > 0)
686 break;
687 // "" within a verbatim string is an escaped double quote: skip it.
688 if (Verbatim && Repeated()) {
689 ++Begin;
690 break;
691 }
692 return Begin;
693 }
694 }
695
696 return End;
697}
698
699void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
700 FormatToken *CSharpStringLiteral = Tokens.back();
701
702 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
703 return;
704
705 auto &TokenText = CSharpStringLiteral->TokenText;
706
707 bool Verbatim = false;
708 bool Interpolated = false;
709 if (TokenText.startswith(R"($@")") || TokenText.startswith(R"(@$")")) {
710 Verbatim = true;
711 Interpolated = true;
712 } else if (TokenText.startswith(R"(@")")) {
713 Verbatim = true;
714 } else if (TokenText.startswith(R"($")")) {
715 Interpolated = true;
716 }
717
718 // Deal with multiline strings.
719 if (!Verbatim && !Interpolated)
720 return;
721
722 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
723 const char *Offset = StrBegin;
724 if (Verbatim && Interpolated)
725 Offset += 3;
726 else
727 Offset += 2;
728
729 const auto End = Lex->getBuffer().end();
730 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
731
732 // Make no attempt to format code properly if a verbatim string is
733 // unterminated.
734 if (Offset >= End)
735 return;
736
737 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
738 TokenText = LiteralText;
739
740 // Adjust width for potentially multiline string literals.
741 size_t FirstBreak = LiteralText.find('\n');
742 StringRef FirstLineText = FirstBreak == StringRef::npos
743 ? LiteralText
744 : LiteralText.substr(0, FirstBreak);
745 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
746 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
747 Encoding);
748 size_t LastBreak = LiteralText.rfind('\n');
749 if (LastBreak != StringRef::npos) {
750 CSharpStringLiteral->IsMultiline = true;
751 unsigned StartColumn = 0;
752 CSharpStringLiteral->LastLineColumnWidth =
753 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
754 StartColumn, Style.TabWidth, Encoding);
755 }
756
757 assert(Offset < End);
758 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
759}
760
761void FormatTokenLexer::handleTemplateStrings() {
762 FormatToken *BacktickToken = Tokens.back();
763
764 if (BacktickToken->is(tok::l_brace)) {
765 StateStack.push(LexerState::NORMAL);
766 return;
767 }
768 if (BacktickToken->is(tok::r_brace)) {
769 if (StateStack.size() == 1)
770 return;
771 StateStack.pop();
772 if (StateStack.top() != LexerState::TEMPLATE_STRING)
773 return;
774 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
775 } else if (BacktickToken->is(tok::unknown) &&
776 BacktickToken->TokenText == "`") {
777 StateStack.push(LexerState::TEMPLATE_STRING);
778 } else {
779 return; // Not actually a template
780 }
781
782 // 'Manually' lex ahead in the current file buffer.
783 const char *Offset = Lex->getBufferLocation();
784 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
785 for (; Offset != Lex->getBuffer().end(); ++Offset) {
786 if (Offset[0] == '`') {
787 StateStack.pop();
788 ++Offset;
789 break;
790 }
791 if (Offset[0] == '\\') {
792 ++Offset; // Skip the escaped character.
793 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
794 Offset[1] == '{') {
795 // '${' introduces an expression interpolation in the template string.
796 StateStack.push(LexerState::NORMAL);
797 Offset += 2;
798 break;
799 }
800 }
801
802 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
803 BacktickToken->setType(TT_TemplateString);
804 BacktickToken->Tok.setKind(tok::string_literal);
805 BacktickToken->TokenText = LiteralText;
806
807 // Adjust width for potentially multiline string literals.
808 size_t FirstBreak = LiteralText.find('\n');
809 StringRef FirstLineText = FirstBreak == StringRef::npos
810 ? LiteralText
811 : LiteralText.substr(0, FirstBreak);
812 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
813 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
814 size_t LastBreak = LiteralText.rfind('\n');
815 if (LastBreak != StringRef::npos) {
816 BacktickToken->IsMultiline = true;
817 unsigned StartColumn = 0; // The template tail spans the entire line.
818 BacktickToken->LastLineColumnWidth =
819 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
820 StartColumn, Style.TabWidth, Encoding);
821 }
822
823 SourceLocation loc = Lex->getSourceLocation(Offset);
824 resetLexer(SourceMgr.getFileOffset(loc));
825}
826
827void FormatTokenLexer::tryParsePythonComment() {
828 FormatToken *HashToken = Tokens.back();
829 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
830 return;
831 // Turn the remainder of this line into a comment.
832 const char *CommentBegin =
833 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
834 size_t From = CommentBegin - Lex->getBuffer().begin();
835 size_t To = Lex->getBuffer().find_first_of('\n', From);
836 if (To == StringRef::npos)
837 To = Lex->getBuffer().size();
838 size_t Len = To - From;
839 HashToken->setType(TT_LineComment);
840 HashToken->Tok.setKind(tok::comment);
841 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
842 SourceLocation Loc = To < Lex->getBuffer().size()
843 ? Lex->getSourceLocation(CommentBegin + Len)
844 : SourceMgr.getLocForEndOfFile(ID);
845 resetLexer(SourceMgr.getFileOffset(Loc));
846}
847
848bool FormatTokenLexer::tryMerge_TMacro() {
849 if (Tokens.size() < 4)
850 return false;
851 FormatToken *Last = Tokens.back();
852 if (!Last->is(tok::r_paren))
853 return false;
854
855 FormatToken *String = Tokens[Tokens.size() - 2];
856 if (!String->is(tok::string_literal) || String->IsMultiline)
857 return false;
858
859 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
860 return false;
861
862 FormatToken *Macro = Tokens[Tokens.size() - 4];
863 if (Macro->TokenText != "_T")
864 return false;
865
866 const char *Start = Macro->TokenText.data();
867 const char *End = Last->TokenText.data() + Last->TokenText.size();
868 String->TokenText = StringRef(Start, End - Start);
869 String->IsFirst = Macro->IsFirst;
870 String->LastNewlineOffset = Macro->LastNewlineOffset;
871 String->WhitespaceRange = Macro->WhitespaceRange;
872 String->OriginalColumn = Macro->OriginalColumn;
873 String->ColumnWidth = encoding::columnWidthWithTabs(
874 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
875 String->NewlinesBefore = Macro->NewlinesBefore;
876 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
877
878 Tokens.pop_back();
879 Tokens.pop_back();
880 Tokens.pop_back();
881 Tokens.back() = String;
882 if (FirstInLineIndex >= Tokens.size())
883 FirstInLineIndex = Tokens.size() - 1;
884 return true;
885}
886
887bool FormatTokenLexer::tryMergeConflictMarkers() {
888 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
889 return false;
890
891 // Conflict lines look like:
892 // <marker> <text from the vcs>
893 // For example:
894 // >>>>>>> /file/in/file/system at revision 1234
895 //
896 // We merge all tokens in a line that starts with a conflict marker
897 // into a single token with a special token type that the unwrapped line
898 // parser will use to correctly rebuild the underlying code.
899
900 FileID ID;
901 // Get the position of the first token in the line.
902 unsigned FirstInLineOffset;
903 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
904 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
905 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
906 // Calculate the offset of the start of the current line.
907 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
908 if (LineOffset == StringRef::npos)
909 LineOffset = 0;
910 else
911 ++LineOffset;
912
913 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
914 StringRef LineStart;
915 if (FirstSpace == StringRef::npos)
916 LineStart = Buffer.substr(LineOffset);
917 else
918 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
919
920 TokenType Type = TT_Unknown;
921 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
922 Type = TT_ConflictStart;
923 } else if (LineStart == "|||||||" || LineStart == "=======" ||
924 LineStart == "====") {
925 Type = TT_ConflictAlternative;
926 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
927 Type = TT_ConflictEnd;
928 }
929
930 if (Type != TT_Unknown) {
931 FormatToken *Next = Tokens.back();
932
933 Tokens.resize(FirstInLineIndex + 1);
934 // We do not need to build a complete token here, as we will skip it
935 // during parsing anyway (as we must not touch whitespace around conflict
936 // markers).
937 Tokens.back()->setType(Type);
938 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
939
940 Tokens.push_back(Next);
941 return true;
942 }
943
944 return false;
945}
946
947FormatToken *FormatTokenLexer::getStashedToken() {
948 // Create a synthesized second '>' or '<' token.
949 Token Tok = FormatTok->Tok;
950 StringRef TokenText = FormatTok->TokenText;
951
952 unsigned OriginalColumn = FormatTok->OriginalColumn;
953 FormatTok = new (Allocator.Allocate()) FormatToken;
954 FormatTok->Tok = Tok;
955 SourceLocation TokLocation =
956 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
957 FormatTok->Tok.setLocation(TokLocation);
958 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
959 FormatTok->TokenText = TokenText;
960 FormatTok->ColumnWidth = 1;
961 FormatTok->OriginalColumn = OriginalColumn + 1;
962
963 return FormatTok;
964}
965
966/// Truncate the current token to the new length and make the lexer continue
967/// from the end of the truncated token. Used for other languages that have
968/// different token boundaries, like JavaScript in which a comment ends at a
969/// line break regardless of whether the line break follows a backslash. Also
970/// used to set the lexer to the end of whitespace if the lexer regards
971/// whitespace and an unrecognized symbol as one token.
972void FormatTokenLexer::truncateToken(size_t NewLen) {
973 assert(NewLen <= FormatTok->TokenText.size());
974 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
975 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
976 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
978 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
979 Encoding);
980 FormatTok->Tok.setLength(NewLen);
981}
982
983/// Count the length of leading whitespace in a token.
984static size_t countLeadingWhitespace(StringRef Text) {
985 // Basically counting the length matched by this regex.
986 // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
987 // Directly using the regex turned out to be slow. With the regex
988 // version formatting all files in this directory took about 1.25
989 // seconds. This version took about 0.5 seconds.
990 const unsigned char *const Begin = Text.bytes_begin();
991 const unsigned char *const End = Text.bytes_end();
992 const unsigned char *Cur = Begin;
993 while (Cur < End) {
994 if (isspace(Cur[0])) {
995 ++Cur;
996 } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
997 // A '\' followed by a newline always escapes the newline, regardless
998 // of whether there is another '\' before it.
999 // The source has a null byte at the end. So the end of the entire input
1000 // isn't reached yet. Also the lexer doesn't break apart an escaped
1001 // newline.
1002 assert(End - Cur >= 2);
1003 Cur += 2;
1004 } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
1005 (Cur[3] == '\n' || Cur[3] == '\r')) {
1006 // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1007 // characters are quoted individually in this comment because if we write
1008 // them together some compilers warn that we have a trigraph in the code.
1009 assert(End - Cur >= 4);
1010 Cur += 4;
1011 } else {
1012 break;
1013 }
1014 }
1015 return Cur - Begin;
1016}
1017
1018FormatToken *FormatTokenLexer::getNextToken() {
1019 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1020 StateStack.pop();
1021 return getStashedToken();
1022 }
1023
1024 FormatTok = new (Allocator.Allocate()) FormatToken;
1025 readRawToken(*FormatTok);
1026 SourceLocation WhitespaceStart =
1027 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1028 FormatTok->IsFirst = IsFirstToken;
1029 IsFirstToken = false;
1030
1031 // Consume and record whitespace until we find a significant token.
1032 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1033 // followed by a symbol such as backtick. Those symbols may be
1034 // significant in other languages.
1035 unsigned WhitespaceLength = TrailingWhitespace;
1036 while (FormatTok->isNot(tok::eof)) {
1037 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1038 if (LeadingWhitespace == 0)
1039 break;
1040 if (LeadingWhitespace < FormatTok->TokenText.size())
1041 truncateToken(LeadingWhitespace);
1042 StringRef Text = FormatTok->TokenText;
1043 bool InEscape = false;
1044 for (int i = 0, e = Text.size(); i != e; ++i) {
1045 switch (Text[i]) {
1046 case '\r':
1047 // If this is a CRLF sequence, break here and the LF will be handled on
1048 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1049 // the same as a single LF.
1050 if (i + 1 < e && Text[i + 1] == '\n')
1051 break;
1052 [[fallthrough]];
1053 case '\n':
1054 ++FormatTok->NewlinesBefore;
1055 if (!InEscape)
1056 FormatTok->HasUnescapedNewline = true;
1057 else
1058 InEscape = false;
1059 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1060 Column = 0;
1061 break;
1062 case '\f':
1063 case '\v':
1064 Column = 0;
1065 break;
1066 case ' ':
1067 ++Column;
1068 break;
1069 case '\t':
1070 Column +=
1071 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1072 break;
1073 case '\\':
1074 case '?':
1075 case '/':
1076 // The text was entirely whitespace when this loop was entered. Thus
1077 // this has to be an escape sequence.
1078 assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1079 Text.substr(i, 4) == "\?\?/\r" ||
1080 Text.substr(i, 4) == "\?\?/\n" ||
1081 (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1082 Text.substr(i - 1, 4) == "\?\?/\n")) ||
1083 (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1084 Text.substr(i - 2, 4) == "\?\?/\n")));
1085 InEscape = true;
1086 break;
1087 default:
1088 // This shouldn't happen.
1089 assert(false);
1090 break;
1091 }
1092 }
1093 WhitespaceLength += Text.size();
1094 readRawToken(*FormatTok);
1095 }
1096
1097 if (FormatTok->is(tok::unknown))
1098 FormatTok->setType(TT_ImplicitStringLiteral);
1099
1100 // JavaScript and Java do not allow to escape the end of the line with a
1101 // backslash. Backslashes are syntax errors in plain source, but can occur in
1102 // comments. When a single line comment ends with a \, it'll cause the next
1103 // line of code to be lexed as a comment, breaking formatting. The code below
1104 // finds comments that contain a backslash followed by a line break, truncates
1105 // the comment token at the backslash, and resets the lexer to restart behind
1106 // the backslash.
1107 if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1108 FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
1109 size_t BackslashPos = FormatTok->TokenText.find('\\');
1110 while (BackslashPos != StringRef::npos) {
1111 if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1112 FormatTok->TokenText[BackslashPos + 1] == '\n') {
1113 truncateToken(BackslashPos + 1);
1114 break;
1115 }
1116 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1117 }
1118 }
1119
1120 if (Style.isVerilog()) {
1121 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1122 SmallVector<StringRef, 1> Matches;
1123 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1124 // And it uses the hash for delays and parameter lists. In order to continue
1125 // using `tok::hash` in other places, the backtick gets marked as the hash
1126 // here. And in order to tell the backtick and hash apart for
1127 // Verilog-specific stuff, the hash becomes an identifier.
1128 if (FormatTok->is(tok::numeric_constant)) {
1129 // In Verilog the quote is not part of a number.
1130 auto Quote = FormatTok->TokenText.find('\'');
1131 if (Quote != StringRef::npos)
1132 truncateToken(Quote);
1133 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1134 FormatTok->Tok.setKind(tok::raw_identifier);
1135 } else if (FormatTok->is(tok::raw_identifier)) {
1136 if (FormatTok->TokenText == "`") {
1137 FormatTok->Tok.setIdentifierInfo(nullptr);
1138 FormatTok->Tok.setKind(tok::hash);
1139 } else if (FormatTok->TokenText == "``") {
1140 FormatTok->Tok.setIdentifierInfo(nullptr);
1141 FormatTok->Tok.setKind(tok::hashhash);
1142 } else if (Tokens.size() > 0 &&
1143 Tokens.back()->is(Keywords.kw_apostrophe) &&
1144 NumberBase.match(FormatTok->TokenText, &Matches)) {
1145 // In Verilog in a based number literal like `'b10`, there may be
1146 // whitespace between `'b` and `10`. Therefore we handle the base and
1147 // the rest of the number literal as two tokens. But if there is no
1148 // space in the input code, we need to manually separate the two parts.
1149 truncateToken(Matches[0].size());
1150 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1151 }
1152 }
1153 }
1154
1155 FormatTok->WhitespaceRange = SourceRange(
1156 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1157
1158 FormatTok->OriginalColumn = Column;
1159
1160 TrailingWhitespace = 0;
1161 if (FormatTok->is(tok::comment)) {
1162 // FIXME: Add the trimmed whitespace to Column.
1163 StringRef UntrimmedText = FormatTok->TokenText;
1164 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1165 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1166 } else if (FormatTok->is(tok::raw_identifier)) {
1167 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1168 FormatTok->Tok.setIdentifierInfo(&Info);
1169 FormatTok->Tok.setKind(Info.getTokenID());
1170 if (Style.Language == FormatStyle::LK_Java &&
1171 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1172 tok::kw_operator)) {
1173 FormatTok->Tok.setKind(tok::identifier);
1174 FormatTok->Tok.setIdentifierInfo(nullptr);
1175 } else if (Style.isJavaScript() &&
1176 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1177 tok::kw_operator)) {
1178 FormatTok->Tok.setKind(tok::identifier);
1179 FormatTok->Tok.setIdentifierInfo(nullptr);
1180 }
1181 } else if (FormatTok->is(tok::greatergreater)) {
1182 FormatTok->Tok.setKind(tok::greater);
1183 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1184 ++Column;
1185 StateStack.push(LexerState::TOKEN_STASHED);
1186 } else if (FormatTok->is(tok::lessless)) {
1187 FormatTok->Tok.setKind(tok::less);
1188 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1189 ++Column;
1190 StateStack.push(LexerState::TOKEN_STASHED);
1191 }
1192
1193 if (Style.isVerilog() && Tokens.size() > 0 &&
1194 Tokens.back()->is(TT_VerilogNumberBase) &&
1195 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1196 // Mark the number following a base like `'h?a0` as a number.
1197 FormatTok->Tok.setKind(tok::numeric_constant);
1198 }
1199
1200 // Now FormatTok is the next non-whitespace token.
1201
1202 StringRef Text = FormatTok->TokenText;
1203 size_t FirstNewlinePos = Text.find('\n');
1204 if (FirstNewlinePos == StringRef::npos) {
1205 // FIXME: ColumnWidth actually depends on the start column, we need to
1206 // take this into account when the token is moved.
1207 FormatTok->ColumnWidth =
1208 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1209 Column += FormatTok->ColumnWidth;
1210 } else {
1211 FormatTok->IsMultiline = true;
1212 // FIXME: ColumnWidth actually depends on the start column, we need to
1213 // take this into account when the token is moved.
1215 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1216
1217 // The last line of the token always starts in column 0.
1218 // Thus, the length can be precomputed even in the presence of tabs.
1220 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1221 Column = FormatTok->LastLineColumnWidth;
1222 }
1223
1224 if (Style.isCpp()) {
1225 auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1226 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1227 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1228 tok::pp_define) &&
1229 it != Macros.end()) {
1230 FormatTok->setType(it->second);
1231 if (it->second == TT_IfMacro) {
1232 // The lexer token currently has type tok::kw_unknown. However, for this
1233 // substitution to be treated correctly in the TokenAnnotator, faking
1234 // the tok value seems to be needed. Not sure if there's a more elegant
1235 // way.
1236 FormatTok->Tok.setKind(tok::kw_if);
1237 }
1238 } else if (FormatTok->is(tok::identifier)) {
1239 if (MacroBlockBeginRegex.match(Text))
1240 FormatTok->setType(TT_MacroBlockBegin);
1241 else if (MacroBlockEndRegex.match(Text))
1242 FormatTok->setType(TT_MacroBlockEnd);
1243 }
1244 }
1245
1246 return FormatTok;
1247}
1248
1249bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1250 // In Verilog the quote is not a character literal.
1251 //
1252 // Make the backtick and double backtick identifiers to match against them
1253 // more easily.
1254 //
1255 // In Verilog an escaped identifier starts with backslash and ends with
1256 // whitespace. Unless that whitespace is an escaped newline. A backslash can
1257 // also begin an escaped newline outside of an escaped identifier. We check
1258 // for that outside of the Regex since we can't use negative lookhead
1259 // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1260 // identifier may have a length of 0 according to Section A.9.3.
1261 // FIXME: If there is an escaped newline in the middle of an escaped
1262 // identifier, allow for pasting the two lines together, But escaped
1263 // identifiers usually occur only in generated code anyway.
1264 static const llvm::Regex VerilogToken(R"re(^('|``?|\\‍(\\)re"
1265 "(\r?\n|\r)|[^[:space:]])*)");
1266
1267 SmallVector<StringRef, 4> Matches;
1268 const char *Start = Lex->getBufferLocation();
1269 if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1270 &Matches)) {
1271 return false;
1272 }
1273 // There is a null byte at the end of the buffer, so we don't have to check
1274 // Start[1] is within the buffer.
1275 if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1276 return false;
1277 size_t Len = Matches[0].size();
1278
1279 // The kind has to be an identifier so we can match it against those defined
1280 // in Keywords. The kind has to be set before the length because the setLength
1281 // function checks that the kind is not an annotation.
1282 Tok.setKind(tok::raw_identifier);
1283 Tok.setLength(Len);
1284 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1285 Tok.setRawIdentifierData(Start);
1286 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1287 return true;
1288}
1289
1290void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1291 // For Verilog, first see if there is a special token, and fall back to the
1292 // normal lexer if there isn't one.
1293 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1294 Lex->LexFromRawLexer(Tok.Tok);
1295 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1296 Tok.Tok.getLength());
1297 // For formatting, treat unterminated string literals like normal string
1298 // literals.
1299 if (Tok.is(tok::unknown)) {
1300 if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1301 Tok.Tok.setKind(tok::string_literal);
1302 Tok.IsUnterminatedLiteral = true;
1303 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1304 Tok.Tok.setKind(tok::string_literal);
1305 }
1306 }
1307
1308 if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Proto ||
1310 Tok.is(tok::char_constant)) {
1311 Tok.Tok.setKind(tok::string_literal);
1312 }
1313
1314 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1315 FormattingDisabled = false;
1316
1317 Tok.Finalized = FormattingDisabled;
1318
1319 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1320 FormattingDisabled = true;
1321}
1322
1323void FormatTokenLexer::resetLexer(unsigned Offset) {
1324 StringRef Buffer = SourceMgr.getBufferData(ID);
1325 LangOpts = getFormattingLangOpts(Style);
1326 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1327 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1328 Lex->SetKeepWhitespaceMode(true);
1329 TrailingWhitespace = 0;
1330}
1331
1332} // namespace format
1333} // namespace clang
MatchType Type
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:2797
unsigned Offset
Definition: Format.cpp:2798
StringRef Identifier
Definition: Format.cpp:2804
Various functions to configurably format source code.
#define X(type, name)
Definition: Value.h:142
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
SourceLocation Begin
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
llvm::MemoryBufferRef getBufferOrFake(FileID FID, SourceLocation Loc=SourceLocation()) const
Return the buffer for the specified FileID.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:186
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:131
void setLength(unsigned Len)
Definition: Token.h:140
void setKind(tok::TokenKind K)
Definition: Token.h:94
void setLocation(SourceLocation L)
Definition: Token.h:139
bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const
Definition: Token.h:100
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:195
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:61
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition: Format.cpp:3933
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:3639
bool isClangFormatOn(StringRef Comment)
Definition: Format.cpp:3929
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:171
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
#define true
Definition: stdbool.h:21
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:1593
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
@ LK_Java
Should be used for Java.
Definition: Format.h:2736
@ LK_Proto
Should be used for Protocol Buffers (https://developers.google.com/protocol-buffers/).
Definition: Format.h:2745
@ LK_TextProto
Should be used for Protocol Buffer messages in text format (https://developers.google....
Definition: Format.h:2750
std::vector< std::string > AttributeMacros
A vector of strings that should be interpreted as attributes/qualifiers instead of identifiers.
Definition: Format.h:882
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:2810
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:2765
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:4212
std::vector< std::string > StatementAttributeLikeMacros
Macros which are ignored in front of a statement, as if they were an attribute.
Definition: Format.h:4197
std::vector< std::string > IfMacros
A vector of macros that should be interpreted as conditionals instead of as function calls.
Definition: Format.h:2242
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls.
Definition: Format.h:2219
bool isCSharp() const
Definition: Format.h:2757
std::vector< std::string > WhitespaceSensitiveMacros
A vector of macros which are whitespace-sensitive and should not be touched.
Definition: Format.h:4285
bool isVerilog() const
Definition: Format.h:2760
bool isJavaScript() const
Definition: Format.h:2759
std::vector< std::string > NamespaceMacros
A vector of macros which are used to open namespace blocks.
Definition: Format.h:2919
std::vector< std::string > StatementMacros
A vector of macros that should be interpreted as complete statements.
Definition: Format.h:4208
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:2814
std::vector< std::string > TypenameMacros
A vector of macros that should be interpreted as type declarations instead of as function calls.
Definition: Format.h:4229
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:451
bool isNot(T Kind) const
Definition: FormatToken.h:569
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:275
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
Definition: FormatToken.h:420
unsigned IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:291
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:416
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:288
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:425
void setType(TokenType T)
Definition: FormatToken.h:385
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:550
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:429
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:562
unsigned IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:294
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:284
void setFinalizedType(TokenType T)
Sets the type and also the finalized flag.
Definition: FormatToken.h:399