clang  5.0.0svn
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief This file implements FormatTokenLexer, which tokenizes a source file
12 /// into a FormatToken stream suitable for ClangFormat.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "FormatTokenLexer.h"
17 #include "FormatToken.h"
20 #include "clang/Format/Format.h"
21 #include "llvm/Support/Regex.h"
22 
23 namespace clang {
24 namespace format {
25 
27  const FormatStyle &Style,
29  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
30  Column(0), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
31  Style(Style), IdentTable(getFormattingLangOpts(Style)),
32  Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
33  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
34  MacroBlockEndRegex(Style.MacroBlockEnd) {
35  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
36  getFormattingLangOpts(Style)));
37  Lex->SetKeepWhitespaceMode(true);
38 
39  for (const std::string &ForEachMacro : Style.ForEachMacros)
40  ForEachMacros.push_back(&IdentTable.get(ForEachMacro));
41  std::sort(ForEachMacros.begin(), ForEachMacros.end());
42 }
43 
45  assert(Tokens.empty());
46  assert(FirstInLineIndex == 0);
47  do {
48  Tokens.push_back(getNextToken());
49  if (Style.Language == FormatStyle::LK_JavaScript) {
50  tryParseJSRegexLiteral();
51  handleTemplateStrings();
52  }
53  tryMergePreviousTokens();
54  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
55  FirstInLineIndex = Tokens.size() - 1;
56  } while (Tokens.back()->Tok.isNot(tok::eof));
57  return Tokens;
58 }
59 
60 void FormatTokenLexer::tryMergePreviousTokens() {
61  if (tryMerge_TMacro())
62  return;
63  if (tryMergeConflictMarkers())
64  return;
65  if (tryMergeLessLess())
66  return;
67 
68  if (Style.Language == FormatStyle::LK_JavaScript) {
69  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
70  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
71  tok::equal};
72  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
73  tok::greaterequal};
74  static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
75  // FIXME: Investigate what token type gives the correct operator priority.
76  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
77  return;
78  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
79  return;
80  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
81  return;
82  if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
83  return;
84  }
85 }
86 
87 bool FormatTokenLexer::tryMergeLessLess() {
88  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
89  if (Tokens.size() < 3)
90  return false;
91 
92  bool FourthTokenIsLess = false;
93  if (Tokens.size() > 3)
94  FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
95 
96  auto First = Tokens.end() - 3;
97  if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
98  First[0]->isNot(tok::less) || FourthTokenIsLess)
99  return false;
100 
101  // Only merge if there currently is no whitespace between the two "<".
102  if (First[1]->WhitespaceRange.getBegin() !=
103  First[1]->WhitespaceRange.getEnd())
104  return false;
105 
106  First[0]->Tok.setKind(tok::lessless);
107  First[0]->TokenText = "<<";
108  First[0]->ColumnWidth += 1;
109  Tokens.erase(Tokens.end() - 2);
110  return true;
111 }
112 
113 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
114  TokenType NewType) {
115  if (Tokens.size() < Kinds.size())
116  return false;
117 
119  Tokens.end() - Kinds.size();
120  if (!First[0]->is(Kinds[0]))
121  return false;
122  unsigned AddLength = 0;
123  for (unsigned i = 1; i < Kinds.size(); ++i) {
124  if (!First[i]->is(Kinds[i]) ||
125  First[i]->WhitespaceRange.getBegin() !=
126  First[i]->WhitespaceRange.getEnd())
127  return false;
128  AddLength += First[i]->TokenText.size();
129  }
130  Tokens.resize(Tokens.size() - Kinds.size() + 1);
131  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
132  First[0]->TokenText.size() + AddLength);
133  First[0]->ColumnWidth += AddLength;
134  First[0]->Type = NewType;
135  return true;
136 }
137 
138 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
139 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
140  // NB: This is not entirely correct, as an r_paren can introduce an operand
141  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
142  // corner case to not matter in practice, though.
143  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
144  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
145  tok::colon, tok::question, tok::tilde) ||
146  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
147  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
148  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
149  Tok->isBinaryOperator();
150 }
151 
152 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
153  if (!Prev)
154  return true;
155 
156  // Regex literals can only follow after prefix unary operators, not after
157  // postfix unary operators. If the '++' is followed by a non-operand
158  // introducing token, the slash here is the operand and not the start of a
159  // regex.
160  // `!` is an unary prefix operator, but also a post-fix operator that casts
161  // away nullability, so the same check applies.
162  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
163  return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
164 
165  // The previous token must introduce an operand location where regex
166  // literals can occur.
167  if (!precedesOperand(Prev))
168  return false;
169 
170  return true;
171 }
172 
173 // Tries to parse a JavaScript Regex literal starting at the current token,
174 // if that begins with a slash and is in a location where JavaScript allows
175 // regex literals. Changes the current token to a regex literal and updates
176 // its text if successful.
177 void FormatTokenLexer::tryParseJSRegexLiteral() {
178  FormatToken *RegexToken = Tokens.back();
179  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
180  return;
181 
182  FormatToken *Prev = nullptr;
183  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
184  // NB: Because previous pointers are not initialized yet, this cannot use
185  // Token.getPreviousNonComment.
186  if ((*I)->isNot(tok::comment)) {
187  Prev = *I;
188  break;
189  }
190  }
191 
192  if (!canPrecedeRegexLiteral(Prev))
193  return;
194 
195  // 'Manually' lex ahead in the current file buffer.
196  const char *Offset = Lex->getBufferLocation();
197  const char *RegexBegin = Offset - RegexToken->TokenText.size();
198  StringRef Buffer = Lex->getBuffer();
199  bool InCharacterClass = false;
200  bool HaveClosingSlash = false;
201  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
202  // Regular expressions are terminated with a '/', which can only be
203  // escaped using '\' or a character class between '[' and ']'.
204  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
205  switch (*Offset) {
206  case '\\':
207  // Skip the escaped character.
208  ++Offset;
209  break;
210  case '[':
211  InCharacterClass = true;
212  break;
213  case ']':
214  InCharacterClass = false;
215  break;
216  case '/':
217  if (!InCharacterClass)
218  HaveClosingSlash = true;
219  break;
220  }
221  }
222 
223  RegexToken->Type = TT_RegexLiteral;
224  // Treat regex literals like other string_literals.
225  RegexToken->Tok.setKind(tok::string_literal);
226  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
227  RegexToken->ColumnWidth = RegexToken->TokenText.size();
228 
229  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
230 }
231 
232 void FormatTokenLexer::handleTemplateStrings() {
233  FormatToken *BacktickToken = Tokens.back();
234 
235  if (BacktickToken->is(tok::l_brace)) {
236  StateStack.push(LexerState::NORMAL);
237  return;
238  }
239  if (BacktickToken->is(tok::r_brace)) {
240  if (StateStack.size() == 1)
241  return;
242  StateStack.pop();
243  if (StateStack.top() != LexerState::TEMPLATE_STRING)
244  return;
245  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
246  } else if (BacktickToken->is(tok::unknown) &&
247  BacktickToken->TokenText == "`") {
248  StateStack.push(LexerState::TEMPLATE_STRING);
249  } else {
250  return; // Not actually a template
251  }
252 
253  // 'Manually' lex ahead in the current file buffer.
254  const char *Offset = Lex->getBufferLocation();
255  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
256  for (; Offset != Lex->getBuffer().end(); ++Offset) {
257  if (Offset[0] == '`') {
258  StateStack.pop();
259  break;
260  }
261  if (Offset[0] == '\\') {
262  ++Offset; // Skip the escaped character.
263  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
264  Offset[1] == '{') {
265  // '${' introduces an expression interpolation in the template string.
266  StateStack.push(LexerState::NORMAL);
267  ++Offset;
268  break;
269  }
270  }
271 
272  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
273  BacktickToken->Type = TT_TemplateString;
274  BacktickToken->Tok.setKind(tok::string_literal);
275  BacktickToken->TokenText = LiteralText;
276 
277  // Adjust width for potentially multiline string literals.
278  size_t FirstBreak = LiteralText.find('\n');
279  StringRef FirstLineText = FirstBreak == StringRef::npos
280  ? LiteralText
281  : LiteralText.substr(0, FirstBreak);
283  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
284  size_t LastBreak = LiteralText.rfind('\n');
285  if (LastBreak != StringRef::npos) {
286  BacktickToken->IsMultiline = true;
287  unsigned StartColumn = 0; // The template tail spans the entire line.
289  LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
290  Style.TabWidth, Encoding);
291  }
292 
293  SourceLocation loc = Offset < Lex->getBuffer().end()
294  ? Lex->getSourceLocation(Offset + 1)
295  : SourceMgr.getLocForEndOfFile(ID);
296  resetLexer(SourceMgr.getFileOffset(loc));
297 }
298 
299 bool FormatTokenLexer::tryMerge_TMacro() {
300  if (Tokens.size() < 4)
301  return false;
302  FormatToken *Last = Tokens.back();
303  if (!Last->is(tok::r_paren))
304  return false;
305 
306  FormatToken *String = Tokens[Tokens.size() - 2];
307  if (!String->is(tok::string_literal) || String->IsMultiline)
308  return false;
309 
310  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
311  return false;
312 
313  FormatToken *Macro = Tokens[Tokens.size() - 4];
314  if (Macro->TokenText != "_T")
315  return false;
316 
317  const char *Start = Macro->TokenText.data();
318  const char *End = Last->TokenText.data() + Last->TokenText.size();
319  String->TokenText = StringRef(Start, End - Start);
320  String->IsFirst = Macro->IsFirst;
321  String->LastNewlineOffset = Macro->LastNewlineOffset;
322  String->WhitespaceRange = Macro->WhitespaceRange;
323  String->OriginalColumn = Macro->OriginalColumn;
325  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
326  String->NewlinesBefore = Macro->NewlinesBefore;
327  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
328 
329  Tokens.pop_back();
330  Tokens.pop_back();
331  Tokens.pop_back();
332  Tokens.back() = String;
333  return true;
334 }
335 
336 bool FormatTokenLexer::tryMergeConflictMarkers() {
337  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
338  return false;
339 
340  // Conflict lines look like:
341  // <marker> <text from the vcs>
342  // For example:
343  // >>>>>>> /file/in/file/system at revision 1234
344  //
345  // We merge all tokens in a line that starts with a conflict marker
346  // into a single token with a special token type that the unwrapped line
347  // parser will use to correctly rebuild the underlying code.
348 
349  FileID ID;
350  // Get the position of the first token in the line.
351  unsigned FirstInLineOffset;
352  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
353  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
354  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
355  // Calculate the offset of the start of the current line.
356  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
357  if (LineOffset == StringRef::npos) {
358  LineOffset = 0;
359  } else {
360  ++LineOffset;
361  }
362 
363  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
364  StringRef LineStart;
365  if (FirstSpace == StringRef::npos) {
366  LineStart = Buffer.substr(LineOffset);
367  } else {
368  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
369  }
370 
371  TokenType Type = TT_Unknown;
372  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
373  Type = TT_ConflictStart;
374  } else if (LineStart == "|||||||" || LineStart == "=======" ||
375  LineStart == "====") {
376  Type = TT_ConflictAlternative;
377  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
378  Type = TT_ConflictEnd;
379  }
380 
381  if (Type != TT_Unknown) {
382  FormatToken *Next = Tokens.back();
383 
384  Tokens.resize(FirstInLineIndex + 1);
385  // We do not need to build a complete token here, as we will skip it
386  // during parsing anyway (as we must not touch whitespace around conflict
387  // markers).
388  Tokens.back()->Type = Type;
389  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
390 
391  Tokens.push_back(Next);
392  return true;
393  }
394 
395  return false;
396 }
397 
398 FormatToken *FormatTokenLexer::getStashedToken() {
399  // Create a synthesized second '>' or '<' token.
400  Token Tok = FormatTok->Tok;
401  StringRef TokenText = FormatTok->TokenText;
402 
403  unsigned OriginalColumn = FormatTok->OriginalColumn;
404  FormatTok = new (Allocator.Allocate()) FormatToken;
405  FormatTok->Tok = Tok;
406  SourceLocation TokLocation =
407  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
408  FormatTok->Tok.setLocation(TokLocation);
409  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
410  FormatTok->TokenText = TokenText;
411  FormatTok->ColumnWidth = 1;
412  FormatTok->OriginalColumn = OriginalColumn + 1;
413 
414  return FormatTok;
415 }
416 
417 FormatToken *FormatTokenLexer::getNextToken() {
418  if (StateStack.top() == LexerState::TOKEN_STASHED) {
419  StateStack.pop();
420  return getStashedToken();
421  }
422 
423  FormatTok = new (Allocator.Allocate()) FormatToken;
424  readRawToken(*FormatTok);
425  SourceLocation WhitespaceStart =
426  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
427  FormatTok->IsFirst = IsFirstToken;
428  IsFirstToken = false;
429 
430  // Consume and record whitespace until we find a significant token.
431  unsigned WhitespaceLength = TrailingWhitespace;
432  while (FormatTok->Tok.is(tok::unknown)) {
433  StringRef Text = FormatTok->TokenText;
434  auto EscapesNewline = [&](int pos) {
435  // A '\r' here is just part of '\r\n'. Skip it.
436  if (pos >= 0 && Text[pos] == '\r')
437  --pos;
438  // See whether there is an odd number of '\' before this.
439  unsigned count = 0;
440  for (; pos >= 0; --pos, ++count)
441  if (Text[pos] != '\\')
442  break;
443  return count & 1;
444  };
445  // FIXME: This miscounts tok:unknown tokens that are not just
446  // whitespace, e.g. a '`' character.
447  for (int i = 0, e = Text.size(); i != e; ++i) {
448  switch (Text[i]) {
449  case '\n':
450  ++FormatTok->NewlinesBefore;
451  FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
452  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
453  Column = 0;
454  break;
455  case '\r':
456  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
457  Column = 0;
458  break;
459  case '\f':
460  case '\v':
461  Column = 0;
462  break;
463  case ' ':
464  ++Column;
465  break;
466  case '\t':
467  Column += Style.TabWidth - Column % Style.TabWidth;
468  break;
469  case '\\':
470  if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
471  FormatTok->Type = TT_ImplicitStringLiteral;
472  break;
473  default:
474  FormatTok->Type = TT_ImplicitStringLiteral;
475  break;
476  }
477  if (FormatTok->Type == TT_ImplicitStringLiteral)
478  break;
479  }
480 
481  if (FormatTok->is(TT_ImplicitStringLiteral))
482  break;
483  WhitespaceLength += FormatTok->Tok.getLength();
484 
485  readRawToken(*FormatTok);
486  }
487 
488  // In case the token starts with escaped newlines, we want to
489  // take them into account as whitespace - this pattern is quite frequent
490  // in macro definitions.
491  // FIXME: Add a more explicit test.
492  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
493  FormatTok->TokenText[1] == '\n') {
494  ++FormatTok->NewlinesBefore;
495  WhitespaceLength += 2;
496  FormatTok->LastNewlineOffset = 2;
497  Column = 0;
498  FormatTok->TokenText = FormatTok->TokenText.substr(2);
499  }
500 
501  FormatTok->WhitespaceRange = SourceRange(
502  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
503 
504  FormatTok->OriginalColumn = Column;
505 
506  TrailingWhitespace = 0;
507  if (FormatTok->Tok.is(tok::comment)) {
508  // FIXME: Add the trimmed whitespace to Column.
509  StringRef UntrimmedText = FormatTok->TokenText;
510  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
511  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
512  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
513  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
514  FormatTok->Tok.setIdentifierInfo(&Info);
515  FormatTok->Tok.setKind(Info.getTokenID());
516  if (Style.Language == FormatStyle::LK_Java &&
517  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
518  tok::kw_operator)) {
519  FormatTok->Tok.setKind(tok::identifier);
520  FormatTok->Tok.setIdentifierInfo(nullptr);
521  } else if (Style.Language == FormatStyle::LK_JavaScript &&
522  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
523  tok::kw_operator)) {
524  FormatTok->Tok.setKind(tok::identifier);
525  FormatTok->Tok.setIdentifierInfo(nullptr);
526  }
527  } else if (FormatTok->Tok.is(tok::greatergreater)) {
528  FormatTok->Tok.setKind(tok::greater);
529  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
530  ++Column;
531  StateStack.push(LexerState::TOKEN_STASHED);
532  } else if (FormatTok->Tok.is(tok::lessless)) {
533  FormatTok->Tok.setKind(tok::less);
534  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
535  ++Column;
536  StateStack.push(LexerState::TOKEN_STASHED);
537  }
538 
539  // Now FormatTok is the next non-whitespace token.
540 
541  StringRef Text = FormatTok->TokenText;
542  size_t FirstNewlinePos = Text.find('\n');
543  if (FirstNewlinePos == StringRef::npos) {
544  // FIXME: ColumnWidth actually depends on the start column, we need to
545  // take this into account when the token is moved.
546  FormatTok->ColumnWidth =
547  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
548  Column += FormatTok->ColumnWidth;
549  } else {
550  FormatTok->IsMultiline = true;
551  // FIXME: ColumnWidth actually depends on the start column, we need to
552  // take this into account when the token is moved.
554  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
555 
556  // The last line of the token always starts in column 0.
557  // Thus, the length can be precomputed even in the presence of tabs.
559  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
560  Column = FormatTok->LastLineColumnWidth;
561  }
562 
563  if (Style.IsCpp()) {
564  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
565  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
566  tok::pp_define) &&
567  std::find(ForEachMacros.begin(), ForEachMacros.end(),
568  FormatTok->Tok.getIdentifierInfo()) != ForEachMacros.end()) {
569  FormatTok->Type = TT_ForEachMacro;
570  } else if (FormatTok->is(tok::identifier)) {
571  if (MacroBlockBeginRegex.match(Text)) {
572  FormatTok->Type = TT_MacroBlockBegin;
573  } else if (MacroBlockEndRegex.match(Text)) {
574  FormatTok->Type = TT_MacroBlockEnd;
575  }
576  }
577  }
578 
579  return FormatTok;
580 }
581 
582 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
583  Lex->LexFromRawLexer(Tok.Tok);
584  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
585  Tok.Tok.getLength());
586  // For formatting, treat unterminated string literals like normal string
587  // literals.
588  if (Tok.is(tok::unknown)) {
589  if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
590  Tok.Tok.setKind(tok::string_literal);
591  Tok.IsUnterminatedLiteral = true;
592  } else if (Style.Language == FormatStyle::LK_JavaScript &&
593  Tok.TokenText == "''") {
594  Tok.Tok.setKind(tok::string_literal);
595  }
596  }
597 
598  if (Style.Language == FormatStyle::LK_JavaScript &&
599  Tok.is(tok::char_constant)) {
600  Tok.Tok.setKind(tok::string_literal);
601  }
602 
603  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
604  Tok.TokenText == "/* clang-format on */")) {
605  FormattingDisabled = false;
606  }
607 
608  Tok.Finalized = FormattingDisabled;
609 
610  if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
611  Tok.TokenText == "/* clang-format off */")) {
612  FormattingDisabled = true;
613  }
614 }
615 
616 void FormatTokenLexer::resetLexer(unsigned Offset) {
617  StringRef Buffer = SourceMgr.getBufferData(ID);
618  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
619  getFormattingLangOpts(Style), Buffer.begin(),
620  Buffer.begin() + Offset, Buffer.end()));
621  Lex->SetKeepWhitespaceMode(true);
622  TrailingWhitespace = 0;
623 }
624 
625 } // namespace format
626 } // namespace clang
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file. ...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens...
Definition: Lexer.h:46
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
Token Tok
The Token.
Definition: FormatToken.h:119
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {...
Definition: Token.h:94
Defines the SourceManager interface.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:210
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer...
The base class of the type hierarchy.
Definition: Type.h:1283
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
bool IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:148
bool IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:151
bool isBinaryOperator() const
Definition: FormatToken.h:383
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:125
One of these records is kept for each identifier that is lexed.
Should be used for Java.
Definition: Format.h:955
Token - This structure provides full information about a lexed token.
Definition: Token.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:62
void setKind(tok::TokenKind K)
Definition: Token.h:90
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
tok::TokenKind getTokenID() const
If this is a source-language token (e.g.
uint32_t Offset
Definition: CacheTokens.cpp:43
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls...
Definition: Format.h:825
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:301
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:1000
ID
Defines the set of possible language-specific address spaces.
Definition: AddressSpaces.h:27
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:997
Should be used for JavaScript.
Definition: Format.h:957
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:1892
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file. ...
Definition: Token.h:123
A wrapper around a Token storing information about the whitespace characters preceding it...
Definition: FormatToken.h:115
unsigned LastNewlineOffset
The offset just past the last &#39; &#39; in this token&#39;s leading whitespace (relative to WhiteSpaceStart)...
Definition: FormatToken.h:136
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
llvm::MemoryBuffer * getBuffer(FileID FID, SourceLocation Loc, bool *Invalid=nullptr) const
Return the buffer for the specified FileID.
Encodes a location in the source.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:292
Various functions to configurably format source code.
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:176
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:185
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:132
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
bool IsUnterminatedLiteral
Set to true if this token is an unterminated literal.
Definition: FormatToken.h:166
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:163
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, const FormatStyle &Style, encoding::Encoding Encoding)
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:46
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:969
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
/file This file defines classes for searching and anlyzing source code clones.
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:141
bool IsCpp() const
Definition: Format.h:966
bool Finalized
If true, this token has been fully formatted (indented and potentially re-formatted inside)...
Definition: FormatToken.h:290
unsigned getLength() const
Definition: Token.h:126
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:1270
Defines the clang::SourceLocation class and associated facilities.
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:1245
void setLocation(SourceLocation L)
Definition: Token.h:131
#define true
Definition: stdbool.h:32
A trivial tuple used to represent a source range.
bool HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:129
This class handles loading and caching of source files into memory.
ArrayRef< FormatToken * > lex()
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:145