clang  10.0.0svn
Tokens.h
Go to the documentation of this file.
1 //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // Record tokens that a preprocessor emits and define operations to map between
9 // the tokens written in a file and tokens produced by the preprocessor.
10 //
11 // When running the compiler, there are two token streams we are interested in:
12 // - "spelled" tokens directly correspond to a substring written in some
13 // source file.
14 // - "expanded" tokens represent the result of preprocessing, parses consumes
15 // this token stream to produce the AST.
16 //
17 // Expanded tokens correspond directly to locations found in the AST, allowing
18 // to find subranges of the token stream covered by various AST nodes. Spelled
19 // tokens correspond directly to the source code written by the user.
20 //
21 // To allow composing these two use-cases, we also define operations that map
22 // between expanded and spelled tokens that produced them (macro calls,
23 // directives, etc).
24 //
25 //===----------------------------------------------------------------------===//
26 
27 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
28 #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
29 
34 #include "clang/Basic/TokenKinds.h"
35 #include "clang/Lex/Token.h"
36 #include "llvm/ADT/ArrayRef.h"
37 #include "llvm/ADT/Optional.h"
38 #include "llvm/ADT/StringRef.h"
39 #include "llvm/Support/Compiler.h"
40 #include "llvm/Support/raw_ostream.h"
41 #include <cstdint>
42 #include <tuple>
43 
44 namespace clang {
45 class Preprocessor;
46 
47 namespace syntax {
48 
49 /// A half-open character range inside a particular file, the start offset is
50 /// included and the end offset is excluded from the range.
51 struct FileRange {
52  /// EXPECTS: File.isValid() && Begin <= End.
53  FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
54  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
55  FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
56  /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
57  /// are the same.
58  FileRange(const SourceManager &SM, SourceLocation BeginLoc,
59  SourceLocation EndLoc);
60 
61  FileID file() const { return File; }
62  /// Start is a start offset (inclusive) in the corresponding file.
63  unsigned beginOffset() const { return Begin; }
64  /// End offset (exclusive) in the corresponding file.
65  unsigned endOffset() const { return End; }
66 
67  unsigned length() const { return End - Begin; }
68 
69  /// Check if \p Offset is inside the range.
70  bool contains(unsigned Offset) const {
71  return Begin <= Offset && Offset < End;
72  }
73  /// Check \p Offset is inside the range or equal to its endpoint.
74  bool touches(unsigned Offset) const {
75  return Begin <= Offset && Offset <= End;
76  }
77 
78  /// Gets the substring that this FileRange refers to.
79  llvm::StringRef text(const SourceManager &SM) const;
80 
81  friend bool operator==(const FileRange &L, const FileRange &R) {
82  return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
83  }
84  friend bool operator!=(const FileRange &L, const FileRange &R) {
85  return !(L == R);
86  }
87 
88 private:
89  FileID File;
90  unsigned Begin;
91  unsigned End;
92 };
93 
94 /// For debugging purposes.
95 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
96 
97 /// A token coming directly from a file or from a macro invocation. Has just
98 /// enough information to locate the token in the source code.
99 /// Can represent both expanded and spelled tokens.
100 class Token {
101 public:
102  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
103  /// EXPECTS: clang::Token is not an annotation token.
104  explicit Token(const clang::Token &T);
105 
106  tok::TokenKind kind() const { return Kind; }
107  /// Location of the first character of a token.
108  SourceLocation location() const { return Location; }
109  /// Location right after the last character of a token.
111  return Location.getLocWithOffset(Length);
112  }
113  unsigned length() const { return Length; }
114 
115  /// Get the substring covered by the token. Note that will include all
116  /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
117  /// in\
118  /// t
119  /// both have the same kind tok::kw_int, but results of text() are different.
120  llvm::StringRef text(const SourceManager &SM) const;
121 
122  /// Gets a range of this token.
123  /// EXPECTS: token comes from a file, not from a macro expansion.
124  FileRange range(const SourceManager &SM) const;
125 
126  /// Given two tokens inside the same file, returns a file range that starts at
127  /// \p First and ends at \p Last.
128  /// EXPECTS: First and Last are file tokens from the same file, Last starts
129  /// after First.
130  static FileRange range(const SourceManager &SM, const syntax::Token &First,
131  const syntax::Token &Last);
132 
133  std::string dumpForTests(const SourceManager &SM) const;
134  /// For debugging purposes.
135  std::string str() const;
136 
137 private:
138  SourceLocation Location;
139  unsigned Length;
141 };
142 /// For debugging purposes. Equivalent to a call to Token::str().
143 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
144 
145 /// A list of tokens obtained by preprocessing a text buffer and operations to
146 /// map between the expanded and spelled tokens, i.e. TokenBuffer has
147 /// information about two token streams:
148 /// 1. Expanded tokens: tokens produced by the preprocessor after all macro
149 /// replacements,
150 /// 2. Spelled tokens: corresponding directly to the source code of a file
151 /// before any macro replacements occurred.
152 /// Here's an example to illustrate a difference between those two:
153 /// #define FOO 10
154 /// int a = FOO;
155 ///
156 /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
157 /// Expanded tokens are {'int','a','=','10',';','eof'}.
158 ///
159 /// Note that the expanded token stream has a tok::eof token at the end, the
160 /// spelled tokens never store a 'eof' token.
161 ///
162 /// The full list expanded tokens can be obtained with expandedTokens(). Spelled
163 /// tokens for each of the files can be obtained via spelledTokens(FileID).
164 ///
165 /// To map between the expanded and spelled tokens use findSpelledByExpanded().
166 ///
167 /// To build a token buffer use the TokenCollector class. You can also compute
168 /// the spelled tokens of a file using the tokenize() helper.
169 ///
170 /// FIXME: allow to map from spelled to expanded tokens when use-case shows up.
171 /// FIXME: allow mappings into macro arguments.
172 class TokenBuffer {
173 public:
174  TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
175  /// All tokens produced by the preprocessor after all macro replacements,
176  /// directives, etc. Source locations found in the clang AST will always
177  /// point to one of these tokens.
178  /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
179  /// into two '>' tokens by the parser. However, TokenBuffer currently
180  /// keeps it as a single '>>' token.
182  return ExpandedTokens;
183  }
184 
185  /// Find the subrange of spelled tokens that produced the corresponding \p
186  /// Expanded tokens.
187  ///
188  /// EXPECTS: \p Expanded is a subrange of expandedTokens().
189  ///
190  /// Will fail if the expanded tokens do not correspond to a
191  /// sequence of spelled tokens. E.g. for the following example:
192  ///
193  /// #define FIRST f1 f2 f3
194  /// #define SECOND s1 s2 s3
195  ///
196  /// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
197  ///
198  /// the results would be:
199  /// expanded => spelled
200  /// ------------------------
201  /// a => a
202  /// s1 s2 s3 => SECOND
203  /// a f1 f2 f3 => a FIRST
204  /// a f1 => can't map
205  /// s1 s2 => can't map
206  ///
207  /// If \p Expanded is empty, the returned value is llvm::None.
208  /// Complexity is logarithmic.
210  spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
211 
212  /// An expansion produced by the preprocessor, includes macro expansions and
213  /// preprocessor directives. Preprocessor always maps a non-empty range of
214  /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
215  /// few examples of expansions:
216  /// #pragma once // Expands to an empty range.
217  /// #define FOO 1 2 3 // Expands an empty range.
218  /// FOO // Expands to "1 2 3".
219  /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
220  /// #include <vector> // Expands to tokens produced by the include.
221  struct Expansion {
224  };
225  /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
226  /// a preprocessor directive) return the subrange of expanded tokens that the
227  /// macro expands to.
229  expansionStartingAt(const syntax::Token *Spelled) const;
230 
231  /// Lexed tokens of a file before preprocessing. E.g. for the following input
232  /// #define DECL(name) int name = 10
233  /// DECL(a);
234  /// spelledTokens() returns {"#", "define", "DECL", "(", "name", ")", "eof"}.
235  /// FIXME: we do not yet store tokens of directives, like #include, #define,
236  /// #pragma, etc.
237  llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
238 
239  /// Get all tokens that expand a macro in \p FID. For the following input
240  /// #define FOO B
241  /// #define FOO2(X) int X
242  /// FOO2(XY)
243  /// int B;
244  /// FOO;
245  /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
246  /// respecitvely).
247  std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
248 
249  const SourceManager &sourceManager() const { return *SourceMgr; }
250 
251  std::string dumpForTests() const;
252 
253 private:
254  /// Describes a mapping between a continuous subrange of spelled tokens and
255  /// expanded tokens. Represents macro expansions, preprocessor directives,
256  /// conditionally disabled pp regions, etc.
257  /// #define FOO 1+2
258  /// #define BAR(a) a + 1
259  /// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
260  /// BAR(1) // invocation #2, tokens = {'a', '+', '1'},
261  /// macroTokens = {'BAR', '(', '1', ')'}.
262  struct Mapping {
263  // Positions in the corresponding spelled token stream. The corresponding
264  // range is never empty.
265  unsigned BeginSpelled = 0;
266  unsigned EndSpelled = 0;
267  // Positions in the expanded token stream. The corresponding range can be
268  // empty.
269  unsigned BeginExpanded = 0;
270  unsigned EndExpanded = 0;
271 
272  /// For debugging purposes.
273  std::string str() const;
274  };
275  /// Spelled tokens of the file with information about the subranges.
276  struct MarkedFile {
277  /// Lexed, but not preprocessed, tokens of the file. These map directly to
278  /// text in the corresponding files and include tokens of all preprocessor
279  /// directives.
280  /// FIXME: spelled tokens don't change across FileID that map to the same
281  /// FileEntry. We could consider deduplicating them to save memory.
282  std::vector<syntax::Token> SpelledTokens;
283  /// A sorted list to convert between the spelled and expanded token streams.
284  std::vector<Mapping> Mappings;
285  /// The first expanded token produced for this FileID.
286  unsigned BeginExpanded = 0;
287  unsigned EndExpanded = 0;
288  };
289 
290  friend class TokenCollector;
291 
292  /// Maps a single expanded token to its spelled counterpart or a mapping that
293  /// produced it.
294  std::pair<const syntax::Token *, const Mapping *>
295  spelledForExpandedToken(const syntax::Token *Expanded) const;
296 
297  /// Token stream produced after preprocessing, conceputally this captures the
298  /// same stream as 'clang -E' (excluding the preprocessor directives like
299  /// #file, etc.).
300  std::vector<syntax::Token> ExpandedTokens;
301  llvm::DenseMap<FileID, MarkedFile> Files;
302  // The value is never null, pointer instead of reference to avoid disabling
303  // implicit assignment operator.
304  const SourceManager *SourceMgr;
305 };
306 
307 /// Lex the text buffer, corresponding to \p FID, in raw mode and record the
308 /// resulting spelled tokens. Does minimal post-processing on raw identifiers,
309 /// setting the appropriate token kind (instead of the raw_identifier reported
310 /// by lexer in raw mode). This is a very low-level function, most users should
311 /// prefer to use TokenCollector. Lexing in raw mode produces wildly different
312 /// results from what one might expect when running a C++ frontend, e.g.
313 /// preprocessor does not run at all.
314 /// The result will *not* have a 'eof' token at the end.
315 std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
316  const LangOptions &LO);
317 
318 /// Collects tokens for the main file while running the frontend action. An
319 /// instance of this object should be created on
320 /// FrontendAction::BeginSourceFile() and the results should be consumed after
321 /// FrontendAction::Execute() finishes.
323 public:
324  /// Adds the hooks to collect the tokens. Should be called before the
325  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
326  /// CreateASTConsumer().
328 
329  /// Finalizes token collection. Should be called after preprocessing is
330  /// finished, i.e. after running Execute().
331  LLVM_NODISCARD TokenBuffer consume() &&;
332 
333 private:
334  /// Maps from a start to an end spelling location of transformations
335  /// performed by the preprocessor. These include:
336  /// 1. range from '#' to the last token in the line for PP directives,
337  /// 2. macro name and arguments for macro expansions.
338  /// Note that we record only top-level macro expansions, intermediate
339  /// expansions (e.g. inside macro arguments) are ignored.
340  ///
341  /// Used to find correct boundaries of macro calls and directives when
342  /// building mappings from spelled to expanded tokens.
343  ///
344  /// Logically, at each point of the preprocessor execution there is a stack of
345  /// macro expansions being processed and we could use it to recover the
346  /// location information we need. However, the public preprocessor API only
347  /// exposes the points when macro expansions start (when we push a macro onto
348  /// the stack) and not when they end (when we pop a macro from the stack).
349  /// To workaround this limitation, we rely on source location information
350  /// stored in this map.
351  using PPExpansions = llvm::DenseMap</*SourceLocation*/ int, SourceLocation>;
352  class Builder;
353  class CollectPPExpansions;
354 
355  std::vector<syntax::Token> Expanded;
356  // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
357  PPExpansions Expansions;
358  Preprocessor &PP;
359  CollectPPExpansions *Collector;
360 };
361 
362 } // namespace syntax
363 } // namespace clang
364 
365 #endif
llvm::StringRef text(const SourceManager &SM) const
Gets the substring that this FileRange refers to.
Definition: Tokens.cpp:112
const SourceManager & sourceManager() const
Definition: Tokens.h:249
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
friend bool operator==(const FileRange &L, const FileRange &R)
Definition: Tokens.h:81
Defines the clang::FileManager interface and associated types.
Defines the SourceManager interface.
llvm::raw_ostream & operator<<(llvm::raw_ostream &OS, NodeKind K)
For debugging purposes.
Definition: Nodes.cpp:13
StringRef P
A token coming directly from a file or from a macro invocation.
Definition: Tokens.h:100
RangeSelector range(RangeSelector Begin, RangeSelector End)
Selects from the start of Begin and to the end of End.
FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset)
EXPECTS: File.isValid() && Begin <= End.
Definition: Tokens.cpp:78
llvm::ArrayRef< syntax::Token > expandedTokens() const
All tokens produced by the preprocessor after all macro replacements, directives, etc...
Definition: Tokens.h:181
Token - This structure provides full information about a lexed token.
Definition: Token.h:34
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:49
llvm::ArrayRef< syntax::Token > Spelled
Definition: Tokens.h:222
SourceLocation location() const
Location of the first character of a token.
Definition: Tokens.h:108
bool touches(unsigned Offset) const
Check Offset is inside the range or equal to its endpoint.
Definition: Tokens.h:74
SourceLocation endLocation() const
Location right after the last character of a token.
Definition: Tokens.h:110
unsigned length() const
Definition: Tokens.h:113
TokenBuffer(const SourceManager &SourceMgr)
Definition: Tokens.h:174
bool contains(unsigned Offset) const
Check if Offset is inside the range.
Definition: Tokens.h:70
unsigned Offset
Definition: Format.cpp:1809
Defines the clang::LangOptions interface.
unsigned length() const
Definition: Tokens.h:67
const SourceManager & SM
Definition: Format.cpp:1667
Kind
Encodes a location in the source.
tok::TokenKind kind() const
Definition: Tokens.h:106
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:24
FileID file() const
Definition: Tokens.h:61
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
unsigned beginOffset() const
Start is a start offset (inclusive) in the corresponding file.
Definition: Tokens.h:63
Dataflow Directional Tag Classes.
A half-open character range inside a particular file, the start offset is included and the end offset...
Definition: Tokens.h:51
Collects tokens for the main file while running the frontend action.
Definition: Tokens.h:322
std::vector< syntax::Token > tokenize(FileID FID, const SourceManager &SM, const LangOptions &LO)
Lex the text buffer, corresponding to FID, in raw mode and record the resulting spelled tokens...
Definition: Tokens.cpp:250
Defines the clang::TokenKind enum and support functions.
Defines the clang::SourceLocation class and associated facilities.
An expansion produced by the preprocessor, includes macro expansions and preprocessor directives...
Definition: Tokens.h:221
friend bool operator!=(const FileRange &L, const FileRange &R)
Definition: Tokens.h:84
A list of tokens obtained by preprocessing a text buffer and operations to map between the expanded a...
Definition: Tokens.h:172
llvm::ArrayRef< syntax::Token > Expanded
Definition: Tokens.h:223
unsigned endOffset() const
End offset (exclusive) in the corresponding file.
Definition: Tokens.h:65
This class handles loading and caching of source files into memory.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:125