clang  6.0.0svn
CommentLexer.h
Go to the documentation of this file.
1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines lexer for structured comments and supporting token class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
15 #define LLVM_CLANG_AST_COMMENTLEXER_H
16 
17 #include "clang/Basic/Diagnostic.h"
19 #include "llvm/ADT/SmallString.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/Support/Allocator.h"
22 #include "llvm/Support/raw_ostream.h"
23 
24 namespace clang {
25 namespace comments {
26 
27 class Lexer;
28 class TextTokenRetokenizer;
29 struct CommandInfo;
30 class CommandTraits;
31 
32 namespace tok {
33 enum TokenKind {
34  eof,
37  unknown_command, // Command that does not have an ID.
38  backslash_command, // Command with an ID, that used backslash marker.
39  at_command, // Command with an ID, that used 'at' marker.
45  html_start_tag, // <tag
46  html_ident, // attr
48  html_quoted_string, // "blah\"blah" or 'blah\'blah'
51  html_end_tag // </tag
52 };
53 } // end namespace tok
54 
55 /// \brief Comment token.
56 class Token {
57  friend class Lexer;
58  friend class TextTokenRetokenizer;
59 
60  /// The location of the token.
61  SourceLocation Loc;
62 
63  /// The actual kind of the token.
65 
66  /// Length of the token spelling in comment. Can be 0 for synthenized
67  /// tokens.
68  unsigned Length;
69 
70  /// Contains text value associated with a token.
71  const char *TextPtr;
72 
73  /// Integer value associated with a token.
74  ///
75  /// If the token is a konwn command, contains command ID and TextPtr is
76  /// unused (command spelling can be found with CommandTraits). Otherwise,
77  /// contains the length of the string that starts at TextPtr.
78  unsigned IntVal;
79 
80 public:
81  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
82  void setLocation(SourceLocation SL) { Loc = SL; }
83 
84  SourceLocation getEndLocation() const LLVM_READONLY {
85  if (Length == 0 || Length == 1)
86  return Loc;
87  return Loc.getLocWithOffset(Length - 1);
88  }
89 
90  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
91  void setKind(tok::TokenKind K) { Kind = K; }
92 
93  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
94  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
95 
96  unsigned getLength() const LLVM_READONLY { return Length; }
97  void setLength(unsigned L) { Length = L; }
98 
99  StringRef getText() const LLVM_READONLY {
100  assert(is(tok::text));
101  return StringRef(TextPtr, IntVal);
102  }
103 
104  void setText(StringRef Text) {
105  assert(is(tok::text));
106  TextPtr = Text.data();
107  IntVal = Text.size();
108  }
109 
110  StringRef getUnknownCommandName() const LLVM_READONLY {
111  assert(is(tok::unknown_command));
112  return StringRef(TextPtr, IntVal);
113  }
114 
115  void setUnknownCommandName(StringRef Name) {
116  assert(is(tok::unknown_command));
117  TextPtr = Name.data();
118  IntVal = Name.size();
119  }
120 
121  unsigned getCommandID() const LLVM_READONLY {
122  assert(is(tok::backslash_command) || is(tok::at_command));
123  return IntVal;
124  }
125 
126  void setCommandID(unsigned ID) {
127  assert(is(tok::backslash_command) || is(tok::at_command));
128  IntVal = ID;
129  }
130 
131  unsigned getVerbatimBlockID() const LLVM_READONLY {
133  return IntVal;
134  }
135 
136  void setVerbatimBlockID(unsigned ID) {
138  IntVal = ID;
139  }
140 
141  StringRef getVerbatimBlockText() const LLVM_READONLY {
142  assert(is(tok::verbatim_block_line));
143  return StringRef(TextPtr, IntVal);
144  }
145 
146  void setVerbatimBlockText(StringRef Text) {
147  assert(is(tok::verbatim_block_line));
148  TextPtr = Text.data();
149  IntVal = Text.size();
150  }
151 
152  unsigned getVerbatimLineID() const LLVM_READONLY {
153  assert(is(tok::verbatim_line_name));
154  return IntVal;
155  }
156 
157  void setVerbatimLineID(unsigned ID) {
158  assert(is(tok::verbatim_line_name));
159  IntVal = ID;
160  }
161 
162  StringRef getVerbatimLineText() const LLVM_READONLY {
163  assert(is(tok::verbatim_line_text));
164  return StringRef(TextPtr, IntVal);
165  }
166 
167  void setVerbatimLineText(StringRef Text) {
168  assert(is(tok::verbatim_line_text));
169  TextPtr = Text.data();
170  IntVal = Text.size();
171  }
172 
173  StringRef getHTMLTagStartName() const LLVM_READONLY {
174  assert(is(tok::html_start_tag));
175  return StringRef(TextPtr, IntVal);
176  }
177 
178  void setHTMLTagStartName(StringRef Name) {
179  assert(is(tok::html_start_tag));
180  TextPtr = Name.data();
181  IntVal = Name.size();
182  }
183 
184  StringRef getHTMLIdent() const LLVM_READONLY {
185  assert(is(tok::html_ident));
186  return StringRef(TextPtr, IntVal);
187  }
188 
189  void setHTMLIdent(StringRef Name) {
190  assert(is(tok::html_ident));
191  TextPtr = Name.data();
192  IntVal = Name.size();
193  }
194 
195  StringRef getHTMLQuotedString() const LLVM_READONLY {
196  assert(is(tok::html_quoted_string));
197  return StringRef(TextPtr, IntVal);
198  }
199 
200  void setHTMLQuotedString(StringRef Str) {
201  assert(is(tok::html_quoted_string));
202  TextPtr = Str.data();
203  IntVal = Str.size();
204  }
205 
206  StringRef getHTMLTagEndName() const LLVM_READONLY {
207  assert(is(tok::html_end_tag));
208  return StringRef(TextPtr, IntVal);
209  }
210 
211  void setHTMLTagEndName(StringRef Name) {
212  assert(is(tok::html_end_tag));
213  TextPtr = Name.data();
214  IntVal = Name.size();
215  }
216 
217  void dump(const Lexer &L, const SourceManager &SM) const;
218 };
219 
220 /// \brief Comment lexer.
221 class Lexer {
222 private:
223  Lexer(const Lexer &) = delete;
224  void operator=(const Lexer &) = delete;
225 
226  /// Allocator for strings that are semantic values of tokens and have to be
227  /// computed (for example, resolved decimal character references).
228  llvm::BumpPtrAllocator &Allocator;
229 
230  DiagnosticsEngine &Diags;
231 
232  const CommandTraits &Traits;
233 
234  const char *const BufferStart;
235  const char *const BufferEnd;
236  SourceLocation FileLoc;
237 
238  const char *BufferPtr;
239 
240  /// One past end pointer for the current comment. For BCPL comments points
241  /// to newline or BufferEnd, for C comments points to star in '*/'.
242  const char *CommentEnd;
243 
244  enum LexerCommentState {
245  LCS_BeforeComment,
246  LCS_InsideBCPLComment,
247  LCS_InsideCComment,
248  LCS_BetweenComments
249  };
250 
251  /// Low-level lexer state, track if we are inside or outside of comment.
252  LexerCommentState CommentState;
253 
254  enum LexerState {
255  /// Lexing normal comment text
256  LS_Normal,
257 
258  /// Finished lexing verbatim block beginning command, will lex first body
259  /// line.
260  LS_VerbatimBlockFirstLine,
261 
262  /// Lexing verbatim block body line-by-line, skipping line-starting
263  /// decorations.
264  LS_VerbatimBlockBody,
265 
266  /// Finished lexing verbatim line beginning command, will lex text (one
267  /// line).
268  LS_VerbatimLineText,
269 
270  /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
271  LS_HTMLStartTag,
272 
273  /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
274  LS_HTMLEndTag
275  };
276 
277  /// Current lexing mode.
278  LexerState State;
279 
280  /// If State is LS_VerbatimBlock, contains the name of verbatim end
281  /// command, including command marker.
282  SmallString<16> VerbatimBlockEndCommandName;
283 
284  /// Given a character reference name (e.g., "lt"), return the character that
285  /// it stands for (e.g., "<").
286  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
287 
288  /// Given a Unicode codepoint as base-10 integer, return the character.
289  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
290 
291  /// Given a Unicode codepoint as base-16 integer, return the character.
292  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
293 
294  void formTokenWithChars(Token &Result, const char *TokEnd,
296 
297  void formTextToken(Token &Result, const char *TokEnd) {
298  StringRef Text(BufferPtr, TokEnd - BufferPtr);
299  formTokenWithChars(Result, TokEnd, tok::text);
300  Result.setText(Text);
301  }
302 
303  SourceLocation getSourceLocation(const char *Loc) const {
304  assert(Loc >= BufferStart && Loc <= BufferEnd &&
305  "Location out of range for this buffer!");
306 
307  const unsigned CharNo = Loc - BufferStart;
308  return FileLoc.getLocWithOffset(CharNo);
309  }
310 
311  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
312  return Diags.Report(Loc, DiagID);
313  }
314 
315  /// Eat string matching regexp \code \s*\* \endcode.
316  void skipLineStartingDecorations();
317 
318  /// Lex stuff inside comments. CommentEnd should be set correctly.
319  void lexCommentText(Token &T);
320 
321  void setupAndLexVerbatimBlock(Token &T,
322  const char *TextBegin,
323  char Marker, const CommandInfo *Info);
324 
325  void lexVerbatimBlockFirstLine(Token &T);
326 
327  void lexVerbatimBlockBody(Token &T);
328 
329  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
330  const CommandInfo *Info);
331 
332  void lexVerbatimLineText(Token &T);
333 
334  void lexHTMLCharacterReference(Token &T);
335 
336  void setupAndLexHTMLStartTag(Token &T);
337 
338  void lexHTMLStartTag(Token &T);
339 
340  void setupAndLexHTMLEndTag(Token &T);
341 
342  void lexHTMLEndTag(Token &T);
343 
344 public:
345  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
346  const CommandTraits &Traits,
347  SourceLocation FileLoc,
348  const char *BufferStart, const char *BufferEnd);
349 
350  void lex(Token &T);
351 
352  StringRef getSpelling(const Token &Tok,
353  const SourceManager &SourceMgr,
354  bool *Invalid = nullptr) const;
355 };
356 
357 } // end namespace comments
358 } // end namespace clang
359 
360 #endif
361 
void setHTMLQuotedString(StringRef Str)
Definition: CommentLexer.h:200
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, unsigned DiagID)
Produce a diagnostic highlighting some portion of a literal.
void setHTMLTagStartName(StringRef Name)
Definition: CommentLexer.h:178
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getEndLocation() const LLVM_READONLY
Definition: CommentLexer.h:84
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:96
Defines the SourceManager interface.
unsigned getVerbatimBlockID() const LLVM_READONLY
Definition: CommentLexer.h:131
void setLength(unsigned L)
Definition: CommentLexer.h:97
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1207
void setText(StringRef Text)
Definition: CommentLexer.h:104
StringRef getHTMLTagEndName() const LLVM_READONLY
Definition: CommentLexer.h:206
void setVerbatimLineText(StringRef Text)
Definition: CommentLexer.h:167
Information about a single command.
tok::TokenKind getKind() const LLVM_READONLY
Definition: CommentLexer.h:90
StringRef getVerbatimLineText() const LLVM_READONLY
Definition: CommentLexer.h:162
LineState State
void setCommandID(unsigned ID)
Definition: CommentLexer.h:126
bool isNot(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:94
void setVerbatimBlockID(unsigned ID)
Definition: CommentLexer.h:136
const FormatToken & Tok
static void dump(llvm::raw_ostream &OS, StringRef FunctionName, ArrayRef< CounterExpression > Expressions, ArrayRef< CounterMappingRegion > Regions)
StringRef getHTMLTagStartName() const LLVM_READONLY
Definition: CommentLexer.h:173
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:147
void setLocation(SourceLocation SL)
Definition: CommentLexer.h:82
Defines the Diagnostic-related interfaces.
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:955
unsigned getVerbatimLineID() const LLVM_READONLY
Definition: CommentLexer.h:152
void setUnknownCommandName(StringRef Name)
Definition: CommentLexer.h:115
const FunctionProtoType * T
void setVerbatimLineID(unsigned ID)
Definition: CommentLexer.h:157
void setHTMLTagEndName(StringRef Name)
Definition: CommentLexer.h:211
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:81
StringRef getHTMLQuotedString() const LLVM_READONLY
Definition: CommentLexer.h:195
const SourceManager & SM
Definition: Format.cpp:1337
bool is(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:93
void setVerbatimBlockText(StringRef Text)
Definition: CommentLexer.h:146
This class provides information about commands that can be used in comments.
Kind
Re-lexes a sequence of tok::text tokens.
Encodes a location in the source.
Comment lexer.
Definition: CommentLexer.h:221
Dataflow Directional Tag Classes.
void setKind(tok::TokenKind K)
Definition: CommentLexer.h:91
unsigned getCommandID() const LLVM_READONLY
Definition: CommentLexer.h:121
StringRef getUnknownCommandName() const LLVM_READONLY
Definition: CommentLexer.h:110
Comment token.
Definition: CommentLexer.h:56
StringRef getVerbatimBlockText() const LLVM_READONLY
Definition: CommentLexer.h:141
StringRef getText() const LLVM_READONLY
Definition: CommentLexer.h:99
StringRef Text
Definition: Format.cpp:1346
void setHTMLIdent(StringRef Name)
Definition: CommentLexer.h:189
This class handles loading and caching of source files into memory.
StringRef getHTMLIdent() const LLVM_READONLY
Definition: CommentLexer.h:184