clang  9.0.0svn
CommentLexer.h
Go to the documentation of this file.
1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines lexer for structured comments and supporting token class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14 #define LLVM_CLANG_AST_COMMENTLEXER_H
15 
16 #include "clang/Basic/Diagnostic.h"
18 #include "llvm/ADT/SmallString.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Support/Allocator.h"
21 #include "llvm/Support/raw_ostream.h"
22 
23 namespace clang {
24 namespace comments {
25 
26 class Lexer;
27 class TextTokenRetokenizer;
28 struct CommandInfo;
29 class CommandTraits;
30 
31 namespace tok {
32 enum TokenKind {
33  eof,
36  unknown_command, // Command that does not have an ID.
37  backslash_command, // Command with an ID, that used backslash marker.
38  at_command, // Command with an ID, that used 'at' marker.
44  html_start_tag, // <tag
45  html_ident, // attr
47  html_quoted_string, // "blah\"blah" or 'blah\'blah'
50  html_end_tag // </tag
51 };
52 } // end namespace tok
53 
54 /// Comment token.
55 class Token {
56  friend class Lexer;
57  friend class TextTokenRetokenizer;
58 
59  /// The location of the token.
60  SourceLocation Loc;
61 
62  /// The actual kind of the token.
64 
65  /// Length of the token spelling in comment. Can be 0 for synthenized
66  /// tokens.
67  unsigned Length;
68 
69  /// Contains text value associated with a token.
70  const char *TextPtr;
71 
72  /// Integer value associated with a token.
73  ///
74  /// If the token is a known command, contains command ID and TextPtr is
75  /// unused (command spelling can be found with CommandTraits). Otherwise,
76  /// contains the length of the string that starts at TextPtr.
77  unsigned IntVal;
78 
79 public:
80  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81  void setLocation(SourceLocation SL) { Loc = SL; }
82 
83  SourceLocation getEndLocation() const LLVM_READONLY {
84  if (Length == 0 || Length == 1)
85  return Loc;
86  return Loc.getLocWithOffset(Length - 1);
87  }
88 
89  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90  void setKind(tok::TokenKind K) { Kind = K; }
91 
92  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94 
95  unsigned getLength() const LLVM_READONLY { return Length; }
96  void setLength(unsigned L) { Length = L; }
97 
98  StringRef getText() const LLVM_READONLY {
99  assert(is(tok::text));
100  return StringRef(TextPtr, IntVal);
101  }
102 
103  void setText(StringRef Text) {
104  assert(is(tok::text));
105  TextPtr = Text.data();
106  IntVal = Text.size();
107  }
108 
109  StringRef getUnknownCommandName() const LLVM_READONLY {
110  assert(is(tok::unknown_command));
111  return StringRef(TextPtr, IntVal);
112  }
113 
114  void setUnknownCommandName(StringRef Name) {
115  assert(is(tok::unknown_command));
116  TextPtr = Name.data();
117  IntVal = Name.size();
118  }
119 
120  unsigned getCommandID() const LLVM_READONLY {
121  assert(is(tok::backslash_command) || is(tok::at_command));
122  return IntVal;
123  }
124 
125  void setCommandID(unsigned ID) {
126  assert(is(tok::backslash_command) || is(tok::at_command));
127  IntVal = ID;
128  }
129 
130  unsigned getVerbatimBlockID() const LLVM_READONLY {
132  return IntVal;
133  }
134 
135  void setVerbatimBlockID(unsigned ID) {
137  IntVal = ID;
138  }
139 
140  StringRef getVerbatimBlockText() const LLVM_READONLY {
141  assert(is(tok::verbatim_block_line));
142  return StringRef(TextPtr, IntVal);
143  }
144 
145  void setVerbatimBlockText(StringRef Text) {
146  assert(is(tok::verbatim_block_line));
147  TextPtr = Text.data();
148  IntVal = Text.size();
149  }
150 
151  unsigned getVerbatimLineID() const LLVM_READONLY {
152  assert(is(tok::verbatim_line_name));
153  return IntVal;
154  }
155 
156  void setVerbatimLineID(unsigned ID) {
157  assert(is(tok::verbatim_line_name));
158  IntVal = ID;
159  }
160 
161  StringRef getVerbatimLineText() const LLVM_READONLY {
162  assert(is(tok::verbatim_line_text));
163  return StringRef(TextPtr, IntVal);
164  }
165 
166  void setVerbatimLineText(StringRef Text) {
167  assert(is(tok::verbatim_line_text));
168  TextPtr = Text.data();
169  IntVal = Text.size();
170  }
171 
172  StringRef getHTMLTagStartName() const LLVM_READONLY {
173  assert(is(tok::html_start_tag));
174  return StringRef(TextPtr, IntVal);
175  }
176 
177  void setHTMLTagStartName(StringRef Name) {
178  assert(is(tok::html_start_tag));
179  TextPtr = Name.data();
180  IntVal = Name.size();
181  }
182 
183  StringRef getHTMLIdent() const LLVM_READONLY {
184  assert(is(tok::html_ident));
185  return StringRef(TextPtr, IntVal);
186  }
187 
188  void setHTMLIdent(StringRef Name) {
189  assert(is(tok::html_ident));
190  TextPtr = Name.data();
191  IntVal = Name.size();
192  }
193 
194  StringRef getHTMLQuotedString() const LLVM_READONLY {
195  assert(is(tok::html_quoted_string));
196  return StringRef(TextPtr, IntVal);
197  }
198 
199  void setHTMLQuotedString(StringRef Str) {
200  assert(is(tok::html_quoted_string));
201  TextPtr = Str.data();
202  IntVal = Str.size();
203  }
204 
205  StringRef getHTMLTagEndName() const LLVM_READONLY {
206  assert(is(tok::html_end_tag));
207  return StringRef(TextPtr, IntVal);
208  }
209 
210  void setHTMLTagEndName(StringRef Name) {
211  assert(is(tok::html_end_tag));
212  TextPtr = Name.data();
213  IntVal = Name.size();
214  }
215 
216  void dump(const Lexer &L, const SourceManager &SM) const;
217 };
218 
219 /// Comment lexer.
220 class Lexer {
221 private:
222  Lexer(const Lexer &) = delete;
223  void operator=(const Lexer &) = delete;
224 
225  /// Allocator for strings that are semantic values of tokens and have to be
226  /// computed (for example, resolved decimal character references).
227  llvm::BumpPtrAllocator &Allocator;
228 
229  DiagnosticsEngine &Diags;
230 
231  const CommandTraits &Traits;
232 
233  const char *const BufferStart;
234  const char *const BufferEnd;
235  SourceLocation FileLoc;
236 
237  const char *BufferPtr;
238 
239  /// One past end pointer for the current comment. For BCPL comments points
240  /// to newline or BufferEnd, for C comments points to star in '*/'.
241  const char *CommentEnd;
242 
243  enum LexerCommentState {
244  LCS_BeforeComment,
245  LCS_InsideBCPLComment,
246  LCS_InsideCComment,
247  LCS_BetweenComments
248  };
249 
250  /// Low-level lexer state, track if we are inside or outside of comment.
251  LexerCommentState CommentState;
252 
253  enum LexerState {
254  /// Lexing normal comment text
255  LS_Normal,
256 
257  /// Finished lexing verbatim block beginning command, will lex first body
258  /// line.
259  LS_VerbatimBlockFirstLine,
260 
261  /// Lexing verbatim block body line-by-line, skipping line-starting
262  /// decorations.
263  LS_VerbatimBlockBody,
264 
265  /// Finished lexing verbatim line beginning command, will lex text (one
266  /// line).
267  LS_VerbatimLineText,
268 
269  /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
270  LS_HTMLStartTag,
271 
272  /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
273  LS_HTMLEndTag
274  };
275 
276  /// Current lexing mode.
277  LexerState State;
278 
279  /// If State is LS_VerbatimBlock, contains the name of verbatim end
280  /// command, including command marker.
281  SmallString<16> VerbatimBlockEndCommandName;
282 
283  /// If true, the commands, html tags, etc will be parsed and reported as
284  /// separate tokens inside the comment body. If false, the comment text will
285  /// be parsed into text and newline tokens.
286  bool ParseCommands;
287 
288  /// Given a character reference name (e.g., "lt"), return the character that
289  /// it stands for (e.g., "<").
290  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
291 
292  /// Given a Unicode codepoint as base-10 integer, return the character.
293  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
294 
295  /// Given a Unicode codepoint as base-16 integer, return the character.
296  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
297 
298  void formTokenWithChars(Token &Result, const char *TokEnd,
300 
301  void formTextToken(Token &Result, const char *TokEnd) {
302  StringRef Text(BufferPtr, TokEnd - BufferPtr);
303  formTokenWithChars(Result, TokEnd, tok::text);
304  Result.setText(Text);
305  }
306 
307  SourceLocation getSourceLocation(const char *Loc) const {
308  assert(Loc >= BufferStart && Loc <= BufferEnd &&
309  "Location out of range for this buffer!");
310 
311  const unsigned CharNo = Loc - BufferStart;
312  return FileLoc.getLocWithOffset(CharNo);
313  }
314 
315  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
316  return Diags.Report(Loc, DiagID);
317  }
318 
319  /// Eat string matching regexp \code \s*\* \endcode.
320  void skipLineStartingDecorations();
321 
322  /// Lex comment text, including commands if ParseCommands is set to true.
323  void lexCommentText(Token &T);
324 
325  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
326  const CommandInfo *Info);
327 
328  void lexVerbatimBlockFirstLine(Token &T);
329 
330  void lexVerbatimBlockBody(Token &T);
331 
332  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
333  const CommandInfo *Info);
334 
335  void lexVerbatimLineText(Token &T);
336 
337  void lexHTMLCharacterReference(Token &T);
338 
339  void setupAndLexHTMLStartTag(Token &T);
340 
341  void lexHTMLStartTag(Token &T);
342 
343  void setupAndLexHTMLEndTag(Token &T);
344 
345  void lexHTMLEndTag(Token &T);
346 
347 public:
348  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
349  const CommandTraits &Traits, SourceLocation FileLoc,
350  const char *BufferStart, const char *BufferEnd,
351  bool ParseCommands = true);
352 
353  void lex(Token &T);
354 
355  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr,
356  bool *Invalid = nullptr) const;
357 };
358 
359 } // end namespace comments
360 } // end namespace clang
361 
362 #endif
363 
void setHTMLQuotedString(StringRef Str)
Definition: CommentLexer.h:199
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc TokLoc, const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, unsigned DiagID)
Produce a diagnostic highlighting some portion of a literal.
void setHTMLTagStartName(StringRef Name)
Definition: CommentLexer.h:177
SourceLocation getLocWithOffset(int Offset) const
Return a source location with the specified offset from this SourceLocation.
SourceLocation getEndLocation() const LLVM_READONLY
Definition: CommentLexer.h:83
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:95
Defines the SourceManager interface.
unsigned getVerbatimBlockID() const LLVM_READONLY
Definition: CommentLexer.h:130
void setLength(unsigned L)
Definition: CommentLexer.h:96
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1294
void setText(StringRef Text)
Definition: CommentLexer.h:103
StringRef getHTMLTagEndName() const LLVM_READONLY
Definition: CommentLexer.h:205
void setVerbatimLineText(StringRef Text)
Definition: CommentLexer.h:166
Information about a single command.
tok::TokenKind getKind() const LLVM_READONLY
Definition: CommentLexer.h:89
StringRef getVerbatimLineText() const LLVM_READONLY
Definition: CommentLexer.h:161
LineState State
void setCommandID(unsigned ID)
Definition: CommentLexer.h:125
bool isNot(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:93
void setVerbatimBlockID(unsigned ID)
Definition: CommentLexer.h:135
const FormatToken & Tok
static void dump(llvm::raw_ostream &OS, StringRef FunctionName, ArrayRef< CounterExpression > Expressions, ArrayRef< CounterMappingRegion > Regions)
StringRef getHTMLTagStartName() const LLVM_READONLY
Definition: CommentLexer.h:172
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:148
void setLocation(SourceLocation SL)
Definition: CommentLexer.h:81
Defines the Diagnostic-related interfaces.
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1042
unsigned getVerbatimLineID() const LLVM_READONLY
Definition: CommentLexer.h:151
void setUnknownCommandName(StringRef Name)
Definition: CommentLexer.h:114
void setVerbatimLineID(unsigned ID)
Definition: CommentLexer.h:156
void setHTMLTagEndName(StringRef Name)
Definition: CommentLexer.h:210
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:80
StringRef getHTMLQuotedString() const LLVM_READONLY
Definition: CommentLexer.h:194
const SourceManager & SM
Definition: Format.cpp:1489
bool is(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:92
void setVerbatimBlockText(StringRef Text)
Definition: CommentLexer.h:145
This class provides information about commands that can be used in comments.
Kind
Re-lexes a sequence of tok::text tokens.
Encodes a location in the source.
Comment lexer.
Definition: CommentLexer.h:220
Dataflow Directional Tag Classes.
void setKind(tok::TokenKind K)
Definition: CommentLexer.h:90
unsigned getCommandID() const LLVM_READONLY
Definition: CommentLexer.h:120
StringRef getUnknownCommandName() const LLVM_READONLY
Definition: CommentLexer.h:109
Comment token.
Definition: CommentLexer.h:55
StringRef getVerbatimBlockText() const LLVM_READONLY
Definition: CommentLexer.h:140
StringRef getText() const LLVM_READONLY
Definition: CommentLexer.h:98
StringRef Text
Definition: Format.cpp:1629
void setHTMLIdent(StringRef Name)
Definition: CommentLexer.h:188
This class handles loading and caching of source files into memory.
StringRef getHTMLIdent() const LLVM_READONLY
Definition: CommentLexer.h:183