clang  14.0.0git
CommentLexer.h
Go to the documentation of this file.
1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines lexer for structured comments and supporting token class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14 #define LLVM_CLANG_AST_COMMENTLEXER_H
15 
16 #include "clang/Basic/Diagnostic.h"
18 #include "llvm/ADT/SmallString.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Support/Allocator.h"
21 #include "llvm/Support/raw_ostream.h"
22 
23 namespace clang {
24 namespace comments {
25 
26 class Lexer;
27 class TextTokenRetokenizer;
28 struct CommandInfo;
29 class CommandTraits;
30 
31 namespace tok {
32 enum TokenKind {
33  eof,
36  unknown_command, // Command that does not have an ID.
37  backslash_command, // Command with an ID, that used backslash marker.
38  at_command, // Command with an ID, that used 'at' marker.
44  html_start_tag, // <tag
45  html_ident, // attr
47  html_quoted_string, // "blah\"blah" or 'blah\'blah'
50  html_end_tag // </tag
51 };
52 } // end namespace tok
53 
54 /// Comment token.
55 class Token {
56  friend class Lexer;
57  friend class TextTokenRetokenizer;
58 
59  /// The location of the token.
60  SourceLocation Loc;
61 
62  /// The actual kind of the token.
64 
65  /// Integer value associated with a token.
66  ///
67  /// If the token is a known command, contains command ID and TextPtr is
68  /// unused (command spelling can be found with CommandTraits). Otherwise,
69  /// contains the length of the string that starts at TextPtr.
70  unsigned IntVal;
71 
72  /// Length of the token spelling in comment. Can be 0 for synthenized
73  /// tokens.
74  unsigned Length;
75 
76  /// Contains text value associated with a token.
77  const char *TextPtr;
78 
79 public:
80  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81  void setLocation(SourceLocation SL) { Loc = SL; }
82 
83  SourceLocation getEndLocation() const LLVM_READONLY {
84  if (Length == 0 || Length == 1)
85  return Loc;
86  return Loc.getLocWithOffset(Length - 1);
87  }
88 
89  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90  void setKind(tok::TokenKind K) { Kind = K; }
91 
92  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94 
95  unsigned getLength() const LLVM_READONLY { return Length; }
96  void setLength(unsigned L) { Length = L; }
97 
98  StringRef getText() const LLVM_READONLY {
99  assert(is(tok::text));
100  return StringRef(TextPtr, IntVal);
101  }
102 
103  void setText(StringRef Text) {
104  assert(is(tok::text));
105  TextPtr = Text.data();
106  IntVal = Text.size();
107  }
108 
109  StringRef getUnknownCommandName() const LLVM_READONLY {
110  assert(is(tok::unknown_command));
111  return StringRef(TextPtr, IntVal);
112  }
113 
114  void setUnknownCommandName(StringRef Name) {
115  assert(is(tok::unknown_command));
116  TextPtr = Name.data();
117  IntVal = Name.size();
118  }
119 
120  unsigned getCommandID() const LLVM_READONLY {
122  return IntVal;
123  }
124 
125  void setCommandID(unsigned ID) {
127  IntVal = ID;
128  }
129 
130  unsigned getVerbatimBlockID() const LLVM_READONLY {
132  return IntVal;
133  }
134 
135  void setVerbatimBlockID(unsigned ID) {
137  IntVal = ID;
138  }
139 
140  StringRef getVerbatimBlockText() const LLVM_READONLY {
141  assert(is(tok::verbatim_block_line));
142  return StringRef(TextPtr, IntVal);
143  }
144 
145  void setVerbatimBlockText(StringRef Text) {
146  assert(is(tok::verbatim_block_line));
147  TextPtr = Text.data();
148  IntVal = Text.size();
149  }
150 
151  unsigned getVerbatimLineID() const LLVM_READONLY {
152  assert(is(tok::verbatim_line_name));
153  return IntVal;
154  }
155 
156  void setVerbatimLineID(unsigned ID) {
157  assert(is(tok::verbatim_line_name));
158  IntVal = ID;
159  }
160 
161  StringRef getVerbatimLineText() const LLVM_READONLY {
162  assert(is(tok::verbatim_line_text));
163  return StringRef(TextPtr, IntVal);
164  }
165 
166  void setVerbatimLineText(StringRef Text) {
167  assert(is(tok::verbatim_line_text));
168  TextPtr = Text.data();
169  IntVal = Text.size();
170  }
171 
172  StringRef getHTMLTagStartName() const LLVM_READONLY {
173  assert(is(tok::html_start_tag));
174  return StringRef(TextPtr, IntVal);
175  }
176 
177  void setHTMLTagStartName(StringRef Name) {
178  assert(is(tok::html_start_tag));
179  TextPtr = Name.data();
180  IntVal = Name.size();
181  }
182 
183  StringRef getHTMLIdent() const LLVM_READONLY {
184  assert(is(tok::html_ident));
185  return StringRef(TextPtr, IntVal);
186  }
187 
188  void setHTMLIdent(StringRef Name) {
189  assert(is(tok::html_ident));
190  TextPtr = Name.data();
191  IntVal = Name.size();
192  }
193 
194  StringRef getHTMLQuotedString() const LLVM_READONLY {
195  assert(is(tok::html_quoted_string));
196  return StringRef(TextPtr, IntVal);
197  }
198 
199  void setHTMLQuotedString(StringRef Str) {
200  assert(is(tok::html_quoted_string));
201  TextPtr = Str.data();
202  IntVal = Str.size();
203  }
204 
205  StringRef getHTMLTagEndName() const LLVM_READONLY {
206  assert(is(tok::html_end_tag));
207  return StringRef(TextPtr, IntVal);
208  }
209 
210  void setHTMLTagEndName(StringRef Name) {
211  assert(is(tok::html_end_tag));
212  TextPtr = Name.data();
213  IntVal = Name.size();
214  }
215 
216  void dump(const Lexer &L, const SourceManager &SM) const;
217 };
218 
219 /// Comment lexer.
220 class Lexer {
221 private:
222  Lexer(const Lexer &) = delete;
223  void operator=(const Lexer &) = delete;
224 
225  /// Allocator for strings that are semantic values of tokens and have to be
226  /// computed (for example, resolved decimal character references).
227  llvm::BumpPtrAllocator &Allocator;
228 
229  DiagnosticsEngine &Diags;
230 
231  const CommandTraits &Traits;
232 
233  const char *const BufferStart;
234  const char *const BufferEnd;
235 
236  const char *BufferPtr;
237 
238  /// One past end pointer for the current comment. For BCPL comments points
239  /// to newline or BufferEnd, for C comments points to star in '*/'.
240  const char *CommentEnd;
241 
242  SourceLocation FileLoc;
243 
244  /// If true, the commands, html tags, etc will be parsed and reported as
245  /// separate tokens inside the comment body. If false, the comment text will
246  /// be parsed into text and newline tokens.
247  bool ParseCommands;
248 
249  enum LexerCommentState : uint8_t {
250  LCS_BeforeComment,
251  LCS_InsideBCPLComment,
252  LCS_InsideCComment,
253  LCS_BetweenComments
254  };
255 
256  /// Low-level lexer state, track if we are inside or outside of comment.
257  LexerCommentState CommentState;
258 
259  enum LexerState : uint8_t {
260  /// Lexing normal comment text
261  LS_Normal,
262 
263  /// Finished lexing verbatim block beginning command, will lex first body
264  /// line.
265  LS_VerbatimBlockFirstLine,
266 
267  /// Lexing verbatim block body line-by-line, skipping line-starting
268  /// decorations.
269  LS_VerbatimBlockBody,
270 
271  /// Finished lexing verbatim line beginning command, will lex text (one
272  /// line).
273  LS_VerbatimLineText,
274 
275  /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
276  LS_HTMLStartTag,
277 
278  /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
279  LS_HTMLEndTag
280  };
281 
282  /// Current lexing mode.
283  LexerState State;
284 
285  /// If State is LS_VerbatimBlock, contains the name of verbatim end
286  /// command, including command marker.
287  SmallString<16> VerbatimBlockEndCommandName;
288 
289  /// Given a character reference name (e.g., "lt"), return the character that
290  /// it stands for (e.g., "<").
291  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
292 
293  /// Given a Unicode codepoint as base-10 integer, return the character.
294  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
295 
296  /// Given a Unicode codepoint as base-16 integer, return the character.
297  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
298 
299  void formTokenWithChars(Token &Result, const char *TokEnd,
301 
302  void formTextToken(Token &Result, const char *TokEnd) {
303  StringRef Text(BufferPtr, TokEnd - BufferPtr);
304  formTokenWithChars(Result, TokEnd, tok::text);
305  Result.setText(Text);
306  }
307 
308  SourceLocation getSourceLocation(const char *Loc) const {
309  assert(Loc >= BufferStart && Loc <= BufferEnd &&
310  "Location out of range for this buffer!");
311 
312  const unsigned CharNo = Loc - BufferStart;
313  return FileLoc.getLocWithOffset(CharNo);
314  }
315 
316  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
317  return Diags.Report(Loc, DiagID);
318  }
319 
320  /// Eat string matching regexp \code \s*\* \endcode.
321  void skipLineStartingDecorations();
322 
323  /// Lex comment text, including commands if ParseCommands is set to true.
324  void lexCommentText(Token &T);
325 
326  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
327  const CommandInfo *Info);
328 
329  void lexVerbatimBlockFirstLine(Token &T);
330 
331  void lexVerbatimBlockBody(Token &T);
332 
333  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
334  const CommandInfo *Info);
335 
336  void lexVerbatimLineText(Token &T);
337 
338  void lexHTMLCharacterReference(Token &T);
339 
340  void setupAndLexHTMLStartTag(Token &T);
341 
342  void lexHTMLStartTag(Token &T);
343 
344  void setupAndLexHTMLEndTag(Token &T);
345 
346  void lexHTMLEndTag(Token &T);
347 
348 public:
349  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
350  const CommandTraits &Traits, SourceLocation FileLoc,
351  const char *BufferStart, const char *BufferEnd,
352  bool ParseCommands = true);
353 
354  void lex(Token &T);
355 
356  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
357 };
358 
359 } // end namespace comments
360 } // end namespace clang
361 
362 #endif
363 
clang::comments::Token::setVerbatimLineText
void setVerbatimLineText(StringRef Text)
Definition: CommentLexer.h:166
clang::comments::TextTokenRetokenizer
Re-lexes a sequence of tok::text tokens.
Definition: CommentParser.cpp:30
clang::comments::tok::text
@ text
Definition: CommentLexer.h:35
clang::comments::Token::setVerbatimBlockID
void setVerbatimBlockID(unsigned ID)
Definition: CommentLexer.h:135
clang::DiagnosticBuilder
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1264
clang::comments::Token::getVerbatimLineID
unsigned getVerbatimLineID() const LLVM_READONLY
Definition: CommentLexer.h:151
clang::comments::tok::unknown_command
@ unknown_command
Definition: CommentLexer.h:36
clang::comments::Token::setHTMLIdent
void setHTMLIdent(StringRef Name)
Definition: CommentLexer.h:188
clang::comments::Token::getVerbatimBlockText
StringRef getVerbatimBlockText() const LLVM_READONLY
Definition: CommentLexer.h:140
clang::comments::tok::html_start_tag
@ html_start_tag
Definition: CommentLexer.h:44
clang::comments::Token::setVerbatimBlockText
void setVerbatimBlockText(StringRef Text)
Definition: CommentLexer.h:145
clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:88
clang::comments::Token::isNot
bool isNot(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:93
clang::SourceLocation::getLocWithOffset
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
Definition: SourceLocation.h:136
clang::comments::tok::TokenKind
TokenKind
Definition: CommentLexer.h:32
clang::DiagnosticsEngine
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:191
clang::comments::Token::setHTMLTagEndName
void setHTMLTagEndName(StringRef Name)
Definition: CommentLexer.h:210
clang::comments::Token::setHTMLTagStartName
void setHTMLTagStartName(StringRef Name)
Definition: CommentLexer.h:177
clang::comments::Token::getLocation
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:80
clang::comments::Token::getHTMLQuotedString
StringRef getHTMLQuotedString() const LLVM_READONLY
Definition: CommentLexer.h:194
clang::comments::Token::dump
void dump(const Lexer &L, const SourceManager &SM) const
Definition: CommentLexer.cpp:21
SourceManager.h
clang::comments::CommandInfo
Information about a single command.
Definition: CommentCommandTraits.h:32
clang::SourceManager
This class handles loading and caching of source files into memory.
Definition: SourceManager.h:626
clang::comments::Token::setUnknownCommandName
void setUnknownCommandName(StringRef Name)
Definition: CommentLexer.h:114
clang::comments::tok::html_ident
@ html_ident
Definition: CommentLexer.h:45
clang::comments::Token::getCommandID
unsigned getCommandID() const LLVM_READONLY
Definition: CommentLexer.h:120
clang::comments::Token::getUnknownCommandName
StringRef getUnknownCommandName() const LLVM_READONLY
Definition: CommentLexer.h:109
Diagnostic.h
clang::comments::Token::getEndLocation
SourceLocation getEndLocation() const LLVM_READONLY
Definition: CommentLexer.h:83
clang::comments::tok::verbatim_block_begin
@ verbatim_block_begin
Definition: CommentLexer.h:39
clang::comments::Token::setKind
void setKind(tok::TokenKind K)
Definition: CommentLexer.h:90
clang::comments::Token::getHTMLIdent
StringRef getHTMLIdent() const LLVM_READONLY
Definition: CommentLexer.h:183
llvm::SmallString< 16 >
clang::comments::Token::getVerbatimBlockID
unsigned getVerbatimBlockID() const LLVM_READONLY
Definition: CommentLexer.h:130
clang::comments::Token::setHTMLQuotedString
void setHTMLQuotedString(StringRef Str)
Definition: CommentLexer.h:199
clang::comments::Token
Comment token.
Definition: CommentLexer.h:55
clang::comments::tok::html_equals
@ html_equals
Definition: CommentLexer.h:46
clang::comments::tok::at_command
@ at_command
Definition: CommentLexer.h:38
clang::comments::tok::html_slash_greater
@ html_slash_greater
Definition: CommentLexer.h:49
clang::comments::tok::verbatim_line_name
@ verbatim_line_name
Definition: CommentLexer.h:42
clang::comments::Token::getKind
tok::TokenKind getKind() const LLVM_READONLY
Definition: CommentLexer.h:89
clang::comments::tok::verbatim_line_text
@ verbatim_line_text
Definition: CommentLexer.h:43
clang::comments::Token::setText
void setText(StringRef Text)
Definition: CommentLexer.h:103
clang::comments::Lexer::getSpelling
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const
Definition: CommentLexer.cpp:851
clang::comments::tok::verbatim_block_line
@ verbatim_block_line
Definition: CommentLexer.h:40
clang::comments::tok::html_greater
@ html_greater
Definition: CommentLexer.h:48
clang::comments::Token::getText
StringRef getText() const LLVM_READONLY
Definition: CommentLexer.h:98
clang::comments::Token::getVerbatimLineText
StringRef getVerbatimLineText() const LLVM_READONLY
Definition: CommentLexer.h:161
clang::comments::Token::setLocation
void setLocation(SourceLocation SL)
Definition: CommentLexer.h:81
clang::comments::tok::html_end_tag
@ html_end_tag
Definition: CommentLexer.h:50
clang::comments::Token::is
bool is(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:92
clang::ObjCPropertyAttribute::Kind
Kind
Definition: DeclObjCCommon.h:22
clang::comments::tok::verbatim_block_end
@ verbatim_block_end
Definition: CommentLexer.h:41
clang::comments::Token::setVerbatimLineID
void setVerbatimLineID(unsigned ID)
Definition: CommentLexer.h:156
clang::Builtin::ID
ID
Definition: Builtins.h:48
clang
Definition: CalledOnceCheck.h:17
Text
StringRef Text
Definition: Format.cpp:2334
clang::comments::Lexer::lex
void lex(Token &T)
Definition: CommentLexer.cpp:749
clang::comments::Token::getHTMLTagStartName
StringRef getHTMLTagStartName() const LLVM_READONLY
Definition: CommentLexer.h:172
clang::comments::Token::getHTMLTagEndName
StringRef getHTMLTagEndName() const LLVM_READONLY
Definition: CommentLexer.h:205
clang::comments::tok::eof
@ eof
Definition: CommentLexer.h:33
clang::comments::Token::getLength
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:95
clang::comments::CommandTraits
This class provides information about commands that can be used in comments.
Definition: CommentCommandTraits.h:127
SM
#define SM(sm)
Definition: Cuda.cpp:78
clang::comments::tok::backslash_command
@ backslash_command
Definition: CommentLexer.h:37
clang::comments::tok::newline
@ newline
Definition: CommentLexer.h:34
clang::comments::Token::setLength
void setLength(unsigned L)
Definition: CommentLexer.h:96
clang::comments::Token::setCommandID
void setCommandID(unsigned ID)
Definition: CommentLexer.h:125
clang::comments::Lexer
Comment lexer.
Definition: CommentLexer.h:220
clang::DiagnosticsEngine::Report
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1523
clang::comments::tok::html_quoted_string
@ html_quoted_string
Definition: CommentLexer.h:47