clang 20.0.0git
CommentLexer.h
Go to the documentation of this file.
1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines lexer for structured comments and supporting token class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14#define LLVM_CLANG_AST_COMMENTLEXER_H
15
18#include "llvm/ADT/SmallString.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/Support/Allocator.h"
21#include "llvm/Support/raw_ostream.h"
22
23namespace clang {
24namespace comments {
25
26class Lexer;
27class TextTokenRetokenizer;
28struct CommandInfo;
29class CommandTraits;
30
31namespace tok {
36 unknown_command, // Command that does not have an ID.
37 backslash_command, // Command with an ID, that used backslash marker.
38 at_command, // Command with an ID, that used 'at' marker.
45 html_ident, // attr
47 html_quoted_string, // "blah\"blah" or 'blah\'blah'
50 html_end_tag // </tag
51};
52} // end namespace tok
53
54/// Comment token.
55class Token {
56 friend class Lexer;
58
59 /// The location of the token.
61
62 /// The actual kind of the token.
64
65 /// Integer value associated with a token.
66 ///
67 /// If the token is a known command, contains command ID and TextPtr is
68 /// unused (command spelling can be found with CommandTraits). Otherwise,
69 /// contains the length of the string that starts at TextPtr.
70 unsigned IntVal;
71
72 /// Length of the token spelling in comment. Can be 0 for synthenized
73 /// tokens.
74 unsigned Length;
75
76 /// Contains text value associated with a token.
77 const char *TextPtr;
78
79public:
80 SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81 void setLocation(SourceLocation SL) { Loc = SL; }
82
83 SourceLocation getEndLocation() const LLVM_READONLY {
84 if (Length == 0 || Length == 1)
85 return Loc;
86 return Loc.getLocWithOffset(Length - 1);
87 }
88
89 tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90 void setKind(tok::TokenKind K) { Kind = K; }
91
92 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94
95 unsigned getLength() const LLVM_READONLY { return Length; }
96 void setLength(unsigned L) { Length = L; }
97
98 StringRef getText() const LLVM_READONLY {
99 assert(is(tok::text));
100 return StringRef(TextPtr, IntVal);
101 }
102
103 void setText(StringRef Text) {
104 assert(is(tok::text));
105 TextPtr = Text.data();
106 IntVal = Text.size();
107 }
108
109 StringRef getUnknownCommandName() const LLVM_READONLY {
110 assert(is(tok::unknown_command));
111 return StringRef(TextPtr, IntVal);
112 }
113
114 void setUnknownCommandName(StringRef Name) {
115 assert(is(tok::unknown_command));
116 TextPtr = Name.data();
117 IntVal = Name.size();
118 }
119
120 unsigned getCommandID() const LLVM_READONLY {
122 return IntVal;
123 }
124
125 void setCommandID(unsigned ID) {
127 IntVal = ID;
128 }
129
130 unsigned getVerbatimBlockID() const LLVM_READONLY {
132 return IntVal;
133 }
134
135 void setVerbatimBlockID(unsigned ID) {
137 IntVal = ID;
138 }
139
140 StringRef getVerbatimBlockText() const LLVM_READONLY {
142 return StringRef(TextPtr, IntVal);
143 }
144
145 void setVerbatimBlockText(StringRef Text) {
147 TextPtr = Text.data();
148 IntVal = Text.size();
149 }
150
151 unsigned getVerbatimLineID() const LLVM_READONLY {
153 return IntVal;
154 }
155
156 void setVerbatimLineID(unsigned ID) {
158 IntVal = ID;
159 }
160
161 StringRef getVerbatimLineText() const LLVM_READONLY {
163 return StringRef(TextPtr, IntVal);
164 }
165
166 void setVerbatimLineText(StringRef Text) {
168 TextPtr = Text.data();
169 IntVal = Text.size();
170 }
171
172 StringRef getHTMLTagStartName() const LLVM_READONLY {
173 assert(is(tok::html_start_tag));
174 return StringRef(TextPtr, IntVal);
175 }
176
177 void setHTMLTagStartName(StringRef Name) {
178 assert(is(tok::html_start_tag));
179 TextPtr = Name.data();
180 IntVal = Name.size();
181 }
182
183 StringRef getHTMLIdent() const LLVM_READONLY {
184 assert(is(tok::html_ident));
185 return StringRef(TextPtr, IntVal);
186 }
187
188 void setHTMLIdent(StringRef Name) {
189 assert(is(tok::html_ident));
190 TextPtr = Name.data();
191 IntVal = Name.size();
192 }
193
194 StringRef getHTMLQuotedString() const LLVM_READONLY {
196 return StringRef(TextPtr, IntVal);
197 }
198
199 void setHTMLQuotedString(StringRef Str) {
201 TextPtr = Str.data();
202 IntVal = Str.size();
203 }
204
205 StringRef getHTMLTagEndName() const LLVM_READONLY {
206 assert(is(tok::html_end_tag));
207 return StringRef(TextPtr, IntVal);
208 }
209
210 void setHTMLTagEndName(StringRef Name) {
211 assert(is(tok::html_end_tag));
212 TextPtr = Name.data();
213 IntVal = Name.size();
214 }
215
216 void dump(const Lexer &L, const SourceManager &SM) const;
217};
218
219/// Comment lexer.
220class Lexer {
221private:
222 Lexer(const Lexer &) = delete;
223 void operator=(const Lexer &) = delete;
224
225 /// Allocator for strings that are semantic values of tokens and have to be
226 /// computed (for example, resolved decimal character references).
227 llvm::BumpPtrAllocator &Allocator;
228
229 DiagnosticsEngine &Diags;
230
231 const CommandTraits &Traits;
232
233 const char *const BufferStart;
234 const char *const BufferEnd;
235
236 const char *BufferPtr;
237
238 /// One past end pointer for the current comment. For BCPL comments points
239 /// to newline or BufferEnd, for C comments points to star in '*/'.
240 const char *CommentEnd;
241
242 SourceLocation FileLoc;
243
244 /// If true, the commands, html tags, etc will be parsed and reported as
245 /// separate tokens inside the comment body. If false, the comment text will
246 /// be parsed into text and newline tokens.
247 bool ParseCommands;
248
249 enum LexerCommentState : uint8_t {
250 LCS_BeforeComment,
251 LCS_InsideBCPLComment,
252 LCS_InsideCComment,
253 LCS_BetweenComments
254 };
255
256 /// Low-level lexer state, track if we are inside or outside of comment.
257 LexerCommentState CommentState;
258
259 enum LexerState : uint8_t {
260 /// Lexing normal comment text
261 LS_Normal,
262
263 /// Finished lexing verbatim block beginning command, will lex first body
264 /// line.
265 LS_VerbatimBlockFirstLine,
266
267 /// Lexing verbatim block body line-by-line, skipping line-starting
268 /// decorations.
269 LS_VerbatimBlockBody,
270
271 /// Finished lexing verbatim line beginning command, will lex text (one
272 /// line).
273 LS_VerbatimLineText,
274
275 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
276 LS_HTMLStartTag,
277
278 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
279 LS_HTMLEndTag
280 };
281
282 /// Current lexing mode.
283 LexerState State;
284
285 /// If State is LS_VerbatimBlock, contains the name of verbatim end
286 /// command, including command marker.
287 SmallString<16> VerbatimBlockEndCommandName;
288
289 /// Given a character reference name (e.g., "lt"), return the character that
290 /// it stands for (e.g., "<").
291 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
292
293 /// Given a Unicode codepoint as base-10 integer, return the character.
294 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
295
296 /// Given a Unicode codepoint as base-16 integer, return the character.
297 StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
298
299 void formTokenWithChars(Token &Result, const char *TokEnd,
301
302 void formTextToken(Token &Result, const char *TokEnd) {
303 StringRef Text(BufferPtr, TokEnd - BufferPtr);
304 formTokenWithChars(Result, TokEnd, tok::text);
305 Result.setText(Text);
306 }
307
308 SourceLocation getSourceLocation(const char *Loc) const {
309 assert(Loc >= BufferStart && Loc <= BufferEnd &&
310 "Location out of range for this buffer!");
311
312 const unsigned CharNo = Loc - BufferStart;
313 return FileLoc.getLocWithOffset(CharNo);
314 }
315
316 DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
317 return Diags.Report(Loc, DiagID);
318 }
319
320 /// Eat string matching regexp \code \s*\* \endcode.
321 void skipLineStartingDecorations();
322
323 /// Skip over pure text.
324 const char *skipTextToken();
325
326 /// Lex comment text, including commands if ParseCommands is set to true.
327 void lexCommentText(Token &T);
328
329 void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
330 const CommandInfo *Info);
331
332 void lexVerbatimBlockFirstLine(Token &T);
333
334 void lexVerbatimBlockBody(Token &T);
335
336 void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
337 const CommandInfo *Info);
338
339 void lexVerbatimLineText(Token &T);
340
341 void lexHTMLCharacterReference(Token &T);
342
343 void setupAndLexHTMLStartTag(Token &T);
344
345 void lexHTMLStartTag(Token &T);
346
347 void setupAndLexHTMLEndTag(Token &T);
348
349 void lexHTMLEndTag(Token &T);
350
351public:
352 Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
353 const CommandTraits &Traits, SourceLocation FileLoc,
354 const char *BufferStart, const char *BufferEnd,
355 bool ParseCommands = true);
356
357 void lex(Token &T);
358
359 StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
360};
361
362} // end namespace comments
363} // end namespace clang
364
365#endif
366
static char ID
Definition: Arena.cpp:183
#define SM(sm)
Definition: Cuda.cpp:83
Defines the Diagnostic-related interfaces.
enum clang::sema::@1653::IndirectLocalPathEntry::EntryKind Kind
StringRef Text
Definition: Format.cpp:3002
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the SourceManager interface.
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1271
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:192
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1547
Encodes a location in the source.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
This class provides information about commands that can be used in comments.
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const
Re-lexes a sequence of tok::text tokens.
Comment token.
Definition: CommentLexer.h:55
StringRef getHTMLQuotedString() const LLVM_READONLY
Definition: CommentLexer.h:194
bool isNot(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:93
SourceLocation getEndLocation() const LLVM_READONLY
Definition: CommentLexer.h:83
void setLocation(SourceLocation SL)
Definition: CommentLexer.h:81
void setVerbatimBlockText(StringRef Text)
Definition: CommentLexer.h:145
void setHTMLTagEndName(StringRef Name)
Definition: CommentLexer.h:210
unsigned getCommandID() const LLVM_READONLY
Definition: CommentLexer.h:120
void setVerbatimLineID(unsigned ID)
Definition: CommentLexer.h:156
void setHTMLTagStartName(StringRef Name)
Definition: CommentLexer.h:177
StringRef getUnknownCommandName() const LLVM_READONLY
Definition: CommentLexer.h:109
StringRef getText() const LLVM_READONLY
Definition: CommentLexer.h:98
void dump(const Lexer &L, const SourceManager &SM) const
StringRef getHTMLIdent() const LLVM_READONLY
Definition: CommentLexer.h:183
void setHTMLIdent(StringRef Name)
Definition: CommentLexer.h:188
StringRef getVerbatimBlockText() const LLVM_READONLY
Definition: CommentLexer.h:140
unsigned getVerbatimLineID() const LLVM_READONLY
Definition: CommentLexer.h:151
void setVerbatimLineText(StringRef Text)
Definition: CommentLexer.h:166
unsigned getVerbatimBlockID() const LLVM_READONLY
Definition: CommentLexer.h:130
void setLength(unsigned L)
Definition: CommentLexer.h:96
void setText(StringRef Text)
Definition: CommentLexer.h:103
bool is(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:92
void setUnknownCommandName(StringRef Name)
Definition: CommentLexer.h:114
void setCommandID(unsigned ID)
Definition: CommentLexer.h:125
void setHTMLQuotedString(StringRef Str)
Definition: CommentLexer.h:199
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:95
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:80
StringRef getVerbatimLineText() const LLVM_READONLY
Definition: CommentLexer.h:161
tok::TokenKind getKind() const LLVM_READONLY
Definition: CommentLexer.h:89
void setVerbatimBlockID(unsigned ID)
Definition: CommentLexer.h:135
void setKind(tok::TokenKind K)
Definition: CommentLexer.h:90
StringRef getHTMLTagStartName() const LLVM_READONLY
Definition: CommentLexer.h:172
StringRef getHTMLTagEndName() const LLVM_READONLY
Definition: CommentLexer.h:205
The JSON file list parser is used to communicate input to InstallAPI.
@ Result
The result type of a method or function.
const FunctionProtoType * T
Information about a single command.