clang 23.0.0git
CommentLexer.h
Go to the documentation of this file.
1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines lexer for structured comments and supporting token class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14#define LLVM_CLANG_AST_COMMENTLEXER_H
15
18#include "llvm/ADT/SmallString.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/Support/Allocator.h"
21#include "llvm/Support/raw_ostream.h"
22
23namespace clang {
24namespace comments {
25
26class Lexer;
28struct CommandInfo;
29class CommandTraits;
30
31namespace tok {
36 unknown_backslash_command, // Command that does not have an ID, that used
37 // backslash marker.
38 unknown_at_command, // Command that does not have an ID, that used 'at'
39 // marker.
40 backslash_command, // Command with an ID, that used backslash marker.
41 at_command, // Command with an ID, that used 'at' marker.
48 html_ident, // attr
50 html_quoted_string, // "blah\"blah" or 'blah\'blah'
53 html_end_tag // </tag
54};
55} // end namespace tok
56
57/// Comment token.
58class Token {
59 friend class Lexer;
61
62 /// The location of the token.
64
65 /// The actual kind of the token.
66 tok::TokenKind Kind;
67
68 /// Integer value associated with a token.
69 ///
70 /// If the token is a known command, contains command ID and TextPtr is
71 /// unused (command spelling can be found with CommandTraits). Otherwise,
72 /// contains the length of the string that starts at TextPtr.
73 unsigned IntVal;
74
75 /// Length of the token spelling in comment. Can be 0 for synthenized
76 /// tokens.
77 unsigned Length;
78
79 /// Contains text value associated with a token.
80 const char *TextPtr;
81
82public:
83 SourceLocation getLocation() const LLVM_READONLY { return Loc; }
84 void setLocation(SourceLocation SL) { Loc = SL; }
85
86 SourceLocation getEndLocation() const LLVM_READONLY {
87 if (Length == 0 || Length == 1)
88 return Loc;
89 return Loc.getLocWithOffset(Length - 1);
90 }
91
92 tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
93 void setKind(tok::TokenKind K) { Kind = K; }
94
95 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
96 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
97
98 unsigned getLength() const LLVM_READONLY { return Length; }
99 void setLength(unsigned L) { Length = L; }
100
101 StringRef getText() const LLVM_READONLY {
102 assert(is(tok::text));
103 return StringRef(TextPtr, IntVal);
104 }
105
106 void setText(StringRef Text) {
107 assert(is(tok::text));
108 TextPtr = Text.data();
109 IntVal = Text.size();
110 }
111
112 StringRef getUnknownCommandName() const LLVM_READONLY {
114 return StringRef(TextPtr, IntVal);
115 }
116
117 void setUnknownCommandName(StringRef Name) {
119 TextPtr = Name.data();
120 IntVal = Name.size();
121 }
122
123 unsigned getCommandID() const LLVM_READONLY {
125 return IntVal;
126 }
127
128 void setCommandID(unsigned ID) {
130 IntVal = ID;
131 }
132
133 unsigned getVerbatimBlockID() const LLVM_READONLY {
135 return IntVal;
136 }
137
138 void setVerbatimBlockID(unsigned ID) {
140 IntVal = ID;
141 }
142
143 StringRef getVerbatimBlockText() const LLVM_READONLY {
145 return StringRef(TextPtr, IntVal);
146 }
147
148 void setVerbatimBlockText(StringRef Text) {
150 TextPtr = Text.data();
151 IntVal = Text.size();
152 }
153
154 unsigned getVerbatimLineID() const LLVM_READONLY {
156 return IntVal;
157 }
158
159 void setVerbatimLineID(unsigned ID) {
161 IntVal = ID;
162 }
163
164 StringRef getVerbatimLineText() const LLVM_READONLY {
166 return StringRef(TextPtr, IntVal);
167 }
168
169 void setVerbatimLineText(StringRef Text) {
171 TextPtr = Text.data();
172 IntVal = Text.size();
173 }
174
175 StringRef getHTMLTagStartName() const LLVM_READONLY {
176 assert(is(tok::html_start_tag));
177 return StringRef(TextPtr, IntVal);
178 }
179
180 void setHTMLTagStartName(StringRef Name) {
181 assert(is(tok::html_start_tag));
182 TextPtr = Name.data();
183 IntVal = Name.size();
184 }
185
186 StringRef getHTMLIdent() const LLVM_READONLY {
187 assert(is(tok::html_ident));
188 return StringRef(TextPtr, IntVal);
189 }
190
191 void setHTMLIdent(StringRef Name) {
192 assert(is(tok::html_ident));
193 TextPtr = Name.data();
194 IntVal = Name.size();
195 }
196
197 StringRef getHTMLQuotedString() const LLVM_READONLY {
199 return StringRef(TextPtr, IntVal);
200 }
201
202 void setHTMLQuotedString(StringRef Str) {
204 TextPtr = Str.data();
205 IntVal = Str.size();
206 }
207
208 StringRef getHTMLTagEndName() const LLVM_READONLY {
209 assert(is(tok::html_end_tag));
210 return StringRef(TextPtr, IntVal);
211 }
212
213 void setHTMLTagEndName(StringRef Name) {
214 assert(is(tok::html_end_tag));
215 TextPtr = Name.data();
216 IntVal = Name.size();
217 }
218
219 void dump(const Lexer &L, const SourceManager &SM) const;
220};
221
222/// Comment lexer.
223class Lexer {
224private:
225 Lexer(const Lexer &) = delete;
226 void operator=(const Lexer &) = delete;
227
228 /// Allocator for strings that are semantic values of tokens and have to be
229 /// computed (for example, resolved decimal character references).
230 llvm::BumpPtrAllocator &Allocator;
231
232 DiagnosticsEngine &Diags;
233
234 const CommandTraits &Traits;
235
236 const char *const BufferStart;
237 const char *const BufferEnd;
238
239 const char *BufferPtr;
240
241 /// One past end pointer for the current comment. For BCPL comments points
242 /// to newline or BufferEnd, for C comments points to star in '*/'.
243 const char *CommentEnd;
244
245 SourceLocation FileLoc;
246
247 /// If true, the commands, html tags, etc will be parsed and reported as
248 /// separate tokens inside the comment body. If false, the comment text will
249 /// be parsed into text and newline tokens.
250 bool ParseCommands;
251
252 enum LexerCommentState : uint8_t {
253 LCS_BeforeComment,
254 LCS_InsideBCPLComment,
255 LCS_InsideCComment,
256 LCS_BetweenComments
257 };
258
259 /// Low-level lexer state, track if we are inside or outside of comment.
260 LexerCommentState CommentState;
261
262 enum LexerState : uint8_t {
263 /// Lexing normal comment text
264 LS_Normal,
265
266 /// Finished lexing verbatim block beginning command, will lex first body
267 /// line.
268 LS_VerbatimBlockFirstLine,
269
270 /// Lexing verbatim block body line-by-line, skipping line-starting
271 /// decorations.
272 LS_VerbatimBlockBody,
273
274 /// Finished lexing verbatim line beginning command, will lex text (one
275 /// line).
276 LS_VerbatimLineText,
277
278 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
279 LS_HTMLStartTag,
280
281 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
282 LS_HTMLEndTag
283 };
284
285 /// Current lexing mode.
286 LexerState State;
287
288 /// If State is LS_VerbatimBlock, contains the name of verbatim end
289 /// command, including command marker.
290 SmallString<16> VerbatimBlockEndCommandName;
291
292 /// Given a character reference name (e.g., "lt"), return the character that
293 /// it stands for (e.g., "<").
294 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
295
296 /// Given a Unicode codepoint as base-10 integer, return the character.
297 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
298
299 /// Given a Unicode codepoint as base-16 integer, return the character.
300 StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
301
302 void formTokenWithChars(Token &Result, const char *TokEnd,
303 tok::TokenKind Kind);
304
305 void formTextToken(Token &Result, const char *TokEnd) {
306 StringRef Text(BufferPtr, TokEnd - BufferPtr);
307 formTokenWithChars(Result, TokEnd, tok::text);
308 Result.setText(Text);
309 }
310
311 SourceLocation getSourceLocation(const char *Loc) const {
312 assert(Loc >= BufferStart && Loc <= BufferEnd &&
313 "Location out of range for this buffer!");
314
315 const unsigned CharNo = Loc - BufferStart;
316 return FileLoc.getLocWithOffset(CharNo);
317 }
318
319 DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
320 return Diags.Report(Loc, DiagID);
321 }
322
323 /// Eat string matching regexp \code \s*\* \endcode.
324 void skipLineStartingDecorations();
325
326 /// Skip over pure text.
327 const char *skipTextToken();
328
329 /// Lex comment text, including commands if ParseCommands is set to true.
330 void lexCommentText(Token &T);
331
332 void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
333 const CommandInfo *Info);
334
335 void lexVerbatimBlockFirstLine(Token &T);
336
337 void lexVerbatimBlockBody(Token &T);
338
339 void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
340 const CommandInfo *Info);
341
342 void lexVerbatimLineText(Token &T);
343
344 void lexHTMLCharacterReference(Token &T);
345
346 void setupAndLexHTMLStartTag(Token &T);
347
348 void lexHTMLStartTag(Token &T);
349
350 void setupAndLexHTMLEndTag(Token &T);
351
352 void lexHTMLEndTag(Token &T);
353
354public:
355 Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
356 const CommandTraits &Traits, SourceLocation FileLoc,
357 const char *BufferStart, const char *BufferEnd,
358 bool ParseCommands = true);
359
360 void lex(Token &T);
361
362 StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
363};
364
365} // end namespace comments
366} // end namespace clang
367
368#endif
369
Defines the Diagnostic-related interfaces.
Token Tok
The Token.
#define SM(sm)
Defines the SourceManager interface.
A little helper class used to produce diagnostics.
Concrete class used by the front-end to report problems and issues.
Definition Diagnostic.h:233
Encodes a location in the source.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
This class provides information about commands that can be used in comments.
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const
Re-lexes a sequence of tok::text tokens.
StringRef getHTMLQuotedString() const LLVM_READONLY
bool isNot(tok::TokenKind K) const LLVM_READONLY
SourceLocation getEndLocation() const LLVM_READONLY
void setLocation(SourceLocation SL)
void setVerbatimBlockText(StringRef Text)
void setHTMLTagEndName(StringRef Name)
unsigned getCommandID() const LLVM_READONLY
void setVerbatimLineID(unsigned ID)
void setHTMLTagStartName(StringRef Name)
StringRef getUnknownCommandName() const LLVM_READONLY
StringRef getText() const LLVM_READONLY
void dump(const Lexer &L, const SourceManager &SM) const
StringRef getHTMLIdent() const LLVM_READONLY
void setHTMLIdent(StringRef Name)
StringRef getVerbatimBlockText() const LLVM_READONLY
unsigned getVerbatimLineID() const LLVM_READONLY
void setVerbatimLineText(StringRef Text)
unsigned getVerbatimBlockID() const LLVM_READONLY
void setLength(unsigned L)
void setText(StringRef Text)
bool is(tok::TokenKind K) const LLVM_READONLY
void setUnknownCommandName(StringRef Name)
void setCommandID(unsigned ID)
void setHTMLQuotedString(StringRef Str)
friend class TextTokenRetokenizer
unsigned getLength() const LLVM_READONLY
SourceLocation getLocation() const LLVM_READONLY
StringRef getVerbatimLineText() const LLVM_READONLY
tok::TokenKind getKind() const LLVM_READONLY
void setVerbatimBlockID(unsigned ID)
void setKind(tok::TokenKind K)
StringRef getHTMLTagStartName() const LLVM_READONLY
StringRef getHTMLTagEndName() const LLVM_READONLY
The JSON file list parser is used to communicate input to InstallAPI.
@ Result
The result type of a method or function.
Definition TypeBase.h:905
__packed_splat4 __packed_splat2 __packed_splat8 __packed_splat4 __packed_splat2 uint8_t
Information about a single command.