clang-tools  16.0.0git
Markup.cpp
Go to the documentation of this file.
1 //===--- Markup.cpp -----------------------------------------*- C++-*------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "support/Markup.h"
9 #include "llvm/ADT/ArrayRef.h"
10 #include "llvm/ADT/STLExtras.h"
11 #include "llvm/ADT/SmallVector.h"
12 #include "llvm/ADT/StringExtras.h"
13 #include "llvm/ADT/StringRef.h"
14 #include "llvm/Support/Compiler.h"
15 #include "llvm/Support/raw_ostream.h"
16 #include <cstddef>
17 #include <iterator>
18 #include <memory>
19 #include <string>
20 #include <vector>
21 
22 namespace clang {
23 namespace clangd {
24 namespace markup {
25 namespace {
26 
27 // Is <contents a plausible start to an HTML tag?
28 // Contents may not be the rest of the line, but it's the rest of the plain
29 // text, so we expect to see at least the tag name.
30 bool looksLikeTag(llvm::StringRef Contents) {
31  if (Contents.empty())
32  return false;
33  if (Contents.front() == '!' || Contents.front() == '?' ||
34  Contents.front() == '/')
35  return true;
36  // Check the start of the tag name.
37  if (!llvm::isAlpha(Contents.front()))
38  return false;
39  // Drop rest of the tag name, and following whitespace.
40  Contents = Contents
41  .drop_while([](char C) {
42  return llvm::isAlnum(C) || C == '-' || C == '_' || C == ':';
43  })
44  .drop_while(llvm::isSpace);
45  // The rest of the tag consists of attributes, which have restrictive names.
46  // If we hit '=', all bets are off (attribute values can contain anything).
47  for (; !Contents.empty(); Contents = Contents.drop_front()) {
48  if (llvm::isAlnum(Contents.front()) || llvm::isSpace(Contents.front()))
49  continue;
50  if (Contents.front() == '>' || Contents.startswith("/>"))
51  return true; // May close the tag.
52  if (Contents.front() == '=')
53  return true; // Don't try to parse attribute values.
54  return false; // Random punctuation means this isn't a tag.
55  }
56  return true; // Potentially incomplete tag.
57 }
58 
59 // Tests whether C should be backslash-escaped in markdown.
60 // The string being escaped is Before + C + After. This is part of a paragraph.
61 // StartsLine indicates whether `Before` is the start of the line.
62 // After may not be everything until the end of the line.
63 //
64 // It's always safe to escape punctuation, but want minimal escaping.
65 // The strategy is to escape the first character of anything that might start
66 // a markdown grammar construct.
67 bool needsLeadingEscape(char C, llvm::StringRef Before, llvm::StringRef After,
68  bool StartsLine) {
69  assert(Before.take_while(llvm::isSpace).empty());
70  auto RulerLength = [&]() -> /*Length*/ unsigned {
71  if (!StartsLine || !Before.empty())
72  return false;
73  llvm::StringRef A = After.rtrim();
74  return llvm::all_of(A, [C](char D) { return C == D; }) ? 1 + A.size() : 0;
75  };
76  auto IsBullet = [&]() {
77  return StartsLine && Before.empty() &&
78  (After.empty() || After.startswith(" "));
79  };
80  auto SpaceSurrounds = [&]() {
81  return (After.empty() || llvm::isSpace(After.front())) &&
82  (Before.empty() || llvm::isSpace(Before.back()));
83  };
84  auto WordSurrounds = [&]() {
85  return (!After.empty() && llvm::isAlnum(After.front())) &&
86  (!Before.empty() && llvm::isAlnum(Before.back()));
87  };
88 
89  switch (C) {
90  case '\\': // Escaped character.
91  return true;
92  case '`': // Code block or inline code
93  // Any number of backticks can delimit an inline code block that can end
94  // anywhere (including on another line). We must escape them all.
95  return true;
96  case '~': // Code block
97  return StartsLine && Before.empty() && After.startswith("~~");
98  case '#': { // ATX heading.
99  if (!StartsLine || !Before.empty())
100  return false;
101  llvm::StringRef Rest = After.ltrim(C);
102  return Rest.empty() || Rest.startswith(" ");
103  }
104  case ']': // Link or link reference.
105  // We escape ] rather than [ here, because it's more constrained:
106  // ](...) is an in-line link
107  // ]: is a link reference
108  // The following are only links if the link reference exists:
109  // ] by itself is a shortcut link
110  // ][...] is an out-of-line link
111  // Because we never emit link references, we don't need to handle these.
112  return After.startswith(":") || After.startswith("(");
113  case '=': // Setex heading.
114  return RulerLength() > 0;
115  case '_': // Horizontal ruler or matched delimiter.
116  if (RulerLength() >= 3)
117  return true;
118  // Not a delimiter if surrounded by space, or inside a word.
119  // (The rules at word boundaries are subtle).
120  return !(SpaceSurrounds() || WordSurrounds());
121  case '-': // Setex heading, horizontal ruler, or bullet.
122  if (RulerLength() > 0)
123  return true;
124  return IsBullet();
125  case '+': // Bullet list.
126  return IsBullet();
127  case '*': // Bullet list, horizontal ruler, or delimiter.
128  return IsBullet() || RulerLength() >= 3 || !SpaceSurrounds();
129  case '<': // HTML tag (or autolink, which we choose not to escape)
130  return looksLikeTag(After);
131  case '>': // Quote marker. Needs escaping at start of line.
132  return StartsLine && Before.empty();
133  case '&': { // HTML entity reference
134  auto End = After.find(';');
135  if (End == llvm::StringRef::npos)
136  return false;
137  llvm::StringRef Content = After.substr(0, End);
138  if (Content.consume_front("#")) {
139  if (Content.consume_front("x") || Content.consume_front("X"))
140  return llvm::all_of(Content, llvm::isHexDigit);
141  return llvm::all_of(Content, llvm::isDigit);
142  }
143  return llvm::all_of(Content, llvm::isAlpha);
144  }
145  case '.': // Numbered list indicator. Escape 12. -> 12\. at start of line.
146  case ')':
147  return StartsLine && !Before.empty() &&
148  llvm::all_of(Before, llvm::isDigit) && After.startswith(" ");
149  default:
150  return false;
151  }
152 }
153 
154 /// Escape a markdown text block. Ensures the punctuation will not introduce
155 /// any of the markdown constructs.
156 std::string renderText(llvm::StringRef Input, bool StartsLine) {
157  std::string R;
158  for (unsigned I = 0; I < Input.size(); ++I) {
159  if (needsLeadingEscape(Input[I], Input.substr(0, I), Input.substr(I + 1),
160  StartsLine))
161  R.push_back('\\');
162  R.push_back(Input[I]);
163  }
164  return R;
165 }
166 
167 /// Renders \p Input as an inline block of code in markdown. The returned value
168 /// is surrounded by backticks and the inner contents are properly escaped.
169 std::string renderInlineBlock(llvm::StringRef Input) {
170  std::string R;
171  // Double all backticks to make sure we don't close the inline block early.
172  for (size_t From = 0; From < Input.size();) {
173  size_t Next = Input.find("`", From);
174  R += Input.substr(From, Next - From);
175  if (Next == llvm::StringRef::npos)
176  break;
177  R += "``"; // double the found backtick.
178 
179  From = Next + 1;
180  }
181  // If results starts with a backtick, add spaces on both sides. The spaces
182  // are ignored by markdown renderers.
183  if (llvm::StringRef(R).startswith("`") || llvm::StringRef(R).endswith("`"))
184  return "` " + std::move(R) + " `";
185  // Markdown render should ignore first and last space if both are there. We
186  // add an extra pair of spaces in that case to make sure we render what the
187  // user intended.
188  if (llvm::StringRef(R).startswith(" ") && llvm::StringRef(R).endswith(" "))
189  return "` " + std::move(R) + " `";
190  return "`" + std::move(R) + "`";
191 }
192 
193 /// Get marker required for \p Input to represent a markdown codeblock. It
194 /// consists of at least 3 backticks(`). Although markdown also allows to use
195 /// tilde(~) for code blocks, they are never used.
196 std::string getMarkerForCodeBlock(llvm::StringRef Input) {
197  // Count the maximum number of consecutive backticks in \p Input. We need to
198  // start and end the code block with more.
199  unsigned MaxBackticks = 0;
200  unsigned Backticks = 0;
201  for (char C : Input) {
202  if (C == '`') {
203  ++Backticks;
204  continue;
205  }
206  MaxBackticks = std::max(MaxBackticks, Backticks);
207  Backticks = 0;
208  }
209  MaxBackticks = std::max(Backticks, MaxBackticks);
210  // Use the corresponding number of backticks to start and end a code block.
211  return std::string(/*Repeat=*/std::max(3u, MaxBackticks + 1), '`');
212 }
213 
214 // Trims the input and concatenates whitespace blocks into a single ` `.
215 std::string canonicalizeSpaces(llvm::StringRef Input) {
216  llvm::SmallVector<llvm::StringRef> Words;
217  llvm::SplitString(Input, Words);
218  return llvm::join(Words, " ");
219 }
220 
221 std::string renderBlocks(llvm::ArrayRef<std::unique_ptr<Block>> Children,
222  void (Block::*RenderFunc)(llvm::raw_ostream &) const) {
223  std::string R;
224  llvm::raw_string_ostream OS(R);
225 
226  // Trim rulers.
227  Children = Children.drop_while(
228  [](const std::unique_ptr<Block> &C) { return C->isRuler(); });
229  auto Last = llvm::find_if(
230  llvm::reverse(Children),
231  [](const std::unique_ptr<Block> &C) { return !C->isRuler(); });
232  Children = Children.drop_back(Children.end() - Last.base());
233 
234  bool LastBlockWasRuler = true;
235  for (const auto &C : Children) {
236  if (C->isRuler() && LastBlockWasRuler)
237  continue;
238  LastBlockWasRuler = C->isRuler();
239  ((*C).*RenderFunc)(OS);
240  }
241 
242  // Get rid of redundant empty lines introduced in plaintext while imitating
243  // padding in markdown.
244  std::string AdjustedResult;
245  llvm::StringRef TrimmedText(OS.str());
246  TrimmedText = TrimmedText.trim();
247 
248  llvm::copy_if(TrimmedText, std::back_inserter(AdjustedResult),
249  [&TrimmedText](const char &C) {
250  return !llvm::StringRef(TrimmedText.data(),
251  &C - TrimmedText.data() + 1)
252  // We allow at most two newlines.
253  .endswith("\n\n\n");
254  });
255 
256  return AdjustedResult;
257 }
258 
259 // Separates two blocks with extra spacing. Note that it might render strangely
260 // in vscode if the trailing block is a codeblock, see
261 // https://github.com/microsoft/vscode/issues/88416 for details.
262 class Ruler : public Block {
263 public:
264  void renderMarkdown(llvm::raw_ostream &OS) const override {
265  // Note that we need an extra new line before the ruler, otherwise we might
266  // make previous block a title instead of introducing a ruler.
267  OS << "\n---\n";
268  }
269  void renderPlainText(llvm::raw_ostream &OS) const override { OS << '\n'; }
270  std::unique_ptr<Block> clone() const override {
271  return std::make_unique<Ruler>(*this);
272  }
273  bool isRuler() const override { return true; }
274 };
275 
276 class CodeBlock : public Block {
277 public:
278  void renderMarkdown(llvm::raw_ostream &OS) const override {
279  std::string Marker = getMarkerForCodeBlock(Contents);
280  // No need to pad from previous blocks, as they should end with a new line.
281  OS << Marker << Language << '\n' << Contents << '\n' << Marker << '\n';
282  }
283 
284  void renderPlainText(llvm::raw_ostream &OS) const override {
285  // In plaintext we want one empty line before and after codeblocks.
286  OS << '\n' << Contents << "\n\n";
287  }
288 
289  std::unique_ptr<Block> clone() const override {
290  return std::make_unique<CodeBlock>(*this);
291  }
292 
293  CodeBlock(std::string Contents, std::string Language)
294  : Contents(std::move(Contents)), Language(std::move(Language)) {}
295 
296 private:
297  std::string Contents;
298  std::string Language;
299 };
300 
301 // Inserts two spaces after each `\n` to indent each line. First line is not
302 // indented.
303 std::string indentLines(llvm::StringRef Input) {
304  assert(!Input.endswith("\n") && "Input should've been trimmed.");
305  std::string IndentedR;
306  // We'll add 2 spaces after each new line.
307  IndentedR.reserve(Input.size() + Input.count('\n') * 2);
308  for (char C : Input) {
309  IndentedR += C;
310  if (C == '\n')
311  IndentedR.append(" ");
312  }
313  return IndentedR;
314 }
315 
316 class Heading : public Paragraph {
317 public:
318  Heading(size_t Level) : Level(Level) {}
319  void renderMarkdown(llvm::raw_ostream &OS) const override {
320  OS << std::string(Level, '#') << ' ';
322  }
323 
324 private:
325  size_t Level;
326 };
327 
328 } // namespace
329 
330 std::string Block::asMarkdown() const {
331  std::string R;
332  llvm::raw_string_ostream OS(R);
334  return llvm::StringRef(OS.str()).trim().str();
335 }
336 
337 std::string Block::asPlainText() const {
338  std::string R;
339  llvm::raw_string_ostream OS(R);
341  return llvm::StringRef(OS.str()).trim().str();
342 }
343 
344 void Paragraph::renderMarkdown(llvm::raw_ostream &OS) const {
345  bool NeedsSpace = false;
346  bool HasChunks = false;
347  for (auto &C : Chunks) {
348  if (C.SpaceBefore || NeedsSpace)
349  OS << " ";
350  switch (C.Kind) {
351  case Chunk::PlainText:
352  OS << renderText(C.Contents, !HasChunks);
353  break;
354  case Chunk::InlineCode:
355  OS << renderInlineBlock(C.Contents);
356  break;
357  }
358  HasChunks = true;
359  NeedsSpace = C.SpaceAfter;
360  }
361  // Paragraphs are translated into markdown lines, not markdown paragraphs.
362  // Therefore it only has a single linebreak afterwards.
363  // VSCode requires two spaces at the end of line to start a new one.
364  OS << " \n";
365 }
366 
367 std::unique_ptr<Block> Paragraph::clone() const {
368  return std::make_unique<Paragraph>(*this);
369 }
370 
371 /// Choose a marker to delimit `Text` from a prioritized list of options.
372 /// This is more readable than escaping for plain-text.
373 llvm::StringRef chooseMarker(llvm::ArrayRef<llvm::StringRef> Options,
374  llvm::StringRef Text) {
375  // Prefer a delimiter whose characters don't appear in the text.
376  for (llvm::StringRef S : Options)
377  if (Text.find_first_of(S) == llvm::StringRef::npos)
378  return S;
379  return Options.front();
380 }
381 
382 void Paragraph::renderPlainText(llvm::raw_ostream &OS) const {
383  bool NeedsSpace = false;
384  for (auto &C : Chunks) {
385  if (C.SpaceBefore || NeedsSpace)
386  OS << " ";
387  llvm::StringRef Marker = "";
388  if (C.Preserve && C.Kind == Chunk::InlineCode)
389  Marker = chooseMarker({"`", "'", "\""}, C.Contents);
390  OS << Marker << C.Contents << Marker;
391  NeedsSpace = C.SpaceAfter;
392  }
393  OS << '\n';
394 }
395 
396 BulletList::BulletList() = default;
397 BulletList::~BulletList() = default;
398 
399 void BulletList::renderMarkdown(llvm::raw_ostream &OS) const {
400  for (auto &D : Items) {
401  // Instead of doing this we might prefer passing Indent to children to get
402  // rid of the copies, if it turns out to be a bottleneck.
403  OS << "- " << indentLines(D.asMarkdown()) << '\n';
404  }
405  // We need a new line after list to terminate it in markdown.
406  OS << '\n';
407 }
408 
409 void BulletList::renderPlainText(llvm::raw_ostream &OS) const {
410  for (auto &D : Items) {
411  // Instead of doing this we might prefer passing Indent to children to get
412  // rid of the copies, if it turns out to be a bottleneck.
413  OS << "- " << indentLines(D.asPlainText()) << '\n';
414  }
415 }
416 
418  if (!Chunks.empty())
419  Chunks.back().SpaceAfter = true;
420  return *this;
421 }
422 
423 Paragraph &Paragraph::appendText(llvm::StringRef Text) {
424  std::string Norm = canonicalizeSpaces(Text);
425  if (Norm.empty())
426  return *this;
427  Chunks.emplace_back();
428  Chunk &C = Chunks.back();
429  C.Contents = std::move(Norm);
430  C.Kind = Chunk::PlainText;
431  C.SpaceBefore = llvm::isSpace(Text.front());
432  C.SpaceAfter = llvm::isSpace(Text.back());
433  return *this;
434 }
435 
436 Paragraph &Paragraph::appendCode(llvm::StringRef Code, bool Preserve) {
437  bool AdjacentCode =
438  !Chunks.empty() && Chunks.back().Kind == Chunk::InlineCode;
439  std::string Norm = canonicalizeSpaces(std::move(Code));
440  if (Norm.empty())
441  return *this;
442  Chunks.emplace_back();
443  Chunk &C = Chunks.back();
444  C.Contents = std::move(Norm);
445  C.Kind = Chunk::InlineCode;
446  C.Preserve = Preserve;
447  // Disallow adjacent code spans without spaces, markdown can't render them.
448  C.SpaceBefore = AdjacentCode;
449  return *this;
450 }
451 
452 std::unique_ptr<Block> BulletList::clone() const {
453  return std::make_unique<BulletList>(*this);
454 }
455 
457  Items.emplace_back();
458  return Items.back();
459 }
460 
462  Children.clear();
463  for (const auto &C : Other.Children)
464  Children.push_back(C->clone());
465  return *this;
466 }
467 
469  std::move(Other.Children.begin(), Other.Children.end(),
470  std::back_inserter(Children));
471 }
472 
474  Children.push_back(std::make_unique<Paragraph>());
475  return *static_cast<Paragraph *>(Children.back().get());
476 }
477 
478 void Document::addRuler() { Children.push_back(std::make_unique<Ruler>()); }
479 
480 void Document::addCodeBlock(std::string Code, std::string Language) {
481  Children.emplace_back(
482  std::make_unique<CodeBlock>(std::move(Code), std::move(Language)));
483 }
484 
485 std::string Document::asMarkdown() const {
486  return renderBlocks(Children, &Block::renderMarkdown);
487 }
488 
489 std::string Document::asPlainText() const {
490  return renderBlocks(Children, &Block::renderPlainText);
491 }
492 
494  Children.emplace_back(std::make_unique<BulletList>());
495  return *static_cast<BulletList *>(Children.back().get());
496 }
497 
499  assert(Level > 0);
500  Children.emplace_back(std::make_unique<Heading>(Level));
501  return *static_cast<Paragraph *>(Children.back().get());
502 }
503 } // namespace markup
504 } // namespace clangd
505 } // namespace clang
clang::clangd::markup::Document::asPlainText
std::string asPlainText() const
Doesn't contain any trailing newlines.
Definition: Markup.cpp:489
clang::clangd::markup::BulletList
Represents a sequence of one or more documents.
Definition: Markup.h:80
clang::clangd::markup::Paragraph
Represents parts of the markup that can contain strings, like inline code, code block or plain text.
Definition: Markup.h:43
clang::clangd::markup::BulletList::clone
std::unique_ptr< Block > clone() const override
Definition: Markup.cpp:452
clang::clangd::markup::Block::renderMarkdown
virtual void renderMarkdown(llvm::raw_ostream &OS) const =0
clang::clangd::markup::Paragraph::appendText
Paragraph & appendText(llvm::StringRef Text)
Append plain text to the end of the string.
Definition: Markup.cpp:423
clang::clangd::markup::Document::addCodeBlock
void addCodeBlock(std::string Code, std::string Language="cpp")
Adds a block of code.
Definition: Markup.cpp:480
clang::clangd::markup::Block
Holds text and knows how to lay it out.
Definition: Markup.h:28
clang::clangd::markup::Document::addParagraph
Paragraph & addParagraph()
Adds a semantical block that will be separate from others.
Definition: Markup.cpp:473
Text
std::string Text
Definition: HTMLGenerator.cpp:80
clang::clangd::markup::Document
A format-agnostic representation for structured text.
Definition: Markup.h:97
clang::tidy::cppcoreguidelines::join
static std::string join(ArrayRef< SpecialMemberFunctionsCheck::SpecialMemberFunctionKind > SMFS, llvm::StringRef AndOr)
Definition: SpecialMemberFunctionsCheck.cpp:78
clang::clangd::markup::Document::addBulletList
BulletList & addBulletList()
Definition: Markup.cpp:493
clang::clangd::markup::Document::addRuler
void addRuler()
Inserts a horizontal separator to the document.
Definition: Markup.cpp:478
clang::clangd::markup::BulletList::renderMarkdown
void renderMarkdown(llvm::raw_ostream &OS) const override
Definition: Markup.cpp:399
clang::clangd::markup::Block::asMarkdown
std::string asMarkdown() const
Definition: Markup.cpp:330
clang::clangd::markup::Document::asMarkdown
std::string asMarkdown() const
Doesn't contain any trailing newlines.
Definition: Markup.cpp:485
Children
std::vector< std::unique_ptr< HTMLNode > > Children
Definition: HTMLGenerator.cpp:91
clang::clangd::markup::Paragraph::appendSpace
Paragraph & appendSpace()
Ensure there is space between the surrounding chunks.
Definition: Markup.cpp:417
clang::clangd::markup::Paragraph::renderMarkdown
void renderMarkdown(llvm::raw_ostream &OS) const override
Definition: Markup.cpp:344
ns1::ns2::A
@ A
Definition: CategoricalFeature.h:3
Code
std::string Code
Definition: FindTargetTests.cpp:67
ns1::ns2::D
@ D
Definition: CategoricalFeature.h:3
clang::clangd::markup::Document::addHeading
Paragraph & addHeading(size_t Level)
Heading is a special type of paragraph that will be prepended with Level many '#'s in markdown.
Definition: Markup.cpp:498
Markup.h
clang::clangd::markup::BulletList::renderPlainText
void renderPlainText(llvm::raw_ostream &OS) const override
Definition: Markup.cpp:409
clang::clangd::markup::Document::append
void append(Document Other)
Definition: Markup.cpp:468
clang::clangd::markup::Document::operator=
Document & operator=(const Document &)
Definition: Markup.cpp:461
clang::clangd::markup::Paragraph::appendCode
Paragraph & appendCode(llvm::StringRef Code, bool Preserve=false)
Append inline code, this translates to the ` block in markdown.
Definition: Markup.cpp:436
clang::clangd::markup::Block::asPlainText
std::string asPlainText() const
Definition: Markup.cpp:337
clang::clangd::markup::Paragraph::clone
std::unique_ptr< Block > clone() const override
Definition: Markup.cpp:367
clang::clangd::markup::Block::renderPlainText
virtual void renderPlainText(llvm::raw_ostream &OS) const =0
clang::clangd::markup::BulletList::BulletList
BulletList()
C
const Criteria C
Definition: FunctionCognitiveComplexityCheck.cpp:93
clang
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Definition: ApplyReplacements.h:27
OS
llvm::raw_string_ostream OS
Definition: TraceTests.cpp:160
clang::clangd::markup::BulletList::addItem
class Document & addItem()
Definition: Markup.cpp:456
clang::clangd::markup::Paragraph::renderPlainText
void renderPlainText(llvm::raw_ostream &OS) const override
Definition: Markup.cpp:382
clang::clangd::markup::BulletList::~BulletList
~BulletList()
clang::clangd::markup::chooseMarker
llvm::StringRef chooseMarker(llvm::ArrayRef< llvm::StringRef > Options, llvm::StringRef Text)
Choose a marker to delimit Text from a prioritized list of options.
Definition: Markup.cpp:373