clang-tools  14.0.0git
cppreference_parser.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #===- cppreference_parser.py - ------------------------------*- python -*--===#
3 #
4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 # See https://llvm.org/LICENSE.txt for license information.
6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 #
8 #===------------------------------------------------------------------------===#
9 
10 from bs4 import BeautifulSoup, NavigableString
11 
12 import collections
13 import multiprocessing
14 import os
15 import re
16 import signal
17 import sys
18 
19 
20 class Symbol:
21 
22  def __init__(self, name, namespace, headers):
23  # unqualifed symbol name, e.g. "move"
24  self.name = name
25  # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
26  # None for C symbols.
27  self.namespace = namespace
28  # a list of corresponding headers
29  self.headers = headers
30 
31 
32 def _HasClass(tag, *classes):
33  for c in tag.get('class', []):
34  if c in classes:
35  return True
36  return False
37 
38 
39 def _ParseSymbolPage(symbol_page_html, symbol_name):
40  """Parse symbol page and retrieve the include header defined in this page.
41  The symbol page provides header for the symbol, specifically in
42  "Defined in header <header>" section. An example:
43 
44  <tr class="t-dsc-header">
45  <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
46  </td></tr>
47 
48  Returns a list of headers.
49  """
50  headers = set()
51  all_headers = set()
52 
53  soup = BeautifulSoup(symbol_page_html, "html.parser")
54  # Rows in table are like:
55  # Defined in header <foo> .t-dsc-header
56  # Defined in header <bar> .t-dsc-header
57  # decl1 .t-dcl
58  # Defined in header <baz> .t-dsc-header
59  # decl2 .t-dcl
60  for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
61  current_headers = []
62  was_decl = False
63  for row in table.select('tr'):
64  if _HasClass(row, 't-dcl', 't-dsc'):
65  was_decl = True
66  # Symbols are in the first cell.
67  found_symbols = row.find('td').stripped_strings
68  if not symbol_name in found_symbols:
69  continue
70  headers.update(current_headers)
71  elif _HasClass(row, 't-dsc-header'):
72  # If we saw a decl since the last header, this is a new block of headers
73  # for a new block of decls.
74  if was_decl:
75  current_headers = []
76  was_decl = False
77  # There are also .t-dsc-header for "defined in namespace".
78  if not "Defined in header " in row.text:
79  continue
80  # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
81  for header_code in row.find_all("code"):
82  current_headers.append(header_code.text)
83  all_headers.add(header_code.text)
84  # If the symbol was never named, consider all named headers.
85  return headers or all_headers
86 
87 
88 def _ParseIndexPage(index_page_html):
89  """Parse index page.
90  The index page lists all std symbols and hrefs to their detailed pages
91  (which contain the defined header). An example:
92 
93  <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
94  <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
95 
96  Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
97  """
98  symbols = []
99  soup = BeautifulSoup(index_page_html, "html.parser")
100  for symbol_href in soup.select("a[title]"):
101  # Ignore annotated symbols like "acos<>() (std::complex)".
102  # These tend to be overloads, and we the primary is more useful.
103  # This accidentally accepts begin/end despite the (iterator) caption: the
104  # (since C++11) note is first. They are good symbols, so the bug is unfixed.
105  caption = symbol_href.next_sibling
106  variant = isinstance(caption, NavigableString) and "(" in caption
107  symbol_tt = symbol_href.find("tt")
108  if symbol_tt:
109  symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
110  symbol_href["href"], variant))
111  return symbols
112 
113 
114 def _ReadSymbolPage(path, name):
115  with open(path) as f:
116  return _ParseSymbolPage(f.read(), name)
117 
118 
119 def _GetSymbols(pool, root_dir, index_page_name, namespace):
120  """Get all symbols listed in the index page. All symbols should be in the
121  given namespace.
122 
123  Returns a list of Symbols.
124  """
125 
126  # Workflow steps:
127  # 1. Parse index page which lists all symbols to get symbol
128  # name (unqualified name) and its href link to the symbol page which
129  # contains the defined header.
130  # 2. Parse the symbol page to get the defined header.
131  index_page_path = os.path.join(root_dir, index_page_name)
132  with open(index_page_path, "r") as f:
133  # Read each symbol page in parallel.
134  results = [] # (symbol_name, promise of [header...])
135  for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
136  # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
137  # FIXME: use these as a fallback rather than ignoring entirely.
138  if variant:
139  continue
140  path = os.path.join(root_dir, symbol_page_path)
141  results.append((symbol_name,
142  pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
143 
144  # Build map from symbol name to a set of headers.
145  symbol_headers = collections.defaultdict(set)
146  for symbol_name, lazy_headers in results:
147  symbol_headers[symbol_name].update(lazy_headers.get())
148 
149  symbols = []
150  for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
151  symbols.append(Symbol(name, namespace, list(headers)))
152  return symbols
153 
154 
155 def GetSymbols(parse_pages):
156  """Get all symbols by parsing the given pages.
157 
158  Args:
159  parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
160  """
161  symbols = []
162  # Run many workers to process individual symbol pages under the symbol index.
163  # Don't allow workers to capture Ctrl-C.
164  pool = multiprocessing.Pool(
165  initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
166  try:
167  for root_dir, page_name, namespace in parse_pages:
168  symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace))
169  finally:
170  pool.terminate()
171  pool.join()
172  return symbols
cppreference_parser.Symbol.headers
headers
Definition: cppreference_parser.py:29
cppreference_parser.Symbol
Definition: cppreference_parser.py:20
cppreference_parser.Symbol.namespace
namespace
Definition: cppreference_parser.py:27
cppreference_parser.GetSymbols
def GetSymbols(parse_pages)
Definition: cppreference_parser.py:155
set
set(LLVM_LINK_COMPONENTS Support) add_clang_library(clangApplyReplacements lib/Tooling/ApplyReplacements.cpp) clang_target_link_libraries(clangApplyReplacements PRIVATE clangAST clangBasic clangRewrite clangToolingCore clangToolingRefactoring) include_directories($
Definition: clang-apply-replacements/CMakeLists.txt:1
cppreference_parser.Symbol.__init__
def __init__(self, name, namespace, headers)
Definition: cppreference_parser.py:22
cppreference_parser.Symbol.name
name
Definition: cppreference_parser.py:24