clang-tools  12.0.0git
Dex.cpp
Go to the documentation of this file.
1 //===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Dex.h"
10 #include "FileDistance.h"
11 #include "FuzzyMatch.h"
12 #include "Quality.h"
13 #include "index/Index.h"
14 #include "index/dex/Iterator.h"
15 #include "support/Logger.h"
16 #include "support/Trace.h"
17 #include "llvm/ADT/StringSet.h"
18 #include "llvm/Support/ScopedPrinter.h"
19 #include <algorithm>
20 #include <queue>
21 
22 namespace clang {
23 namespace clangd {
24 namespace dex {
25 
26 std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
27  RelationSlab Rels) {
28  auto Size = Symbols.bytes() + Refs.bytes();
29  // There is no need to include "Rels" in Data because the relations are self-
30  // contained, without references into a backing store.
31  auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
32  return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
33  Size);
34 }
35 
36 namespace {
37 
38 // Mark symbols which are can be used for code completion.
39 const Token RestrictedForCodeCompletion =
40  Token(Token::Kind::Sentinel, "Restricted For Code Completion");
41 
42 // Helper to efficiently assemble the inverse index (token -> matching docs).
43 // The output is a nice uniform structure keyed on Token, but constructing
44 // the Token object every time we want to insert into the map is wasteful.
45 // Instead we have various maps keyed on things that are cheap to compute,
46 // and produce the Token keys once at the end.
47 class IndexBuilder {
48  llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
49  std::vector<DocID> RestrictedCCDocs;
50  llvm::StringMap<std::vector<DocID>> TypeDocs;
51  llvm::StringMap<std::vector<DocID>> ScopeDocs;
52  llvm::StringMap<std::vector<DocID>> ProximityDocs;
53  std::vector<Trigram> TrigramScratch;
54 
55 public:
56  // Add the tokens which are given symbol's characteristics.
57  // This includes fuzzy matching trigrams, symbol's scope, etc.
58  // FIXME(kbobyrev): Support more token types:
59  // * Namespace proximity
60  void add(const Symbol &Sym, DocID D) {
61  generateIdentifierTrigrams(Sym.Name, TrigramScratch);
62  for (Trigram T : TrigramScratch)
63  TrigramDocs[T].push_back(D);
64  ScopeDocs[Sym.Scope].push_back(D);
65  if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
66  for (const auto &ProximityURI :
68  ProximityDocs[ProximityURI].push_back(D);
70  RestrictedCCDocs.push_back(D);
71  if (!Sym.Type.empty())
72  TypeDocs[Sym.Type].push_back(D);
73  }
74 
75  // Assemble the final compressed posting lists for the added symbols.
76  llvm::DenseMap<Token, PostingList> build() {
77  llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
78  TrigramDocs.size() +
79  RestrictedCCDocs.size() +
80  TypeDocs.size() +
81  ScopeDocs.size() +
82  ProximityDocs.size());
83  for (const auto &E : TrigramDocs)
84  Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
85  for (const auto &E : TypeDocs)
86  Result.try_emplace(Token(Token::Kind::Type, E.first()), E.second);
87  for (const auto &E : ScopeDocs)
88  Result.try_emplace(Token(Token::Kind::Scope, E.first()), E.second);
89  for (const auto &E : ProximityDocs)
90  Result.try_emplace(Token(Token::Kind::ProximityURI, E.first()), E.second);
91  if (!RestrictedCCDocs.empty())
92  Result.try_emplace(RestrictedForCodeCompletion, RestrictedCCDocs);
93  return Result;
94  }
95 };
96 
97 } // namespace
98 
99 void Dex::buildIndex() {
100  this->Corpus = dex::Corpus(Symbols.size());
101  std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
102 
103  for (size_t I = 0; I < Symbols.size(); ++I) {
104  const Symbol *Sym = Symbols[I];
105  LookupTable[Sym->ID] = Sym;
106  ScoredSymbols[I] = {quality(*Sym), Sym};
107  }
108 
109  // Symbols are sorted by symbol qualities so that items in the posting lists
110  // are stored in the descending order of symbol quality.
111  llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
112 
113  // SymbolQuality was empty up until now.
114  SymbolQuality.resize(Symbols.size());
115  // Populate internal storage using Symbol + Score pairs.
116  for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
117  SymbolQuality[I] = ScoredSymbols[I].first;
118  Symbols[I] = ScoredSymbols[I].second;
119  }
120 
121  // Build posting lists for symbols.
122  IndexBuilder Builder;
123  for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
124  Builder.add(*Symbols[SymbolRank], SymbolRank);
125  InvertedIndex = Builder.build();
126 }
127 
128 std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
129  auto It = InvertedIndex.find(Tok);
130  return It == InvertedIndex.end() ? Corpus.none()
131  : It->second.iterator(&It->first);
132 }
133 
134 // Constructs BOOST iterators for Path Proximities.
135 std::unique_ptr<Iterator> Dex::createFileProximityIterator(
136  llvm::ArrayRef<std::string> ProximityPaths) const {
137  std::vector<std::unique_ptr<Iterator>> BoostingIterators;
138  // Deduplicate parent URIs extracted from the ProximityPaths.
139  llvm::StringSet<> ParentURIs;
140  llvm::StringMap<SourceParams> Sources;
141  for (const auto &Path : ProximityPaths) {
142  Sources[Path] = SourceParams();
143  auto PathURI = URI::create(Path);
144  const auto PathProximityURIs = generateProximityURIs(PathURI.toString());
145  for (const auto &ProximityURI : PathProximityURIs)
146  ParentURIs.insert(ProximityURI);
147  }
148  // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
149  // for all parameters except for Proximity Path distance signal.
150  SymbolRelevanceSignals PathProximitySignals;
151  // DistanceCalculator will find the shortest distance from ProximityPaths to
152  // any URI extracted from the ProximityPaths.
153  URIDistance DistanceCalculator(Sources);
154  PathProximitySignals.FileProximityMatch = &DistanceCalculator;
155  // Try to build BOOST iterator for each Proximity Path provided by
156  // ProximityPaths. Boosting factor should depend on the distance to the
157  // Proximity Path: the closer processed path is, the higher boosting factor.
158  for (const auto &ParentURI : ParentURIs.keys()) {
159  // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
160  auto It = iterator(Token(Token::Kind::ProximityURI, ParentURI));
161  if (It->kind() != Iterator::Kind::False) {
162  PathProximitySignals.SymbolURI = ParentURI;
163  BoostingIterators.push_back(
164  Corpus.boost(std::move(It), PathProximitySignals.evaluate()));
165  }
166  }
167  BoostingIterators.push_back(Corpus.all());
168  return Corpus.unionOf(std::move(BoostingIterators));
169 }
170 
171 // Constructs BOOST iterators for preferred types.
172 std::unique_ptr<Iterator>
173 Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
174  std::vector<std::unique_ptr<Iterator>> BoostingIterators;
175  SymbolRelevanceSignals PreferredTypeSignals;
176  PreferredTypeSignals.TypeMatchesPreferred = true;
177  auto Boost = PreferredTypeSignals.evaluate();
178  for (const auto &T : Types)
179  BoostingIterators.push_back(
180  Corpus.boost(iterator(Token(Token::Kind::Type, T)), Boost));
181  BoostingIterators.push_back(Corpus.all());
182  return Corpus.unionOf(std::move(BoostingIterators));
183 }
184 
185 /// Constructs iterators over tokens extracted from the query and exhausts it
186 /// while applying Callback to each symbol in the order of decreasing quality
187 /// of the matched symbols.
189  llvm::function_ref<void(const Symbol &)> Callback) const {
190  assert(!StringRef(Req.Query).contains("::") &&
191  "There must be no :: in query.");
192  trace::Span Tracer("Dex fuzzyFind");
193  FuzzyMatcher Filter(Req.Query);
194  // For short queries we use specialized trigrams that don't yield all results.
195  // Prevent clients from postfiltering them for longer queries.
196  bool More = !Req.Query.empty() && Req.Query.size() < 3;
197 
198  std::vector<std::unique_ptr<Iterator>> Criteria;
199  const auto TrigramTokens = generateQueryTrigrams(Req.Query);
200 
201  // Generate query trigrams and construct AND iterator over all query
202  // trigrams.
203  std::vector<std::unique_ptr<Iterator>> TrigramIterators;
204  for (const auto &Trigram : TrigramTokens)
205  TrigramIterators.push_back(iterator(Trigram));
206  Criteria.push_back(Corpus.intersect(move(TrigramIterators)));
207 
208  // Generate scope tokens for search query.
209  std::vector<std::unique_ptr<Iterator>> ScopeIterators;
210  for (const auto &Scope : Req.Scopes)
211  ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
212  if (Req.AnyScope)
213  ScopeIterators.push_back(
214  Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
215  Criteria.push_back(Corpus.unionOf(move(ScopeIterators)));
216 
217  // Add proximity paths boosting (all symbols, some boosted).
218  Criteria.push_back(createFileProximityIterator(Req.ProximityPaths));
219  // Add boosting for preferred types.
220  Criteria.push_back(createTypeBoostingIterator(Req.PreferredTypes));
221 
223  Criteria.push_back(iterator(RestrictedForCodeCompletion));
224 
225  // Use TRUE iterator if both trigrams and scopes from the query are not
226  // present in the symbol index.
227  auto Root = Corpus.intersect(move(Criteria));
228  // Retrieve more items than it was requested: some of the items with high
229  // final score might not be retrieved otherwise.
230  // FIXME(kbobyrev): Tune this ratio.
231  if (Req.Limit)
232  Root = Corpus.limit(move(Root), *Req.Limit * 100);
233  SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
234  vlog("Dex query tree: {0}", *Root);
235 
236  using IDAndScore = std::pair<DocID, float>;
237  std::vector<IDAndScore> IDAndScores = consume(*Root);
238 
239  auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
240  return LHS.second > RHS.second;
241  };
243  Req.Limit ? *Req.Limit : std::numeric_limits<size_t>::max(), Compare);
244  for (const auto &IDAndScore : IDAndScores) {
245  const DocID SymbolDocID = IDAndScore.first;
246  const auto *Sym = Symbols[SymbolDocID];
247  const llvm::Optional<float> Score = Filter.match(Sym->Name);
248  if (!Score)
249  continue;
250  // Combine Fuzzy Matching score, precomputed symbol quality and boosting
251  // score for a cumulative final symbol score.
252  const float FinalScore =
253  (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
254  // If Top.push(...) returns true, it means that it had to pop an item. In
255  // this case, it is possible to retrieve more symbols.
256  if (Top.push({SymbolDocID, FinalScore}))
257  More = true;
258  }
259 
260  // Apply callback to the top Req.Limit items in the descending
261  // order of cumulative score.
262  for (const auto &Item : std::move(Top).items())
263  Callback(*Symbols[Item.first]);
264  return More;
265 }
266 
267 void Dex::lookup(const LookupRequest &Req,
268  llvm::function_ref<void(const Symbol &)> Callback) const {
269  trace::Span Tracer("Dex lookup");
270  for (const auto &ID : Req.IDs) {
271  auto I = LookupTable.find(ID);
272  if (I != LookupTable.end())
273  Callback(*I->second);
274  }
275 }
276 
277 bool Dex::refs(const RefsRequest &Req,
278  llvm::function_ref<void(const Ref &)> Callback) const {
279  trace::Span Tracer("Dex refs");
280  uint32_t Remaining =
281  Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
282  for (const auto &ID : Req.IDs)
283  for (const auto &Ref : Refs.lookup(ID)) {
284  if (!static_cast<int>(Req.Filter & Ref.Kind))
285  continue;
286  if (Remaining == 0)
287  return true; // More refs were available.
288  --Remaining;
289  Callback(Ref);
290  }
291  return false; // We reported all refs.
292 }
293 
295  const RelationsRequest &Req,
296  llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
297  trace::Span Tracer("Dex relations");
298  uint32_t Remaining =
299  Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
300  for (const SymbolID &Subject : Req.Subjects) {
301  LookupRequest LookupReq;
302  auto It = Relations.find(
303  std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
304  if (It != Relations.end()) {
305  for (const auto &Object : It->second) {
306  if (Remaining > 0) {
307  --Remaining;
308  LookupReq.IDs.insert(Object);
309  }
310  }
311  }
312  lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
313  }
314 }
315 
316 size_t Dex::estimateMemoryUsage() const {
317  size_t Bytes = Symbols.size() * sizeof(const Symbol *);
318  Bytes += SymbolQuality.size() * sizeof(float);
319  Bytes += LookupTable.getMemorySize();
320  Bytes += InvertedIndex.getMemorySize();
321  for (const auto &TokenToPostingList : InvertedIndex)
322  Bytes += TokenToPostingList.second.bytes();
323  Bytes += Refs.getMemorySize();
324  Bytes += Relations.getMemorySize();
325  return Bytes + BackingDataSize;
326 }
327 
328 std::vector<std::string> generateProximityURIs(llvm::StringRef URIPath) {
329  std::vector<std::string> Result;
330  auto ParsedURI = URI::parse(URIPath);
331  assert(ParsedURI &&
332  "Non-empty argument of generateProximityURIs() should be a valid "
333  "URI.");
334  llvm::StringRef Body = ParsedURI->body();
335  // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
336  // size of resulting vector. Some projects might want to have higher limit if
337  // the file hierarchy is deeper. For the generic case, it would be useful to
338  // calculate Limit in the index build stage by calculating the maximum depth
339  // of the project source tree at runtime.
340  size_t Limit = 5;
341  // Insert original URI before the loop: this would save a redundant iteration
342  // with a URI parse.
343  Result.emplace_back(ParsedURI->toString());
344  while (!Body.empty() && --Limit > 0) {
345  // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
346  // could be optimized.
347  Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix);
348  if (!Body.empty())
349  Result.emplace_back(
350  URI(ParsedURI->scheme(), ParsedURI->authority(), Body).toString());
351  }
352  return Result;
353 }
354 
355 } // namespace dex
356 } // namespace clangd
357 } // namespace clang
size_t bytes() const
Definition: Symbol.h:193
An immutable symbol container that stores a set of symbols.
Definition: Symbol.h:177
llvm::DenseSet< SymbolID > IDs
Definition: Index.h:68
bool AnyScope
If set to true, allow symbols from any scope.
Definition: Index.h:39
std::unique_ptr< Iterator > intersect(std::vector< std::unique_ptr< Iterator >> Children) const
Returns AND Iterator which performs the intersection of the PostingLists of its children.
Definition: Iterator.cpp:358
bool RestrictForCodeCompletion
If set to true, only symbols for completion support will be considered.
Definition: Index.h:44
An efficient structure of storing large set of symbol references in memory.
Definition: Ref.h:104
This defines Dex - a symbol index implementation based on query iterators over symbol tokens...
llvm::Optional< uint32_t > Limit
If set, limit the number of relations returned from the index.
Definition: Index.h:80
llvm::DenseSet< SymbolID > IDs
Definition: Index.h:64
Represents a symbol occurrence in the source file.
Definition: Ref.h:87
Path Proximity URI to symbol declaration.
llvm::unique_function< void(llvm::Expected< T >)> Callback
A Callback<T> is a void function that accepts Expected<T>.
Definition: Function.h:28
llvm::StringRef Scope
The containing namespace. e.g. "" (global), "ns::" (top-level namespace).
Definition: Symbol.h:44
bool fuzzyFind(const FuzzyFindRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Constructs iterators over tokens extracted from the query and exhausts it while applying Callback to ...
Definition: Dex.cpp:188
std::vector< std::string > PreferredTypes
Preferred types of symbols. These are raw representation of OpaqueType.
Definition: Index.h:49
void vlog(const char *Fmt, Ts &&... Vals)
Definition: Logger.h:67
Represents trigram used for fuzzy search of unqualified symbol names.
std::vector< std::string > Scopes
If this is non-empty, symbols must be in at least one of the scopes (e.g.
Definition: Index.h:36
SymbolID ID
The ID of the symbol.
Definition: Symbol.h:38
llvm::Optional< float > Score
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
Definition: Iterator.cpp:350
std::unique_ptr< Iterator > unionOf(std::vector< std::unique_ptr< Iterator >> Children) const
Returns OR Iterator which performs the union of the PostingLists of its children. ...
Definition: Iterator.cpp:388
Whether or not this symbol is meant to be used for the code completion.
Definition: Symbol.h:119
std::vector< std::string > generateProximityURIs(llvm::StringRef URIPath)
Returns Search Token for a number of parent directories of given Path.
Definition: Dex.cpp:328
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
Definition: Trigram.cpp:101
void generateIdentifierTrigrams(llvm::StringRef Identifier, std::vector< Trigram > &Result)
Produces list of unique fuzzy-search trigrams from unqualified symbol.
Definition: Trigram.cpp:81
SymbolFlag Flags
Definition: Symbol.h:128
static std::unique_ptr< SymbolIndex > build(SymbolSlab, RefSlab, RelationSlab)
Builds an index from slabs. The index takes ownership of the slab.
Definition: Dex.cpp:26
std::unique_ptr< Iterator > limit(std::unique_ptr< Iterator > Child, size_t Limit) const
Returns LIMIT iterator, which yields up to N elements of its child iterator.
Definition: Iterator.cpp:434
static const char * toString(OffsetEncoding OE)
Definition: Protocol.cpp:1170
std::unique_ptr< Iterator > none() const
Returns FALSE Iterator which iterates over no documents.
Definition: Iterator.cpp:421
std::string Path
A typedef to represent a file path.
Definition: Path.h:20
std::string Query
A query string for the fuzzy find.
Definition: Index.h:29
llvm::Optional< float > match(llvm::StringRef Word)
Definition: FuzzyMatch.cpp:92
SymbolLocation CanonicalDeclaration
The location of the preferred declaration of the symbol.
Definition: Symbol.h:56
bool push(value_type &&V)
Definition: Quality.h:160
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
Definition: Iterator.h:46
void relations(const RelationsRequest &Req, llvm::function_ref< void(const SymbolID &, const Symbol &)> Callback) const override
Definition: Dex.cpp:294
SymbolSlab Symbols
llvm::DenseSet< SymbolID > Subjects
Definition: Index.h:77
void lookup(const LookupRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Looks up symbols with any of the given symbol IDs and applies Callback on each matched symbol...
Definition: Dex.cpp:267
llvm::StringRef SymbolURI
These are used to calculate proximity between the index symbol and the query.
Definition: Quality.h:102
CodeCompletionBuilder Builder
static llvm::Expected< URI > create(llvm::StringRef AbsolutePath, llvm::StringRef Scheme)
Creates a URI for a file in the given scheme.
Definition: URI.cpp:196
The class presents a C++ symbol, e.g.
Definition: Symbol.h:36
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
std::vector< std::string > ProximityPaths
Contextually relevant files (e.g.
Definition: Index.h:47
llvm::StringRef Name
The unqualified name of the symbol, e.g. "bar" (for ns::bar).
Definition: Symbol.h:42
std::unique_ptr< trace::EventTracer > Tracer
Definition: TraceTests.cpp:163
llvm::Optional< uint32_t > Limit
If set, limit the number of refers returned from the index.
Definition: Index.h:73
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
A URI describes the location of a source file.
Definition: URI.h:28
llvm::Optional< uint32_t > Limit
The number of top candidates to return.
Definition: Index.h:42
const Expr * E
size_t bytes() const
Definition: Ref.h:122
Internal Token type for invalid/special tokens, e.g.
static llvm::Expected< URI > parse(llvm::StringRef Uri)
Parse a URI string "<scheme>:[//<authority>/]<path>".
Definition: URI.cpp:163
std::unique_ptr< Iterator > boost(std::unique_ptr< Iterator > Child, float Factor) const
Returns BOOST iterator which multiplies the score of each item by given factor.
Definition: Iterator.cpp:425
RefSlab Refs
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:40
std::unique_ptr< Iterator > all() const
Returns TRUE Iterator which iterates over "virtual" PostingList containing all items in range [0...
Definition: Iterator.cpp:417
size_t estimateMemoryUsage() const override
Returns estimated size of index (in bytes).
Definition: Dex.cpp:316
llvm::StringRef Type
Raw representation of the OpaqueType of the symbol, used for scoring purposes.
Definition: Symbol.h:85
Records an event whose duration is the lifetime of the Span object.
Definition: Trace.h:135
Type of symbol (see Symbol::Type).
Attributes of a symbol-query pair that affect how much we like it.
Definition: Quality.h:87
#define SPAN_ATTACH(S, Name, Expr)
Attach a key-value pair to a Span event.
Definition: Trace.h:154
RefKind Kind
Definition: Ref.h:90
bool refs(const RefsRequest &Req, llvm::function_ref< void(const Ref &)> Callback) const override
Finds all occurrences (e.g.
Definition: Dex.cpp:277
float quality(const Symbol &S)
Computes query-independent quality score for a Symbol.
Definition: Symbol.cpp:29
TopN<T> is a lossy container that preserves only the "best" N elements.
Definition: Quality.h:152