clang-tools  10.0.0svn
Dex.cpp
Go to the documentation of this file.
1 //===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Dex.h"
10 #include "FileDistance.h"
11 #include "FuzzyMatch.h"
12 #include "Logger.h"
13 #include "Quality.h"
14 #include "Trace.h"
15 #include "index/Index.h"
16 #include "index/dex/Iterator.h"
17 #include "llvm/ADT/StringSet.h"
18 #include "llvm/Support/ScopedPrinter.h"
19 #include <algorithm>
20 #include <queue>
21 
22 namespace clang {
23 namespace clangd {
24 namespace dex {
25 
26 std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
27  RelationSlab Rels) {
28  auto Size = Symbols.bytes() + Refs.bytes();
29  // There is no need to include "Rels" in Data because the relations are self-
30  // contained, without references into a backing store.
31  auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
32  return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
33  Size);
34 }
35 
36 namespace {
37 
38 // Mark symbols which are can be used for code completion.
39 const Token RestrictedForCodeCompletion =
40  Token(Token::Kind::Sentinel, "Restricted For Code Completion");
41 
42 // Returns the tokens which are given symbol's characteristics. Currently, the
43 // generated tokens only contain fuzzy matching trigrams and symbol's scope,
44 // but in the future this will also return path proximity tokens and other
45 // types of tokens such as symbol type (if applicable).
46 // Returns the tokens which are given symbols's characteristics. For example,
47 // trigrams and scopes.
48 // FIXME(kbobyrev): Support more token types:
49 // * Namespace proximity
50 std::vector<Token> generateSearchTokens(const Symbol &Sym) {
51  std::vector<Token> Result = generateIdentifierTrigrams(Sym.Name);
52  Result.emplace_back(Token::Kind::Scope, Sym.Scope);
53  // Skip token generation for symbols with unknown declaration location.
54  if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
55  for (const auto &ProximityURI :
57  Result.emplace_back(Token::Kind::ProximityURI, ProximityURI);
59  Result.emplace_back(RestrictedForCodeCompletion);
60  if (!Sym.Type.empty())
61  Result.emplace_back(Token::Kind::Type, Sym.Type);
62  return Result;
63 }
64 
65 } // namespace
66 
67 void Dex::buildIndex() {
68  this->Corpus = dex::Corpus(Symbols.size());
69  std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
70 
71  for (size_t I = 0; I < Symbols.size(); ++I) {
72  const Symbol *Sym = Symbols[I];
73  LookupTable[Sym->ID] = Sym;
74  ScoredSymbols[I] = {quality(*Sym), Sym};
75  }
76 
77  // Symbols are sorted by symbol qualities so that items in the posting lists
78  // are stored in the descending order of symbol quality.
79  llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
80 
81  // SymbolQuality was empty up until now.
82  SymbolQuality.resize(Symbols.size());
83  // Populate internal storage using Symbol + Score pairs.
84  for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
85  SymbolQuality[I] = ScoredSymbols[I].first;
86  Symbols[I] = ScoredSymbols[I].second;
87  }
88 
89  // Populate TempInvertedIndex with lists for index symbols.
90  llvm::DenseMap<Token, std::vector<DocID>> TempInvertedIndex;
91  for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) {
92  const auto *Sym = Symbols[SymbolRank];
93  for (const auto &Token : generateSearchTokens(*Sym))
94  TempInvertedIndex[Token].push_back(SymbolRank);
95  }
96 
97  // Convert lists of items to posting lists.
98  for (const auto &TokenToPostingList : TempInvertedIndex)
99  InvertedIndex.insert(
100  {TokenToPostingList.first, PostingList(TokenToPostingList.second)});
101 }
102 
103 std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
104  auto It = InvertedIndex.find(Tok);
105  return It == InvertedIndex.end() ? Corpus.none()
106  : It->second.iterator(&It->first);
107 }
108 
109 // Constructs BOOST iterators for Path Proximities.
110 std::unique_ptr<Iterator> Dex::createFileProximityIterator(
111  llvm::ArrayRef<std::string> ProximityPaths) const {
112  std::vector<std::unique_ptr<Iterator>> BoostingIterators;
113  // Deduplicate parent URIs extracted from the ProximityPaths.
114  llvm::StringSet<> ParentURIs;
115  llvm::StringMap<SourceParams> Sources;
116  for (const auto &Path : ProximityPaths) {
117  Sources[Path] = SourceParams();
118  auto PathURI = URI::create(Path);
119  const auto PathProximityURIs = generateProximityURIs(PathURI.toString());
120  for (const auto &ProximityURI : PathProximityURIs)
121  ParentURIs.insert(ProximityURI);
122  }
123  // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
124  // for all parameters except for Proximity Path distance signal.
125  SymbolRelevanceSignals PathProximitySignals;
126  // DistanceCalculator will find the shortest distance from ProximityPaths to
127  // any URI extracted from the ProximityPaths.
128  URIDistance DistanceCalculator(Sources);
129  PathProximitySignals.FileProximityMatch = &DistanceCalculator;
130  // Try to build BOOST iterator for each Proximity Path provided by
131  // ProximityPaths. Boosting factor should depend on the distance to the
132  // Proximity Path: the closer processed path is, the higher boosting factor.
133  for (const auto &ParentURI : ParentURIs.keys()) {
134  // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
135  auto It = iterator(Token(Token::Kind::ProximityURI, ParentURI));
136  if (It->kind() != Iterator::Kind::False) {
137  PathProximitySignals.SymbolURI = ParentURI;
138  BoostingIterators.push_back(
139  Corpus.boost(std::move(It), PathProximitySignals.evaluate()));
140  }
141  }
142  BoostingIterators.push_back(Corpus.all());
143  return Corpus.unionOf(std::move(BoostingIterators));
144 }
145 
146 // Constructs BOOST iterators for preferred types.
147 std::unique_ptr<Iterator>
148 Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
149  std::vector<std::unique_ptr<Iterator>> BoostingIterators;
150  SymbolRelevanceSignals PreferredTypeSignals;
151  PreferredTypeSignals.TypeMatchesPreferred = true;
152  auto Boost = PreferredTypeSignals.evaluate();
153  for (const auto &T : Types)
154  BoostingIterators.push_back(
155  Corpus.boost(iterator(Token(Token::Kind::Type, T)), Boost));
156  BoostingIterators.push_back(Corpus.all());
157  return Corpus.unionOf(std::move(BoostingIterators));
158 }
159 
160 /// Constructs iterators over tokens extracted from the query and exhausts it
161 /// while applying Callback to each symbol in the order of decreasing quality
162 /// of the matched symbols.
164  llvm::function_ref<void(const Symbol &)> Callback) const {
165  assert(!StringRef(Req.Query).contains("::") &&
166  "There must be no :: in query.");
167  trace::Span Tracer("Dex fuzzyFind");
168  FuzzyMatcher Filter(Req.Query);
169  // For short queries we use specialized trigrams that don't yield all results.
170  // Prevent clients from postfiltering them for longer queries.
171  bool More = !Req.Query.empty() && Req.Query.size() < 3;
172 
173  std::vector<std::unique_ptr<Iterator>> Criteria;
174  const auto TrigramTokens = generateQueryTrigrams(Req.Query);
175 
176  // Generate query trigrams and construct AND iterator over all query
177  // trigrams.
178  std::vector<std::unique_ptr<Iterator>> TrigramIterators;
179  for (const auto &Trigram : TrigramTokens)
180  TrigramIterators.push_back(iterator(Trigram));
181  Criteria.push_back(Corpus.intersect(move(TrigramIterators)));
182 
183  // Generate scope tokens for search query.
184  std::vector<std::unique_ptr<Iterator>> ScopeIterators;
185  for (const auto &Scope : Req.Scopes)
186  ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
187  if (Req.AnyScope)
188  ScopeIterators.push_back(
189  Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
190  Criteria.push_back(Corpus.unionOf(move(ScopeIterators)));
191 
192  // Add proximity paths boosting (all symbols, some boosted).
193  Criteria.push_back(createFileProximityIterator(Req.ProximityPaths));
194  // Add boosting for preferred types.
195  Criteria.push_back(createTypeBoostingIterator(Req.PreferredTypes));
196 
198  Criteria.push_back(iterator(RestrictedForCodeCompletion));
199 
200  // Use TRUE iterator if both trigrams and scopes from the query are not
201  // present in the symbol index.
202  auto Root = Corpus.intersect(move(Criteria));
203  // Retrieve more items than it was requested: some of the items with high
204  // final score might not be retrieved otherwise.
205  // FIXME(kbobyrev): Tune this ratio.
206  if (Req.Limit)
207  Root = Corpus.limit(move(Root), *Req.Limit * 100);
208  SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
209  vlog("Dex query tree: {0}", *Root);
210 
211  using IDAndScore = std::pair<DocID, float>;
212  std::vector<IDAndScore> IDAndScores = consume(*Root);
213 
214  auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
215  return LHS.second > RHS.second;
216  };
218  Req.Limit ? *Req.Limit : std::numeric_limits<size_t>::max(), Compare);
219  for (const auto &IDAndScore : IDAndScores) {
220  const DocID SymbolDocID = IDAndScore.first;
221  const auto *Sym = Symbols[SymbolDocID];
222  const llvm::Optional<float> Score = Filter.match(Sym->Name);
223  if (!Score)
224  continue;
225  // Combine Fuzzy Matching score, precomputed symbol quality and boosting
226  // score for a cumulative final symbol score.
227  const float FinalScore =
228  (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
229  // If Top.push(...) returns true, it means that it had to pop an item. In
230  // this case, it is possible to retrieve more symbols.
231  if (Top.push({SymbolDocID, FinalScore}))
232  More = true;
233  }
234 
235  // Apply callback to the top Req.Limit items in the descending
236  // order of cumulative score.
237  for (const auto &Item : std::move(Top).items())
238  Callback(*Symbols[Item.first]);
239  return More;
240 }
241 
242 void Dex::lookup(const LookupRequest &Req,
243  llvm::function_ref<void(const Symbol &)> Callback) const {
244  trace::Span Tracer("Dex lookup");
245  for (const auto &ID : Req.IDs) {
246  auto I = LookupTable.find(ID);
247  if (I != LookupTable.end())
248  Callback(*I->second);
249  }
250 }
251 
252 void Dex::refs(const RefsRequest &Req,
253  llvm::function_ref<void(const Ref &)> Callback) const {
254  trace::Span Tracer("Dex refs");
255  uint32_t Remaining =
256  Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
257  for (const auto &ID : Req.IDs)
258  for (const auto &Ref : Refs.lookup(ID)) {
259  if (Remaining > 0 && static_cast<int>(Req.Filter & Ref.Kind)) {
260  --Remaining;
261  Callback(Ref);
262  }
263  }
264 }
265 
267  const RelationsRequest &Req,
268  llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
269  trace::Span Tracer("Dex relations");
270  uint32_t Remaining =
271  Req.Limit.getValueOr(std::numeric_limits<uint32_t>::max());
272  for (const SymbolID &Subject : Req.Subjects) {
273  LookupRequest LookupReq;
274  auto It = Relations.find(
275  std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
276  if (It != Relations.end()) {
277  for (const auto &Object : It->second) {
278  if (Remaining > 0) {
279  --Remaining;
280  LookupReq.IDs.insert(Object);
281  }
282  }
283  }
284  lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
285  }
286 }
287 
288 size_t Dex::estimateMemoryUsage() const {
289  size_t Bytes = Symbols.size() * sizeof(const Symbol *);
290  Bytes += SymbolQuality.size() * sizeof(float);
291  Bytes += LookupTable.getMemorySize();
292  Bytes += InvertedIndex.getMemorySize();
293  for (const auto &TokenToPostingList : InvertedIndex)
294  Bytes += TokenToPostingList.second.bytes();
295  Bytes += Refs.getMemorySize();
296  Bytes += Relations.getMemorySize();
297  return Bytes + BackingDataSize;
298 }
299 
300 std::vector<std::string> generateProximityURIs(llvm::StringRef URIPath) {
301  std::vector<std::string> Result;
302  auto ParsedURI = URI::parse(URIPath);
303  assert(ParsedURI &&
304  "Non-empty argument of generateProximityURIs() should be a valid "
305  "URI.");
306  llvm::StringRef Body = ParsedURI->body();
307  // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
308  // size of resulting vector. Some projects might want to have higher limit if
309  // the file hierarchy is deeper. For the generic case, it would be useful to
310  // calculate Limit in the index build stage by calculating the maximum depth
311  // of the project source tree at runtime.
312  size_t Limit = 5;
313  // Insert original URI before the loop: this would save a redundant iteration
314  // with a URI parse.
315  Result.emplace_back(ParsedURI->toString());
316  while (!Body.empty() && --Limit > 0) {
317  // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
318  // could be optimized.
319  Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix);
320  if (!Body.empty())
321  Result.emplace_back(
322  URI(ParsedURI->scheme(), ParsedURI->authority(), Body).toString());
323  }
324  return Result;
325 }
326 
327 } // namespace dex
328 } // namespace clangd
329 } // namespace clang
int Limit
size_t bytes() const
Definition: Symbol.h:192
An immutable symbol container that stores a set of symbols.
Definition: Symbol.h:177
llvm::DenseSet< SymbolID > IDs
Definition: Index.h:68
bool AnyScope
If set to true, allow symbols from any scope.
Definition: Index.h:39
std::unique_ptr< Iterator > intersect(std::vector< std::unique_ptr< Iterator >> Children) const
Returns AND Iterator which performs the intersection of the PostingLists of its children.
Definition: Iterator.cpp:358
void refs(const RefsRequest &Req, llvm::function_ref< void(const Ref &)> Callback) const override
Finds all occurrences (e.g.
Definition: Dex.cpp:252
bool RestrictForCodeCompletion
If set to true, only symbols for completion support will be considered.
Definition: Index.h:44
An efficient structure of storing large set of symbol references in memory.
Definition: Ref.h:69
This defines Dex - a symbol index implementation based on query iterators over symbol tokens...
llvm::Optional< uint32_t > Limit
If set, limit the number of relations returned from the index.
Definition: Index.h:80
PostingList is the storage of DocIDs which can be inserted to the Query Tree as a leaf by constructin...
Definition: PostingList.h:59
llvm::DenseSet< SymbolID > IDs
Definition: Index.h:64
Represents a symbol occurrence in the source file.
Definition: Ref.h:52
Path Proximity URI to symbol declaration.
llvm::unique_function< void(llvm::Expected< T >)> Callback
A Callback<T> is a void function that accepts Expected<T>.
Definition: Function.h:28
llvm::StringRef Scope
The containing namespace. e.g. "" (global), "ns::" (top-level namespace).
Definition: Symbol.h:44
std::vector< Token > generateIdentifierTrigrams(llvm::StringRef Identifier)
Returns list of unique fuzzy-search trigrams from unqualified symbol.
Definition: Trigram.cpp:23
bool fuzzyFind(const FuzzyFindRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Constructs iterators over tokens extracted from the query and exhausts it while applying Callback to ...
Definition: Dex.cpp:163
std::vector< std::string > PreferredTypes
Preferred types of symbols. These are raw representation of OpaqueType.
Definition: Index.h:49
void vlog(const char *Fmt, Ts &&... Vals)
Definition: Logger.h:67
std::vector< std::string > Scopes
If this is non-empty, symbols must be in at least one of the scopes (e.g.
Definition: Index.h:36
SymbolID ID
The ID of the symbol.
Definition: Symbol.h:38
llvm::Optional< float > Score
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
Definition: Iterator.cpp:350
std::unique_ptr< Iterator > unionOf(std::vector< std::unique_ptr< Iterator >> Children) const
Returns OR Iterator which performs the union of the PostingLists of its children. ...
Definition: Iterator.cpp:388
Whether or not this symbol is meant to be used for the code completion.
Definition: Symbol.h:119
std::vector< std::string > generateProximityURIs(llvm::StringRef URIPath)
Returns Search Token for a number of parent directories of given Path.
Definition: Dex.cpp:300
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
Definition: Trigram.cpp:86
SymbolFlag Flags
Definition: Symbol.h:128
static std::unique_ptr< SymbolIndex > build(SymbolSlab, RefSlab, RelationSlab)
Builds an index from slabs. The index takes ownership of the slab.
Definition: Dex.cpp:26
std::unique_ptr< Iterator > limit(std::unique_ptr< Iterator > Child, size_t Limit) const
Returns LIMIT iterator, which yields up to N elements of its child iterator.
Definition: Iterator.cpp:434
static const char * toString(OffsetEncoding OE)
Definition: Protocol.cpp:1030
std::unique_ptr< Iterator > none() const
Returns FALSE Iterator which iterates over no documents.
Definition: Iterator.cpp:421
std::string Path
A typedef to represent a file path.
Definition: Path.h:20
std::string Query
A query string for the fuzzy find.
Definition: Index.h:29
llvm::Optional< float > match(llvm::StringRef Word)
Definition: FuzzyMatch.cpp:92
SymbolLocation CanonicalDeclaration
The location of the preferred declaration of the symbol.
Definition: Symbol.h:56
bool push(value_type &&V)
Definition: Quality.h:160
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
Definition: Iterator.h:46
void relations(const RelationsRequest &Req, llvm::function_ref< void(const SymbolID &, const Symbol &)> Callback) const override
Definition: Dex.cpp:266
SymbolSlab Symbols
llvm::DenseSet< SymbolID > Subjects
Definition: Index.h:77
void lookup(const LookupRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Looks up symbols with any of the given symbol IDs and applies Callback on each matched symbol...
Definition: Dex.cpp:242
llvm::StringRef SymbolURI
These are used to calculate proximity between the index symbol and the query.
Definition: Quality.h:102
static llvm::Expected< URI > create(llvm::StringRef AbsolutePath, llvm::StringRef Scheme)
Creates a URI for a file in the given scheme.
Definition: URI.cpp:197
The class presents a C++ symbol, e.g.
Definition: Symbol.h:36
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
std::vector< std::string > ProximityPaths
Contextually relevant files (e.g.
Definition: Index.h:47
llvm::StringRef Name
The unqualified name of the symbol, e.g. "bar" (for ns::bar).
Definition: Symbol.h:42
llvm::Optional< uint32_t > Limit
If set, limit the number of refers returned from the index.
Definition: Index.h:73
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
A URI describes the location of a source file.
Definition: URI.h:28
llvm::Optional< uint32_t > Limit
The number of top candidates to return.
Definition: Index.h:42
size_t bytes() const
Definition: Ref.h:87
Internal Token type for invalid/special tokens, e.g.
static llvm::Expected< URI > parse(llvm::StringRef Uri)
Parse a URI string "<scheme>:[//<authority>/]<path>".
Definition: URI.cpp:164
std::unique_ptr< Iterator > boost(std::unique_ptr< Iterator > Child, float Factor) const
Returns BOOST iterator which multiplies the score of each item by given factor.
Definition: Iterator.cpp:425
RefSlab Refs
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:40
std::unique_ptr< Iterator > all() const
Returns TRUE Iterator which iterates over "virtual" PostingList containing all items in range [0...
Definition: Iterator.cpp:417
size_t estimateMemoryUsage() const override
Returns estimated size of index (in bytes).
Definition: Dex.cpp:288
llvm::StringRef Type
Raw representation of the OpaqueType of the symbol, used for scoring purposes.
Definition: Symbol.h:85
Records an event whose duration is the lifetime of the Span object.
Definition: Trace.h:81
Type of symbol (see Symbol::Type).
Attributes of a symbol-query pair that affect how much we like it.
Definition: Quality.h:87
#define SPAN_ATTACH(S, Name, Expr)
Attach a key-value pair to a Span event.
Definition: Trace.h:97
RefKind Kind
Definition: Ref.h:55
float quality(const Symbol &S)
Computes query-independent quality score for a Symbol.
Definition: Symbol.cpp:29
TopN<T> is a lossy container that preserves only the "best" N elements.
Definition: Quality.h:152