20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/StringMap.h"
22#include "llvm/ADT/StringSet.h"
23#include "llvm/Support/Path.h"
24#include "llvm/Support/ScopedPrinter.h"
37 bool SupportContainedRefs) {
38 auto Size = Symbols.bytes() + Refs.bytes();
41 auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
42 return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
43 Size, SupportContainedRefs);
49const Token RestrictedForCodeCompletion =
58 llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
59 std::vector<DocID> RestrictedCCDocs;
60 llvm::StringMap<std::vector<DocID>> TypeDocs;
61 llvm::StringMap<std::vector<DocID>> ScopeDocs;
62 llvm::StringMap<std::vector<DocID>> ProximityDocs;
63 std::vector<Trigram> TrigramScratch;
72 for (
Trigram T : TrigramScratch)
73 TrigramDocs[T].push_back(D);
74 ScopeDocs[Sym.
Scope].push_back(D);
76 for (
const auto &ProximityURI :
78 ProximityDocs[ProximityURI].push_back(D);
80 RestrictedCCDocs.push_back(D);
81 if (!Sym.
Type.empty())
82 TypeDocs[Sym.
Type].push_back(D);
86 llvm::DenseMap<Token, PostingList> build() && {
87 llvm::DenseMap<Token, PostingList> Result(
89 RestrictedCCDocs.size() +
92 ProximityDocs.size());
96 auto CreatePostingList =
97 [&Result](
Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) {
98 for (
auto &E : Docs) {
99 Result.try_emplace(
Token(TK, E.first()), E.second);
110 for (
auto &E : TrigramDocs) {
114 TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{};
115 if (!RestrictedCCDocs.empty())
116 Result.try_emplace(RestrictedForCodeCompletion,
117 std::move(RestrictedCCDocs));
124void Dex::buildIndex(
bool SupportContainedRefs) {
125 this->Corpus = dex::Corpus(Symbols.size());
126 std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
128 for (
size_t I = 0; I < Symbols.size(); ++I) {
129 const Symbol *Sym = Symbols[I];
130 LookupTable[Sym->ID] = Sym;
131 ScoredSymbols[I] = {
quality(*Sym), Sym};
136 llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
139 SymbolQuality.resize(Symbols.size());
141 for (
size_t I = 0; I < ScoredSymbols.size(); ++I) {
142 SymbolQuality[I] = ScoredSymbols[I].first;
143 Symbols[I] = ScoredSymbols[I].second;
147 IndexBuilder Builder;
148 for (
DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
149 Builder.add(*Symbols[SymbolRank], SymbolRank);
150 InvertedIndex = std::move(Builder).build();
154 if (!SupportContainedRefs)
156 for (
const auto &[ID, RefList] : Refs)
157 for (
const auto &R : RefList)
160 RevRefs.emplace_back(R, ID);
162 llvm::sort(RevRefs, [](
const RevRef &A,
const RevRef &B) {
163 return A.ref().Container <
B.ref().Container;
167std::unique_ptr<Iterator> Dex::iterator(
const Token &Tok)
const {
168 auto It = InvertedIndex.find(Tok);
169 return It == InvertedIndex.end() ? Corpus.none()
170 : It->second.iterator(&It->first);
174std::unique_ptr<Iterator> Dex::createFileProximityIterator(
175 llvm::ArrayRef<std::string> ProximityPaths)
const {
176 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
178 llvm::StringSet<> ParentURIs;
179 llvm::StringMap<SourceParams> Sources;
180 for (
const auto &
Path : ProximityPaths) {
181 Sources[
Path] = SourceParams();
184 ParentURIs.insert_range(PathProximityURIs);
188 SymbolRelevanceSignals PathProximitySignals;
191 URIDistance DistanceCalculator(Sources);
192 PathProximitySignals.FileProximityMatch = &DistanceCalculator;
196 for (
const auto &ParentURI : ParentURIs.keys()) {
200 PathProximitySignals.SymbolURI = ParentURI;
201 BoostingIterators.push_back(Corpus.boost(
202 std::move(It), PathProximitySignals.evaluateHeuristics()));
205 BoostingIterators.push_back(Corpus.all());
206 return Corpus.unionOf(std::move(BoostingIterators));
210std::unique_ptr<Iterator>
211Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types)
const {
212 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
213 SymbolRelevanceSignals PreferredTypeSignals;
214 PreferredTypeSignals.TypeMatchesPreferred =
true;
215 auto Boost = PreferredTypeSignals.evaluateHeuristics();
216 for (
const auto &T : Types)
217 BoostingIterators.push_back(
219 BoostingIterators.push_back(Corpus.all());
220 return Corpus.unionOf(std::move(BoostingIterators));
228 assert(!StringRef(Req.
Query).contains(
"::") &&
229 "There must be no :: in query.");
234 bool More = !Req.
Query.empty() && Req.
Query.size() < 3;
236 std::vector<std::unique_ptr<Iterator>> Criteria;
241 std::vector<std::unique_ptr<Iterator>> TrigramIterators;
242 for (
const auto &
Trigram : TrigramTokens)
243 TrigramIterators.push_back(iterator(
Trigram));
244 Criteria.push_back(Corpus.intersect(std::move(TrigramIterators)));
247 std::vector<std::unique_ptr<Iterator>> ScopeIterators;
248 for (
const auto &Scope : Req.
Scopes)
251 ScopeIterators.push_back(
252 Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
253 Criteria.push_back(Corpus.unionOf(std::move(ScopeIterators)));
256 Criteria.push_back(createFileProximityIterator(Req.
ProximityPaths));
258 Criteria.push_back(createTypeBoostingIterator(Req.
PreferredTypes));
261 Criteria.push_back(iterator(RestrictedForCodeCompletion));
265 auto Root = Corpus.intersect(std::move(Criteria));
270 Root = Corpus.limit(std::move(Root), *Req.
Limit * 100);
271 SPAN_ATTACH(Tracer,
"query", llvm::to_string(*Root));
272 vlog(
"Dex query tree: {0}", *Root);
274 using IDAndScore = std::pair<DocID, float>;
275 std::vector<IDAndScore> IDAndScores =
consume(*Root);
277 auto Compare = [](
const IDAndScore &LHS,
const IDAndScore &RHS) {
278 return LHS.second > RHS.second;
280 TopN<IDAndScore,
decltype(Compare)> Top(
281 Req.
Limit.value_or(std::numeric_limits<size_t>::max()), Compare);
282 for (
const auto &IDAndScore : IDAndScores) {
283 const DocID SymbolDocID = IDAndScore.first;
284 const auto *Sym = Symbols[SymbolDocID];
285 const std::optional<float> Score = Filter.
match(Sym->
Name);
290 const float FinalScore =
291 (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
294 if (Top.push({SymbolDocID, FinalScore}))
300 for (
const auto &Item : std::move(Top).items())
308 for (
const auto &ID : Req.
IDs) {
309 auto I = LookupTable.find(ID);
310 if (I != LookupTable.end())
316 llvm::function_ref<
void(
const Ref &)>
Callback)
const {
318 uint32_t Remaining = Req.
Limit.value_or(std::numeric_limits<uint32_t>::max());
319 for (
const auto &ID : Req.
IDs)
320 for (
const auto &
Ref : Refs.lookup(ID)) {
331llvm::iterator_range<std::vector<Dex::RevRef>::const_iterator>
332Dex::lookupRevRefs(
const SymbolID &Container)
const {
336 QueryRef.Container = Container;
339 auto ItPair = std::equal_range(RevRefs.cbegin(), RevRefs.cend(), Query,
340 [](
const RevRef &A,
const RevRef &B) {
341 return A.ref().Container < B.ref().Container;
343 return {ItPair.first, ItPair.second};
350 uint32_t Remaining = Req.
Limit.value_or(std::numeric_limits<uint32_t>::max());
351 for (
const auto &Rev : lookupRevRefs(Req.
ID)) {
356 Callback(Rev.containedRefsResult());
365 uint32_t Remaining = Req.
Limit.value_or(std::numeric_limits<uint32_t>::max());
368 auto It = Relations.find(
369 std::make_pair(Subject,
static_cast<uint8_t
>(Req.
Predicate)));
370 if (It != Relations.end()) {
371 for (
const auto &
Object : It->second) {
384 return [
this](llvm::StringRef FileURI) {
390 size_t Bytes = Symbols.size() *
sizeof(
const Symbol *);
391 Bytes += SymbolQuality.size() *
sizeof(float);
392 Bytes += LookupTable.getMemorySize();
393 Bytes += InvertedIndex.getMemorySize();
394 for (
const auto &TokenToPostingList : InvertedIndex)
395 Bytes += TokenToPostingList.second.bytes();
396 Bytes += Refs.getMemorySize();
397 Bytes += RevRefs.size() *
sizeof(RevRef);
398 Bytes += Relations.getMemorySize();
399 return Bytes + BackingDataSize;
405 S = S.split(
':').second;
406 if (S.consume_front(
"//"))
407 S = S.drop_until([](
char C) {
return C ==
'/'; });
418llvm::SmallVector<llvm::StringRef, ProximityURILimit>
433 llvm::SmallVector<llvm::StringRef, ProximityURILimit> Result = {
URI};
435 for (
auto Slash =
Path.rfind(
'/'); Slash > 0 && Slash != StringRef::npos;
436 Slash =
Path.rfind(
'/')) {
438 Result.push_back(
URI.substr(0,
Path.end() -
URI.data()));
443 if (
Path.starts_with(
"/"))
444 Result.push_back(
URI.substr(0,
Path.begin() + 1 -
URI.data()));
This defines Dex - a symbol index implementation based on query iterators over symbol tokens,...
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
#define SPAN_ATTACH(S, Name, Expr)
Attach a key-value pair to a Span event.
Trigrams are attributes of the symbol unqualified name used to effectively extract symbols which can ...
std::optional< float > match(llvm::StringRef Word)
An efficient structure of storing large set of symbol references in memory.
An immutable symbol container that stores a set of symbols.
TopN<T> is a lossy container that preserves only the "best" N elements.
A URI describes the location of a source file.
static llvm::Expected< URI > create(llvm::StringRef AbsolutePath, llvm::StringRef Scheme)
Creates a URI for a file in the given scheme.
size_t estimateMemoryUsage() const override
Returns estimated size of index (in bytes).
bool refs(const RefsRequest &Req, llvm::function_ref< void(const Ref &)> Callback) const override
Finds all occurrences (e.g.
void relations(const RelationsRequest &Req, llvm::function_ref< void(const SymbolID &, const Symbol &)> Callback) const override
static std::unique_ptr< SymbolIndex > build(SymbolSlab, RefSlab, RelationSlab, bool SupportContainedRefs)
Builds an index from slabs. The index takes ownership of the slab.
bool containedRefs(const ContainedRefsRequest &Req, llvm::function_ref< void(const ContainedRefsResult &)> Callback) const override
Find all symbols that are referenced by a symbol and apply Callback on each result.
void lookup(const LookupRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Looks up symbols with any of the given symbol IDs and applies Callback on each matched symbol.
llvm::unique_function< IndexContents(llvm::StringRef) const > indexedFiles() const override
bool fuzzyFind(const FuzzyFindRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Constructs iterators over tokens extracted from the query and exhausts it while applying Callback to ...
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
@ ProximityURI
Path Proximity URI to symbol declaration.
@ Scope
Scope primitives, e.g.
@ Sentinel
Internal Token type for invalid/special tokens, e.g.
@ Trigram
Represents trigram used for fuzzy search of unqualified symbol names.
@ Type
Type of symbol (see Symbol::Type).
Records an event whose duration is the lifetime of the Span object.
Token objects represent a characteristic of a symbol, which can be used to perform efficient search.
llvm::SmallVector< llvm::StringRef, ProximityURILimit > generateProximityURIs(llvm::StringRef URI)
Returns Search Token for a number of parent directories of given Path.
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
llvm::StringRef findPathInURI(llvm::StringRef S)
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
constexpr unsigned ProximityURILimit
void generateIdentifierTrigrams(llvm::StringRef Identifier, std::vector< Trigram > &Result)
Produces list of unique fuzzy-search trigrams from unqualified symbol.
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
FIXME: Skip testing on windows temporarily due to the different escaping code mode.
IndexContents
Describes what data is covered by an index.
void vlog(const char *Fmt, Ts &&... Vals)
llvm::unique_function< void(llvm::Expected< T >)> Callback
A Callback<T> is a void function that accepts Expected<T>.
std::string Path
A typedef to represent a file path.
float quality(const Symbol &S)
Computes query-independent quality score for a Symbol.
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
std::optional< uint32_t > Limit
If set, limit the number of refers returned from the index.
static const RefKind SupportedRefKinds
Note that RefKind::Call just restricts the matched SymbolKind to functions, not the form of the refer...
std::vector< std::string > Scopes
If this is non-empty, symbols must be in at least one of the scopes (e.g.
bool RestrictForCodeCompletion
If set to true, only symbols for completion support will be considered.
std::string Query
A query string for the fuzzy find.
std::vector< std::string > ProximityPaths
Contextually relevant files (e.g.
bool AnyScope
If set to true, allow symbols from any scope.
std::optional< uint32_t > Limit
The number of top candidates to return.
std::vector< std::string > PreferredTypes
Preferred types of symbols. These are raw representation of OpaqueType.
llvm::DenseSet< SymbolID > IDs
Represents a symbol occurrence in the source file.
llvm::DenseSet< SymbolID > IDs
std::optional< uint32_t > Limit
If set, limit the number of refers returned from the index.
std::optional< uint32_t > Limit
If set, limit the number of relations returned from the index.
llvm::DenseSet< SymbolID > Subjects
The class presents a C++ symbol, e.g.
@ IndexedForCodeCompletion
Whether or not this symbol is meant to be used for the code completion.
llvm::StringRef Type
Raw representation of the OpaqueType of the symbol, used for scoring purposes.
llvm::StringRef Name
The unqualified name of the symbol, e.g. "bar" (for ns::bar).
llvm::StringRef Scope
The containing namespace. e.g. "" (global), "ns::" (top-level namespace).
SymbolLocation CanonicalDeclaration
The location of the preferred declaration of the symbol.
A single C++ or preprocessor token.
clang::tok::TokenKind Kind
The type of token as determined by clang's lexer.