20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/StringMap.h"
22#include "llvm/ADT/StringSet.h"
23#include "llvm/Support/Path.h"
24#include "llvm/Support/ScopedPrinter.h"
37 bool SupportContainedRefs) {
38 auto Size = Symbols.bytes() + Refs.bytes();
41 auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
42 return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
43 Size, SupportContainedRefs);
49const Token RestrictedForCodeCompletion =
58 llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
59 std::vector<DocID> RestrictedCCDocs;
60 llvm::StringMap<std::vector<DocID>> TypeDocs;
61 llvm::StringMap<std::vector<DocID>> ScopeDocs;
62 llvm::StringMap<std::vector<DocID>> ProximityDocs;
63 std::vector<Trigram> TrigramScratch;
72 for (
Trigram T : TrigramScratch)
73 TrigramDocs[T].push_back(D);
74 ScopeDocs[Sym.
Scope].push_back(D);
76 for (
const auto &ProximityURI :
78 ProximityDocs[ProximityURI].push_back(D);
80 RestrictedCCDocs.push_back(D);
81 if (!Sym.
Type.empty())
82 TypeDocs[Sym.
Type].push_back(D);
86 llvm::DenseMap<Token, PostingList> build() && {
87 llvm::DenseMap<Token, PostingList> Result(
89 RestrictedCCDocs.size() +
92 ProximityDocs.size());
96 auto CreatePostingList =
97 [&Result](
Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) {
98 for (
auto &
E : Docs) {
99 Result.try_emplace(
Token(TK,
E.first()),
E.second);
110 for (
auto &
E : TrigramDocs) {
114 TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{};
115 if (!RestrictedCCDocs.empty())
116 Result.try_emplace(RestrictedForCodeCompletion,
117 std::move(RestrictedCCDocs));
124void Dex::buildIndex(
bool SupportContainedRefs) {
125 this->Corpus = dex::Corpus(Symbols.size());
126 std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
128 for (
size_t I = 0; I < Symbols.size(); ++I) {
129 const Symbol *Sym = Symbols[I];
130 LookupTable[Sym->ID] = Sym;
131 ScoredSymbols[I] = {
quality(*Sym), Sym};
136 llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
139 SymbolQuality.resize(Symbols.size());
141 for (
size_t I = 0; I < ScoredSymbols.size(); ++I) {
142 SymbolQuality[I] = ScoredSymbols[I].first;
143 Symbols[I] = ScoredSymbols[I].second;
148 for (
DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
149 Builder.add(*Symbols[SymbolRank], SymbolRank);
150 InvertedIndex = std::move(
Builder).build();
154 if (!SupportContainedRefs)
156 for (
const auto &[
ID, RefList] : Refs)
157 for (
const auto &R : RefList)
160 RevRefs.emplace_back(R,
ID);
162 llvm::sort(RevRefs, [](
const RevRef &A,
const RevRef &B) {
167std::unique_ptr<Iterator> Dex::iterator(
const Token &Tok)
const {
168 auto It = InvertedIndex.find(Tok);
169 return It == InvertedIndex.end() ? Corpus.
none()
170 : It->second.iterator(&It->first);
174std::unique_ptr<Iterator> Dex::createFileProximityIterator(
175 llvm::ArrayRef<std::string> ProximityPaths)
const {
176 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
178 llvm::StringSet<> ParentURIs;
179 llvm::StringMap<SourceParams> Sources;
180 for (
const auto &
Path : ProximityPaths) {
181 Sources[
Path] = SourceParams();
184 for (
const auto &ProximityURI : PathProximityURIs)
185 ParentURIs.insert(ProximityURI);
189 SymbolRelevanceSignals PathProximitySignals;
192 URIDistance DistanceCalculator(Sources);
193 PathProximitySignals.FileProximityMatch = &DistanceCalculator;
197 for (
const auto &ParentURI : ParentURIs.keys()) {
201 PathProximitySignals.SymbolURI = ParentURI;
202 BoostingIterators.push_back(Corpus.
boost(
203 std::move(It), PathProximitySignals.evaluateHeuristics()));
206 BoostingIterators.push_back(Corpus.
all());
207 return Corpus.
unionOf(std::move(BoostingIterators));
211std::unique_ptr<Iterator>
212Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types)
const {
213 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
214 SymbolRelevanceSignals PreferredTypeSignals;
215 PreferredTypeSignals.TypeMatchesPreferred =
true;
216 auto Boost = PreferredTypeSignals.evaluateHeuristics();
217 for (
const auto &T : Types)
218 BoostingIterators.push_back(
220 BoostingIterators.push_back(Corpus.
all());
221 return Corpus.
unionOf(std::move(BoostingIterators));
229 assert(!StringRef(Req.
Query).contains(
"::") &&
230 "There must be no :: in query.");
235 bool More = !Req.
Query.empty() && Req.
Query.size() < 3;
237 std::vector<std::unique_ptr<Iterator>> Criteria;
242 std::vector<std::unique_ptr<Iterator>> TrigramIterators;
243 for (
const auto &
Trigram : TrigramTokens)
244 TrigramIterators.push_back(iterator(
Trigram));
248 std::vector<std::unique_ptr<Iterator>> ScopeIterators;
249 for (
const auto &Scope : Req.
Scopes)
252 ScopeIterators.push_back(
254 Criteria.push_back(
Corpus.
unionOf(std::move(ScopeIterators)));
257 Criteria.push_back(createFileProximityIterator(Req.
ProximityPaths));
259 Criteria.push_back(createTypeBoostingIterator(Req.
PreferredTypes));
262 Criteria.push_back(iterator(RestrictedForCodeCompletion));
275 using IDAndScore = std::pair<DocID, float>;
276 std::vector<IDAndScore> IDAndScores =
consume(*
Root);
278 auto Compare = [](
const IDAndScore &LHS,
const IDAndScore &RHS) {
279 return LHS.second > RHS.second;
281 TopN<IDAndScore,
decltype(Compare)> Top(
282 Req.
Limit.value_or(std::numeric_limits<size_t>::max()), Compare);
283 for (
const auto &IDAndScore : IDAndScores) {
284 const DocID SymbolDocID = IDAndScore.first;
285 const auto *Sym = Symbols[SymbolDocID];
286 const std::optional<float>
Score = Filter.match(Sym->
Name);
291 const float FinalScore =
292 (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
295 if (Top.push({SymbolDocID, FinalScore}))
301 for (
const auto &Item : std::move(Top).items())
309 for (
const auto &
ID : Req.
IDs) {
310 auto I = LookupTable.find(
ID);
311 if (I != LookupTable.end())
317 llvm::function_ref<
void(
const Ref &)>
Callback)
const {
319 uint32_t Remaining = Req.
Limit.value_or(std::numeric_limits<uint32_t>::max());
320 for (
const auto &
ID : Req.
IDs)
321 for (
const auto &
Ref : Refs.lookup(
ID)) {
332llvm::iterator_range<std::vector<Dex::RevRef>::const_iterator>
340 auto ItPair = std::equal_range(RevRefs.cbegin(), RevRefs.cend(), Query,
341 [](
const RevRef &A,
const RevRef &B) {
342 return A.ref().Container < B.ref().Container;
344 return {ItPair.first, ItPair.second};
347bool Dex::containedRefs(
351 uint32_t Remaining = Req.
Limit.value_or(std::numeric_limits<uint32_t>::max());
352 for (
const auto &Rev : lookupRevRefs(Req.
ID)) {
357 Callback(Rev.containedRefsResult());
366 uint32_t Remaining = Req.
Limit.value_or(std::numeric_limits<uint32_t>::max());
369 auto It = Relations.find(
370 std::make_pair(Subject,
static_cast<uint8_t
>(Req.
Predicate)));
371 if (It != Relations.end()) {
372 for (
const auto &
Object : It->second) {
384Dex::indexedFiles()
const {
385 return [
this](llvm::StringRef FileURI) {
390size_t Dex::estimateMemoryUsage()
const {
392 Bytes += SymbolQuality.size() *
sizeof(float);
393 Bytes += LookupTable.getMemorySize();
394 Bytes += InvertedIndex.getMemorySize();
395 for (
const auto &TokenToPostingList : InvertedIndex)
396 Bytes += TokenToPostingList.second.bytes();
397 Bytes += Refs.getMemorySize();
398 Bytes += RevRefs.size() *
sizeof(RevRef);
399 Bytes += Relations.getMemorySize();
400 return Bytes + BackingDataSize;
406 S = S.split(
':').second;
407 if (S.consume_front(
"//"))
408 S = S.drop_until([](
char C) {
return C ==
'/'; });
419llvm::SmallVector<llvm::StringRef, ProximityURILimit>
434 llvm::SmallVector<llvm::StringRef, ProximityURILimit> Result = {
URI};
436 for (
auto Slash =
Path.rfind(
'/'); Slash > 0 && Slash != StringRef::npos;
437 Slash =
Path.rfind(
'/')) {
439 Result.push_back(
URI.substr(0,
Path.end() -
URI.data()));
444 if (
Path.starts_with(
"/"))
445 Result.push_back(
URI.substr(0,
Path.begin() + 1 -
URI.data()));
CodeCompletionBuilder Builder
This defines Dex - a symbol index implementation based on query iterators over symbol tokens,...
std::optional< float > Score
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
#define SPAN_ATTACH(S, Name, Expr)
Attach a key-value pair to a Span event.
Trigrams are attributes of the symbol unqualified name used to effectively extract symbols which can ...
An efficient structure of storing large set of symbol references in memory.
An immutable symbol container that stores a set of symbols.
TopN<T> is a lossy container that preserves only the "best" N elements.
A URI describes the location of a source file.
static llvm::Expected< URI > create(llvm::StringRef AbsolutePath, llvm::StringRef Scheme)
Creates a URI for a file in the given scheme.
std::unique_ptr< Iterator > none() const
Returns FALSE Iterator which iterates over no documents.
std::unique_ptr< Iterator > limit(std::unique_ptr< Iterator > Child, size_t Limit) const
Returns LIMIT iterator, which yields up to N elements of its child iterator.
std::unique_ptr< Iterator > intersect(std::vector< std::unique_ptr< Iterator > > Children) const
Returns AND Iterator which performs the intersection of the PostingLists of its children.
std::unique_ptr< Iterator > boost(std::unique_ptr< Iterator > Child, float Factor) const
Returns BOOST iterator which multiplies the score of each item by given factor.
std::unique_ptr< Iterator > all() const
Returns TRUE Iterator which iterates over "virtual" PostingList containing all items in range [0,...
std::unique_ptr< Iterator > unionOf(std::vector< std::unique_ptr< Iterator > > Children) const
Returns OR Iterator which performs the union of the PostingLists of its children.
static std::unique_ptr< SymbolIndex > build(SymbolSlab, RefSlab, RelationSlab, bool SupportContainedRefs)
Builds an index from slabs. The index takes ownership of the slab.
bool fuzzyFind(const FuzzyFindRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Constructs iterators over tokens extracted from the query and exhausts it while applying Callback to ...
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Kind
Kind specifies Token type which defines semantics for the internal representation.
@ ProximityURI
Path Proximity URI to symbol declaration.
@ Scope
Scope primitives, e.g.
@ Sentinel
Internal Token type for invalid/special tokens, e.g.
@ Trigram
Represents trigram used for fuzzy search of unqualified symbol names.
@ Type
Type of symbol (see Symbol::Type).
Records an event whose duration is the lifetime of the Span object.
Token objects represent a characteristic of a symbol, which can be used to perform efficient search.
llvm::SmallVector< llvm::StringRef, ProximityURILimit > generateProximityURIs(llvm::StringRef URI)
Returns Search Token for a number of parent directories of given Path.
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
llvm::StringRef findPathInURI(llvm::StringRef S)
constexpr unsigned ProximityURILimit
void generateIdentifierTrigrams(llvm::StringRef Identifier, std::vector< Trigram > &Result)
Produces list of unique fuzzy-search trigrams from unqualified symbol.
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
IndexContents
Describes what data is covered by an index.
std::string Path
A typedef to represent a file path.
void vlog(const char *Fmt, Ts &&... Vals)
llvm::unique_function< void(llvm::Expected< T >)> Callback
A Callback<T> is a void function that accepts Expected<T>.
std::vector< std::string > lookup(const SymbolIndex &I, llvm::ArrayRef< SymbolID > IDs)
float quality(const Symbol &S)
Computes query-independent quality score for a Symbol.
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
std::optional< uint32_t > Limit
If set, limit the number of refers returned from the index.
static const RefKind SupportedRefKinds
Note that RefKind::Call just restricts the matched SymbolKind to functions, not the form of the refer...
std::vector< std::string > Scopes
If this is non-empty, symbols must be in at least one of the scopes (e.g.
bool RestrictForCodeCompletion
If set to true, only symbols for completion support will be considered.
std::string Query
A query string for the fuzzy find.
std::vector< std::string > ProximityPaths
Contextually relevant files (e.g.
bool AnyScope
If set to true, allow symbols from any scope.
std::optional< uint32_t > Limit
The number of top candidates to return.
std::vector< std::string > PreferredTypes
Preferred types of symbols. These are raw representation of OpaqueType.
llvm::DenseSet< SymbolID > IDs
Represents a symbol occurrence in the source file.
llvm::DenseSet< SymbolID > IDs
std::optional< uint32_t > Limit
If set, limit the number of refers returned from the index.
std::optional< uint32_t > Limit
If set, limit the number of relations returned from the index.
llvm::DenseSet< SymbolID > Subjects
The class presents a C++ symbol, e.g.
@ IndexedForCodeCompletion
Whether or not this symbol is meant to be used for the code completion.
llvm::StringRef Type
Raw representation of the OpaqueType of the symbol, used for scoring purposes.
llvm::StringRef Name
The unqualified name of the symbol, e.g. "bar" (for ns::bar).
llvm::StringRef Scope
The containing namespace. e.g. "" (global), "ns::" (top-level namespace).
SymbolLocation CanonicalDeclaration
The location of the preferred declaration of the symbol.
A single C++ or preprocessor token.