20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/StringMap.h"
22#include "llvm/ADT/StringSet.h"
23#include "llvm/Support/Path.h"
24#include "llvm/Support/ScopedPrinter.h"
37 auto Size = Symbols.bytes() + Refs.bytes();
40 auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
41 return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
48const Token RestrictedForCodeCompletion =
57 llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
58 std::vector<DocID> RestrictedCCDocs;
59 llvm::StringMap<std::vector<DocID>> TypeDocs;
60 llvm::StringMap<std::vector<DocID>> ScopeDocs;
61 llvm::StringMap<std::vector<DocID>> ProximityDocs;
62 std::vector<Trigram> TrigramScratch;
71 for (
Trigram T : TrigramScratch)
72 TrigramDocs[T].push_back(D);
73 ScopeDocs[Sym.
Scope].push_back(D);
75 for (
const auto &ProximityURI :
77 ProximityDocs[ProximityURI].push_back(D);
79 RestrictedCCDocs.push_back(D);
80 if (!Sym.
Type.empty())
81 TypeDocs[Sym.
Type].push_back(D);
85 llvm::DenseMap<Token, PostingList> build() && {
86 llvm::DenseMap<Token, PostingList> Result(
88 RestrictedCCDocs.size() +
91 ProximityDocs.size());
95 auto CreatePostingList =
96 [&Result](
Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) {
97 for (
auto &
E : Docs) {
98 Result.try_emplace(Token(TK,
E.first()),
E.second);
109 for (
auto &
E : TrigramDocs) {
113 TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{};
114 if (!RestrictedCCDocs.empty())
115 Result.try_emplace(RestrictedForCodeCompletion,
116 std::move(RestrictedCCDocs));
123void Dex::buildIndex() {
124 this->Corpus = dex::Corpus(Symbols.size());
125 std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
127 for (
size_t I = 0; I < Symbols.size(); ++I) {
128 const Symbol *Sym = Symbols[I];
129 LookupTable[Sym->ID] = Sym;
130 ScoredSymbols[I] = {
quality(*Sym), Sym};
135 llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());
138 SymbolQuality.resize(Symbols.size());
140 for (
size_t I = 0; I < ScoredSymbols.size(); ++I) {
141 SymbolQuality[I] = ScoredSymbols[I].first;
142 Symbols[I] = ScoredSymbols[I].second;
147 for (
DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
148 Builder.add(*Symbols[SymbolRank], SymbolRank);
149 InvertedIndex = std::move(
Builder).build();
152std::unique_ptr<Iterator> Dex::iterator(
const Token &Tok)
const {
153 auto It = InvertedIndex.find(Tok);
154 return It == InvertedIndex.end() ? Corpus.
none()
155 : It->second.iterator(&It->first);
159std::unique_ptr<Iterator> Dex::createFileProximityIterator(
160 llvm::ArrayRef<std::string> ProximityPaths)
const {
161 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
163 llvm::StringSet<> ParentURIs;
164 llvm::StringMap<SourceParams> Sources;
165 for (
const auto &
Path : ProximityPaths) {
166 Sources[
Path] = SourceParams();
169 for (
const auto &ProximityURI : PathProximityURIs)
170 ParentURIs.insert(ProximityURI);
174 SymbolRelevanceSignals PathProximitySignals;
177 URIDistance DistanceCalculator(Sources);
178 PathProximitySignals.FileProximityMatch = &DistanceCalculator;
182 for (
const auto &ParentURI : ParentURIs.keys()) {
186 PathProximitySignals.SymbolURI = ParentURI;
187 BoostingIterators.push_back(Corpus.
boost(
188 std::move(It), PathProximitySignals.evaluateHeuristics()));
191 BoostingIterators.push_back(Corpus.
all());
192 return Corpus.
unionOf(std::move(BoostingIterators));
196std::unique_ptr<Iterator>
197Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types)
const {
198 std::vector<std::unique_ptr<Iterator>> BoostingIterators;
199 SymbolRelevanceSignals PreferredTypeSignals;
200 PreferredTypeSignals.TypeMatchesPreferred =
true;
201 auto Boost = PreferredTypeSignals.evaluateHeuristics();
202 for (
const auto &T : Types)
203 BoostingIterators.push_back(
205 BoostingIterators.push_back(Corpus.
all());
206 return Corpus.
unionOf(std::move(BoostingIterators));
214 assert(!StringRef(Req.
Query).contains(
"::") &&
215 "There must be no :: in query.");
220 bool More = !Req.
Query.empty() && Req.
Query.size() < 3;
222 std::vector<std::unique_ptr<Iterator>> Criteria;
227 std::vector<std::unique_ptr<Iterator>> TrigramIterators;
228 for (
const auto &
Trigram : TrigramTokens)
229 TrigramIterators.push_back(iterator(
Trigram));
233 std::vector<std::unique_ptr<Iterator>> ScopeIterators;
234 for (
const auto &Scope : Req.
Scopes)
237 ScopeIterators.push_back(
239 Criteria.push_back(
Corpus.
unionOf(std::move(ScopeIterators)));
242 Criteria.push_back(createFileProximityIterator(Req.
ProximityPaths));
244 Criteria.push_back(createTypeBoostingIterator(Req.
PreferredTypes));
247 Criteria.push_back(iterator(RestrictedForCodeCompletion));
260 using IDAndScore = std::pair<DocID, float>;
261 std::vector<IDAndScore> IDAndScores =
consume(*
Root);
263 auto Compare = [](
const IDAndScore &LHS,
const IDAndScore &RHS) {
264 return LHS.second > RHS.second;
266 TopN<IDAndScore,
decltype(Compare)> Top(
267 Req.
Limit ? *Req.
Limit : std::numeric_limits<size_t>::max(), Compare);
268 for (
const auto &IDAndScore : IDAndScores) {
269 const DocID SymbolDocID = IDAndScore.first;
270 const auto *Sym = Symbols[SymbolDocID];
271 const std::optional<float>
Score = Filter.match(Sym->
Name);
276 const float FinalScore =
277 (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
280 if (Top.push({SymbolDocID, FinalScore}))
286 for (
const auto &Item : std::move(Top).items())
294 for (
const auto &
ID : Req.
IDs) {
295 auto I = LookupTable.find(
ID);
296 if (I != LookupTable.end())
302 llvm::function_ref<
void(
const Ref &)>
Callback)
const {
304 uint32_t Remaining = Req.
Limit.value_or(std::numeric_limits<uint32_t>::max());
305 for (
const auto &
ID : Req.
IDs)
306 for (
const auto &
Ref : Refs.lookup(
ID)) {
321 uint32_t Remaining = Req.
Limit.value_or(std::numeric_limits<uint32_t>::max());
324 auto It = Relations.find(
325 std::make_pair(Subject,
static_cast<uint8_t
>(Req.
Predicate)));
326 if (It != Relations.end()) {
327 for (
const auto &
Object : It->second) {
339Dex::indexedFiles()
const {
340 return [
this](llvm::StringRef FileURI) {
345size_t Dex::estimateMemoryUsage()
const {
347 Bytes += SymbolQuality.size() *
sizeof(float);
348 Bytes += LookupTable.getMemorySize();
349 Bytes += InvertedIndex.getMemorySize();
350 for (
const auto &TokenToPostingList : InvertedIndex)
351 Bytes += TokenToPostingList.second.bytes();
352 Bytes += Refs.getMemorySize();
353 Bytes += Relations.getMemorySize();
354 return Bytes + BackingDataSize;
360 S = S.split(
':').second;
361 if (S.consume_front(
"//"))
362 S = S.drop_until([](
char C) {
return C ==
'/'; });
373llvm::SmallVector<llvm::StringRef, ProximityURILimit>
388 llvm::SmallVector<llvm::StringRef, ProximityURILimit> Result = {
URI};
390 for (
auto Slash =
Path.rfind(
'/'); Slash > 0 && Slash != StringRef::npos;
391 Slash =
Path.rfind(
'/')) {
393 Result.push_back(
URI.substr(0,
Path.end() -
URI.data()));
398 if (
Path.starts_with(
"/"))
399 Result.push_back(
URI.substr(0,
Path.begin() + 1 -
URI.data()));
CodeCompletionBuilder Builder
This defines Dex - a symbol index implementation based on query iterators over symbol tokens,...
std::optional< float > Score
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
Token objects represent a characteristic of a symbol, which can be used to perform efficient search.
#define SPAN_ATTACH(S, Name, Expr)
Attach a key-value pair to a Span event.
Trigrams are attributes of the symbol unqualified name used to effectively extract symbols which can ...
An efficient structure of storing large set of symbol references in memory.
An immutable symbol container that stores a set of symbols.
TopN<T> is a lossy container that preserves only the "best" N elements.
A URI describes the location of a source file.
static llvm::Expected< URI > create(llvm::StringRef AbsolutePath, llvm::StringRef Scheme)
Creates a URI for a file in the given scheme.
std::unique_ptr< Iterator > none() const
Returns FALSE Iterator which iterates over no documents.
std::unique_ptr< Iterator > limit(std::unique_ptr< Iterator > Child, size_t Limit) const
Returns LIMIT iterator, which yields up to N elements of its child iterator.
std::unique_ptr< Iterator > intersect(std::vector< std::unique_ptr< Iterator > > Children) const
Returns AND Iterator which performs the intersection of the PostingLists of its children.
std::unique_ptr< Iterator > boost(std::unique_ptr< Iterator > Child, float Factor) const
Returns BOOST iterator which multiplies the score of each item by given factor.
std::unique_ptr< Iterator > all() const
Returns TRUE Iterator which iterates over "virtual" PostingList containing all items in range [0,...
std::unique_ptr< Iterator > unionOf(std::vector< std::unique_ptr< Iterator > > Children) const
Returns OR Iterator which performs the union of the PostingLists of its children.
static std::unique_ptr< SymbolIndex > build(SymbolSlab, RefSlab, RelationSlab)
Builds an index from slabs. The index takes ownership of the slab.
bool fuzzyFind(const FuzzyFindRequest &Req, llvm::function_ref< void(const Symbol &)> Callback) const override
Constructs iterators over tokens extracted from the query and exhausts it while applying Callback to ...
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Kind
Kind specifies Token type which defines semantics for the internal representation.
@ ProximityURI
Path Proximity URI to symbol declaration.
@ Scope
Scope primitives, e.g.
@ Sentinel
Internal Token type for invalid/special tokens, e.g.
@ Trigram
Represents trigram used for fuzzy search of unqualified symbol names.
@ Type
Type of symbol (see Symbol::Type).
Records an event whose duration is the lifetime of the Span object.
llvm::SmallVector< llvm::StringRef, ProximityURILimit > generateProximityURIs(llvm::StringRef URI)
Returns Search Token for a number of parent directories of given Path.
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
llvm::StringRef findPathInURI(llvm::StringRef S)
constexpr unsigned ProximityURILimit
void generateIdentifierTrigrams(llvm::StringRef Identifier, std::vector< Trigram > &Result)
Produces list of unique fuzzy-search trigrams from unqualified symbol.
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
IndexContents
Describes what data is covered by an index.
std::string Path
A typedef to represent a file path.
void vlog(const char *Fmt, Ts &&... Vals)
llvm::unique_function< void(llvm::Expected< T >)> Callback
A Callback<T> is a void function that accepts Expected<T>.
std::vector< std::string > lookup(const SymbolIndex &I, llvm::ArrayRef< SymbolID > IDs)
float quality(const Symbol &S)
Computes query-independent quality score for a Symbol.
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
std::vector< std::string > Scopes
If this is non-empty, symbols must be in at least one of the scopes (e.g.
bool RestrictForCodeCompletion
If set to true, only symbols for completion support will be considered.
std::string Query
A query string for the fuzzy find.
std::vector< std::string > ProximityPaths
Contextually relevant files (e.g.
bool AnyScope
If set to true, allow symbols from any scope.
std::optional< uint32_t > Limit
The number of top candidates to return.
std::vector< std::string > PreferredTypes
Preferred types of symbols. These are raw representation of OpaqueType.
llvm::DenseSet< SymbolID > IDs
Represents a symbol occurrence in the source file.
llvm::DenseSet< SymbolID > IDs
std::optional< uint32_t > Limit
If set, limit the number of refers returned from the index.
std::optional< uint32_t > Limit
If set, limit the number of relations returned from the index.
llvm::DenseSet< SymbolID > Subjects
The class presents a C++ symbol, e.g.
@ IndexedForCodeCompletion
Whether or not this symbol is meant to be used for the code completion.
llvm::StringRef Type
Raw representation of the OpaqueType of the symbol, used for scoring purposes.
llvm::StringRef Name
The unqualified name of the symbol, e.g. "bar" (for ns::bar).
llvm::StringRef Scope
The containing namespace. e.g. "" (global), "ns::" (top-level namespace).
SymbolLocation CanonicalDeclaration
The location of the preferred declaration of the symbol.