clang-tools 20.0.0git
Iterator.h
Go to the documentation of this file.
1//===--- Iterator.h - Query Symbol Retrieval --------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// Symbol index queries consist of specific requirements for the requested
11/// symbol, such as high fuzzy matching score, scope, type etc. The lists of all
12/// symbols matching some criteria (e.g. belonging to "clang::clangd::" scope)
13/// are expressed in a form of Search Tokens which are stored in the inverted
14/// index. Inverted index maps these tokens to the posting lists - sorted (by
15/// symbol quality) sequences of symbol IDs matching the token, e.g. scope token
16/// "clangd::clangd::" is mapped to the list of IDs of all symbols which are
17/// declared in this namespace. Search queries are build from a set of
18/// requirements which can be combined with each other forming the query trees.
19/// The leafs of such trees are posting lists, and the nodes are operations on
20/// these posting lists, e.g. intersection or union. Efficient processing of
21/// these multi-level queries is handled by Iterators. Iterators advance through
22/// all leaf posting lists producing the result of search query, which preserves
23/// the sorted order of IDs. Having the resulting IDs sorted is important,
24/// because it allows receiving a certain number of the most valuable items
25/// (e.g. symbols with highest quality which was the sorting key in the first
26/// place) without processing all items with requested properties (this might
27/// not be computationally effective if search request is not very restrictive).
28//
29//===----------------------------------------------------------------------===//
30
31#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
32#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
33
34#include "llvm/Support/raw_ostream.h"
35#include <algorithm>
36#include <memory>
37#include <vector>
38
39namespace clang {
40namespace clangd {
41namespace dex {
42
43/// Symbol position in the list of all index symbols sorted by a pre-computed
44/// symbol quality.
45using DocID = uint32_t;
46
47/// Iterator is the interface for Query Tree node. The simplest type of Iterator
48/// is DocumentIterator which is simply a wrapper around PostingList iterator
49/// and serves as the Query Tree leaf. More sophisticated examples of iterators
50/// can manage intersection, union of the elements produced by other iterators
51/// (their children) to form a multi-level Query Tree. The interface is designed
52/// to be extensible in order to support multiple types of iterators.
53class Iterator {
54public:
55 /// Returns true if all valid DocIDs were processed and hence the iterator is
56 /// exhausted.
57 virtual bool reachedEnd() const = 0;
58 /// Moves to next valid DocID. If it doesn't exist, the iterator is exhausted
59 /// and proceeds to the END.
60 ///
61 /// Note: reachedEnd() must be false.
62 virtual void advance() = 0;
63 /// Moves to the first valid DocID which is equal or higher than given ID. If
64 /// it doesn't exist, the iterator is exhausted and proceeds to the END.
65 ///
66 /// Note: reachedEnd() must be false.
67 virtual void advanceTo(DocID ID) = 0;
68 /// Returns the current element this iterator points to.
69 ///
70 /// Note: reachedEnd() must be false.
71 virtual DocID peek() const = 0;
72 /// Informs the iterator that the current document was consumed, and returns
73 /// its boost.
74 ///
75 /// Note: If this iterator has any child iterators that contain the document,
76 /// consume() should be called on those and their boosts incorporated.
77 /// consume() must *not* be called on children that don't contain the current
78 /// doc.
79 virtual float consume() = 0;
80 /// Returns an estimate of advance() calls before the iterator is exhausted.
81 virtual size_t estimateSize() const = 0;
82
83 virtual ~Iterator() {}
84
85 /// Prints a convenient human-readable iterator representation by recursively
86 /// dumping iterators in the following format:
87 ///
88 /// (Type Child1 Child2 ...)
89 ///
90 /// Where Type is the iterator type representation: "&" for And, "|" for Or,
91 /// ChildN is N-th iterator child. Raw iterators over PostingList are
92 /// represented as "[... CurID ...]" where CurID is the current PostingList
93 /// entry being inspected.
94 friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
95 const Iterator &Iterator) {
96 return Iterator.dump(OS);
97 }
98
99 /// Inspect iterator type, used internally for optimizing query trees.
100 enum class Kind { And, Or, True, False, Other };
101 Kind kind() const { return MyKind; }
102
103protected:
104 Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {}
105
106private:
107 virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
108 Kind MyKind;
109};
110
111/// Advances the iterator until it is exhausted. Returns pairs of document IDs
112/// with the corresponding boosting score.
113///
114/// Boosting can be seen as a compromise between retrieving too many items and
115/// calculating finals score for each of them (which might be very expensive)
116/// and not retrieving enough items so that items with very high final score
117/// would not be processed. Boosting score is a computationally efficient way
118/// to acquire preliminary scores of requested items.
119std::vector<std::pair<DocID, float>> consume(Iterator &It);
120
121namespace detail {
122// Variadic template machinery.
123inline void populateChildren(std::vector<std::unique_ptr<Iterator>> &) {}
124template <typename... TailT>
125void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children,
126 std::unique_ptr<Iterator> Head, TailT... Tail) {
127 Children.push_back(std::move(Head));
128 populateChildren(Children, std::move(Tail)...);
129}
130} // namespace detail
131
132// A corpus is a set of documents, and a factory for iterators over them.
133class Corpus {
134 DocID Size;
135
136public:
137 explicit Corpus(DocID Size) : Size(Size) {}
138
139 /// Returns AND Iterator which performs the intersection of the PostingLists
140 /// of its children.
141 ///
142 /// consume(): AND Iterator returns the product of Childrens' boosting
143 /// scores.
144 std::unique_ptr<Iterator>
145 intersect(std::vector<std::unique_ptr<Iterator>> Children) const;
146
147 /// Returns OR Iterator which performs the union of the PostingLists of its
148 /// children.
149 ///
150 /// consume(): OR Iterator returns the highest boost value among children
151 /// containing the requested item.
152 std::unique_ptr<Iterator>
153 unionOf(std::vector<std::unique_ptr<Iterator>> Children) const;
154
155 /// Returns TRUE Iterator which iterates over "virtual" PostingList
156 /// containing all items in range [0, Size) in an efficient manner.
157 std::unique_ptr<Iterator> all() const;
158
159 /// Returns FALSE Iterator which iterates over no documents.
160 std::unique_ptr<Iterator> none() const;
161
162 /// Returns BOOST iterator which multiplies the score of each item by given
163 /// factor. Boosting can be used as a computationally inexpensive filtering.
164 /// Users can return significantly more items using consumeAndBoost() and
165 /// then trim Top K using retrieval score.
166 std::unique_ptr<Iterator> boost(std::unique_ptr<Iterator> Child,
167 float Factor) const;
168
169 /// Returns LIMIT iterator, which yields up to N elements of its child
170 /// iterator. Elements only count towards the limit if they are part of the
171 /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2)
172 /// 1)) yields (2), not ().
173 std::unique_ptr<Iterator> limit(std::unique_ptr<Iterator> Child,
174 size_t Limit) const;
175
176 /// This allows intersect(create(...), create(...)) syntax.
177 template <typename... Args>
178 std::unique_ptr<Iterator> intersect(Args... args) const {
179 std::vector<std::unique_ptr<Iterator>> Children;
180 detail::populateChildren(Children, std::forward<Args>(args)...);
181 return intersect(std::move(Children));
182 }
183
184 /// This allows unionOf(create(...), create(...)) syntax.
185 template <typename... Args>
186 std::unique_ptr<Iterator> unionOf(Args... args) const {
187 std::vector<std::unique_ptr<Iterator>> Children;
188 detail::populateChildren(Children, std::forward<Args>(args)...);
189 return unionOf(std::move(Children));
190 }
191};
192
193} // namespace dex
194} // namespace clangd
195} // namespace clang
196
197#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H
llvm::raw_string_ostream OS
Definition: TraceTests.cpp:160
llvm::json::Object Args
Definition: Trace.cpp:138
std::unique_ptr< Iterator > none() const
Returns FALSE Iterator which iterates over no documents.
Definition: Iterator.cpp:428
std::unique_ptr< Iterator > intersect(Args... args) const
This allows intersect(create(...), create(...)) syntax.
Definition: Iterator.h:178
std::unique_ptr< Iterator > limit(std::unique_ptr< Iterator > Child, size_t Limit) const
Returns LIMIT iterator, which yields up to N elements of its child iterator.
Definition: Iterator.cpp:441
std::unique_ptr< Iterator > intersect(std::vector< std::unique_ptr< Iterator > > Children) const
Returns AND Iterator which performs the intersection of the PostingLists of its children.
Definition: Iterator.cpp:365
std::unique_ptr< Iterator > boost(std::unique_ptr< Iterator > Child, float Factor) const
Returns BOOST iterator which multiplies the score of each item by given factor.
Definition: Iterator.cpp:432
std::unique_ptr< Iterator > all() const
Returns TRUE Iterator which iterates over "virtual" PostingList containing all items in range [0,...
Definition: Iterator.cpp:424
std::unique_ptr< Iterator > unionOf(Args... args) const
This allows unionOf(create(...), create(...)) syntax.
Definition: Iterator.h:186
std::unique_ptr< Iterator > unionOf(std::vector< std::unique_ptr< Iterator > > Children) const
Returns OR Iterator which performs the union of the PostingLists of its children.
Definition: Iterator.cpp:395
Iterator is the interface for Query Tree node.
Definition: Iterator.h:53
virtual float consume()=0
Informs the iterator that the current document was consumed, and returns its boost.
Iterator(Kind MyKind=Kind::Other)
Definition: Iterator.h:104
virtual bool reachedEnd() const =0
Returns true if all valid DocIDs were processed and hence the iterator is exhausted.
virtual void advance()=0
Moves to next valid DocID.
friend llvm::raw_ostream & operator<<(llvm::raw_ostream &OS, const Iterator &Iterator)
Prints a convenient human-readable iterator representation by recursively dumping iterators in the fo...
Definition: Iterator.h:94
virtual DocID peek() const =0
Returns the current element this iterator points to.
Kind
Inspect iterator type, used internally for optimizing query trees.
Definition: Iterator.h:100
virtual size_t estimateSize() const =0
Returns an estimate of advance() calls before the iterator is exhausted.
virtual void advanceTo(DocID ID)=0
Moves to the first valid DocID which is equal or higher than given ID.
void populateChildren(std::vector< std::unique_ptr< Iterator > > &)
Definition: Iterator.h:123
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
Definition: Iterator.cpp:357
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
Definition: Iterator.h:45
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//