clang-tools 20.0.0git
PostingList.cpp
Go to the documentation of this file.
1//===--- PostingList.cpp - Symbol identifiers storage interface -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "PostingList.h"
10#include "index/dex/Iterator.h"
11#include "index/dex/Token.h"
12#include "llvm/Support/MathExtras.h"
13#include <optional>
14
15namespace clang {
16namespace clangd {
17namespace dex {
18namespace {
19
20/// Implements iterator of PostingList chunks. This requires iterating over two
21/// levels: the first level iterator iterates over the chunks and decompresses
22/// them on-the-fly when the contents of chunk are to be seen.
23class ChunkIterator : public Iterator {
24public:
25 explicit ChunkIterator(const Token *Tok, llvm::ArrayRef<Chunk> Chunks)
26 : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) {
27 if (!Chunks.empty()) {
28 DecompressedChunk = CurrentChunk->decompress();
29 CurrentID = DecompressedChunk.begin();
30 }
31 }
32
33 bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }
34
35 /// Advances cursor to the next item.
36 void advance() override {
37 assert(!reachedEnd() &&
38 "Posting List iterator can't advance() at the end.");
39 ++CurrentID;
40 normalizeCursor();
41 }
42
43 /// Applies binary search to advance cursor to the next item with DocID
44 /// equal or higher than the given one.
45 void advanceTo(DocID ID) override {
46 assert(!reachedEnd() &&
47 "Posting List iterator can't advance() at the end.");
48 if (ID <= peek())
49 return;
50 advanceToChunk(ID);
51 // Try to find ID within current chunk.
52 CurrentID = std::partition_point(CurrentID, DecompressedChunk.end(),
53 [&](const DocID D) { return D < ID; });
54 normalizeCursor();
55 }
56
57 DocID peek() const override {
58 assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
59 return *CurrentID;
60 }
61
62 float consume() override {
63 assert(!reachedEnd() &&
64 "Posting List iterator can't consume() at the end.");
65 return 1;
66 }
67
68 size_t estimateSize() const override {
69 return Chunks.size() * ApproxEntriesPerChunk;
70 }
71
72private:
73 llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
74 if (Tok != nullptr)
75 return OS << *Tok;
76 OS << '[';
77 const char *Sep = "";
78 for (const Chunk &C : Chunks)
79 for (const DocID Doc : C.decompress()) {
80 OS << Sep << Doc;
81 Sep = " ";
82 }
83 return OS << ']';
84 }
85
86 /// If the cursor is at the end of a chunk, place it at the start of the next
87 /// chunk.
88 void normalizeCursor() {
89 // Invariant is already established if examined chunk is not exhausted.
90 if (CurrentID != std::end(DecompressedChunk))
91 return;
92 // Advance to next chunk if current one is exhausted.
93 ++CurrentChunk;
94 if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.
95 return;
96 DecompressedChunk = CurrentChunk->decompress();
97 CurrentID = DecompressedChunk.begin();
98 }
99
100 /// Advances CurrentChunk to the chunk which might contain ID.
101 void advanceToChunk(DocID ID) {
102 if ((CurrentChunk != Chunks.end() - 1) &&
103 ((CurrentChunk + 1)->Head <= ID)) {
104 CurrentChunk =
105 std::partition_point(CurrentChunk + 1, Chunks.end(),
106 [&](const Chunk &C) { return C.Head < ID; });
107 --CurrentChunk;
108 DecompressedChunk = CurrentChunk->decompress();
109 CurrentID = DecompressedChunk.begin();
110 }
111 }
112
113 const Token *Tok;
114 llvm::ArrayRef<Chunk> Chunks;
115 /// Iterator over chunks.
116 /// If CurrentChunk is valid, then DecompressedChunk is
117 /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
118 /// into it.
119 decltype(Chunks)::const_iterator CurrentChunk;
120 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
121 /// Iterator over DecompressedChunk.
122 decltype(DecompressedChunk)::iterator CurrentID;
123
124 static constexpr size_t ApproxEntriesPerChunk = 15;
125};
126
127static constexpr size_t BitsPerEncodingByte = 7;
128
129/// Writes a variable length DocID into the buffer and updates the buffer size.
130/// If it doesn't fit, returns false and doesn't write to the buffer.
131bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {
132 assert(Delta != 0 && "0 is not a valid PostingList delta.");
133 // Calculate number of bytes Delta encoding would take by examining the
134 // meaningful bits.
135 unsigned Width = 1 + llvm::Log2_64(Delta) / BitsPerEncodingByte;
136 if (Width > Payload.size())
137 return false;
138
139 do {
140 uint8_t Encoding = Delta & 0x7f;
141 Delta >>= 7;
142 Payload.front() = Delta ? Encoding | 0x80 : Encoding;
143 Payload = Payload.drop_front();
144 } while (Delta != 0);
145 return true;
146}
147
148/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
149/// DocIDs. The compression stores deltas (differences) between subsequent
150/// DocIDs and encodes these deltas utilizing the least possible number of
151/// bytes.
152///
153/// Each encoding byte consists of two parts: the first bit (continuation bit)
154/// indicates whether this is the last byte (0 if this byte is the last) of
155/// current encoding and seven bytes a piece of DocID (payload). DocID contains
156/// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
157/// payloads and one 4-bit payload), but in practice it is expected that gaps
158/// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
159/// In very dense posting lists (with average gaps less than 128) this
160/// representation would be 4 times more efficient than raw DocID array.
161///
162/// PostingList encoding example:
163///
164/// DocIDs 42 47 7000
165/// gaps 5 6958
166/// Encoding (raw number) 00000101 10110110 00101110
167std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
168 assert(!Documents.empty() && "Can't encode empty sequence.");
169 std::vector<Chunk> Result;
170 Result.emplace_back();
171 DocID Last = Result.back().Head = Documents.front();
172 llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
173 for (DocID Doc : Documents.drop_front()) {
174 if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk
175 Result.emplace_back();
176 Result.back().Head = Doc;
177 RemainingPayload = Result.back().Payload;
178 }
179 Last = Doc;
180 }
181 return std::vector<Chunk>(Result); // no move, shrink-to-fit
182}
183
184/// Reads variable length DocID from the buffer and updates the buffer size. If
185/// the stream is terminated, return std::nullopt.
186std::optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
187 if (Bytes.front() == 0 || Bytes.empty())
188 return std::nullopt;
189 DocID Result = 0;
190 bool HasNextByte = true;
191 for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) {
192 assert(Length <= 5 && "Malformed VByte encoding sequence.");
193 // Write meaningful bits to the correct place in the document decoding.
194 Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length);
195 if ((Bytes.front() & 0x80) == 0)
196 HasNextByte = false;
197 Bytes = Bytes.drop_front();
198 }
199 return Result;
200}
201
202} // namespace
203
204llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const {
205 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head};
206 llvm::ArrayRef<uint8_t> Bytes(Payload);
207 DocID Delta;
208 for (DocID Current = Head; !Bytes.empty(); Current += Delta) {
209 auto MaybeDelta = readVByte(Bytes);
210 if (!MaybeDelta)
211 break;
212 Delta = *MaybeDelta;
213 Result.push_back(Current + Delta);
214 }
215 return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
216}
217
218PostingList::PostingList(llvm::ArrayRef<DocID> Documents)
219 : Chunks(encodeStream(Documents)) {}
220
221std::unique_ptr<Iterator> PostingList::iterator(const Token *Tok) const {
222 return std::make_unique<ChunkIterator>(Tok, Chunks);
223}
224
225} // namespace dex
226} // namespace clangd
227} // namespace clang
const Criteria C
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
This defines posting list interface: a storage for identifiers of symbols which can be characterized ...
unsigned Length
std::string Payload
Definition: SourceCode.cpp:673
Token objects represent a characteristic of a symbol, which can be used to perform efficient search.
llvm::raw_string_ostream OS
Definition: TraceTests.cpp:160
std::unique_ptr< Iterator > iterator(const Token *Tok=nullptr) const
Constructs DocumentIterator over given posting list.
PostingList(llvm::ArrayRef< DocID > Documents)
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:39
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
Definition: Iterator.cpp:357
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
Definition: Iterator.h:45
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
llvm::SmallVector< DocID, PayloadSize+1 > decompress() const
std::array< uint8_t, PayloadSize > Payload
VByte-encoded deltas.
Definition: PostingList.h:51
DocID Head
The first element of decompressed Chunk.
Definition: PostingList.h:49