12#include "llvm/Support/MathExtras.h"
23class ChunkIterator :
public Iterator {
25 explicit ChunkIterator(
const Token *Tok, llvm::ArrayRef<Chunk> Chunks)
26 : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) {
27 if (!Chunks.empty()) {
28 DecompressedChunk = CurrentChunk->decompress();
29 CurrentID = DecompressedChunk.begin();
33 bool reachedEnd()
const override {
return CurrentChunk == Chunks.end(); }
36 void advance()
override {
37 assert(!reachedEnd() &&
38 "Posting List iterator can't advance() at the end.");
45 void advanceTo(
DocID ID)
override {
46 assert(!reachedEnd() &&
47 "Posting List iterator can't advance() at the end.");
52 CurrentID = std::partition_point(CurrentID, DecompressedChunk.end(),
53 [&](
const DocID D) { return D < ID; });
57 DocID peek()
const override {
58 assert(!reachedEnd() &&
"Posting List iterator can't peek() at the end.");
63 assert(!reachedEnd() &&
64 "Posting List iterator can't consume() at the end.");
68 size_t estimateSize()
const override {
69 return Chunks.size() * ApproxEntriesPerChunk;
73 llvm::raw_ostream &dump(llvm::raw_ostream &
OS)
const override {
78 for (
const Chunk &
C : Chunks)
79 for (
const DocID Doc :
C.decompress()) {
88 void normalizeCursor() {
90 if (CurrentID != std::end(DecompressedChunk))
94 if (CurrentChunk == Chunks.end())
96 DecompressedChunk = CurrentChunk->decompress();
97 CurrentID = DecompressedChunk.begin();
101 void advanceToChunk(
DocID ID) {
102 if ((CurrentChunk != Chunks.end() - 1) &&
103 ((CurrentChunk + 1)->Head <=
ID)) {
105 std::partition_point(CurrentChunk + 1, Chunks.end(),
106 [&](
const Chunk &
C) { return C.Head < ID; });
108 DecompressedChunk = CurrentChunk->decompress();
109 CurrentID = DecompressedChunk.begin();
114 llvm::ArrayRef<Chunk> Chunks;
119 decltype(Chunks)::const_iterator CurrentChunk;
120 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
122 decltype(DecompressedChunk)::iterator CurrentID;
124 static constexpr size_t ApproxEntriesPerChunk = 15;
127static constexpr size_t BitsPerEncodingByte = 7;
131bool encodeVByte(
DocID Delta, llvm::MutableArrayRef<uint8_t> &
Payload) {
132 assert(Delta != 0 &&
"0 is not a valid PostingList delta.");
135 unsigned Width = 1 + llvm::Log2_64(Delta) / BitsPerEncodingByte;
140 uint8_t Encoding = Delta & 0x7f;
142 Payload.front() = Delta ? Encoding | 0x80 : Encoding;
144 }
while (Delta != 0);
167std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
168 assert(!Documents.empty() &&
"Can't encode empty sequence.");
169 std::vector<Chunk> Result;
170 Result.emplace_back();
171 DocID Last = Result.back().Head = Documents.front();
172 llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
173 for (
DocID Doc : Documents.drop_front()) {
174 if (!encodeVByte(Doc - Last, RemainingPayload)) {
175 Result.emplace_back();
176 Result.back().Head = Doc;
177 RemainingPayload = Result.back().Payload;
181 return std::vector<Chunk>(Result);
186std::optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
187 if (Bytes.front() == 0 || Bytes.empty())
190 bool HasNextByte =
true;
191 for (
size_t Length = 0; HasNextByte && !Bytes.empty(); ++
Length) {
192 assert(
Length <= 5 &&
"Malformed VByte encoding sequence.");
194 Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte *
Length);
195 if ((Bytes.front() & 0x80) == 0)
197 Bytes = Bytes.drop_front();
205 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{
Head};
206 llvm::ArrayRef<uint8_t> Bytes(
Payload);
208 for (
DocID Current =
Head; !Bytes.empty(); Current += Delta) {
209 auto MaybeDelta = readVByte(Bytes);
213 Result.push_back(Current + Delta);
215 return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
219 : Chunks(encodeStream(Documents)) {}
222 return std::make_unique<ChunkIterator>(Tok, Chunks);
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
This defines posting list interface: a storage for identifiers of symbols which can be characterized ...
std::unique_ptr< Iterator > iterator(const Token *Tok=nullptr) const
Constructs DocumentIterator over given posting list.
PostingList(llvm::ArrayRef< DocID > Documents)
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Token objects represent a characteristic of a symbol, which can be used to perform efficient search.
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
llvm::SmallVector< DocID, PayloadSize+1 > decompress() const
std::array< uint8_t, PayloadSize > Payload
VByte-encoded deltas.
DocID Head
The first element of decompressed Chunk.