clang-tools 20.0.0git
PostingList.h
Go to the documentation of this file.
1//===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This defines posting list interface: a storage for identifiers of symbols
11/// which can be characterized by a specific feature (such as fuzzy-find
12/// trigram, scope, type or any other Search Token). Posting lists can be
13/// traversed in order using an iterator and are values for inverted index,
14/// which maps search tokens to corresponding posting lists.
15///
16/// In order to decrease size of Index in-memory representation, Variable Byte
17/// Encoding (VByte) is used for PostingLists compression. An overview of VByte
18/// algorithm can be found in "Introduction to Information Retrieval" book:
19/// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html
20///
21//===----------------------------------------------------------------------===//
22
23#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
24#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
25
26#include "Iterator.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/SmallVector.h"
29#include <cstdint>
30#include <vector>
31
32namespace clang {
33namespace clangd {
34namespace dex {
35class Token;
36
37/// NOTE: This is an implementation detail.
38///
39/// Chunk is a fixed-width piece of PostingList which contains the first DocID
40/// in uncompressed format (Head) and delta-encoded Payload. It can be
41/// decompressed upon request.
42struct Chunk {
43 /// Keep sizeof(Chunk) == 32.
44 static constexpr size_t PayloadSize = 32 - sizeof(DocID);
45
46 llvm::SmallVector<DocID, PayloadSize + 1> decompress() const;
47
48 /// The first element of decompressed Chunk.
50 /// VByte-encoded deltas.
51 std::array<uint8_t, PayloadSize> Payload;
52};
53static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory.");
54
55/// PostingList is the storage of DocIDs which can be inserted to the Query
56/// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs
57/// are stored in underlying chunks. Compression saves memory at a small cost
58/// in access time, which is still fast enough in practice.
60public:
61 explicit PostingList(llvm::ArrayRef<DocID> Documents);
62
63 /// Constructs DocumentIterator over given posting list. DocumentIterator will
64 /// go through the chunks and decompress them on-the-fly when necessary.
65 /// If given, Tok is only used for the string representation.
66 std::unique_ptr<Iterator> iterator(const Token *Tok = nullptr) const;
67
68 /// Returns in-memory size of external storage.
69 size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); }
70
71private:
72 const std::vector<Chunk> Chunks;
73};
74
75} // namespace dex
76} // namespace clangd
77} // namespace clang
78
79#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
PostingList is the storage of DocIDs which can be inserted to the Query Tree as a leaf by constructin...
Definition: PostingList.h:59
std::unique_ptr< Iterator > iterator(const Token *Tok=nullptr) const
Constructs DocumentIterator over given posting list.
size_t bytes() const
Returns in-memory size of external storage.
Definition: PostingList.h:69
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
Definition: Iterator.h:45
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
NOTE: This is an implementation detail.
Definition: PostingList.h:42
llvm::SmallVector< DocID, PayloadSize+1 > decompress() const
std::array< uint8_t, PayloadSize > Payload
VByte-encoded deltas.
Definition: PostingList.h:51
static constexpr size_t PayloadSize
Keep sizeof(Chunk) == 32.
Definition: PostingList.h:44
DocID Head
The first element of decompressed Chunk.
Definition: PostingList.h:49