clang-tools  10.0.0svn
PostingList.h
Go to the documentation of this file.
1 //===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This defines posting list interface: a storage for identifiers of symbols
11 /// which can be characterized by a specific feature (such as fuzzy-find
12 /// trigram, scope, type or any other Search Token). Posting lists can be
13 /// traversed in order using an iterator and are values for inverted index,
14 /// which maps search tokens to corresponding posting lists.
15 ///
16 /// In order to decrease size of Index in-memory representation, Variable Byte
17 /// Encoding (VByte) is used for PostingLists compression. An overview of VByte
18 /// algorithm can be found in "Introduction to Information Retrieval" book:
19 /// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html
20 ///
21 //===----------------------------------------------------------------------===//
22 
23 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
24 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
25 
26 #include "Iterator.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/SmallVector.h"
29 #include <cstdint>
30 #include <vector>
31 
32 namespace clang {
33 namespace clangd {
34 namespace dex {
35 class Token;
36 
37 /// NOTE: This is an implementation detail.
38 ///
39 /// Chunk is a fixed-width piece of PostingList which contains the first DocID
40 /// in uncompressed format (Head) and delta-encoded Payload. It can be
41 /// decompressed upon request.
42 struct Chunk {
43  /// Keep sizeof(Chunk) == 32.
44  static constexpr size_t PayloadSize = 32 - sizeof(DocID);
45 
46  llvm::SmallVector<DocID, PayloadSize + 1> decompress() const;
47 
48  /// The first element of decompressed Chunk.
50  /// VByte-encoded deltas.
51  std::array<uint8_t, PayloadSize> Payload;
52 };
53 static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory.");
54 
55 /// PostingList is the storage of DocIDs which can be inserted to the Query
56 /// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs
57 /// are stored in underlying chunks. Compression saves memory at a small cost
58 /// in access time, which is still fast enough in practice.
59 class PostingList {
60 public:
61  explicit PostingList(llvm::ArrayRef<DocID> Documents);
62 
63  /// Constructs DocumentIterator over given posting list. DocumentIterator will
64  /// go through the chunks and decompress them on-the-fly when necessary.
65  /// If given, Tok is only used for the string representation.
66  std::unique_ptr<Iterator> iterator(const Token *Tok = nullptr) const;
67 
68  /// Returns in-memory size of external storage.
69  size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); }
70 
71 private:
72  const std::vector<Chunk> Chunks;
73 };
74 
75 } // namespace dex
76 } // namespace clangd
77 } // namespace clang
78 
79 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
DocID Head
The first element of decompressed Chunk.
Definition: PostingList.h:49
PostingList is the storage of DocIDs which can be inserted to the Query Tree as a leaf by constructin...
Definition: PostingList.h:59
size_t bytes() const
Returns in-memory size of external storage.
Definition: PostingList.h:69
static constexpr size_t PayloadSize
Keep sizeof(Chunk) == 32.
Definition: PostingList.h:44
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
Definition: Iterator.h:46
NOTE: This is an implementation detail.
Definition: PostingList.h:42
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:40
std::array< uint8_t, PayloadSize > Payload
VByte-encoded deltas.
Definition: PostingList.h:51
llvm::SmallVector< DocID, PayloadSize+1 > decompress() const