extra/doxygen/PostingList_8cpp_source.html

//===--- PostingList.cpp - Symbol identifiers storage interface -----------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "PostingList.h"

#include "index/dex/Iterator.h"

#include "index/dex/Token.h"

#include "llvm/Support/MathExtras.h"

#include <optional>


namespace clang {

namespace clangd {

namespace dex {

namespace {


/// Implements iterator of PostingList chunks. This requires iterating over two

/// levels: the first level iterator iterates over the chunks and decompresses

/// them on-the-fly when the contents of chunk are to be seen.

class ChunkIterator : public Iterator {

public:

  explicit ChunkIterator(const Token *Tok, llvm::ArrayRef<Chunk> Chunks)

      : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) {

    if (!Chunks.empty()) {

      DecompressedChunk = CurrentChunk->decompress();

      CurrentID = DecompressedChunk.begin();

    }

  }


  bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }


  /// Advances cursor to the next item.

  void advance() override {

    assert(!reachedEnd() &&

           "Posting List iterator can't advance() at the end.");

    ++CurrentID;

    normalizeCursor();

  }


  /// Applies binary search to advance cursor to the next item with DocID

  /// equal or higher than the given one.

  void advanceTo(DocID ID) override {

    assert(!reachedEnd() &&

           "Posting List iterator can't advance() at the end.");

    if (ID <= peek())

      return;

    advanceToChunk(ID);

    // Try to find ID within current chunk.

    CurrentID = std::partition_point(CurrentID, DecompressedChunk.end(),

                                     [&](const DocID D) { return D < ID; });

    normalizeCursor();

  }


  DocID peek() const override {

    assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");

    return *CurrentID;

  }


  float consume() override {

    assert(!reachedEnd() &&

           "Posting List iterator can't consume() at the end.");

    return 1;

  }


  size_t estimateSize() const override {

    return Chunks.size() * ApproxEntriesPerChunk;

  }


private:

  llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {

    if (Tok != nullptr)

      return OS << *Tok;

    OS << '[';

    const char *Sep = "";

    for (const Chunk &C : Chunks)

      for (const DocID Doc : C.decompress()) {

        OS << Sep << Doc;

        Sep = " ";

      }

    return OS << ']';

  }


  /// If the cursor is at the end of a chunk, place it at the start of the next

  /// chunk.

  void normalizeCursor() {

    // Invariant is already established if examined chunk is not exhausted.

    if (CurrentID != std::end(DecompressedChunk))

      return;

    // Advance to next chunk if current one is exhausted.

    ++CurrentChunk;

    if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.

      return;

    DecompressedChunk = CurrentChunk->decompress();

    CurrentID = DecompressedChunk.begin();

  }


  /// Advances CurrentChunk to the chunk which might contain ID.

  void advanceToChunk(DocID ID) {

    if ((CurrentChunk != Chunks.end() - 1) &&

        ((CurrentChunk + 1)->Head <= ID)) {

      CurrentChunk =

          std::partition_point(CurrentChunk + 1, Chunks.end(),

                               [&](const Chunk &C) { return C.Head < ID; });

      --CurrentChunk;

      DecompressedChunk = CurrentChunk->decompress();

      CurrentID = DecompressedChunk.begin();

    }

  }


  const Token *Tok;

  llvm::ArrayRef<Chunk> Chunks;

  /// Iterator over chunks.

  /// If CurrentChunk is valid, then DecompressedChunk is

  /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator

  /// into it.

  decltype(Chunks)::const_iterator CurrentChunk;

  llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;

  /// Iterator over DecompressedChunk.

  decltype(DecompressedChunk)::iterator CurrentID;


  static constexpr size_t ApproxEntriesPerChunk = 15;

};


static constexpr size_t BitsPerEncodingByte = 7;


/// Writes a variable length DocID into the buffer and updates the buffer size.

/// If it doesn't fit, returns false and doesn't write to the buffer.

bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {

  assert(Delta != 0 && "0 is not a valid PostingList delta.");

  // Calculate number of bytes Delta encoding would take by examining the

  // meaningful bits.

  unsigned Width = 1 + llvm::Log2_64(Delta) / BitsPerEncodingByte;

  if (Width > Payload.size())

    return false;


  do {

    uint8_t Encoding = Delta & 0x7f;

    Delta >>= 7;

    Payload.front() = Delta ? Encoding | 0x80 : Encoding;

    Payload = Payload.drop_front();

  } while (Delta != 0);

  return true;

}


/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of

/// DocIDs. The compression stores deltas (differences) between subsequent

/// DocIDs and encodes these deltas utilizing the least possible number of

/// bytes.

///

/// Each encoding byte consists of two parts: the first bit (continuation bit)

/// indicates whether this is the last byte (0 if this byte is the last) of

/// current encoding and seven bytes a piece of DocID (payload). DocID contains

/// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit

/// payloads and one 4-bit payload), but in practice it is expected that gaps

/// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.

/// In very dense posting lists (with average gaps less than 128) this

/// representation would be 4 times more efficient than raw DocID array.

///

/// PostingList encoding example:

///

/// DocIDs    42            47        7000

/// gaps                    5         6958

/// Encoding  (raw number)  00000101  10110110 00101110

std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {

  assert(!Documents.empty() && "Can't encode empty sequence.");

  std::vector<Chunk> Result;

  Result.emplace_back();

  DocID Last = Result.back().Head = Documents.front();

  llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;

  for (DocID Doc : Documents.drop_front()) {

    if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk

      Result.emplace_back();

      Result.back().Head = Doc;

      RemainingPayload = Result.back().Payload;

    }

    Last = Doc;

  }

  return std::vector<Chunk>(Result); // no move, shrink-to-fit

}


/// Reads variable length DocID from the buffer and updates the buffer size. If

/// the stream is terminated, return std::nullopt.

std::optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {

  if (Bytes.front() == 0 || Bytes.empty())

    return std::nullopt;

  DocID Result = 0;

  bool HasNextByte = true;

  for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) {

    assert(Length <= 5 && "Malformed VByte encoding sequence.");

    // Write meaningful bits to the correct place in the document decoding.

    Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length);

    if ((Bytes.front() & 0x80) == 0)

      HasNextByte = false;

    Bytes = Bytes.drop_front();

  }

  return Result;

}


} // namespace


llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const {

  llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head};

  llvm::ArrayRef<uint8_t> Bytes(Payload);

  DocID Delta;

  for (DocID Current = Head; !Bytes.empty(); Current += Delta) {

    auto MaybeDelta = readVByte(Bytes);

    if (!MaybeDelta)

      break;

    Delta = *MaybeDelta;

    Result.push_back(Current + Delta);

  }

  return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};

}


PostingList::PostingList(llvm::ArrayRef<DocID> Documents)

    : Chunks(encodeStream(Documents)) {}


std::unique_ptr<Iterator> PostingList::iterator(const Token *Tok) const {

  return std::make_unique<ChunkIterator>(Tok, Chunks);

}


} // namespace dex

} // namespace clangd

} // namespace clang

C
const Criteria C
Definition: FunctionCognitiveComplexityCheck.cpp:92

Iterator.h
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...

PostingList.h
This defines posting list interface: a storage for identifiers of symbols which can be characterized ...

ID
StringRef ID
Definition: ReferenceToConstructedTemporaryCheck.cpp:35

Length
unsigned Length
Definition: SourceCodeTests.cpp:97

Payload
std::string Payload
Definition: SourceCode.cpp:673

Token.h
Token objects represent a characteristic of a symbol, which can be used to perform efficient search.

OS
llvm::raw_string_ostream OS
Definition: TraceTests.cpp:160

clang::clangd::dex::PostingList::iterator
std::unique_ptr< Iterator > iterator(const Token *Tok=nullptr) const
Constructs DocumentIterator over given posting list.
Definition: PostingList.cpp:221

clang::clangd::dex::PostingList::PostingList
PostingList(llvm::ArrayRef< DocID > Documents)
Definition: PostingList.cpp:218

clang::clangd::dex::Token
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:39

clang::clangd::dex::consume
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
Definition: Iterator.cpp:357

clang::clangd::dex::DocID
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
Definition: Iterator.h:45

clang
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Definition: ApplyReplacements.h:27

clang::clangd::dex::Chunk::decompress
llvm::SmallVector< DocID, PayloadSize+1 > decompress() const
Definition: PostingList.cpp:204

clang::clangd::dex::Chunk::Payload
std::array< uint8_t, PayloadSize > Payload
VByte-encoded deltas.
Definition: PostingList.h:51

clang::clangd::dex::Chunk::Head
DocID Head
The first element of decompressed Chunk.
Definition: PostingList.h:49