clang-tools  8.0.0
PostingList.h
Go to the documentation of this file.
1 //===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// This defines posting list interface: a storage for identifiers of symbols
12 /// which can be characterized by a specific feature (such as fuzzy-find
13 /// trigram, scope, type or any other Search Token). Posting lists can be
14 /// traversed in order using an iterator and are values for inverted index,
15 /// which maps search tokens to corresponding posting lists.
16 ///
17 /// In order to decrease size of Index in-memory representation, Variable Byte
18 /// Encoding (VByte) is used for PostingLists compression. An overview of VByte
19 /// algorithm can be found in "Introduction to Information Retrieval" book:
20 /// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html
21 ///
22 //===----------------------------------------------------------------------===//
23 
24 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
25 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
26 
27 #include "Iterator.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/SmallVector.h"
30 #include <cstdint>
31 #include <vector>
32 
33 namespace clang {
34 namespace clangd {
35 namespace dex {
36 struct Token;
37 
38 /// NOTE: This is an implementation detail.
39 ///
40 /// Chunk is a fixed-width piece of PostingList which contains the first DocID
41 /// in uncompressed format (Head) and delta-encoded Payload. It can be
42 /// decompressed upon request.
43 struct Chunk {
44  /// Keep sizeof(Chunk) == 32.
45  static constexpr size_t PayloadSize = 32 - sizeof(DocID);
46 
47  llvm::SmallVector<DocID, PayloadSize + 1> decompress() const;
48 
49  /// The first element of decompressed Chunk.
51  /// VByte-encoded deltas.
52  std::array<uint8_t, PayloadSize> Payload;
53 };
54 static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory.");
55 
56 /// PostingList is the storage of DocIDs which can be inserted to the Query
57 /// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs
58 /// are stored in underlying chunks. Compression saves memory at a small cost
59 /// in access time, which is still fast enough in practice.
60 class PostingList {
61 public:
62  explicit PostingList(llvm::ArrayRef<DocID> Documents);
63 
64  /// Constructs DocumentIterator over given posting list. DocumentIterator will
65  /// go through the chunks and decompress them on-the-fly when necessary.
66  /// If given, Tok is only used for the string representation.
67  std::unique_ptr<Iterator> iterator(const Token *Tok = nullptr) const;
68 
69  /// Returns in-memory size of external storage.
70  size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); }
71 
72 private:
73  const std::vector<Chunk> Chunks;
74 };
75 
76 } // namespace dex
77 } // namespace clangd
78 } // namespace clang
79 
80 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
DocID Head
The first element of decompressed Chunk.
Definition: PostingList.h:50
PostingList is the storage of DocIDs which can be inserted to the Query Tree as a leaf by constructin...
Definition: PostingList.h:60
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:41
size_t bytes() const
Returns in-memory size of external storage.
Definition: PostingList.h:70
static constexpr size_t PayloadSize
Keep sizeof(Chunk) == 32.
Definition: PostingList.h:45
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
Definition: Iterator.h:47
NOTE: This is an implementation detail.
Definition: PostingList.h:43
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
std::array< uint8_t, PayloadSize > Payload
VByte-encoded deltas.
Definition: PostingList.h:52
llvm::SmallVector< DocID, PayloadSize+1 > decompress() const