clang-tools  8.0.0
PostingList.cpp
Go to the documentation of this file.
1 //===--- PostingList.cpp - Symbol identifiers storage interface -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "PostingList.h"
11 #include "Iterator.h"
12 #include "Token.h"
13 #include "llvm/Support/Error.h"
14 #include "llvm/Support/MathExtras.h"
15 
16 namespace clang {
17 namespace clangd {
18 namespace dex {
19 namespace {
20 
21 /// Implements iterator of PostingList chunks. This requires iterating over two
22 /// levels: the first level iterator iterates over the chunks and decompresses
23 /// them on-the-fly when the contents of chunk are to be seen.
24 class ChunkIterator : public Iterator {
25 public:
26  explicit ChunkIterator(const Token *Tok, llvm::ArrayRef<Chunk> Chunks)
27  : Tok(Tok), Chunks(Chunks), CurrentChunk(Chunks.begin()) {
28  if (!Chunks.empty()) {
29  DecompressedChunk = CurrentChunk->decompress();
30  CurrentID = DecompressedChunk.begin();
31  }
32  }
33 
34  bool reachedEnd() const override { return CurrentChunk == Chunks.end(); }
35 
36  /// Advances cursor to the next item.
37  void advance() override {
38  assert(!reachedEnd() &&
39  "Posting List iterator can't advance() at the end.");
40  ++CurrentID;
41  normalizeCursor();
42  }
43 
44  /// Applies binary search to advance cursor to the next item with DocID
45  /// equal or higher than the given one.
46  void advanceTo(DocID ID) override {
47  assert(!reachedEnd() &&
48  "Posting List iterator can't advance() at the end.");
49  if (ID <= peek())
50  return;
51  advanceToChunk(ID);
52  // Try to find ID within current chunk.
53  CurrentID = std::lower_bound(CurrentID, std::end(DecompressedChunk), ID);
54  normalizeCursor();
55  }
56 
57  DocID peek() const override {
58  assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
59  return *CurrentID;
60  }
61 
62  float consume() override {
63  assert(!reachedEnd() &&
64  "Posting List iterator can't consume() at the end.");
65  return 1;
66  }
67 
68  size_t estimateSize() const override {
69  return Chunks.size() * ApproxEntriesPerChunk;
70  }
71 
72 private:
73  llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
74  if (Tok != nullptr)
75  return OS << *Tok;
76  OS << '[';
77  const char *Sep = "";
78  for (const Chunk &C : Chunks)
79  for (const DocID Doc : C.decompress()) {
80  OS << Sep << Doc;
81  Sep = " ";
82  }
83  return OS << ']';
84  }
85 
86  /// If the cursor is at the end of a chunk, place it at the start of the next
87  /// chunk.
88  void normalizeCursor() {
89  // Invariant is already established if examined chunk is not exhausted.
90  if (CurrentID != std::end(DecompressedChunk))
91  return;
92  // Advance to next chunk if current one is exhausted.
93  ++CurrentChunk;
94  if (CurrentChunk == Chunks.end()) // Reached the end of PostingList.
95  return;
96  DecompressedChunk = CurrentChunk->decompress();
97  CurrentID = DecompressedChunk.begin();
98  }
99 
100  /// Advances CurrentChunk to the chunk which might contain ID.
101  void advanceToChunk(DocID ID) {
102  if ((CurrentChunk != Chunks.end() - 1) &&
103  ((CurrentChunk + 1)->Head <= ID)) {
104  // Find the next chunk with Head >= ID.
105  CurrentChunk = std::lower_bound(
106  CurrentChunk + 1, Chunks.end(), ID,
107  [](const Chunk &C, const DocID ID) { return C.Head <= ID; });
108  --CurrentChunk;
109  DecompressedChunk = CurrentChunk->decompress();
110  CurrentID = DecompressedChunk.begin();
111  }
112  }
113 
114  const Token *Tok;
115  llvm::ArrayRef<Chunk> Chunks;
116  /// Iterator over chunks.
117  /// If CurrentChunk is valid, then DecompressedChunk is
118  /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator
119  /// into it.
120  decltype(Chunks)::const_iterator CurrentChunk;
121  llvm::SmallVector<DocID, Chunk::PayloadSize + 1> DecompressedChunk;
122  /// Iterator over DecompressedChunk.
123  decltype(DecompressedChunk)::iterator CurrentID;
124 
125  static constexpr size_t ApproxEntriesPerChunk = 15;
126 };
127 
128 static constexpr size_t BitsPerEncodingByte = 7;
129 
130 /// Writes a variable length DocID into the buffer and updates the buffer size.
131 /// If it doesn't fit, returns false and doesn't write to the buffer.
132 bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) {
133  assert(Delta != 0 && "0 is not a valid PostingList delta.");
134  // Calculate number of bytes Delta encoding would take by examining the
135  // meaningful bits.
136  unsigned Width = 1 + llvm::findLastSet(Delta) / BitsPerEncodingByte;
137  if (Width > Payload.size())
138  return false;
139 
140  do {
141  uint8_t Encoding = Delta & 0x7f;
142  Delta >>= 7;
143  Payload.front() = Delta ? Encoding | 0x80 : Encoding;
144  Payload = Payload.drop_front();
145  } while (Delta != 0);
146  return true;
147 }
148 
149 /// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
150 /// DocIDs. The compression stores deltas (differences) between subsequent
151 /// DocIDs and encodes these deltas utilizing the least possible number of
152 /// bytes.
153 ///
154 /// Each encoding byte consists of two parts: the first bit (continuation bit)
155 /// indicates whether this is the last byte (0 if this byte is the last) of
156 /// current encoding and seven bytes a piece of DocID (payload). DocID contains
157 /// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit
158 /// payloads and one 4-bit payload), but in practice it is expected that gaps
159 /// (deltas) between subsequent DocIDs are not large enough to require 5 bytes.
160 /// In very dense posting lists (with average gaps less than 128) this
161 /// representation would be 4 times more efficient than raw DocID array.
162 ///
163 /// PostingList encoding example:
164 ///
165 /// DocIDs 42 47 7000
166 /// gaps 5 6958
167 /// Encoding (raw number) 00000101 10110110 00101110
168 std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
169  assert(!Documents.empty() && "Can't encode empty sequence.");
170  std::vector<Chunk> Result;
171  Result.emplace_back();
172  DocID Last = Result.back().Head = Documents.front();
173  llvm::MutableArrayRef<uint8_t> RemainingPayload = Result.back().Payload;
174  for (DocID Doc : Documents.drop_front()) {
175  if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk
176  Result.emplace_back();
177  Result.back().Head = Doc;
178  RemainingPayload = Result.back().Payload;
179  }
180  Last = Doc;
181  }
182  return std::vector<Chunk>(Result); // no move, shrink-to-fit
183 }
184 
185 /// Reads variable length DocID from the buffer and updates the buffer size. If
186 /// the stream is terminated, return None.
187 llvm::Optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) {
188  if (Bytes.front() == 0 || Bytes.empty())
189  return None;
190  DocID Result = 0;
191  bool HasNextByte = true;
192  for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) {
193  assert(Length <= 5 && "Malformed VByte encoding sequence.");
194  // Write meaningful bits to the correct place in the document decoding.
195  Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length);
196  if ((Bytes.front() & 0x80) == 0)
197  HasNextByte = false;
198  Bytes = Bytes.drop_front();
199  }
200  return Result;
201 }
202 
203 } // namespace
204 
205 llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const {
206  llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Result{Head};
207  llvm::ArrayRef<uint8_t> Bytes(Payload);
208  DocID Delta;
209  for (DocID Current = Head; !Bytes.empty(); Current += Delta) {
210  auto MaybeDelta = readVByte(Bytes);
211  if (!MaybeDelta)
212  break;
213  Delta = *MaybeDelta;
214  Result.push_back(Current + Delta);
215  }
216  return llvm::SmallVector<DocID, Chunk::PayloadSize + 1>{Result};
217 }
218 
219 PostingList::PostingList(llvm::ArrayRef<DocID> Documents)
220  : Chunks(encodeStream(Documents)) {}
221 
222 std::unique_ptr<Iterator> PostingList::iterator(const Token *Tok) const {
223  return llvm::make_unique<ChunkIterator>(Tok, Chunks);
224 }
225 
226 } // namespace dex
227 } // namespace clangd
228 } // namespace clang
PostingList(llvm::ArrayRef< DocID > Documents)
std::unique_ptr< Iterator > iterator(const Token *Tok=nullptr) const
Constructs DocumentIterator over given posting list.
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:41
std::vector< std::pair< DocID, float > > consume(Iterator &It)
Advances the iterator until it is exhausted.
Definition: Iterator.cpp:351
uint32_t DocID
Symbol position in the list of all index symbols sorted by a pre-computed symbol quality.
Definition: Iterator.h:47
llvm::Optional< llvm::Expected< tooling::AtomicChanges > > Result
This defines posting list interface: a storage for identifiers of symbols which can be characterized ...
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Symbol index queries consist of specific requirements for the requested symbol, such as high fuzzy ma...
Token objects represent a characteristic of a symbol, which can be used to perform efficient search...
llvm::SmallVector< DocID, PayloadSize+1 > decompress() const