clang-tools  8.0.0
Trigram.cpp
Go to the documentation of this file.
1 //===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "Trigram.h"
11 #include "FuzzyMatch.h"
12 #include "Token.h"
13 #include "llvm/ADT/ArrayRef.h"
14 #include "llvm/ADT/DenseSet.h"
15 #include "llvm/ADT/StringExtras.h"
16 #include <cctype>
17 #include <queue>
18 #include <string>
19 
20 namespace clang {
21 namespace clangd {
22 namespace dex {
23 
24 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
25  // Apply fuzzy matching text segmentation.
26  std::vector<CharRole> Roles(Identifier.size());
27  calculateRoles(Identifier,
28  llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
29 
30  std::string LowercaseIdentifier = Identifier.lower();
31 
32  // For each character, store indices of the characters to which fuzzy matching
33  // algorithm can jump. There are 3 possible variants:
34  //
35  // * Next Tail - next character from the same segment
36  // * Next Head - front character of the next segment
37  //
38  // Next stores tuples of three indices in the presented order, if a variant is
39  // not available then 0 is stored.
40  std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
41  unsigned NextTail = 0, NextHead = 0;
42  for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
43  Next[I] = {{NextTail, NextHead}};
44  NextTail = Roles[I] == Tail ? I : 0;
45  if (Roles[I] == Head) {
46  NextHead = I;
47  }
48  }
49 
50  llvm::DenseSet<Token> UniqueTrigrams;
51 
52  auto Add = [&](std::string Chars) {
53  UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
54  };
55 
56  // Iterate through valid sequneces of three characters Fuzzy Matcher can
57  // process.
58  for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
59  // Skip delimiters.
60  if (Roles[I] != Head && Roles[I] != Tail)
61  continue;
62  for (const unsigned J : Next[I]) {
63  if (J == 0)
64  continue;
65  for (const unsigned K : Next[J]) {
66  if (K == 0)
67  continue;
68  Add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
69  LowercaseIdentifier[K]}});
70  }
71  }
72  }
73  // Emit short-query trigrams: FooBar -> f, fo, fb.
74  if (!LowercaseIdentifier.empty())
75  Add({LowercaseIdentifier[0]});
76  if (LowercaseIdentifier.size() >= 2)
77  Add({LowercaseIdentifier[0], LowercaseIdentifier[1]});
78  for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
79  if (Roles[I] == Head) {
80  Add({LowercaseIdentifier[0], LowercaseIdentifier[I]});
81  break;
82  }
83 
84  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
85 }
86 
87 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
88  if (Query.empty())
89  return {};
90  std::string LowercaseQuery = Query.lower();
91  if (Query.size() < 3) // short-query trigrams only
92  return {Token(Token::Kind::Trigram, LowercaseQuery)};
93 
94  // Apply fuzzy matching text segmentation.
95  std::vector<CharRole> Roles(Query.size());
96  calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
97 
98  llvm::DenseSet<Token> UniqueTrigrams;
99  std::string Chars;
100  for (unsigned I = 0; I < Query.size(); ++I) {
101  if (Roles[I] != Head && Roles[I] != Tail)
102  continue; // Skip delimiters.
103  Chars.push_back(LowercaseQuery[I]);
104  if (Chars.size() > 3)
105  Chars.erase(Chars.begin());
106  if (Chars.size() == 3)
107  UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
108  }
109 
110  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
111 }
112 
113 } // namespace dex
114 } // namespace clangd
115 } // namespace clang
std::vector< Token > generateIdentifierTrigrams(llvm::StringRef Identifier)
Returns list of unique fuzzy-search trigrams from unqualified symbol.
Definition: Trigram.cpp:24
Represents trigram used for fuzzy search of unqualified symbol names.
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:41
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
Definition: Trigram.cpp:87
CharTypeSet calculateRoles(llvm::StringRef Text, llvm::MutableArrayRef< CharRole > Roles)
Definition: FuzzyMatch.cpp:155
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Trigrams are attributes of the symbol unqualified name used to effectively extract symbols which can ...
Token objects represent a characteristic of a symbol, which can be used to perform efficient search...