clang-tools  15.0.0git
Trigram.cpp
Go to the documentation of this file.
1 //===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Trigram.h"
10 #include "FuzzyMatch.h"
11 #include "index/dex/Token.h"
12 #include "llvm/ADT/ArrayRef.h"
13 #include "llvm/ADT/DenseSet.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SmallVector.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include <cctype>
19 #include <limits>
20 #include <queue>
21 #include <string>
22 #include <utility>
23 
24 namespace clang {
25 namespace clangd {
26 namespace dex {
27 
28 // Produce trigrams (including duplicates) and pass them to Out().
29 template <typename Func>
30 static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
31  assert(!Identifier.empty());
32  // Apply fuzzy matching text segmentation.
33  llvm::SmallVector<CharRole> Roles(Identifier.size());
35  llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
36 
37  std::string LowercaseIdentifier = Identifier.lower();
38 
39  // For each character, store indices of the characters to which fuzzy matching
40  // algorithm can jump. There are 2 possible variants:
41  //
42  // * Next Tail - next character from the same segment
43  // * Next Head - front character of the next segment
44  //
45  // Next stores tuples of three indices in the presented order, if a variant is
46  // not available then 0 is stored.
47  llvm::SmallVector<std::array<unsigned, 2>, 12> Next(
48  LowercaseIdentifier.size());
49  unsigned NextTail = 0, NextHead = 0;
50  for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
51  Next[I] = {{NextTail, NextHead}};
52  NextTail = Roles[I] == Tail ? I : 0;
53  if (Roles[I] == Head) {
54  NextHead = I;
55  }
56  }
57 
58  // Iterate through valid sequences of three characters Fuzzy Matcher can
59  // process.
60  for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) {
61  // Skip delimiters.
62  if (Roles[I] != Head && Roles[I] != Tail)
63  continue;
64  for (unsigned J : Next[I]) {
65  if (J == 0)
66  continue;
67  for (unsigned K : Next[J]) {
68  if (K == 0)
69  continue;
70  Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
71  LowercaseIdentifier[K]));
72  }
73  }
74  }
75  // Short queries semantics are different. When the user dosn't type enough
76  // symbols to form trigrams, we still want to serve meaningful results. To
77  // achieve that, we form incomplete trigrams (bi- and unigrams) for the
78  // identifiers and also generate short trigrams on the query side from what
79  // is available. We allow a small number of short trigram types in order to
80  // prevent excessive memory usage and increase the quality of the search.
81  // Only the first few symbols are allowed to be used in incomplete trigrams.
82  //
83  // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams:
84  // "_", "_a", "a", "ab", "ad", "d", "de", "dg"
85  for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) {
86  // The first symbol might be a separator, so the loop condition should be
87  // stopping as soon as there is no next head or we have seen two heads.
88  if (Roles[Position] == Head)
89  ++HeadsSeen;
90  Out(Trigram(LowercaseIdentifier[Position]));
91  for (unsigned I : Next[Position])
92  if (I != 0)
93  Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I]));
94  Position = Next[Position][1];
95  if (Position == 0)
96  break;
97  }
98 }
99 
100 void generateIdentifierTrigrams(llvm::StringRef Identifier,
101  std::vector<Trigram> &Result) {
102  // Empirically, scanning for duplicates is faster with fewer trigrams, and
103  // a hashtable is faster with more. This is a hot path, so dispatch based on
104  // expected number of trigrams. Longer identifiers produce more trigrams.
105  // The magic number was tuned by running IndexBenchmark.DexBuild.
106  constexpr unsigned ManyTrigramsIdentifierThreshold = 14;
107  Result.clear();
108  if (Identifier.empty())
109  return;
110 
111  if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
113  if (!llvm::is_contained(Result, T))
114  Result.push_back(T);
115  });
116  } else {
117  identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); });
118  llvm::sort(Result);
119  Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
120  }
121 }
122 
123 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
124  if (Query.empty())
125  return {};
126 
127  // Apply fuzzy matching text segmentation.
128  llvm::SmallVector<CharRole> Roles(Query.size());
129  calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
130 
131  std::string LowercaseQuery = Query.lower();
132 
133  llvm::DenseSet<Token> UniqueTrigrams;
134  std::string Chars;
135  for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
136  if (Roles[I] != Head && Roles[I] != Tail)
137  continue; // Skip delimiters.
138  Chars.push_back(LowercaseQuery[I]);
139  if (Chars.size() > 3)
140  Chars.erase(Chars.begin());
141  if (Chars.size() == 3)
142  UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
143  }
144 
145  // For queries with very few letters, generateIdentifierTrigrams emulates
146  // outputs of this function to match the semantics.
147  if (UniqueTrigrams.empty()) {
148  // If bigram can't be formed out of letters/numbers, we prepend separator.
149  std::string Result(1, LowercaseQuery.front());
150  for (unsigned I = 1; I < LowercaseQuery.size(); ++I)
151  if (Roles[I] == Head || Roles[I] == Tail)
152  Result += LowercaseQuery[I];
153  UniqueTrigrams.insert(
154  Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2)));
155  }
156 
157  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
158 }
159 
160 } // namespace dex
161 } // namespace clangd
162 } // namespace clang
clang::clangd::Head
@ Head
Definition: FuzzyMatch.h:58
Trigram.h
clang::clangd::calculateRoles
CharTypeSet calculateRoles(llvm::StringRef Text, llvm::MutableArrayRef< CharRole > Roles)
Definition: FuzzyMatch.cpp:154
clang::clangd::dex::Trigram
Definition: Trigram.h:38
clang::clangd::Tail
@ Tail
Definition: FuzzyMatch.h:57
clang::clangd::dex::generateQueryTrigrams
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
Definition: Trigram.cpp:123
clang::clangd::Position
Definition: Protocol.h:153
FuzzyMatch.h
clang::clangd::dex::generateIdentifierTrigrams
void generateIdentifierTrigrams(llvm::StringRef Identifier, std::vector< Trigram > &Result)
Produces list of unique fuzzy-search trigrams from unqualified symbol.
Definition: Trigram.cpp:100
clang::clangd::dex::Token
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:38
clang
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Definition: ApplyReplacements.h:27
Out
CompiledFragmentImpl & Out
Definition: ConfigCompile.cpp:99
Token.h
clang::clangd::SymbolOrigin::Identifier
@ Identifier
clang::clangd::dex::identifierTrigrams
static void identifierTrigrams(llvm::StringRef Identifier, Func Out)
Definition: Trigram.cpp:30
K
Kind K
Definition: Rename.cpp:436
clang::clangd::dex::Token::Kind::Trigram
@ Trigram
Represents trigram used for fuzzy search of unqualified symbol names.