clang-tools  12.0.0git
Trigram.cpp
Go to the documentation of this file.
1 //===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Trigram.h"
10 #include "FuzzyMatch.h"
11 #include "Token.h"
12 #include "llvm/ADT/ArrayRef.h"
13 #include "llvm/ADT/DenseSet.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/StringExtras.h"
16 #include <cctype>
17 #include <queue>
18 #include <string>
19 
20 namespace clang {
21 namespace clangd {
22 namespace dex {
23 
24 // Produce trigrams (including duplicates) and pass them to Out().
25 template <typename Func>
26 static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
27  // Apply fuzzy matching text segmentation.
28  std::vector<CharRole> Roles(Identifier.size());
29  calculateRoles(Identifier,
30  llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
31 
32  std::string LowercaseIdentifier = Identifier.lower();
33 
34  // For each character, store indices of the characters to which fuzzy matching
35  // algorithm can jump. There are 3 possible variants:
36  //
37  // * Next Tail - next character from the same segment
38  // * Next Head - front character of the next segment
39  //
40  // Next stores tuples of three indices in the presented order, if a variant is
41  // not available then 0 is stored.
42  std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
43  unsigned NextTail = 0, NextHead = 0;
44  for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
45  Next[I] = {{NextTail, NextHead}};
46  NextTail = Roles[I] == Tail ? I : 0;
47  if (Roles[I] == Head) {
48  NextHead = I;
49  }
50  }
51 
52  // Iterate through valid sequences of three characters Fuzzy Matcher can
53  // process.
54  for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
55  // Skip delimiters.
56  if (Roles[I] != Head && Roles[I] != Tail)
57  continue;
58  for (const unsigned J : Next[I]) {
59  if (J == 0)
60  continue;
61  for (const unsigned K : Next[J]) {
62  if (K == 0)
63  continue;
64  Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
65  LowercaseIdentifier[K]));
66  }
67  }
68  }
69  // Emit short-query trigrams: FooBar -> f, fo, fb.
70  if (!LowercaseIdentifier.empty())
71  Out(Trigram(LowercaseIdentifier[0]));
72  if (LowercaseIdentifier.size() >= 2)
73  Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1]));
74  for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
75  if (Roles[I] == Head) {
76  Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I]));
77  break;
78  }
79 }
80 
81 void generateIdentifierTrigrams(llvm::StringRef Identifier,
82  std::vector<Trigram> &Result) {
83  // Empirically, scanning for duplicates is faster with fewer trigrams, and
84  // a hashtable is faster with more. This is a hot path, so dispatch based on
85  // expected number of trigrams. Longer identifiers produce more trigrams.
86  // The magic number was tuned by running IndexBenchmark.DexBuild.
87  constexpr unsigned ManyTrigramsIdentifierThreshold = 14;
88  Result.clear();
89  if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
90  identifierTrigrams(Identifier, [&](Trigram T) {
91  if (!llvm::is_contained(Result, T))
92  Result.push_back(T);
93  });
94  } else {
95  identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); });
96  llvm::sort(Result);
97  Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
98  }
99 }
100 
101 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
102  if (Query.empty())
103  return {};
104  std::string LowercaseQuery = Query.lower();
105  if (Query.size() < 3) // short-query trigrams only
106  return {Token(Token::Kind::Trigram, LowercaseQuery)};
107 
108  // Apply fuzzy matching text segmentation.
109  std::vector<CharRole> Roles(Query.size());
110  calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
111 
112  llvm::DenseSet<Token> UniqueTrigrams;
113  std::string Chars;
114  for (unsigned I = 0; I < Query.size(); ++I) {
115  if (Roles[I] != Head && Roles[I] != Tail)
116  continue; // Skip delimiters.
117  Chars.push_back(LowercaseQuery[I]);
118  if (Chars.size() > 3)
119  Chars.erase(Chars.begin());
120  if (Chars.size() == 3)
121  UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
122  }
123 
124  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
125 }
126 
127 } // namespace dex
128 } // namespace clangd
129 } // namespace clang
CompiledFragmentImpl & Out
Represents trigram used for fuzzy search of unqualified symbol names.
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
Definition: Trigram.cpp:101
void generateIdentifierTrigrams(llvm::StringRef Identifier, std::vector< Trigram > &Result)
Produces list of unique fuzzy-search trigrams from unqualified symbol.
Definition: Trigram.cpp:81
CharTypeSet calculateRoles(llvm::StringRef Text, llvm::MutableArrayRef< CharRole > Roles)
Definition: FuzzyMatch.cpp:154
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Trigrams are attributes of the symbol unqualified name used to effectively extract symbols which can ...
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:40
Token objects represent a characteristic of a symbol, which can be used to perform efficient search...
static void identifierTrigrams(llvm::StringRef Identifier, Func Out)
Definition: Trigram.cpp:26