clang-tools 19.0.0git
Trigram.cpp
Go to the documentation of this file.
1//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "Trigram.h"
10#include "FuzzyMatch.h"
11#include "index/dex/Token.h"
12#include "llvm/ADT/ArrayRef.h"
13#include "llvm/ADT/DenseSet.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SmallVector.h"
16#include "llvm/ADT/StringExtras.h"
17#include "llvm/ADT/StringRef.h"
18#include <cctype>
19#include <limits>
20#include <queue>
21#include <string>
22#include <utility>
23
24namespace clang {
25namespace clangd {
26namespace dex {
27
28// Produce trigrams (including duplicates) and pass them to Out().
29template <typename Func>
30static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
31 assert(!Identifier.empty());
32 // Apply fuzzy matching text segmentation.
33 llvm::SmallVector<CharRole> Roles(Identifier.size());
35 llvm::MutableArrayRef(Roles.data(), Identifier.size()));
36
37 std::string LowercaseIdentifier = Identifier.lower();
38
39 // For each character, store indices of the characters to which fuzzy matching
40 // algorithm can jump. There are 2 possible variants:
41 //
42 // * Next Tail - next character from the same segment
43 // * Next Head - front character of the next segment
44 //
45 // Next stores tuples of three indices in the presented order, if a variant is
46 // not available then 0 is stored.
47 llvm::SmallVector<std::array<unsigned, 2>, 12> Next(
48 LowercaseIdentifier.size());
49 unsigned NextTail = 0, NextHead = 0;
50 for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
51 Next[I] = {{NextTail, NextHead}};
52 NextTail = Roles[I] == Tail ? I : 0;
53 if (Roles[I] == Head) {
54 NextHead = I;
55 }
56 }
57
58 // Iterate through valid sequences of three characters Fuzzy Matcher can
59 // process.
60 for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) {
61 // Skip delimiters.
62 if (Roles[I] != Head && Roles[I] != Tail)
63 continue;
64 for (unsigned J : Next[I]) {
65 if (J == 0)
66 continue;
67 for (unsigned K : Next[J]) {
68 if (K == 0)
69 continue;
70 Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
71 LowercaseIdentifier[K]));
72 }
73 }
74 }
75 // Short queries semantics are different. When the user dosn't type enough
76 // symbols to form trigrams, we still want to serve meaningful results. To
77 // achieve that, we form incomplete trigrams (bi- and unigrams) for the
78 // identifiers and also generate short trigrams on the query side from what
79 // is available. We allow a small number of short trigram types in order to
80 // prevent excessive memory usage and increase the quality of the search.
81 // Only the first few symbols are allowed to be used in incomplete trigrams.
82 //
83 // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams:
84 // "_", "_a", "a", "ab", "ad", "d", "de", "dg"
85 for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) {
86 // The first symbol might be a separator, so the loop condition should be
87 // stopping as soon as there is no next head or we have seen two heads.
88 if (Roles[Position] == Head)
89 ++HeadsSeen;
90 Out(Trigram(LowercaseIdentifier[Position]));
91 for (unsigned I : Next[Position])
92 if (I != 0)
93 Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I]));
94 Position = Next[Position][1];
95 if (Position == 0)
96 break;
97 }
98}
99
101 std::vector<Trigram> &Result) {
102 // Empirically, scanning for duplicates is faster with fewer trigrams, and
103 // a hashtable is faster with more. This is a hot path, so dispatch based on
104 // expected number of trigrams. Longer identifiers produce more trigrams.
105 // The magic number was tuned by running IndexBenchmark.DexBuild.
106 constexpr unsigned ManyTrigramsIdentifierThreshold = 14;
107 Result.clear();
108 if (Identifier.empty())
109 return;
110
111 if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
113 if (!llvm::is_contained(Result, T))
114 Result.push_back(T);
115 });
116 } else {
117 identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); });
118 llvm::sort(Result);
119 Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
120 }
121}
122
123std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
124 if (Query.empty())
125 return {};
126
127 // Apply fuzzy matching text segmentation.
128 llvm::SmallVector<CharRole> Roles(Query.size());
129 calculateRoles(Query, llvm::MutableArrayRef(Roles.data(), Query.size()));
130
131 std::string LowercaseQuery = Query.lower();
132
133 llvm::DenseSet<Token> UniqueTrigrams;
134 std::string Chars;
135 for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
136 if (Roles[I] != Head && Roles[I] != Tail)
137 continue; // Skip delimiters.
138 Chars.push_back(LowercaseQuery[I]);
139 if (Chars.size() > 3)
140 Chars.erase(Chars.begin());
141 if (Chars.size() == 3)
142 UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
143 }
144
145 // For queries with very few letters, generateIdentifierTrigrams emulates
146 // outputs of this function to match the semantics.
147 if (UniqueTrigrams.empty()) {
148 // If bigram can't be formed out of letters/numbers, we prepend separator.
149 std::string Result(1, LowercaseQuery.front());
150 for (unsigned I = 1; I < LowercaseQuery.size(); ++I)
151 if (Roles[I] == Head || Roles[I] == Tail)
152 Result += LowercaseQuery[I];
153 UniqueTrigrams.insert(
154 Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2)));
155 }
156
157 return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
158}
159
160} // namespace dex
161} // namespace clangd
162} // namespace clang
CompiledFragmentImpl & Out
Kind K
Definition: Rename.cpp:474
Token objects represent a characteristic of a symbol, which can be used to perform efficient search.
Trigrams are attributes of the symbol unqualified name used to effectively extract symbols which can ...
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:39
@ Trigram
Represents trigram used for fuzzy search of unqualified symbol names.
static void identifierTrigrams(llvm::StringRef Identifier, Func Out)
Definition: Trigram.cpp:30
void generateIdentifierTrigrams(llvm::StringRef Identifier, std::vector< Trigram > &Result)
Produces list of unique fuzzy-search trigrams from unqualified symbol.
Definition: Trigram.cpp:100
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
Definition: Trigram.cpp:123
CharTypeSet calculateRoles(llvm::StringRef Text, llvm::MutableArrayRef< CharRole > Roles)
Definition: FuzzyMatch.cpp:154
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//