clang-tools  10.0.0svn
Trigram.cpp
Go to the documentation of this file.
1 //===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "Trigram.h"
10 #include "FuzzyMatch.h"
11 #include "Token.h"
12 #include "llvm/ADT/ArrayRef.h"
13 #include "llvm/ADT/DenseSet.h"
14 #include "llvm/ADT/StringExtras.h"
15 #include <cctype>
16 #include <queue>
17 #include <string>
18 
19 namespace clang {
20 namespace clangd {
21 namespace dex {
22 
23 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
24  // Apply fuzzy matching text segmentation.
25  std::vector<CharRole> Roles(Identifier.size());
26  calculateRoles(Identifier,
27  llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
28 
29  std::string LowercaseIdentifier = Identifier.lower();
30 
31  // For each character, store indices of the characters to which fuzzy matching
32  // algorithm can jump. There are 3 possible variants:
33  //
34  // * Next Tail - next character from the same segment
35  // * Next Head - front character of the next segment
36  //
37  // Next stores tuples of three indices in the presented order, if a variant is
38  // not available then 0 is stored.
39  std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
40  unsigned NextTail = 0, NextHead = 0;
41  for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
42  Next[I] = {{NextTail, NextHead}};
43  NextTail = Roles[I] == Tail ? I : 0;
44  if (Roles[I] == Head) {
45  NextHead = I;
46  }
47  }
48 
49  llvm::DenseSet<Token> UniqueTrigrams;
50 
51  auto Add = [&](std::string Chars) {
52  UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
53  };
54 
55  // Iterate through valid sequneces of three characters Fuzzy Matcher can
56  // process.
57  for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
58  // Skip delimiters.
59  if (Roles[I] != Head && Roles[I] != Tail)
60  continue;
61  for (const unsigned J : Next[I]) {
62  if (J == 0)
63  continue;
64  for (const unsigned K : Next[J]) {
65  if (K == 0)
66  continue;
67  Add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
68  LowercaseIdentifier[K]}});
69  }
70  }
71  }
72  // Emit short-query trigrams: FooBar -> f, fo, fb.
73  if (!LowercaseIdentifier.empty())
74  Add({LowercaseIdentifier[0]});
75  if (LowercaseIdentifier.size() >= 2)
76  Add({LowercaseIdentifier[0], LowercaseIdentifier[1]});
77  for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
78  if (Roles[I] == Head) {
79  Add({LowercaseIdentifier[0], LowercaseIdentifier[I]});
80  break;
81  }
82 
83  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
84 }
85 
86 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
87  if (Query.empty())
88  return {};
89  std::string LowercaseQuery = Query.lower();
90  if (Query.size() < 3) // short-query trigrams only
91  return {Token(Token::Kind::Trigram, LowercaseQuery)};
92 
93  // Apply fuzzy matching text segmentation.
94  std::vector<CharRole> Roles(Query.size());
95  calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
96 
97  llvm::DenseSet<Token> UniqueTrigrams;
98  std::string Chars;
99  for (unsigned I = 0; I < Query.size(); ++I) {
100  if (Roles[I] != Head && Roles[I] != Tail)
101  continue; // Skip delimiters.
102  Chars.push_back(LowercaseQuery[I]);
103  if (Chars.size() > 3)
104  Chars.erase(Chars.begin());
105  if (Chars.size() == 3)
106  UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
107  }
108 
109  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
110 }
111 
112 } // namespace dex
113 } // namespace clangd
114 } // namespace clang
std::vector< Token > generateIdentifierTrigrams(llvm::StringRef Identifier)
Returns list of unique fuzzy-search trigrams from unqualified symbol.
Definition: Trigram.cpp:23
Represents trigram used for fuzzy search of unqualified symbol names.
std::vector< Token > generateQueryTrigrams(llvm::StringRef Query)
Returns list of unique fuzzy-search trigrams given a query.
Definition: Trigram.cpp:86
CharTypeSet calculateRoles(llvm::StringRef Text, llvm::MutableArrayRef< CharRole > Roles)
Definition: FuzzyMatch.cpp:154
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Trigrams are attributes of the symbol unqualified name used to effectively extract symbols which can ...
A Token represents an attribute of a symbol, such as a particular trigram present in the name (used f...
Definition: Token.h:40
Token objects represent a characteristic of a symbol, which can be used to perform efficient search...