clang-tools  14.0.0git
FuzzyMatch.cpp
Go to the documentation of this file.
1 //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
10 // we consider the possible partial match states:
11 //
12 // u n i q u e _ p t r
13 // +---------------------
14 // |A . . . . . . . . . .
15 // u|
16 // |. . . . . . . . . . .
17 // _|
18 // |. . . . . . . O . . .
19 // p|
20 // |. . . . . . . . . . B
21 //
22 // Each dot represents some prefix of the pattern being matched against some
23 // prefix of the word.
24 // - A is the initial state: '' matched against ''
25 // - O is an intermediate state: 'u_' matched against 'unique_'
26 // - B is the target state: 'u_p' matched against 'unique_ptr'
27 //
28 // We aim to find the best path from A->B.
29 // - Moving right (consuming a word character)
30 // Always legal: not all word characters must match.
31 // - Moving diagonally (consuming both a word and pattern character)
32 // Legal if the characters match.
33 // - Moving down (consuming a pattern character) is never legal.
34 // Never legal: all pattern characters must match something.
35 // Characters are matched case-insensitively.
36 // The first pattern character may only match the start of a word segment.
37 //
38 // The scoring is based on heuristics:
39 // - when matching a character, apply a bonus or penalty depending on the
40 // match quality (does case match, do word segments align, etc)
41 // - when skipping a character, apply a penalty if it hurts the match
42 // (it starts a word segment, or splits the matched region, etc)
43 //
44 // These heuristics require the ability to "look backward" one character, to
45 // see whether it was matched or not. Therefore the dynamic-programming matrix
46 // has an extra dimension (last character matched).
47 // Each entry also has an additional flag indicating whether the last-but-one
48 // character matched, which is needed to trace back through the scoring table
49 // and reconstruct the match.
50 //
51 // We treat strings as byte-sequences, so only ASCII has first-class support.
52 //
53 // This algorithm was inspired by VS code's client-side filtering, and aims
54 // to be mostly-compatible.
55 //
56 //===----------------------------------------------------------------------===//
57 
58 #include "FuzzyMatch.h"
59 #include "llvm/ADT/Optional.h"
60 #include "llvm/Support/Format.h"
61 
62 namespace clang {
63 namespace clangd {
64 
65 constexpr int FuzzyMatcher::MaxPat;
66 constexpr int FuzzyMatcher::MaxWord;
67 
68 static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
69 // A "negative infinity" score that won't overflow.
70 // We use this to mark unreachable states and forbidden solutions.
71 // Score field is 15 bits wide, min value is -2^14, we use half of that.
72 static constexpr int AwfulScore = -(1 << 13);
73 static bool isAwful(int S) { return S < AwfulScore / 2; }
74 static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score.
75 
76 FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
77  : PatN(std::min<int>(MaxPat, Pattern.size())),
78  ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
79  std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
80  for (int I = 0; I < PatN; ++I)
81  LowPat[I] = lower(Pat[I]);
82  Scores[0][0][Miss] = {0, Miss};
83  Scores[0][0][Match] = {AwfulScore, Miss};
84  for (int P = 0; P <= PatN; ++P)
85  for (int W = 0; W < P; ++W)
86  for (Action A : {Miss, Match})
87  Scores[P][W][A] = {AwfulScore, Miss};
88  PatTypeSet = calculateRoles(llvm::StringRef(Pat, PatN),
89  llvm::makeMutableArrayRef(PatRole, PatN));
90 }
91 
92 llvm::Optional<float> FuzzyMatcher::match(llvm::StringRef Word) {
93  if (!(WordContainsPattern = init(Word)))
94  return llvm::None;
95  if (!PatN)
96  return 1;
97  buildGraph();
98  auto Best = std::max(Scores[PatN][WordN][Miss].Score,
99  Scores[PatN][WordN][Match].Score);
100  if (isAwful(Best))
101  return llvm::None;
102  float Score =
103  ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
104  // If the pattern is as long as the word, we have an exact string match,
105  // since every pattern character must match something.
106  if (WordN == PatN)
107  Score *= 2; // May not be perfect 2 if case differs in a significant way.
108  return Score;
109 }
110 
111 // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
112 // The top 6 bits of the char select the byte, the bottom 2 select the offset.
113 // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
114 constexpr static uint8_t CharTypes[] = {
115  0x00, 0x00, 0x00, 0x00, // Control characters
116  0x00, 0x00, 0x00, 0x00, // Control characters
117  0xff, 0xff, 0xff, 0xff, // Punctuation
118  0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
119  0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
120  0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
121  0x57, 0x55, 0x55, 0x55, // ` and a-o
122  0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
123  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
124  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
125  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
126  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
127 };
128 
129 // The Role can be determined from the Type of a character and its neighbors:
130 //
131 // Example | Chars | Type | Role
132 // ---------+--------------+-----
133 // F(o)oBar | Foo | Ull | Tail
134 // Foo(B)ar | oBa | lUl | Head
135 // (f)oo | ^fo | Ell | Head
136 // H(T)TP | HTT | UUU | Tail
137 //
138 // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
139 // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
140 // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
141 constexpr static uint8_t CharRoles[] = {
142  // clang-format off
143  // Curr= Empty Lower Upper Separ
144  /* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
145  /* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
146  /* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
147  /* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
148  // clang-format on
149 };
150 
151 template <typename T> static T packedLookup(const uint8_t *Data, int I) {
152  return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
153 }
154 CharTypeSet calculateRoles(llvm::StringRef Text,
155  llvm::MutableArrayRef<CharRole> Roles) {
156  assert(Text.size() == Roles.size());
157  if (Text.size() == 0)
158  return 0;
159  CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
160  CharTypeSet TypeSet = 1 << Type;
161  // Types holds a sliding window of (Prev, Curr, Next) types.
162  // Initial value is (Empty, Empty, type of Text[0]).
163  int Types = Type;
164  // Rotate slides in the type of the next character.
165  auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
166  for (unsigned I = 0; I < Text.size() - 1; ++I) {
167  // For each character, rotate in the next, and look up the role.
168  Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
169  TypeSet |= 1 << Type;
170  Rotate(Type);
171  Roles[I] = packedLookup<CharRole>(CharRoles, Types);
172  }
173  // For the last character, the "next character" is Empty.
174  Rotate(Empty);
175  Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types);
176  return TypeSet;
177 }
178 
179 // Sets up the data structures matching Word.
180 // Returns false if we can cheaply determine that no match is possible.
181 bool FuzzyMatcher::init(llvm::StringRef NewWord) {
182  WordN = std::min<int>(MaxWord, NewWord.size());
183  if (PatN > WordN)
184  return false;
185  std::copy(NewWord.begin(), NewWord.begin() + WordN, Word);
186  if (PatN == 0)
187  return true;
188  for (int I = 0; I < WordN; ++I)
189  LowWord[I] = lower(Word[I]);
190 
191  // Cheap subsequence check.
192  for (int W = 0, P = 0; P != PatN; ++W) {
193  if (W == WordN)
194  return false;
195  if (LowWord[W] == LowPat[P])
196  ++P;
197  }
198 
199  // FIXME: some words are hard to tokenize algorithmically.
200  // e.g. vsprintf is V S Print F, and should match [pri] but not [int].
201  // We could add a tokenization dictionary for common stdlib names.
202  WordTypeSet = calculateRoles(llvm::StringRef(Word, WordN),
203  llvm::makeMutableArrayRef(WordRole, WordN));
204  return true;
205 }
206 
207 // The forwards pass finds the mappings of Pattern onto Word.
208 // Score = best score achieved matching Word[..W] against Pat[..P].
209 // Unlike other tables, indices range from 0 to N *inclusive*
210 // Matched = whether we chose to match Word[W] with Pat[P] or not.
211 //
212 // Points are mostly assigned to matched characters, with 1 being a good score
213 // and 3 being a great one. So we treat the score range as [0, 3 * PatN].
214 // This range is not strict: we can apply larger bonuses/penalties, or penalize
215 // non-matched characters.
216 void FuzzyMatcher::buildGraph() {
217  for (int W = 0; W < WordN; ++W) {
218  Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
219  Miss};
220  Scores[0][W + 1][Match] = {AwfulScore, Miss};
221  }
222  for (int P = 0; P < PatN; ++P) {
223  for (int W = P; W < WordN; ++W) {
224  auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
225 
226  auto MatchMissScore = PreMiss[Match].Score;
227  auto MissMissScore = PreMiss[Miss].Score;
228  if (P < PatN - 1) { // Skipping trailing characters is always free.
229  MatchMissScore -= skipPenalty(W, Match);
230  MissMissScore -= skipPenalty(W, Miss);
231  }
232  Score[Miss] = (MatchMissScore > MissMissScore)
233  ? ScoreInfo{MatchMissScore, Match}
234  : ScoreInfo{MissMissScore, Miss};
235 
236  auto &PreMatch = Scores[P][W];
237  auto MatchMatchScore =
238  allowMatch(P, W, Match)
239  ? PreMatch[Match].Score + matchBonus(P, W, Match)
240  : AwfulScore;
241  auto MissMatchScore = allowMatch(P, W, Miss)
242  ? PreMatch[Miss].Score + matchBonus(P, W, Miss)
243  : AwfulScore;
244  Score[Match] = (MatchMatchScore > MissMatchScore)
245  ? ScoreInfo{MatchMatchScore, Match}
246  : ScoreInfo{MissMatchScore, Miss};
247  }
248  }
249 }
250 
251 bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
252  if (LowPat[P] != LowWord[W])
253  return false;
254  // We require a "strong" match:
255  // - for the first pattern character. [foo] !~ "barefoot"
256  // - after a gap. [pat] !~ "patnther"
257  if (Last == Miss) {
258  // We're banning matches outright, so conservatively accept some other cases
259  // where our segmentation might be wrong:
260  // - allow matching B in ABCDef (but not in NDEBUG)
261  // - we'd like to accept print in sprintf, but too many false positives
262  if (WordRole[W] == Tail &&
263  (Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower)))
264  return false;
265  }
266  return true;
267 }
268 
269 int FuzzyMatcher::skipPenalty(int W, Action Last) const {
270  if (W == 0) // Skipping the first character.
271  return 3;
272  if (WordRole[W] == Head) // Skipping a segment.
273  return 1; // We want to keep this lower than a consecutive match bonus.
274  // Instead of penalizing non-consecutive matches, we give a bonus to a
275  // consecutive match in matchBonus. This produces a better score distribution
276  // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
277  return 0;
278 }
279 
280 int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
281  assert(LowPat[P] == LowWord[W]);
282  int S = 1;
283  bool IsPatSingleCase =
284  (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper);
285  // Bonus: case matches, or a Head in the pattern aligns with one in the word.
286  // Single-case patterns lack segmentation signals and we assume any character
287  // can be a head of a segment.
288  if (Pat[P] == Word[W] ||
289  (WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head)))
290  ++S;
291  // Bonus: a consecutive match. First character match also gets a bonus to
292  // ensure prefix final match score normalizes to 1.0.
293  if (W == 0 || Last == Match)
294  S += 2;
295  // Penalty: matching inside a segment (and previous char wasn't matched).
296  if (WordRole[W] == Tail && P && Last == Miss)
297  S -= 3;
298  // Penalty: a Head in the pattern matches in the middle of a word segment.
299  if (PatRole[P] == Head && WordRole[W] == Tail)
300  --S;
301  // Penalty: matching the first pattern character in the middle of a segment.
302  if (P == 0 && WordRole[W] == Tail)
303  S -= 4;
304  assert(S <= PerfectBonus);
305  return S;
306 }
307 
308 llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
309  llvm::SmallString<256> Result;
310  OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against ["
311  << llvm::StringRef(Pat, PatN) << "] ===\n";
312  if (PatN == 0) {
313  OS << "Pattern is empty: perfect match.\n";
314  return Result = llvm::StringRef(Word, WordN);
315  }
316  if (WordN == 0) {
317  OS << "Word is empty: no match.\n";
318  return Result;
319  }
320  if (!WordContainsPattern) {
321  OS << "Substring check failed.\n";
322  return Result;
323  } else if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
324  Scores[PatN][WordN][Miss].Score))) {
325  OS << "Substring check passed, but all matches are forbidden\n";
326  }
327  if (!(PatTypeSet & 1 << Upper))
328  OS << "Lowercase query, so scoring ignores case\n";
329 
330  // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
331  // The Score table has cumulative scores, subtracting along this path gives
332  // us the per-letter scores.
333  Action Last =
334  (Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
335  ? Match
336  : Miss;
337  int S[MaxWord];
338  Action A[MaxWord];
339  for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
340  A[W] = Last;
341  const auto &Cell = Scores[P + 1][W + 1][Last];
342  if (Last == Match)
343  --P;
344  const auto &Prev = Scores[P + 1][W][Cell.Prev];
345  S[W] = Cell.Score - Prev.Score;
346  Last = Cell.Prev;
347  }
348  for (int I = 0; I < WordN; ++I) {
349  if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
350  Result.push_back('[');
351  if (A[I] == Miss && I > 0 && A[I - 1] == Match)
352  Result.push_back(']');
353  Result.push_back(Word[I]);
354  }
355  if (A[WordN - 1] == Match)
356  Result.push_back(']');
357 
358  for (char C : llvm::StringRef(Word, WordN))
359  OS << " " << C << " ";
360  OS << "\n";
361  for (int I = 0, J = 0; I < WordN; I++)
362  OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
363  OS << "\n";
364  for (int I = 0; I < WordN; I++)
365  OS << llvm::format("%2d ", S[I]);
366  OS << "\n";
367 
368  OS << "\nSegmentation:";
369  OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n ";
370  for (int I = 0; I < WordN; ++I)
371  OS << "?-+ "[static_cast<int>(WordRole[I])];
372  OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n ";
373  for (int I = 0; I < PatN; ++I)
374  OS << "?-+ "[static_cast<int>(PatRole[I])];
375  OS << "\n";
376 
377  OS << "\nScoring table (last-Miss, last-Match):\n";
378  OS << " | ";
379  for (char C : llvm::StringRef(Word, WordN))
380  OS << " " << C << " ";
381  OS << "\n";
382  OS << "-+----" << std::string(WordN * 4, '-') << "\n";
383  for (int I = 0; I <= PatN; ++I) {
384  for (Action A : {Miss, Match}) {
385  OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
386  for (int J = 0; J <= WordN; ++J) {
387  if (!isAwful(Scores[I][J][A].Score))
388  OS << llvm::format("%3d%c", Scores[I][J][A].Score,
389  Scores[I][J][A].Prev == Match ? '*' : ' ');
390  else
391  OS << " ";
392  }
393  OS << "\n";
394  }
395  }
396 
397  return Result;
398 }
399 
400 } // namespace clangd
401 } // namespace clang
clang::clangd::Head
@ Head
Definition: FuzzyMatch.h:58
Type
NodeType Type
Definition: HTMLGenerator.cpp:73
clang::clangd::FuzzyMatcher::match
llvm::Optional< float > match(llvm::StringRef Word)
Definition: FuzzyMatch.cpp:92
clang::clangd::calculateRoles
CharTypeSet calculateRoles(llvm::StringRef Text, llvm::MutableArrayRef< CharRole > Roles)
Definition: FuzzyMatch.cpp:154
clang::clangd::CharTypeSet
unsigned char CharTypeSet
Definition: FuzzyMatch.h:49
Text
std::string Text
Definition: HTMLGenerator.cpp:80
clang::clangd::packedLookup
static T packedLookup(const uint8_t *Data, int I)
Definition: FuzzyMatch.cpp:151
clang::clangd::lower
static char lower(char C)
Definition: FuzzyMatch.cpp:68
clang::clangd::AwfulScore
static constexpr int AwfulScore
Definition: FuzzyMatch.cpp:72
Action
llvm::unique_function< void()> Action
Definition: TUScheduler.cpp:600
clang::clangd::Tail
@ Tail
Definition: FuzzyMatch.h:57
clang::clangd::FuzzyMatcher::FuzzyMatcher
FuzzyMatcher(llvm::StringRef Pattern)
Definition: FuzzyMatch.cpp:76
clang::clangd::FuzzyMatcher::dumpLast
llvm::SmallString< 256 > dumpLast(llvm::raw_ostream &) const
Definition: FuzzyMatch.cpp:308
ns1::ns2::A
@ A
Definition: CategoricalFeature.h:3
FuzzyMatch.h
clang::clangd::CharTypes
constexpr static uint8_t CharTypes[]
Definition: FuzzyMatch.cpp:114
clang::clangd::Upper
@ Upper
Definition: FuzzyMatch.h:44
clang::clangd::Lower
@ Lower
Definition: FuzzyMatch.h:43
clang::clangd::PerfectBonus
static constexpr int PerfectBonus
Definition: FuzzyMatch.cpp:74
Word
std::string Word
Definition: FuzzyMatchTests.cpp:40
clang::clangd::CharType
CharType
Definition: FuzzyMatch.h:41
clang::clangd::isAwful
static bool isAwful(int S)
Definition: FuzzyMatch.cpp:73
clang::clangd::Empty
@ Empty
Definition: FuzzyMatch.h:42
Score
llvm::Optional< float > Score
Definition: FuzzyMatchTests.cpp:48
C
const Criteria C
Definition: FunctionCognitiveComplexityCheck.cpp:93
clang
===– Representation.cpp - ClangDoc Representation --------—*- C++ -*-===//
Definition: ApplyReplacements.h:27
OS
llvm::raw_string_ostream OS
Definition: TraceTests.cpp:163
clang::clangd::CharRoles
constexpr static uint8_t CharRoles[]
Definition: FuzzyMatch.cpp:141