clang 19.0.0git
InterpolatingCompilationDatabase.cpp
Go to the documentation of this file.
1//===- InterpolatingCompilationDatabase.cpp ---------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// InterpolatingCompilationDatabase wraps another CompilationDatabase and
10// attempts to heuristically determine appropriate compile commands for files
11// that are not included, such as headers or newly created files.
12//
13// Motivating cases include:
14// Header files that live next to their implementation files. These typically
15// share a base filename. (libclang/CXString.h, libclang/CXString.cpp).
16// Some projects separate headers from includes. Filenames still typically
17// match, maybe other path segments too. (include/llvm/IR/Use.h, lib/IR/Use.cc).
18// Matches are sometimes only approximate (Sema.h, SemaDecl.cpp). This goes
19// for directories too (Support/Unix/Process.inc, lib/Support/Process.cpp).
20// Even if we can't find a "right" compile command, even a random one from
21// the project will tend to get important flags like -I and -x right.
22//
23// We "borrow" the compile command for the closest available file:
24// - points are awarded if the filename matches (ignoring extension)
25// - points are awarded if the directory structure matches
26// - ties are broken by length of path prefix match
27//
28// The compile command is adjusted, replacing the filename and removing output
29// file arguments. The -x and -std flags may be affected too.
30//
31// Source language is a tricky issue: is it OK to use a .c file's command
32// for building a .cc file? What language is a .h file in?
33// - We only consider compile commands for c-family languages as candidates.
34// - For files whose language is implied by the filename (e.g. .m, .hpp)
35// we prefer candidates from the same language.
36// If we must cross languages, we drop any -x and -std flags.
37// - For .h files, candidates from any c-family language are acceptable.
38// We use the candidate's language, inserting e.g. -x c++-header.
39//
40// This class is only useful when wrapping databases that can enumerate all
41// their compile commands. If getAllFilenames() is empty, no inference occurs.
42//
43//===----------------------------------------------------------------------===//
44
46#include "clang/Driver/Driver.h"
48#include "clang/Driver/Types.h"
50#include "llvm/ADT/ArrayRef.h"
51#include "llvm/ADT/DenseMap.h"
52#include "llvm/ADT/StringExtras.h"
53#include "llvm/Option/ArgList.h"
54#include "llvm/Option/OptTable.h"
55#include "llvm/Support/Debug.h"
56#include "llvm/Support/Path.h"
57#include "llvm/Support/StringSaver.h"
58#include "llvm/Support/raw_ostream.h"
59#include <memory>
60#include <optional>
61
62namespace clang {
63namespace tooling {
64namespace {
65using namespace llvm;
66namespace types = clang::driver::types;
67namespace path = llvm::sys::path;
68
69// The length of the prefix these two strings have in common.
70size_t matchingPrefix(StringRef L, StringRef R) {
71 size_t Limit = std::min(L.size(), R.size());
72 for (size_t I = 0; I < Limit; ++I)
73 if (L[I] != R[I])
74 return I;
75 return Limit;
76}
77
78// A comparator for searching SubstringWithIndexes with std::equal_range etc.
79// Optionaly prefix semantics: compares equal if the key is a prefix.
80template <bool Prefix> struct Less {
81 bool operator()(StringRef Key, std::pair<StringRef, size_t> Value) const {
82 StringRef V = Prefix ? Value.first.substr(0, Key.size()) : Value.first;
83 return Key < V;
84 }
85 bool operator()(std::pair<StringRef, size_t> Value, StringRef Key) const {
86 StringRef V = Prefix ? Value.first.substr(0, Key.size()) : Value.first;
87 return V < Key;
88 }
89};
90
91// Infer type from filename. If we might have gotten it wrong, set *Certain.
92// *.h will be inferred as a C header, but not certain.
93types::ID guessType(StringRef Filename, bool *Certain = nullptr) {
94 // path::extension is ".cpp", lookupTypeForExtension wants "cpp".
95 auto Lang =
96 types::lookupTypeForExtension(path::extension(Filename).substr(1));
97 if (Certain)
98 *Certain = Lang != types::TY_CHeader && Lang != types::TY_INVALID;
99 return Lang;
100}
101
102// Return Lang as one of the canonical supported types.
103// e.g. c-header --> c; fortran --> TY_INVALID
104static types::ID foldType(types::ID Lang) {
105 switch (Lang) {
106 case types::TY_C:
107 case types::TY_CHeader:
108 return types::TY_C;
109 case types::TY_ObjC:
110 case types::TY_ObjCHeader:
111 return types::TY_ObjC;
112 case types::TY_CXX:
113 case types::TY_CXXHeader:
114 return types::TY_CXX;
115 case types::TY_ObjCXX:
116 case types::TY_ObjCXXHeader:
117 return types::TY_ObjCXX;
118 case types::TY_CUDA:
119 case types::TY_CUDA_DEVICE:
120 return types::TY_CUDA;
121 default:
122 return types::TY_INVALID;
123 }
124}
125
126// A CompileCommand that can be applied to another file.
127struct TransferableCommand {
128 // Flags that should not apply to all files are stripped from CommandLine.
129 CompileCommand Cmd;
130 // Language detected from -x or the filename. Never TY_INVALID.
131 std::optional<types::ID> Type;
132 // Standard specified by -std.
134 // Whether the command line is for the cl-compatible driver.
136
137 TransferableCommand(CompileCommand C)
138 : Cmd(std::move(C)), Type(guessType(Cmd.Filename)) {
139 std::vector<std::string> OldArgs = std::move(Cmd.CommandLine);
140 Cmd.CommandLine.clear();
141
142 // Wrap the old arguments in an InputArgList.
143 llvm::opt::InputArgList ArgList;
144 {
146 for (const std::string &S : OldArgs)
147 TmpArgv.push_back(S.c_str());
148 ClangCLMode = !TmpArgv.empty() &&
150 TmpArgv.front(), llvm::ArrayRef(TmpArgv).slice(1)));
151 ArgList = {TmpArgv.begin(), TmpArgv.end()};
152 }
153
154 // Parse the old args in order to strip out and record unwanted flags.
155 // We parse each argument individually so that we can retain the exact
156 // spelling of each argument; re-rendering is lossy for aliased flags.
157 // E.g. in CL mode, /W4 maps to -Wall.
158 auto &OptTable = clang::driver::getDriverOptTable();
159 if (!OldArgs.empty())
160 Cmd.CommandLine.emplace_back(OldArgs.front());
161 for (unsigned Pos = 1; Pos < OldArgs.size();) {
162 using namespace driver::options;
163
164 const unsigned OldPos = Pos;
165 std::unique_ptr<llvm::opt::Arg> Arg(OptTable.ParseOneArg(
166 ArgList, Pos,
167 llvm::opt::Visibility(ClangCLMode ? CLOption : ClangOption)));
168
169 if (!Arg)
170 continue;
171
172 const llvm::opt::Option &Opt = Arg->getOption();
173
174 // Strip input and output files.
175 if (Opt.matches(OPT_INPUT) || Opt.matches(OPT_o) ||
176 (ClangCLMode && (Opt.matches(OPT__SLASH_Fa) ||
177 Opt.matches(OPT__SLASH_Fe) ||
178 Opt.matches(OPT__SLASH_Fi) ||
179 Opt.matches(OPT__SLASH_Fo))))
180 continue;
181
182 // ...including when the inputs are passed after --.
183 if (Opt.matches(OPT__DASH_DASH))
184 break;
185
186 // Strip -x, but record the overridden language.
187 if (const auto GivenType = tryParseTypeArg(*Arg)) {
188 Type = *GivenType;
189 continue;
190 }
191
192 // Strip -std, but record the value.
193 if (const auto GivenStd = tryParseStdArg(*Arg)) {
194 if (*GivenStd != LangStandard::lang_unspecified)
195 Std = *GivenStd;
196 continue;
197 }
198
199 Cmd.CommandLine.insert(Cmd.CommandLine.end(),
200 OldArgs.data() + OldPos, OldArgs.data() + Pos);
201 }
202
203 // Make use of -std iff -x was missing.
205 Type = toType(LangStandard::getLangStandardForKind(Std).getLanguage());
206 Type = foldType(*Type);
207 // The contract is to store None instead of TY_INVALID.
208 if (Type == types::TY_INVALID)
209 Type = std::nullopt;
210 }
211
212 // Produce a CompileCommand for \p filename, based on this one.
213 // (This consumes the TransferableCommand just to avoid copying Cmd).
214 CompileCommand transferTo(StringRef Filename) && {
215 CompileCommand Result = std::move(Cmd);
216 Result.Heuristic = "inferred from " + Result.Filename;
217 Result.Filename = std::string(Filename);
218 bool TypeCertain;
219 auto TargetType = guessType(Filename, &TypeCertain);
220 // If the filename doesn't determine the language (.h), transfer with -x.
221 if ((!TargetType || !TypeCertain) && Type) {
222 // Use *Type, or its header variant if the file is a header.
223 // Treat no/invalid extension as header (e.g. C++ standard library).
224 TargetType =
225 (!TargetType || types::onlyPrecompileType(TargetType)) // header?
227 : *Type;
228 if (ClangCLMode) {
229 const StringRef Flag = toCLFlag(TargetType);
230 if (!Flag.empty())
231 Result.CommandLine.push_back(std::string(Flag));
232 } else {
233 Result.CommandLine.push_back("-x");
234 Result.CommandLine.push_back(types::getTypeName(TargetType));
235 }
236 }
237 // --std flag may only be transferred if the language is the same.
238 // We may consider "translating" these, e.g. c++11 -> c11.
239 if (Std != LangStandard::lang_unspecified && foldType(TargetType) == Type) {
240 Result.CommandLine.emplace_back((
241 llvm::Twine(ClangCLMode ? "/std:" : "-std=") +
243 }
244 Result.CommandLine.push_back("--");
245 Result.CommandLine.push_back(std::string(Filename));
246 return Result;
247 }
248
249private:
250 // Map the language from the --std flag to that of the -x flag.
251 static types::ID toType(Language Lang) {
252 switch (Lang) {
253 case Language::C:
254 return types::TY_C;
255 case Language::CXX:
256 return types::TY_CXX;
257 case Language::ObjC:
258 return types::TY_ObjC;
259 case Language::ObjCXX:
260 return types::TY_ObjCXX;
261 default:
262 return types::TY_INVALID;
263 }
264 }
265
266 // Convert a file type to the matching CL-style type flag.
267 static StringRef toCLFlag(types::ID Type) {
268 switch (Type) {
269 case types::TY_C:
270 case types::TY_CHeader:
271 return "/TC";
272 case types::TY_CXX:
273 case types::TY_CXXHeader:
274 return "/TP";
275 default:
276 return StringRef();
277 }
278 }
279
280 // Try to interpret the argument as a type specifier, e.g. '-x'.
281 std::optional<types::ID> tryParseTypeArg(const llvm::opt::Arg &Arg) {
282 const llvm::opt::Option &Opt = Arg.getOption();
283 using namespace driver::options;
284 if (ClangCLMode) {
285 if (Opt.matches(OPT__SLASH_TC) || Opt.matches(OPT__SLASH_Tc))
286 return types::TY_C;
287 if (Opt.matches(OPT__SLASH_TP) || Opt.matches(OPT__SLASH_Tp))
288 return types::TY_CXX;
289 } else {
290 if (Opt.matches(driver::options::OPT_x))
291 return types::lookupTypeForTypeSpecifier(Arg.getValue());
292 }
293 return std::nullopt;
294 }
295
296 // Try to interpret the argument as '-std='.
297 std::optional<LangStandard::Kind> tryParseStdArg(const llvm::opt::Arg &Arg) {
298 using namespace driver::options;
299 if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ))
300 return LangStandard::getLangKind(Arg.getValue());
301 return std::nullopt;
302 }
303};
304
305// Given a filename, FileIndex picks the best matching file from the underlying
306// DB. This is the proxy file whose CompileCommand will be reused. The
307// heuristics incorporate file name, extension, and directory structure.
308// Strategy:
309// - Build indexes of each of the substrings we want to look up by.
310// These indexes are just sorted lists of the substrings.
311// - Each criterion corresponds to a range lookup into the index, so we only
312// need O(log N) string comparisons to determine scores.
313//
314// Apart from path proximity signals, also takes file extensions into account
315// when scoring the candidates.
316class FileIndex {
317public:
318 FileIndex(std::vector<std::string> Files)
319 : OriginalPaths(std::move(Files)), Strings(Arena) {
320 // Sort commands by filename for determinism (index is a tiebreaker later).
321 llvm::sort(OriginalPaths);
322 Paths.reserve(OriginalPaths.size());
323 Types.reserve(OriginalPaths.size());
324 Stems.reserve(OriginalPaths.size());
325 for (size_t I = 0; I < OriginalPaths.size(); ++I) {
326 StringRef Path = Strings.save(StringRef(OriginalPaths[I]).lower());
327
328 Paths.emplace_back(Path, I);
329 Types.push_back(foldType(guessType(OriginalPaths[I])));
330 Stems.emplace_back(sys::path::stem(Path), I);
331 auto Dir = ++sys::path::rbegin(Path), DirEnd = sys::path::rend(Path);
332 for (int J = 0; J < DirectorySegmentsIndexed && Dir != DirEnd; ++J, ++Dir)
333 if (Dir->size() > ShortDirectorySegment) // not trivial ones
334 Components.emplace_back(*Dir, I);
335 }
336 llvm::sort(Paths);
337 llvm::sort(Stems);
338 llvm::sort(Components);
339 }
340
341 bool empty() const { return Paths.empty(); }
342
343 // Returns the path for the file that best fits OriginalFilename.
344 // Candidates with extensions matching PreferLanguage will be chosen over
345 // others (unless it's TY_INVALID, or all candidates are bad).
346 StringRef chooseProxy(StringRef OriginalFilename,
347 types::ID PreferLanguage) const {
348 assert(!empty() && "need at least one candidate!");
349 std::string Filename = OriginalFilename.lower();
350 auto Candidates = scoreCandidates(Filename);
351 std::pair<size_t, int> Best =
352 pickWinner(Candidates, Filename, PreferLanguage);
353
354 DEBUG_WITH_TYPE(
355 "interpolate",
356 llvm::dbgs() << "interpolate: chose " << OriginalPaths[Best.first]
357 << " as proxy for " << OriginalFilename << " preferring "
358 << (PreferLanguage == types::TY_INVALID
359 ? "none"
360 : types::getTypeName(PreferLanguage))
361 << " score=" << Best.second << "\n");
362 return OriginalPaths[Best.first];
363 }
364
365private:
366 using SubstringAndIndex = std::pair<StringRef, size_t>;
367 // Directory matching parameters: we look at the last two segments of the
368 // parent directory (usually the semantically significant ones in practice).
369 // We search only the last four of each candidate (for efficiency).
370 constexpr static int DirectorySegmentsIndexed = 4;
371 constexpr static int DirectorySegmentsQueried = 2;
372 constexpr static int ShortDirectorySegment = 1; // Only look at longer names.
373
374 // Award points to candidate entries that should be considered for the file.
375 // Returned keys are indexes into paths, and the values are (nonzero) scores.
376 DenseMap<size_t, int> scoreCandidates(StringRef Filename) const {
377 // Decompose Filename into the parts we care about.
378 // /some/path/complicated/project/Interesting.h
379 // [-prefix--][---dir---] [-dir-] [--stem---]
380 StringRef Stem = sys::path::stem(Filename);
382 llvm::StringRef Prefix;
383 auto Dir = ++sys::path::rbegin(Filename),
384 DirEnd = sys::path::rend(Filename);
385 for (int I = 0; I < DirectorySegmentsQueried && Dir != DirEnd; ++I, ++Dir) {
386 if (Dir->size() > ShortDirectorySegment)
387 Dirs.push_back(*Dir);
388 Prefix = Filename.substr(0, Dir - DirEnd);
389 }
390
391 // Now award points based on lookups into our various indexes.
392 DenseMap<size_t, int> Candidates; // Index -> score.
393 auto Award = [&](int Points, ArrayRef<SubstringAndIndex> Range) {
394 for (const auto &Entry : Range)
395 Candidates[Entry.second] += Points;
396 };
397 // Award one point if the file's basename is a prefix of the candidate,
398 // and another if it's an exact match (so exact matches get two points).
399 Award(1, indexLookup</*Prefix=*/true>(Stem, Stems));
400 Award(1, indexLookup</*Prefix=*/false>(Stem, Stems));
401 // For each of the last few directories in the Filename, award a point
402 // if it's present in the candidate.
403 for (StringRef Dir : Dirs)
404 Award(1, indexLookup</*Prefix=*/false>(Dir, Components));
405 // Award one more point if the whole rest of the path matches.
406 if (sys::path::root_directory(Prefix) != Prefix)
407 Award(1, indexLookup</*Prefix=*/true>(Prefix, Paths));
408 return Candidates;
409 }
410
411 // Pick a single winner from the set of scored candidates.
412 // Returns (index, score).
413 std::pair<size_t, int> pickWinner(const DenseMap<size_t, int> &Candidates,
414 StringRef Filename,
415 types::ID PreferredLanguage) const {
416 struct ScoredCandidate {
417 size_t Index;
418 bool Preferred;
419 int Points;
420 size_t PrefixLength;
421 };
422 // Choose the best candidate by (preferred, points, prefix length, alpha).
423 ScoredCandidate Best = {size_t(-1), false, 0, 0};
424 for (const auto &Candidate : Candidates) {
425 ScoredCandidate S;
426 S.Index = Candidate.first;
427 S.Preferred = PreferredLanguage == types::TY_INVALID ||
428 PreferredLanguage == Types[S.Index];
429 S.Points = Candidate.second;
430 if (!S.Preferred && Best.Preferred)
431 continue;
432 if (S.Preferred == Best.Preferred) {
433 if (S.Points < Best.Points)
434 continue;
435 if (S.Points == Best.Points) {
436 S.PrefixLength = matchingPrefix(Filename, Paths[S.Index].first);
437 if (S.PrefixLength < Best.PrefixLength)
438 continue;
439 // hidden heuristics should at least be deterministic!
440 if (S.PrefixLength == Best.PrefixLength)
441 if (S.Index > Best.Index)
442 continue;
443 }
444 }
445 // PrefixLength was only set above if actually needed for a tiebreak.
446 // But it definitely needs to be set to break ties in the future.
447 S.PrefixLength = matchingPrefix(Filename, Paths[S.Index].first);
448 Best = S;
449 }
450 // Edge case: no candidate got any points.
451 // We ignore PreferredLanguage at this point (not ideal).
452 if (Best.Index == size_t(-1))
453 return {longestMatch(Filename, Paths).second, 0};
454 return {Best.Index, Best.Points};
455 }
456
457 // Returns the range within a sorted index that compares equal to Key.
458 // If Prefix is true, it's instead the range starting with Key.
459 template <bool Prefix>
461 indexLookup(StringRef Key, ArrayRef<SubstringAndIndex> Idx) const {
462 // Use pointers as iteratiors to ease conversion of result to ArrayRef.
463 auto Range = std::equal_range(Idx.data(), Idx.data() + Idx.size(), Key,
464 Less<Prefix>());
465 return {Range.first, Range.second};
466 }
467
468 // Performs a point lookup into a nonempty index, returning a longest match.
469 SubstringAndIndex longestMatch(StringRef Key,
470 ArrayRef<SubstringAndIndex> Idx) const {
471 assert(!Idx.empty());
472 // Longest substring match will be adjacent to a direct lookup.
473 auto It = llvm::lower_bound(Idx, SubstringAndIndex{Key, 0});
474 if (It == Idx.begin())
475 return *It;
476 if (It == Idx.end())
477 return *--It;
478 // Have to choose between It and It-1
479 size_t Prefix = matchingPrefix(Key, It->first);
480 size_t PrevPrefix = matchingPrefix(Key, (It - 1)->first);
481 return Prefix > PrevPrefix ? *It : *--It;
482 }
483
484 // Original paths, everything else is in lowercase.
485 std::vector<std::string> OriginalPaths;
486 BumpPtrAllocator Arena;
487 StringSaver Strings;
488 // Indexes of candidates by certain substrings.
489 // String is lowercase and sorted, index points into OriginalPaths.
490 std::vector<SubstringAndIndex> Paths; // Full path.
491 // Lang types obtained by guessing on the corresponding path. I-th element is
492 // a type for the I-th path.
493 std::vector<types::ID> Types;
494 std::vector<SubstringAndIndex> Stems; // Basename, without extension.
495 std::vector<SubstringAndIndex> Components; // Last path components.
496};
497
498// The actual CompilationDatabase wrapper delegates to its inner database.
499// If no match, looks up a proxy file in FileIndex and transfers its
500// command to the requested file.
501class InterpolatingCompilationDatabase : public CompilationDatabase {
502public:
503 InterpolatingCompilationDatabase(std::unique_ptr<CompilationDatabase> Inner)
504 : Inner(std::move(Inner)), Index(this->Inner->getAllFiles()) {}
505
506 std::vector<CompileCommand>
507 getCompileCommands(StringRef Filename) const override {
508 auto Known = Inner->getCompileCommands(Filename);
509 if (Index.empty() || !Known.empty())
510 return Known;
511 bool TypeCertain;
512 auto Lang = guessType(Filename, &TypeCertain);
513 if (!TypeCertain)
515 auto ProxyCommands =
516 Inner->getCompileCommands(Index.chooseProxy(Filename, foldType(Lang)));
517 if (ProxyCommands.empty())
518 return {};
519 return {transferCompileCommand(std::move(ProxyCommands.front()), Filename)};
520 }
521
522 std::vector<std::string> getAllFiles() const override {
523 return Inner->getAllFiles();
524 }
525
526 std::vector<CompileCommand> getAllCompileCommands() const override {
527 return Inner->getAllCompileCommands();
528 }
529
530private:
531 std::unique_ptr<CompilationDatabase> Inner;
532 FileIndex Index;
533};
534
535} // namespace
536
537std::unique_ptr<CompilationDatabase>
538inferMissingCompileCommands(std::unique_ptr<CompilationDatabase> Inner) {
539 return std::make_unique<InterpolatingCompilationDatabase>(std::move(Inner));
540}
541
543 StringRef Filename) {
544 return TransferableCommand(std::move(Cmd)).transferTo(Filename);
545}
546
547} // namespace tooling
548} // namespace clang
#define V(N, I)
Definition: ASTContext.h:3266
MatchType Type
StringRef Filename
Definition: Format.cpp:2969
CompileCommand Cmd
LangStandard::Kind Std
static std::string getName(const CallEvent &Call)
__SIZE_TYPE__ size_t
ID lookupTypeForTypeSpecifier(const char *Name)
lookupTypeForTypSpecifier - Lookup the type to use for a user specified type name.
Definition: Types.cpp:371
bool onlyPrecompileType(ID Id)
onlyPrecompileType - Should this type only be precompiled.
Definition: Types.cpp:100
const char * getTypeName(ID Id)
getTypeName - Return the name of the type for Id.
Definition: Types.cpp:52
ID lookupHeaderTypeForSourceType(ID Id)
Lookup header file input type that corresponds to given source file type (used for clang-cl emulation...
Definition: Types.cpp:418
ID lookupTypeForExtension(llvm::StringRef Ext)
lookupTypeForExtension - Lookup the type to use for the file extension Ext.
Definition: Types.cpp:298
llvm::StringRef getDriverMode(StringRef ProgName, ArrayRef< const char * > Args)
Returns the driver mode option's value, i.e.
Definition: Driver.cpp:6666
const llvm::opt::OptTable & getDriverOptTable()
bool IsClangCL(StringRef DriverMode)
Checks whether the value produced by getDriverMode is for CL mode.
Definition: Driver.cpp:6681
tooling::CompileCommand transferCompileCommand(tooling::CompileCommand, StringRef Filename)
Transforms a compile command so that it applies the same configuration to a different file.
std::unique_ptr< CompilationDatabase > inferMissingCompileCommands(std::unique_ptr< CompilationDatabase >)
Returns a wrapped CompilationDatabase that defers to the provided one, but getCompileCommands() will ...
The JSON file list parser is used to communicate input to InstallAPI.
Language
The language for the input, used to select and validate the language standard and possible actions.
Definition: LangStandard.h:23
@ C
Languages that the frontend can parse and compile.
@ Result
The result type of a method or function.
YAML serialization mapping.
Definition: Dominators.h:30
Definition: Format.h:5378
static const LangStandard & getLangStandardForKind(Kind K)
static Kind getLangKind(StringRef Name)
Specifies the working directory and command of a compilation.