clang  9.0.0svn
CloneDetection.h
Go to the documentation of this file.
1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file defines classes for searching and analyzing source code clones.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
15 #define LLVM_CLANG_AST_CLONEDETECTION_H
16 
17 #include "clang/AST/StmtVisitor.h"
18 #include "llvm/Support/Regex.h"
19 #include <vector>
20 
21 namespace clang {
22 
23 class Stmt;
24 class Decl;
25 class VarDecl;
26 class ASTContext;
27 class CompoundStmt;
28 
29 /// Identifies a list of statements.
30 ///
31 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
32 /// child statements inside a CompoundStmt or no statements at all.
33 class StmtSequence {
34  /// If this object identifies a sequence of statements inside a CompoundStmt,
35  /// S points to this CompoundStmt. If this object only identifies a single
36  /// Stmt, then S is a pointer to this Stmt.
37  const Stmt *S;
38 
39  /// The declaration that contains the statements.
40  const Decl *D;
41 
42  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
43  /// instance is representing the CompoundStmt children inside the array
44  /// [StartIndex, EndIndex).
45  unsigned StartIndex;
46  unsigned EndIndex;
47 
48 public:
49  /// Constructs a StmtSequence holding multiple statements.
50  ///
51  /// The resulting StmtSequence identifies a continuous sequence of statements
52  /// in the body of the given CompoundStmt. Which statements of the body should
53  /// be identified needs to be specified by providing a start and end index
54  /// that describe a non-empty sub-array in the body of the given CompoundStmt.
55  ///
56  /// \param Stmt A CompoundStmt that contains all statements in its body.
57  /// \param D The Decl containing this Stmt.
58  /// \param StartIndex The inclusive start index in the children array of
59  /// \p Stmt
60  /// \param EndIndex The exclusive end index in the children array of \p Stmt.
61  StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
62  unsigned EndIndex);
63 
64  /// Constructs a StmtSequence holding a single statement.
65  ///
66  /// \param Stmt An arbitrary Stmt.
67  /// \param D The Decl containing this Stmt.
68  StmtSequence(const Stmt *Stmt, const Decl *D);
69 
70  /// Constructs an empty StmtSequence.
71  StmtSequence();
72 
73  typedef const Stmt *const *iterator;
74 
75  /// Returns an iterator pointing to the first statement in this sequence.
76  iterator begin() const;
77 
78  /// Returns an iterator pointing behind the last statement in this sequence.
79  iterator end() const;
80 
81  /// Returns the first statement in this sequence.
82  ///
83  /// This method should only be called on a non-empty StmtSequence object.
84  const Stmt *front() const {
85  assert(!empty());
86  return begin()[0];
87  }
88 
89  /// Returns the last statement in this sequence.
90  ///
91  /// This method should only be called on a non-empty StmtSequence object.
92  const Stmt *back() const {
93  assert(!empty());
94  return begin()[size() - 1];
95  }
96 
97  /// Returns the number of statements this object holds.
98  unsigned size() const {
99  if (holdsSequence())
100  return EndIndex - StartIndex;
101  if (S == nullptr)
102  return 0;
103  return 1;
104  }
105 
106  /// Returns true if and only if this StmtSequence contains no statements.
107  bool empty() const { return size() == 0; }
108 
109  /// Returns the related ASTContext for the stored Stmts.
110  ASTContext &getASTContext() const;
111 
112  /// Returns the declaration that contains the stored Stmts.
113  const Decl *getContainingDecl() const {
114  assert(D);
115  return D;
116  }
117 
118  /// Returns true if this objects holds a list of statements.
119  bool holdsSequence() const { return EndIndex != 0; }
120 
121  /// Returns the start sourcelocation of the first statement in this sequence.
122  ///
123  /// This method should only be called on a non-empty StmtSequence object.
124  SourceLocation getBeginLoc() const;
125 
126  /// Returns the end sourcelocation of the last statement in this sequence.
127  ///
128  /// This method should only be called on a non-empty StmtSequence object.
129  SourceLocation getEndLoc() const;
130 
131  /// Returns the source range of the whole sequence - from the beginning
132  /// of the first statement to the end of the last statement.
133  SourceRange getSourceRange() const;
134 
135  bool operator==(const StmtSequence &Other) const {
136  return std::tie(S, StartIndex, EndIndex) ==
137  std::tie(Other.S, Other.StartIndex, Other.EndIndex);
138  }
139 
140  bool operator!=(const StmtSequence &Other) const {
141  return std::tie(S, StartIndex, EndIndex) !=
142  std::tie(Other.S, Other.StartIndex, Other.EndIndex);
143  }
144 
145  /// Returns true if and only if this sequence covers a source range that
146  /// contains the source range of the given sequence \p Other.
147  ///
148  /// This method should only be called on a non-empty StmtSequence object
149  /// and passed a non-empty StmtSequence object.
150  bool contains(const StmtSequence &Other) const;
151 };
152 
153 /// Searches for similar subtrees in the AST.
154 ///
155 /// First, this class needs several declarations with statement bodies which
156 /// can be passed via analyzeCodeBody. Afterwards all statements can be
157 /// searched for clones by calling findClones with a given list of constraints
158 /// that should specify the wanted properties of the clones.
159 ///
160 /// The result of findClones can be further constrained with the constrainClones
161 /// method.
162 ///
163 /// This class only searches for clones in executable source code
164 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
165 /// are not supported.
167 
168 public:
169  /// A collection of StmtSequences that share an arbitrary property.
171 
172  /// Generates and stores search data for all statements in the body of
173  /// the given Decl.
174  void analyzeCodeBody(const Decl *D);
175 
176  /// Constrains the given list of clone groups with the given constraint.
177  ///
178  /// The constraint is expected to have a method with the signature
179  /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
180  /// as this is the interface that the CloneDetector uses for applying the
181  /// constraint. The constraint is supposed to directly modify the passed list
182  /// so that all clones in the list fulfill the specific property this
183  /// constraint ensures.
184  template <typename T>
185  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
186  C.constrain(CloneGroups);
187  }
188 
189  /// Constrains the given list of clone groups with the given list of
190  /// constraints.
191  ///
192  /// The constraints are applied in sequence in the order in which they are
193  /// passed to this function.
194  template <typename T1, typename... Ts>
195  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
196  Ts... ConstraintList) {
197  constrainClones(CloneGroups, C);
198  constrainClones(CloneGroups, ConstraintList...);
199  }
200 
201  /// Searches for clones in all previously passed statements.
202  /// \param Result Output parameter to which all created clone groups are
203  /// added.
204  /// \param ConstraintList The constraints that should be applied to the
205  // result.
206  template <typename... Ts>
207  void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
208  // The initial assumption is that there is only one clone group and every
209  // statement is a clone of the others. This clone group will then be
210  // split up with the help of the constraints.
211  CloneGroup AllClones;
212  AllClones.reserve(Sequences.size());
213  for (const auto &C : Sequences) {
214  AllClones.push_back(C);
215  }
216 
217  Result.push_back(AllClones);
218 
219  constrainClones(Result, ConstraintList...);
220  }
221 
222 private:
223  CloneGroup Sequences;
224 };
225 
226 /// This class is a utility class that contains utility functions for building
227 /// custom constraints.
229 public:
230  /// Removes all groups by using a filter function.
231  /// \param CloneGroups The list of CloneGroups that is supposed to be
232  /// filtered.
233  /// \param Filter The filter function that should return true for all groups
234  /// that should be removed from the list.
235  static void filterGroups(
236  std::vector<CloneDetector::CloneGroup> &CloneGroups,
237  llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
238  CloneGroups.erase(
239  std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
240  CloneGroups.end());
241  }
242 
243  /// Splits the given CloneGroups until the given Compare function returns true
244  /// for all clones in a single group.
245  /// \param CloneGroups A list of CloneGroups that should be modified.
246  /// \param Compare The comparison function that all clones are supposed to
247  /// pass. Should return true if and only if two clones belong
248  /// to the same CloneGroup.
249  static void splitCloneGroups(
250  std::vector<CloneDetector::CloneGroup> &CloneGroups,
251  llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
252  Compare);
253 };
254 
255 /// This constraint moves clones into clone groups of type II via hashing.
256 ///
257 /// Clones with different hash values are moved into separate clone groups.
258 /// Collisions are possible, and this constraint does nothing to address this
259 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
260 /// constraint chain, not necessarily immediately, to eliminate hash collisions
261 /// through a more detailed analysis.
263 public:
264  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
265 };
266 
267 /// This constraint moves clones into clone groups of type II by comparing them.
268 ///
269 /// Clones that aren't type II clones are moved into separate clone groups.
270 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
271 /// group are guaranteed to be be type II clones of each other, but it is too
272 /// slow to efficiently handle large amounts of clones.
274 public:
275  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
276 };
277 
278 /// Ensures that every clone has at least the given complexity.
279 ///
280 /// Complexity is here defined as the total amount of children of a statement.
281 /// This constraint assumes the first statement in the group is representative
282 /// for all other statements in the group in terms of complexity.
284  unsigned MinComplexity;
285 
286 public:
287  MinComplexityConstraint(unsigned MinComplexity)
288  : MinComplexity(MinComplexity) {}
289 
290  /// Calculates the complexity of the given StmtSequence.
291  /// \param Limit The limit of complexity we probe for. After reaching
292  /// this limit during calculation, this method is exiting
293  /// early to improve performance and returns this limit.
294  size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
295  const std::string &ParentMacroStack = "");
296 
297  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
299  CloneGroups, [this](const CloneDetector::CloneGroup &A) {
300  if (!A.empty())
301  return calculateStmtComplexity(A.front(), MinComplexity) <
302  MinComplexity;
303  else
304  return false;
305  });
306  }
307 };
308 
309 /// Ensures that all clone groups contain at least the given amount of clones.
311  unsigned MinGroupSize;
312 
313 public:
314  MinGroupSizeConstraint(unsigned MinGroupSize = 2)
315  : MinGroupSize(MinGroupSize) {}
316 
317  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
318  CloneConstraint::filterGroups(CloneGroups,
319  [this](const CloneDetector::CloneGroup &A) {
320  return A.size() < MinGroupSize;
321  });
322  }
323 };
324 
325 /// Ensures that no clone group fully contains another clone group.
327  void constrain(std::vector<CloneDetector::CloneGroup> &Result);
328 };
329 
332  std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
333 
334  FilenamePatternConstraint(StringRef IgnoredFilesPattern)
335  : IgnoredFilesPattern(IgnoredFilesPattern) {
336  IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
337  IgnoredFilesPattern.str() + "$)");
338  }
339 
340  bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
341 
342  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
344  CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
345  return isAutoGenerated(Group);
346  });
347  }
348 };
349 
350 /// Analyzes the pattern of the referenced variables in a statement.
352 
353  /// Describes an occurrence of a variable reference in a statement.
354  struct VariableOccurence {
355  /// The index of the associated VarDecl in the Variables vector.
356  size_t KindID;
357  /// The statement in the code where the variable was referenced.
358  const Stmt *Mention;
359 
360  VariableOccurence(size_t KindID, const Stmt *Mention)
361  : KindID(KindID), Mention(Mention) {}
362  };
363 
364  /// All occurrences of referenced variables in the order of appearance.
365  std::vector<VariableOccurence> Occurences;
366  /// List of referenced variables in the order of appearance.
367  /// Every item in this list is unique.
368  std::vector<const VarDecl *> Variables;
369 
370  /// Adds a new variable referenced to this pattern.
371  /// \param VarDecl The declaration of the variable that is referenced.
372  /// \param Mention The SourceRange where this variable is referenced.
373  void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
374 
375  /// Adds each referenced variable from the given statement.
376  void addVariables(const Stmt *S);
377 
378 public:
379  /// Creates an VariablePattern object with information about the given
380  /// StmtSequence.
381  VariablePattern(const StmtSequence &Sequence) {
382  for (const Stmt *S : Sequence)
383  addVariables(S);
384  }
385 
386  /// Describes two clones that reference their variables in a different pattern
387  /// which could indicate a programming error.
389  /// Utility class holding the relevant information about a single
390  /// clone in this pair.
392  /// The variable which referencing in this clone was against the pattern.
393  const VarDecl *Variable;
394  /// Where the variable was referenced.
395  const Stmt *Mention;
396  /// The variable that should have been referenced to follow the pattern.
397  /// If Suggestion is a nullptr then it's not possible to fix the pattern
398  /// by referencing a different variable in this clone.
399  const VarDecl *Suggestion;
400  SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
401  const VarDecl *Suggestion)
402  : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
404  };
405  /// The first clone in the pair which always has a suggested variable.
407  /// This other clone in the pair which can have a suggested variable.
409  };
410 
411  /// Counts the differences between this pattern and the given one.
412  /// \param Other The given VariablePattern to compare with.
413  /// \param FirstMismatch Output parameter that will be filled with information
414  /// about the first difference between the two patterns. This parameter
415  /// can be a nullptr, in which case it will be ignored.
416  /// \return Returns the number of differences between the pattern this object
417  /// is following and the given VariablePattern.
418  ///
419  /// For example, the following statements all have the same pattern and this
420  /// function would return zero:
421  ///
422  /// if (a < b) return a; return b;
423  /// if (x < y) return x; return y;
424  /// if (u2 < u1) return u2; return u1;
425  ///
426  /// But the following statement has a different pattern (note the changed
427  /// variables in the return statements) and would have two differences when
428  /// compared with one of the statements above.
429  ///
430  /// if (a < b) return b; return a;
431  ///
432  /// This function should only be called if the related statements of the given
433  /// pattern and the statements of this objects are clones of each other.
434  unsigned countPatternDifferences(
435  const VariablePattern &Other,
436  VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
437 };
438 
439 /// Ensures that all clones reference variables in the same pattern.
441  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
442 };
443 
444 } // end namespace clang
445 
446 #endif // LLVM_CLANG_AST_CLONEDETECTION_H
FilenamePatternConstraint(StringRef IgnoredFilesPattern)
SourceLocation getEndLoc() const
Returns the end sourcelocation of the last statement in this sequence.
__SIZE_TYPE__ size_t
The unsigned integer type of the result of the sizeof operator.
Definition: opencl-c.h:64
Stmt - This represents one statement.
Definition: Stmt.h:66
ASTContext & getASTContext() const
Returns the related ASTContext for the stored Stmts.
SourceRange getSourceRange() const
Returns the source range of the whole sequence - from the beginning of the first statement to the end...
Analyzes the pattern of the referenced variables in a statement.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:88
const Stmt * front() const
Returns the first statement in this sequence.
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
bool operator!=(const StmtSequence &Other) const
bool operator==(const StmtSequence &Other) const
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Represents a variable declaration or definition.
Definition: Decl.h:812
StmtSequence()
Constructs an empty StmtSequence.
SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, const VarDecl *Suggestion)
iterator begin() const
Returns an iterator pointing to the first statement in this sequence.
Identifies a list of statements.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:154
This class is a utility class that contains utility functions for building custom constraints...
Ensures that all clones reference variables in the same pattern.
Ensures that all clone groups contain at least the given amount of clones.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T C)
Constrains the given list of clone groups with the given constraint.
VariablePattern(const StmtSequence &Sequence)
Creates an VariablePattern object with information about the given StmtSequence.
void findClones(std::vector< CloneGroup > &Result, Ts... ConstraintList)
Searches for clones in all previously passed statements.
iterator end() const
Returns an iterator pointing behind the last statement in this sequence.
Searches for similar subtrees in the AST.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
bool empty() const
Returns true if and only if this StmtSequence contains no statements.
Describes two clones that reference their variables in a different pattern which could indicate a pro...
CompoundStmt - This represents a group of statements like { stmt stmt }.
Definition: Stmt.h:1303
llvm::SmallVector< StmtSequence, 8 > CloneGroup
A collection of StmtSequences that share an arbitrary property.
This constraint moves clones into clone groups of type II via hashing.
SuspiciousCloneInfo SecondCloneInfo
This other clone in the pair which can have a suggested variable.
const Decl * getContainingDecl() const
Returns the declaration that contains the stored Stmts.
This constraint moves clones into clone groups of type II by comparing them.
Ensures that every clone has at least the given complexity.
const VarDecl * Suggestion
The variable that should have been referenced to follow the pattern.
The result type of a method or function.
std::shared_ptr< llvm::Regex > IgnoredFilesRegex
Utility class holding the relevant information about a single clone in this pair. ...
const Stmt * Mention
Where the variable was referenced.
llvm::cl::opt< std::string > Filter
unsigned size() const
Returns the number of statements this object holds.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T1 C, Ts... ConstraintList)
Constrains the given list of clone groups with the given list of constraints.
MinComplexityConstraint(unsigned MinComplexity)
Encodes a location in the source.
const Stmt *const * iterator
bool holdsSequence() const
Returns true if this objects holds a list of statements.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Dataflow Directional Tag Classes.
const Stmt * back() const
Returns the last statement in this sequence.
SourceLocation getBeginLoc() const
Returns the start sourcelocation of the first statement in this sequence.
MinGroupSizeConstraint(unsigned MinGroupSize=2)
bool contains(const StmtSequence &Other) const
Returns true if and only if this sequence covers a source range that contains the source range of the...
static void filterGroups(std::vector< CloneDetector::CloneGroup > &CloneGroups, llvm::function_ref< bool(const CloneDetector::CloneGroup &)> Filter)
Removes all groups by using a filter function.
SuspiciousCloneInfo FirstCloneInfo
The first clone in the pair which always has a suggested variable.
A trivial tuple used to represent a source range.
const VarDecl * Variable
The variable which referencing in this clone was against the pattern.
Ensures that no clone group fully contains another clone group.