clang  8.0.0svn
CloneDetection.h
Go to the documentation of this file.
1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// This file defines classes for searching and analyzing source code clones.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
16 #define LLVM_CLANG_AST_CLONEDETECTION_H
17 
18 #include "clang/AST/StmtVisitor.h"
19 #include "llvm/Support/Regex.h"
20 #include <vector>
21 
22 namespace clang {
23 
24 class Stmt;
25 class Decl;
26 class VarDecl;
27 class ASTContext;
28 class CompoundStmt;
29 
30 /// Identifies a list of statements.
31 ///
32 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
33 /// child statements inside a CompoundStmt or no statements at all.
34 class StmtSequence {
35  /// If this object identifies a sequence of statements inside a CompoundStmt,
36  /// S points to this CompoundStmt. If this object only identifies a single
37  /// Stmt, then S is a pointer to this Stmt.
38  const Stmt *S;
39 
40  /// The declaration that contains the statements.
41  const Decl *D;
42 
43  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
44  /// instance is representing the CompoundStmt children inside the array
45  /// [StartIndex, EndIndex).
46  unsigned StartIndex;
47  unsigned EndIndex;
48 
49 public:
50  /// Constructs a StmtSequence holding multiple statements.
51  ///
52  /// The resulting StmtSequence identifies a continuous sequence of statements
53  /// in the body of the given CompoundStmt. Which statements of the body should
54  /// be identified needs to be specified by providing a start and end index
55  /// that describe a non-empty sub-array in the body of the given CompoundStmt.
56  ///
57  /// \param Stmt A CompoundStmt that contains all statements in its body.
58  /// \param D The Decl containing this Stmt.
59  /// \param StartIndex The inclusive start index in the children array of
60  /// \p Stmt
61  /// \param EndIndex The exclusive end index in the children array of \p Stmt.
62  StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
63  unsigned EndIndex);
64 
65  /// Constructs a StmtSequence holding a single statement.
66  ///
67  /// \param Stmt An arbitrary Stmt.
68  /// \param D The Decl containing this Stmt.
69  StmtSequence(const Stmt *Stmt, const Decl *D);
70 
71  /// Constructs an empty StmtSequence.
72  StmtSequence();
73 
74  typedef const Stmt *const *iterator;
75 
76  /// Returns an iterator pointing to the first statement in this sequence.
77  iterator begin() const;
78 
79  /// Returns an iterator pointing behind the last statement in this sequence.
80  iterator end() const;
81 
82  /// Returns the first statement in this sequence.
83  ///
84  /// This method should only be called on a non-empty StmtSequence object.
85  const Stmt *front() const {
86  assert(!empty());
87  return begin()[0];
88  }
89 
90  /// Returns the last statement in this sequence.
91  ///
92  /// This method should only be called on a non-empty StmtSequence object.
93  const Stmt *back() const {
94  assert(!empty());
95  return begin()[size() - 1];
96  }
97 
98  /// Returns the number of statements this object holds.
99  unsigned size() const {
100  if (holdsSequence())
101  return EndIndex - StartIndex;
102  if (S == nullptr)
103  return 0;
104  return 1;
105  }
106 
107  /// Returns true if and only if this StmtSequence contains no statements.
108  bool empty() const { return size() == 0; }
109 
110  /// Returns the related ASTContext for the stored Stmts.
111  ASTContext &getASTContext() const;
112 
113  /// Returns the declaration that contains the stored Stmts.
114  const Decl *getContainingDecl() const {
115  assert(D);
116  return D;
117  }
118 
119  /// Returns true if this objects holds a list of statements.
120  bool holdsSequence() const { return EndIndex != 0; }
121 
122  /// Returns the start sourcelocation of the first statement in this sequence.
123  ///
124  /// This method should only be called on a non-empty StmtSequence object.
125  LLVM_ATTRIBUTE_DEPRECATED(SourceLocation getStartLoc() const LLVM_READONLY,
126  "Use getBeginLoc instead") {
127  return getBeginLoc();
128  }
129  SourceLocation getBeginLoc() const;
130 
131  /// Returns the end sourcelocation of the last statement in this sequence.
132  ///
133  /// This method should only be called on a non-empty StmtSequence object.
134  SourceLocation getEndLoc() const;
135 
136  /// Returns the source range of the whole sequence - from the beginning
137  /// of the first statement to the end of the last statement.
138  SourceRange getSourceRange() const;
139 
140  bool operator==(const StmtSequence &Other) const {
141  return std::tie(S, StartIndex, EndIndex) ==
142  std::tie(Other.S, Other.StartIndex, Other.EndIndex);
143  }
144 
145  bool operator!=(const StmtSequence &Other) const {
146  return std::tie(S, StartIndex, EndIndex) !=
147  std::tie(Other.S, Other.StartIndex, Other.EndIndex);
148  }
149 
150  /// Returns true if and only if this sequence covers a source range that
151  /// contains the source range of the given sequence \p Other.
152  ///
153  /// This method should only be called on a non-empty StmtSequence object
154  /// and passed a non-empty StmtSequence object.
155  bool contains(const StmtSequence &Other) const;
156 };
157 
158 /// Searches for similar subtrees in the AST.
159 ///
160 /// First, this class needs several declarations with statement bodies which
161 /// can be passed via analyzeCodeBody. Afterwards all statements can be
162 /// searched for clones by calling findClones with a given list of constraints
163 /// that should specify the wanted properties of the clones.
164 ///
165 /// The result of findClones can be further constrained with the constrainClones
166 /// method.
167 ///
168 /// This class only searches for clones in executable source code
169 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
170 /// are not supported.
172 
173 public:
174  /// A collection of StmtSequences that share an arbitrary property.
176 
177  /// Generates and stores search data for all statements in the body of
178  /// the given Decl.
179  void analyzeCodeBody(const Decl *D);
180 
181  /// Constrains the given list of clone groups with the given constraint.
182  ///
183  /// The constraint is expected to have a method with the signature
184  /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
185  /// as this is the interface that the CloneDetector uses for applying the
186  /// constraint. The constraint is supposed to directly modify the passed list
187  /// so that all clones in the list fulfill the specific property this
188  /// constraint ensures.
189  template <typename T>
190  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
191  C.constrain(CloneGroups);
192  }
193 
194  /// Constrains the given list of clone groups with the given list of
195  /// constraints.
196  ///
197  /// The constraints are applied in sequence in the order in which they are
198  /// passed to this function.
199  template <typename T1, typename... Ts>
200  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
201  Ts... ConstraintList) {
202  constrainClones(CloneGroups, C);
203  constrainClones(CloneGroups, ConstraintList...);
204  }
205 
206  /// Searches for clones in all previously passed statements.
207  /// \param Result Output parameter to which all created clone groups are
208  /// added.
209  /// \param ConstraintList The constraints that should be applied to the
210  // result.
211  template <typename... Ts>
212  void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
213  // The initial assumption is that there is only one clone group and every
214  // statement is a clone of the others. This clone group will then be
215  // split up with the help of the constraints.
216  CloneGroup AllClones;
217  AllClones.reserve(Sequences.size());
218  for (const auto &C : Sequences) {
219  AllClones.push_back(C);
220  }
221 
222  Result.push_back(AllClones);
223 
224  constrainClones(Result, ConstraintList...);
225  }
226 
227 private:
228  CloneGroup Sequences;
229 };
230 
231 /// This class is a utility class that contains utility functions for building
232 /// custom constraints.
234 public:
235  /// Removes all groups by using a filter function.
236  /// \param CloneGroups The list of CloneGroups that is supposed to be
237  /// filtered.
238  /// \param Filter The filter function that should return true for all groups
239  /// that should be removed from the list.
240  static void filterGroups(
241  std::vector<CloneDetector::CloneGroup> &CloneGroups,
242  llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
243  CloneGroups.erase(
244  std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
245  CloneGroups.end());
246  }
247 
248  /// Splits the given CloneGroups until the given Compare function returns true
249  /// for all clones in a single group.
250  /// \param CloneGroups A list of CloneGroups that should be modified.
251  /// \param Compare The comparison function that all clones are supposed to
252  /// pass. Should return true if and only if two clones belong
253  /// to the same CloneGroup.
254  static void splitCloneGroups(
255  std::vector<CloneDetector::CloneGroup> &CloneGroups,
256  llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
257  Compare);
258 };
259 
260 /// This constraint moves clones into clone groups of type II via hashing.
261 ///
262 /// Clones with different hash values are moved into separate clone groups.
263 /// Collisions are possible, and this constraint does nothing to address this
264 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
265 /// constraint chain, not necessarily immediately, to eliminate hash collisions
266 /// through a more detailed analysis.
268 public:
269  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
270 };
271 
272 /// This constraint moves clones into clone groups of type II by comparing them.
273 ///
274 /// Clones that aren't type II clones are moved into separate clone groups.
275 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
276 /// group are guaranteed to be be type II clones of each other, but it is too
277 /// slow to efficiently handle large amounts of clones.
279 public:
280  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
281 };
282 
283 /// Ensures that every clone has at least the given complexity.
284 ///
285 /// Complexity is here defined as the total amount of children of a statement.
286 /// This constraint assumes the first statement in the group is representative
287 /// for all other statements in the group in terms of complexity.
289  unsigned MinComplexity;
290 
291 public:
292  MinComplexityConstraint(unsigned MinComplexity)
293  : MinComplexity(MinComplexity) {}
294 
295  /// Calculates the complexity of the given StmtSequence.
296  /// \param Limit The limit of complexity we probe for. After reaching
297  /// this limit during calculation, this method is exiting
298  /// early to improve performance and returns this limit.
299  size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
300  const std::string &ParentMacroStack = "");
301 
302  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
304  CloneGroups, [this](const CloneDetector::CloneGroup &A) {
305  if (!A.empty())
306  return calculateStmtComplexity(A.front(), MinComplexity) <
307  MinComplexity;
308  else
309  return false;
310  });
311  }
312 };
313 
314 /// Ensures that all clone groups contain at least the given amount of clones.
316  unsigned MinGroupSize;
317 
318 public:
319  MinGroupSizeConstraint(unsigned MinGroupSize = 2)
320  : MinGroupSize(MinGroupSize) {}
321 
322  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
323  CloneConstraint::filterGroups(CloneGroups,
324  [this](const CloneDetector::CloneGroup &A) {
325  return A.size() < MinGroupSize;
326  });
327  }
328 };
329 
330 /// Ensures that no clone group fully contains another clone group.
332  void constrain(std::vector<CloneDetector::CloneGroup> &Result);
333 };
334 
337  std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
338 
339  FilenamePatternConstraint(StringRef IgnoredFilesPattern)
340  : IgnoredFilesPattern(IgnoredFilesPattern) {
341  IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
342  IgnoredFilesPattern.str() + "$)");
343  }
344 
345  bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
346 
347  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
349  CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
350  return isAutoGenerated(Group);
351  });
352  }
353 };
354 
355 /// Analyzes the pattern of the referenced variables in a statement.
357 
358  /// Describes an occurrence of a variable reference in a statement.
359  struct VariableOccurence {
360  /// The index of the associated VarDecl in the Variables vector.
361  size_t KindID;
362  /// The statement in the code where the variable was referenced.
363  const Stmt *Mention;
364 
365  VariableOccurence(size_t KindID, const Stmt *Mention)
366  : KindID(KindID), Mention(Mention) {}
367  };
368 
369  /// All occurrences of referenced variables in the order of appearance.
370  std::vector<VariableOccurence> Occurences;
371  /// List of referenced variables in the order of appearance.
372  /// Every item in this list is unique.
373  std::vector<const VarDecl *> Variables;
374 
375  /// Adds a new variable referenced to this pattern.
376  /// \param VarDecl The declaration of the variable that is referenced.
377  /// \param Mention The SourceRange where this variable is referenced.
378  void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
379 
380  /// Adds each referenced variable from the given statement.
381  void addVariables(const Stmt *S);
382 
383 public:
384  /// Creates an VariablePattern object with information about the given
385  /// StmtSequence.
386  VariablePattern(const StmtSequence &Sequence) {
387  for (const Stmt *S : Sequence)
388  addVariables(S);
389  }
390 
391  /// Describes two clones that reference their variables in a different pattern
392  /// which could indicate a programming error.
394  /// Utility class holding the relevant information about a single
395  /// clone in this pair.
397  /// The variable which referencing in this clone was against the pattern.
398  const VarDecl *Variable;
399  /// Where the variable was referenced.
400  const Stmt *Mention;
401  /// The variable that should have been referenced to follow the pattern.
402  /// If Suggestion is a nullptr then it's not possible to fix the pattern
403  /// by referencing a different variable in this clone.
404  const VarDecl *Suggestion;
405  SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
406  const VarDecl *Suggestion)
407  : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
409  };
410  /// The first clone in the pair which always has a suggested variable.
412  /// This other clone in the pair which can have a suggested variable.
414  };
415 
416  /// Counts the differences between this pattern and the given one.
417  /// \param Other The given VariablePattern to compare with.
418  /// \param FirstMismatch Output parameter that will be filled with information
419  /// about the first difference between the two patterns. This parameter
420  /// can be a nullptr, in which case it will be ignored.
421  /// \return Returns the number of differences between the pattern this object
422  /// is following and the given VariablePattern.
423  ///
424  /// For example, the following statements all have the same pattern and this
425  /// function would return zero:
426  ///
427  /// if (a < b) return a; return b;
428  /// if (x < y) return x; return y;
429  /// if (u2 < u1) return u2; return u1;
430  ///
431  /// But the following statement has a different pattern (note the changed
432  /// variables in the return statements) and would have two differences when
433  /// compared with one of the statements above.
434  ///
435  /// if (a < b) return b; return a;
436  ///
437  /// This function should only be called if the related statements of the given
438  /// pattern and the statements of this objects are clones of each other.
439  unsigned countPatternDifferences(
440  const VariablePattern &Other,
441  VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
442 };
443 
444 /// Ensures that all clones reference variables in the same pattern.
446  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
447 };
448 
449 } // end namespace clang
450 
451 #endif // LLVM_CLANG_AST_CLONEDETECTION_H
FilenamePatternConstraint(StringRef IgnoredFilesPattern)
SourceLocation getEndLoc() const
Returns the end sourcelocation of the last statement in this sequence.
__SIZE_TYPE__ size_t
The unsigned integer type of the result of the sizeof operator.
Definition: opencl-c.h:60
Stmt - This represents one statement.
Definition: Stmt.h:66
ASTContext & getASTContext() const
Returns the related ASTContext for the stored Stmts.
SourceRange getSourceRange() const
Returns the source range of the whole sequence - from the beginning of the first statement to the end...
Analyzes the pattern of the referenced variables in a statement.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
const Stmt * front() const
Returns the first statement in this sequence.
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
bool operator!=(const StmtSequence &Other) const
bool operator==(const StmtSequence &Other) const
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Represents a variable declaration or definition.
Definition: Decl.h:820
StmtSequence()
Constructs an empty StmtSequence.
SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, const VarDecl *Suggestion)
iterator begin() const
Returns an iterator pointing to the first statement in this sequence.
Identifies a list of statements.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:153
This class is a utility class that contains utility functions for building custom constraints...
Ensures that all clones reference variables in the same pattern.
Ensures that all clone groups contain at least the given amount of clones.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T C)
Constrains the given list of clone groups with the given constraint.
VariablePattern(const StmtSequence &Sequence)
Creates an VariablePattern object with information about the given StmtSequence.
void findClones(std::vector< CloneGroup > &Result, Ts... ConstraintList)
Searches for clones in all previously passed statements.
iterator end() const
Returns an iterator pointing behind the last statement in this sequence.
Searches for similar subtrees in the AST.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
bool empty() const
Returns true if and only if this StmtSequence contains no statements.
Describes two clones that reference their variables in a different pattern which could indicate a pro...
CompoundStmt - This represents a group of statements like { stmt stmt }.
Definition: Stmt.h:637
llvm::SmallVector< StmtSequence, 8 > CloneGroup
A collection of StmtSequences that share an arbitrary property.
LLVM_ATTRIBUTE_DEPRECATED(SourceLocation getStartLoc() const LLVM_READONLY, "Use getBeginLoc instead")
Returns the start sourcelocation of the first statement in this sequence.
This constraint moves clones into clone groups of type II via hashing.
SuspiciousCloneInfo SecondCloneInfo
This other clone in the pair which can have a suggested variable.
const Decl * getContainingDecl() const
Returns the declaration that contains the stored Stmts.
This constraint moves clones into clone groups of type II by comparing them.
Ensures that every clone has at least the given complexity.
const VarDecl * Suggestion
The variable that should have been referenced to follow the pattern.
The result type of a method or function.
std::shared_ptr< llvm::Regex > IgnoredFilesRegex
Utility class holding the relevant information about a single clone in this pair. ...
const Stmt * Mention
Where the variable was referenced.
unsigned size() const
Returns the number of statements this object holds.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T1 C, Ts... ConstraintList)
Constrains the given list of clone groups with the given list of constraints.
MinComplexityConstraint(unsigned MinComplexity)
Encodes a location in the source.
const Stmt *const * iterator
bool holdsSequence() const
Returns true if this objects holds a list of statements.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Dataflow Directional Tag Classes.
const Stmt * back() const
Returns the last statement in this sequence.
SourceLocation getBeginLoc() const
MinGroupSizeConstraint(unsigned MinGroupSize=2)
bool contains(const StmtSequence &Other) const
Returns true if and only if this sequence covers a source range that contains the source range of the...
static void filterGroups(std::vector< CloneDetector::CloneGroup > &CloneGroups, llvm::function_ref< bool(const CloneDetector::CloneGroup &)> Filter)
Removes all groups by using a filter function.
SuspiciousCloneInfo FirstCloneInfo
The first clone in the pair which always has a suggested variable.
A trivial tuple used to represent a source range.
const VarDecl * Variable
The variable which referencing in this clone was against the pattern.
Ensures that no clone group fully contains another clone group.