clang  6.0.0svn
CloneDetection.h
Go to the documentation of this file.
1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// This file defines classes for searching and analyzing source code clones.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
16 #define LLVM_CLANG_AST_CLONEDETECTION_H
17 
18 #include "clang/AST/StmtVisitor.h"
19 #include "llvm/Support/Regex.h"
20 #include <vector>
21 
22 namespace clang {
23 
24 class Stmt;
25 class Decl;
26 class VarDecl;
27 class ASTContext;
28 class CompoundStmt;
29 
30 /// Identifies a list of statements.
31 ///
32 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
33 /// child statements inside a CompoundStmt or no statements at all.
34 class StmtSequence {
35  /// If this object identifies a sequence of statements inside a CompoundStmt,
36  /// S points to this CompoundStmt. If this object only identifies a single
37  /// Stmt, then S is a pointer to this Stmt.
38  const Stmt *S;
39 
40  /// The declaration that contains the statements.
41  const Decl *D;
42 
43  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
44  /// instance is representing the CompoundStmt children inside the array
45  /// [StartIndex, EndIndex).
46  unsigned StartIndex;
47  unsigned EndIndex;
48 
49 public:
50  /// Constructs a StmtSequence holding multiple statements.
51  ///
52  /// The resulting StmtSequence identifies a continuous sequence of statements
53  /// in the body of the given CompoundStmt. Which statements of the body should
54  /// be identified needs to be specified by providing a start and end index
55  /// that describe a non-empty sub-array in the body of the given CompoundStmt.
56  ///
57  /// \param Stmt A CompoundStmt that contains all statements in its body.
58  /// \param D The Decl containing this Stmt.
59  /// \param StartIndex The inclusive start index in the children array of
60  /// \p Stmt
61  /// \param EndIndex The exclusive end index in the children array of \p Stmt.
62  StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
63  unsigned EndIndex);
64 
65  /// Constructs a StmtSequence holding a single statement.
66  ///
67  /// \param Stmt An arbitrary Stmt.
68  /// \param D The Decl containing this Stmt.
69  StmtSequence(const Stmt *Stmt, const Decl *D);
70 
71  /// Constructs an empty StmtSequence.
72  StmtSequence();
73 
74  typedef const Stmt *const *iterator;
75 
76  /// Returns an iterator pointing to the first statement in this sequence.
77  iterator begin() const;
78 
79  /// Returns an iterator pointing behind the last statement in this sequence.
80  iterator end() const;
81 
82  /// Returns the first statement in this sequence.
83  ///
84  /// This method should only be called on a non-empty StmtSequence object.
85  const Stmt *front() const {
86  assert(!empty());
87  return begin()[0];
88  }
89 
90  /// Returns the last statement in this sequence.
91  ///
92  /// This method should only be called on a non-empty StmtSequence object.
93  const Stmt *back() const {
94  assert(!empty());
95  return begin()[size() - 1];
96  }
97 
98  /// Returns the number of statements this object holds.
99  unsigned size() const {
100  if (holdsSequence())
101  return EndIndex - StartIndex;
102  if (S == nullptr)
103  return 0;
104  return 1;
105  }
106 
107  /// Returns true if and only if this StmtSequence contains no statements.
108  bool empty() const { return size() == 0; }
109 
110  /// Returns the related ASTContext for the stored Stmts.
111  ASTContext &getASTContext() const;
112 
113  /// Returns the declaration that contains the stored Stmts.
114  const Decl *getContainingDecl() const {
115  assert(D);
116  return D;
117  }
118 
119  /// Returns true if this objects holds a list of statements.
120  bool holdsSequence() const { return EndIndex != 0; }
121 
122  /// Returns the start sourcelocation of the first statement in this sequence.
123  ///
124  /// This method should only be called on a non-empty StmtSequence object.
125  SourceLocation getStartLoc() const;
126 
127  /// Returns the end sourcelocation of the last statement in this sequence.
128  ///
129  /// This method should only be called on a non-empty StmtSequence object.
130  SourceLocation getEndLoc() const;
131 
132  /// Returns the source range of the whole sequence - from the beginning
133  /// of the first statement to the end of the last statement.
134  SourceRange getSourceRange() const;
135 
136  bool operator==(const StmtSequence &Other) const {
137  return std::tie(S, StartIndex, EndIndex) ==
138  std::tie(Other.S, Other.StartIndex, Other.EndIndex);
139  }
140 
141  bool operator!=(const StmtSequence &Other) const {
142  return std::tie(S, StartIndex, EndIndex) !=
143  std::tie(Other.S, Other.StartIndex, Other.EndIndex);
144  }
145 
146  /// Returns true if and only if this sequence covers a source range that
147  /// contains the source range of the given sequence \p Other.
148  ///
149  /// This method should only be called on a non-empty StmtSequence object
150  /// and passed a non-empty StmtSequence object.
151  bool contains(const StmtSequence &Other) const;
152 };
153 
154 /// Searches for similar subtrees in the AST.
155 ///
156 /// First, this class needs several declarations with statement bodies which
157 /// can be passed via analyzeCodeBody. Afterwards all statements can be
158 /// searched for clones by calling findClones with a given list of constraints
159 /// that should specify the wanted properties of the clones.
160 ///
161 /// The result of findClones can be further constrained with the constrainClones
162 /// method.
163 ///
164 /// This class only searches for clones in exectuable source code
165 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
166 /// are not supported.
168 
169 public:
170  /// A collection of StmtSequences that share an arbitrary property.
172 
173  /// Generates and stores search data for all statements in the body of
174  /// the given Decl.
175  void analyzeCodeBody(const Decl *D);
176 
177  /// Constrains the given list of clone groups with the given constraint.
178  ///
179  /// The constraint is expected to have a method with the signature
180  /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
181  /// as this is the interface that the CloneDetector uses for applying the
182  /// constraint. The constraint is supposed to directly modify the passed list
183  /// so that all clones in the list fulfill the specific property this
184  /// constraint ensures.
185  template <typename T>
186  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
187  C.constrain(CloneGroups);
188  }
189 
190  /// Constrains the given list of clone groups with the given list of
191  /// constraints.
192  ///
193  /// The constraints are applied in sequence in the order in which they are
194  /// passed to this function.
195  template <typename T1, typename... Ts>
196  static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
197  Ts... ConstraintList) {
198  constrainClones(CloneGroups, C);
199  constrainClones(CloneGroups, ConstraintList...);
200  }
201 
202  /// Searches for clones in all previously passed statements.
203  /// \param Result Output parameter to which all created clone groups are
204  /// added.
205  /// \param ConstraintList The constraints that should be applied to the
206  // result.
207  template <typename... Ts>
208  void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
209  // The initial assumption is that there is only one clone group and every
210  // statement is a clone of the others. This clone group will then be
211  // split up with the help of the constraints.
212  CloneGroup AllClones;
213  AllClones.reserve(Sequences.size());
214  for (const auto &C : Sequences) {
215  AllClones.push_back(C);
216  }
217 
218  Result.push_back(AllClones);
219 
220  constrainClones(Result, ConstraintList...);
221  }
222 
223 private:
224  CloneGroup Sequences;
225 };
226 
227 /// This class is a utility class that contains utility functions for building
228 /// custom constraints.
230 public:
231  /// Removes all groups by using a filter function.
232  /// \param CloneGroups The list of CloneGroups that is supposed to be
233  /// filtered.
234  /// \param Filter The filter function that should return true for all groups
235  /// that should be removed from the list.
236  static void filterGroups(
237  std::vector<CloneDetector::CloneGroup> &CloneGroups,
238  llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
239  CloneGroups.erase(
240  std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
241  CloneGroups.end());
242  }
243 
244  /// Splits the given CloneGroups until the given Compare function returns true
245  /// for all clones in a single group.
246  /// \param CloneGroups A list of CloneGroups that should be modified.
247  /// \param Compare The comparison function that all clones are supposed to
248  /// pass. Should return true if and only if two clones belong
249  /// to the same CloneGroup.
250  static void splitCloneGroups(
251  std::vector<CloneDetector::CloneGroup> &CloneGroups,
252  llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
253  Compare);
254 };
255 
256 /// This constraint moves clones into clone groups of type II via hashing.
257 ///
258 /// Clones with different hash values are moved into separate clone groups.
259 /// Collisions are possible, and this constraint does nothing to address this
260 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
261 /// constraint chain, not necessarily immediately, to eliminate hash collisions
262 /// through a more detailed analysis.
264 public:
265  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
266 };
267 
268 /// This constraint moves clones into clone groups of type II by comparing them.
269 ///
270 /// Clones that aren't type II clones are moved into separate clone groups.
271 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
272 /// group are guaranteed to be be type II clones of each other, but it is too
273 /// slow to efficiently handle large amounts of clones.
275 public:
276  void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
277 };
278 
279 /// Ensures that every clone has at least the given complexity.
280 ///
281 /// Complexity is here defined as the total amount of children of a statement.
282 /// This constraint assumes the first statement in the group is representative
283 /// for all other statements in the group in terms of complexity.
285  unsigned MinComplexity;
286 
287 public:
288  MinComplexityConstraint(unsigned MinComplexity)
289  : MinComplexity(MinComplexity) {}
290 
291  /// Calculates the complexity of the given StmtSequence.
292  /// \param Limit The limit of complexity we probe for. After reaching
293  /// this limit during calculation, this method is exiting
294  /// early to improve performance and returns this limit.
295  size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
296  const std::string &ParentMacroStack = "");
297 
298  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
300  CloneGroups, [this](const CloneDetector::CloneGroup &A) {
301  if (!A.empty())
302  return calculateStmtComplexity(A.front(), MinComplexity) <
303  MinComplexity;
304  else
305  return false;
306  });
307  }
308 };
309 
310 /// Ensures that all clone groups contain at least the given amount of clones.
312  unsigned MinGroupSize;
313 
314 public:
315  MinGroupSizeConstraint(unsigned MinGroupSize = 2)
316  : MinGroupSize(MinGroupSize) {}
317 
318  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
319  CloneConstraint::filterGroups(CloneGroups,
320  [this](const CloneDetector::CloneGroup &A) {
321  return A.size() < MinGroupSize;
322  });
323  }
324 };
325 
326 /// Ensures that no clone group fully contains another clone group.
328  void constrain(std::vector<CloneDetector::CloneGroup> &Result);
329 };
330 
333  std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
334 
335  FilenamePatternConstraint(StringRef IgnoredFilesPattern)
336  : IgnoredFilesPattern(IgnoredFilesPattern) {
337  IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
338  IgnoredFilesPattern.str() + "$)");
339  }
340 
341  bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
342 
343  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
345  CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
346  return isAutoGenerated(Group);
347  });
348  }
349 };
350 
351 /// Analyzes the pattern of the referenced variables in a statement.
353 
354  /// Describes an occurence of a variable reference in a statement.
355  struct VariableOccurence {
356  /// The index of the associated VarDecl in the Variables vector.
357  size_t KindID;
358  /// The statement in the code where the variable was referenced.
359  const Stmt *Mention;
360 
361  VariableOccurence(size_t KindID, const Stmt *Mention)
362  : KindID(KindID), Mention(Mention) {}
363  };
364 
365  /// All occurences of referenced variables in the order of appearance.
366  std::vector<VariableOccurence> Occurences;
367  /// List of referenced variables in the order of appearance.
368  /// Every item in this list is unique.
369  std::vector<const VarDecl *> Variables;
370 
371  /// Adds a new variable referenced to this pattern.
372  /// \param VarDecl The declaration of the variable that is referenced.
373  /// \param Mention The SourceRange where this variable is referenced.
374  void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
375 
376  /// Adds each referenced variable from the given statement.
377  void addVariables(const Stmt *S);
378 
379 public:
380  /// Creates an VariablePattern object with information about the given
381  /// StmtSequence.
382  VariablePattern(const StmtSequence &Sequence) {
383  for (const Stmt *S : Sequence)
384  addVariables(S);
385  }
386 
387  /// Describes two clones that reference their variables in a different pattern
388  /// which could indicate a programming error.
390  /// Utility class holding the relevant information about a single
391  /// clone in this pair.
393  /// The variable which referencing in this clone was against the pattern.
394  const VarDecl *Variable;
395  /// Where the variable was referenced.
396  const Stmt *Mention;
397  /// The variable that should have been referenced to follow the pattern.
398  /// If Suggestion is a nullptr then it's not possible to fix the pattern
399  /// by referencing a different variable in this clone.
400  const VarDecl *Suggestion;
401  SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
402  const VarDecl *Suggestion)
403  : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
405  };
406  /// The first clone in the pair which always has a suggested variable.
408  /// This other clone in the pair which can have a suggested variable.
410  };
411 
412  /// Counts the differences between this pattern and the given one.
413  /// \param Other The given VariablePattern to compare with.
414  /// \param FirstMismatch Output parameter that will be filled with information
415  /// about the first difference between the two patterns. This parameter
416  /// can be a nullptr, in which case it will be ignored.
417  /// \return Returns the number of differences between the pattern this object
418  /// is following and the given VariablePattern.
419  ///
420  /// For example, the following statements all have the same pattern and this
421  /// function would return zero:
422  ///
423  /// if (a < b) return a; return b;
424  /// if (x < y) return x; return y;
425  /// if (u2 < u1) return u2; return u1;
426  ///
427  /// But the following statement has a different pattern (note the changed
428  /// variables in the return statements) and would have two differences when
429  /// compared with one of the statements above.
430  ///
431  /// if (a < b) return b; return a;
432  ///
433  /// This function should only be called if the related statements of the given
434  /// pattern and the statements of this objects are clones of each other.
435  unsigned countPatternDifferences(
436  const VariablePattern &Other,
437  VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
438 };
439 
440 /// Ensures that all clones reference variables in the same pattern.
442  void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
443 };
444 
445 } // end namespace clang
446 
447 #endif // LLVM_CLANG_AST_CLONEDETECTION_H
FilenamePatternConstraint(StringRef IgnoredFilesPattern)
SourceLocation getEndLoc() const
Returns the end sourcelocation of the last statement in this sequence.
__SIZE_TYPE__ size_t
The unsigned integer type of the result of the sizeof operator.
Definition: opencl-c.h:60
Stmt - This represents one statement.
Definition: Stmt.h:66
ASTContext & getASTContext() const
Returns the related ASTContext for the stored Stmts.
SourceRange getSourceRange() const
Returns the source range of the whole sequence - from the beginning of the first statement to the end...
Analyzes the pattern of the referenced variables in a statement.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
const Stmt * front() const
Returns the first statement in this sequence.
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
bool operator!=(const StmtSequence &Other) const
bool operator==(const StmtSequence &Other) const
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
VarDecl - An instance of this class is created to represent a variable declaration or definition...
Definition: Decl.h:807
StmtSequence()
Constructs an empty StmtSequence.
SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, const VarDecl *Suggestion)
iterator begin() const
Returns an iterator pointing to the first statement in this sequence.
Identifies a list of statements.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:149
This class is a utility class that contains utility functions for building custom constraints...
Ensures that all clones reference variables in the same pattern.
Ensures that all clone groups contain at least the given amount of clones.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T C)
Constrains the given list of clone groups with the given constraint.
VariablePattern(const StmtSequence &Sequence)
Creates an VariablePattern object with information about the given StmtSequence.
void findClones(std::vector< CloneGroup > &Result, Ts... ConstraintList)
Searches for clones in all previously passed statements.
iterator end() const
Returns an iterator pointing behind the last statement in this sequence.
Searches for similar subtrees in the AST.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
bool empty() const
Returns true if and only if this StmtSequence contains no statements.
Describes two clones that reference their variables in a different pattern which could indicate a pro...
CompoundStmt - This represents a group of statements like { stmt stmt }.
Definition: Stmt.h:595
llvm::SmallVector< StmtSequence, 8 > CloneGroup
A collection of StmtSequences that share an arbitrary property.
This constraint moves clones into clone groups of type II via hashing.
SuspiciousCloneInfo SecondCloneInfo
This other clone in the pair which can have a suggested variable.
const FunctionProtoType * T
const Decl * getContainingDecl() const
Returns the declaration that contains the stored Stmts.
This constraint moves clones into clone groups of type II by comparing them.
Ensures that every clone has at least the given complexity.
const VarDecl * Suggestion
The variable that should have been referenced to follow the pattern.
The result type of a method or function.
std::shared_ptr< llvm::Regex > IgnoredFilesRegex
Utility class holding the relevant information about a single clone in this pair. ...
const Stmt * Mention
Where the variable was referenced.
unsigned size() const
Returns the number of statements this object holds.
static void constrainClones(std::vector< CloneGroup > &CloneGroups, T1 C, Ts... ConstraintList)
Constrains the given list of clone groups with the given list of constraints.
MinComplexityConstraint(unsigned MinComplexity)
Encodes a location in the source.
const Stmt *const * iterator
bool holdsSequence() const
Returns true if this objects holds a list of statements.
void constrain(std::vector< CloneDetector::CloneGroup > &CloneGroups)
Dataflow Directional Tag Classes.
const Stmt * back() const
Returns the last statement in this sequence.
SourceLocation getStartLoc() const
Returns the start sourcelocation of the first statement in this sequence.
MinGroupSizeConstraint(unsigned MinGroupSize=2)
bool contains(const StmtSequence &Other) const
Returns true if and only if this sequence covers a source range that contains the source range of the...
static void filterGroups(std::vector< CloneDetector::CloneGroup > &CloneGroups, llvm::function_ref< bool(const CloneDetector::CloneGroup &)> Filter)
Removes all groups by using a filter function.
SuspiciousCloneInfo FirstCloneInfo
The first clone in the pair which always has a suggested variable.
A trivial tuple used to represent a source range.
const VarDecl * Variable
The variable which referencing in this clone was against the pattern.
Ensures that no clone group fully contains another clone group.