clang 20.0.0git
ArrayBoundCheckerV2.cpp
Go to the documentation of this file.
1//== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10// which looks for an out-of-bound array element access.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/AST/CharUnits.h"
25#include "llvm/ADT/SmallString.h"
26#include "llvm/Support/FormatVariadic.h"
27#include "llvm/Support/raw_ostream.h"
28#include <optional>
29
30using namespace clang;
31using namespace ento;
32using namespace taint;
33using llvm::formatv;
34
35namespace {
36/// If `E` is a "clean" array subscript expression, return the type of the
37/// accessed element. If the base of the subscript expression is modified by
38/// pointer arithmetic (and not the beginning of a "full" memory region), this
39/// always returns nullopt because that's the right (or the least bad) thing to
40/// do for the diagnostic output that's relying on this.
41static std::optional<QualType> determineElementType(const Expr *E,
42 const CheckerContext &C) {
43 const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
44 if (!ASE)
45 return std::nullopt;
46
47 const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
48 if (!SubscriptBaseReg)
49 return std::nullopt;
50
51 // The base of the subscript expression is affected by pointer arithmetics,
52 // so we want to report byte offsets instead of indices.
53 if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
54 return std::nullopt;
55
56 return ASE->getType();
57}
58
59static std::optional<int64_t>
60determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
61 if (!T)
62 return std::nullopt;
63 return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
64}
65
66class StateUpdateReporter {
67 const SubRegion *Reg;
68 const NonLoc ByteOffsetVal;
69 const std::optional<QualType> ElementType;
70 const std::optional<int64_t> ElementSize;
71 bool AssumedNonNegative = false;
72 std::optional<NonLoc> AssumedUpperBound = std::nullopt;
73
74public:
75 StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
77 : Reg(R), ByteOffsetVal(ByteOffsVal),
78 ElementType(determineElementType(E, C)),
79 ElementSize(determineElementSize(ElementType, C)) {}
80
81 void recordNonNegativeAssumption() { AssumedNonNegative = true; }
82 void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
83 AssumedUpperBound = UpperBoundVal;
84 }
85
86 bool assumedNonNegative() { return AssumedNonNegative; }
87
88 const NoteTag *createNoteTag(CheckerContext &C) const;
89
90private:
91 std::string getMessage(PathSensitiveBugReport &BR) const;
92
93 /// Return true if information about the value of `Sym` can put constraints
94 /// on some symbol which is interesting within the bug report `BR`.
95 /// In particular, this returns true when `Sym` is interesting within `BR`;
96 /// but it also returns true if `Sym` is an expression that contains integer
97 /// constants and a single symbolic operand which is interesting (in `BR`).
98 /// We need to use this instead of plain `BR.isInteresting()` because if we
99 /// are analyzing code like
100 /// int array[10];
101 /// int f(int arg) {
102 /// return array[arg] && array[arg + 10];
103 /// }
104 /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
105 /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
106 /// to detect this out of bounds access).
107 static bool providesInformationAboutInteresting(SymbolRef Sym,
109 static bool providesInformationAboutInteresting(SVal SV,
111 return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
112 }
113};
114
115struct Messages {
116 std::string Short, Full;
117};
118
119// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
120// instead of `PreStmt` because the current implementation passes the whole
121// expression to `CheckerContext::getSVal()` which only works after the
122// symbolic evaluation of the expression. (To turn them into `PreStmt`
123// callbacks, we'd need to duplicate the logic that evaluates these
124// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
125// defined as `PostStmt` for the sake of consistency with the other callbacks.
126class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
127 check::PostStmt<UnaryOperator>,
128 check::PostStmt<MemberExpr>> {
129 BugType BT{this, "Out-of-bound access"};
130 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
131
132 void performCheck(const Expr *E, CheckerContext &C) const;
133
134 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
135 NonLoc Offset, std::optional<NonLoc> Extent,
136 bool IsTaintBug = false) const;
137
138 static void markPartsInteresting(PathSensitiveBugReport &BR,
139 ProgramStateRef ErrorState, NonLoc Val,
140 bool MarkTaint);
141
142 static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
143
144 static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
145 NonLoc Offset, NonLoc Limit,
147 static bool isInAddressOf(const Stmt *S, ASTContext &AC);
148
149public:
150 void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
151 performCheck(E, C);
152 }
153 void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
154 if (E->getOpcode() == UO_Deref)
155 performCheck(E, C);
156 }
157 void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
158 if (E->isArrow())
159 performCheck(E->getBase(), C);
160 }
161};
162
163} // anonymous namespace
164
165/// For a given Location that can be represented as a symbolic expression
166/// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
167/// Arr and the distance of Location from the beginning of Arr (expressed in a
168/// NonLoc that specifies the number of CharUnits). Returns nullopt when these
169/// cannot be determined.
170static std::optional<std::pair<const SubRegion *, NonLoc>>
173 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
174 // We will use this utility to add and multiply values.
175 return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
176 };
177
178 const SubRegion *OwnerRegion = nullptr;
179 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
180
181 const ElementRegion *CurRegion =
182 dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
183
184 while (CurRegion) {
185 const auto Index = CurRegion->getIndex().getAs<NonLoc>();
186 if (!Index)
187 return std::nullopt;
188
189 QualType ElemType = CurRegion->getElementType();
190
191 // FIXME: The following early return was presumably added to safeguard the
192 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
193 // it seems that `ElemType` cannot be incomplete at this point.
194 if (ElemType->isIncompleteType())
195 return std::nullopt;
196
197 // Calculate Delta = Index * sizeof(ElemType).
198 NonLoc Size = SVB.makeArrayIndex(
199 SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
200 auto Delta = EvalBinOp(BO_Mul, *Index, Size);
201 if (!Delta)
202 return std::nullopt;
203
204 // Perform Offset += Delta.
205 Offset = EvalBinOp(BO_Add, *Offset, *Delta);
206 if (!Offset)
207 return std::nullopt;
208
209 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
210 // When this is just another ElementRegion layer, we need to continue the
211 // offset calculations:
212 CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
213 }
214
215 if (OwnerRegion)
216 return std::make_pair(OwnerRegion, *Offset);
217
218 return std::nullopt;
219}
220
221// NOTE: This function is the "heart" of this checker. It simplifies
222// inequalities with transformations that are valid (and very elementary) in
223// pure mathematics, but become invalid if we use them in C++ number model
224// where the calculations may overflow.
225// Due to the overflow issues I think it's impossible (or at least not
226// practical) to integrate this kind of simplification into the resolution of
227// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
228// produces valid results when the calculations are handling memory offsets
229// and every value is well below SIZE_MAX.
230// TODO: This algorithm should be moved to a central location where it's
231// available for other checkers that need to compare memory offsets.
232// NOTE: the simplification preserves the order of the two operands in a
233// mathematical sense, but it may change the result produced by a C++
234// comparison operator (and the automatic type conversions).
235// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
236// size_t and the RHS is stored in an int. (As size_t is unsigned, this
237// comparison is false for all values of "X".) However, the simplification may
238// turn it into "X < -1", which is still always false in a mathematical sense,
239// but can produce a true result when evaluated by `evalBinOp` (which follows
240// the rules of C++ and casts -1 to SIZE_MAX).
241static std::pair<NonLoc, nonloc::ConcreteInt>
243 SValBuilder &svalBuilder) {
244 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
245 if (SymVal && SymVal->isExpression()) {
246 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
247 llvm::APSInt constant =
248 APSIntType(extent.getValue()).convert(SIE->getRHS());
249 switch (SIE->getOpcode()) {
250 case BO_Mul:
251 // The constant should never be 0 here, becasue multiplication by zero
252 // is simplified by the engine.
253 if ((extent.getValue() % constant) != 0)
254 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
255 else
257 nonloc::SymbolVal(SIE->getLHS()),
258 svalBuilder.makeIntVal(extent.getValue() / constant),
259 svalBuilder);
260 case BO_Add:
262 nonloc::SymbolVal(SIE->getLHS()),
263 svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder);
264 default:
265 break;
266 }
267 }
268 }
269
270 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
271}
272
274 const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value);
275 return MaxV && MaxV->isNegative();
276}
277
278static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
280 return T->isUnsignedIntegerType();
281}
282
283// Evaluate the comparison Value < Threshold with the help of the custom
284// simplification algorithm defined for this checker. Return a pair of states,
285// where the first one corresponds to "value below threshold" and the second
286// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
287// the case when the evaluation fails.
288// If the optional argument CheckEquality is true, then use BO_EQ instead of
289// the default BO_LT after consistently applying the same simplification steps.
290static std::pair<ProgramStateRef, ProgramStateRef>
292 SValBuilder &SVB, bool CheckEquality = false) {
293 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
294 std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
295 }
296
297 // We want to perform a _mathematical_ comparison between the numbers `Value`
298 // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
299 // perform automatic conversions. For example the number -1 is less than the
300 // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
301 // -1 is converted to ULONGLONG_MAX.
302 // To avoid automatic conversions, we evaluate the "obvious" cases without
303 // calling `evalBinOpNN`:
304 if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) {
305 if (CheckEquality) {
306 // negative_value == unsigned_threshold is always false
307 return {nullptr, State};
308 }
309 // negative_value < unsigned_threshold is always true
310 return {State, nullptr};
311 }
312 if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) {
313 // unsigned_value == negative_threshold and
314 // unsigned_value < negative_threshold are both always false
315 return {nullptr, State};
316 }
317 // FIXME: These special cases are sufficient for handling real-world
318 // comparisons, but in theory there could be contrived situations where
319 // automatic conversion of a symbolic value (which can be negative and can be
320 // positive) leads to incorrect results.
321 // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
322 // we want to ensure that assumptions coming from this precondition and
323 // assumptions coming from regular C/C++ operator calls are represented by
324 // constraints on the same symbolic expression. A solution that would
325 // evaluate these "mathematical" compariosns through a separate pathway would
326 // be a step backwards in this sense.
327
328 const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
329 auto BelowThreshold =
330 SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
331 .getAs<NonLoc>();
332
333 if (BelowThreshold)
334 return State->assume(*BelowThreshold);
335
336 return {nullptr, nullptr};
337}
338
339static std::string getRegionName(const SubRegion *Region) {
340 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
341 return RegName;
342
343 // Field regions only have descriptive names when their parent has a
344 // descriptive name; so we provide a fallback representation for them:
345 if (const auto *FR = Region->getAs<FieldRegion>()) {
346 if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
347 return formatv("the field '{0}'", Name);
348 return "the unnamed field";
349 }
350
351 if (isa<AllocaRegion>(Region))
352 return "the memory returned by 'alloca'";
353
354 if (isa<SymbolicRegion>(Region) &&
355 isa<HeapSpaceRegion>(Region->getMemorySpace()))
356 return "the heap area";
357
358 if (isa<StringRegion>(Region))
359 return "the string literal";
360
361 return "the region";
362}
363
364static std::optional<int64_t> getConcreteValue(NonLoc SV) {
365 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
366 return ConcreteVal->getValue().tryExtValue();
367 }
368 return std::nullopt;
369}
370
371static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
372 return SV ? getConcreteValue(*SV) : std::nullopt;
373}
374
375static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
376 std::string RegName = getRegionName(Region), OffsetStr = "";
377
378 if (auto ConcreteOffset = getConcreteValue(Offset))
379 OffsetStr = formatv(" {0}", ConcreteOffset);
380
381 return {
382 formatv("Out of bound access to memory preceding {0}", RegName),
383 formatv("Access of {0} at negative byte offset{1}", RegName, OffsetStr)};
384}
385
386/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
387/// it can be performed (`Divisor` is nonzero and there is no remainder). The
388/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
389/// division is considered to be successful.
390static bool tryDividePair(std::optional<int64_t> &Val1,
391 std::optional<int64_t> &Val2, int64_t Divisor) {
392 if (!Divisor)
393 return false;
394 const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
395 const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
396 if (!Val1HasRemainder && !Val2HasRemainder) {
397 if (Val1)
398 *Val1 /= Divisor;
399 if (Val2)
400 *Val2 /= Divisor;
401 return true;
402 }
403 return false;
404}
405
406static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
407 NonLoc Offset, NonLoc Extent, SVal Location,
408 bool AlsoMentionUnderflow) {
409 std::string RegName = getRegionName(Region);
410 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
411 assert(EReg && "this checker only handles element access");
412 QualType ElemType = EReg->getElementType();
413
414 std::optional<int64_t> OffsetN = getConcreteValue(Offset);
415 std::optional<int64_t> ExtentN = getConcreteValue(Extent);
416
417 int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
418
419 bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
420 const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
421
423 llvm::raw_svector_ostream Out(Buf);
424 Out << "Access of ";
425 if (!ExtentN && !UseByteOffsets)
426 Out << "'" << ElemType.getAsString() << "' element in ";
427 Out << RegName << " at ";
428 if (AlsoMentionUnderflow) {
429 Out << "a negative or overflowing " << OffsetOrIndex;
430 } else if (OffsetN) {
431 Out << OffsetOrIndex << " " << *OffsetN;
432 } else {
433 Out << "an overflowing " << OffsetOrIndex;
434 }
435 if (ExtentN) {
436 Out << ", while it holds only ";
437 if (*ExtentN != 1)
438 Out << *ExtentN;
439 else
440 Out << "a single";
441 if (UseByteOffsets)
442 Out << " byte";
443 else
444 Out << " '" << ElemType.getAsString() << "' element";
445
446 if (*ExtentN > 1)
447 Out << "s";
448 }
449
450 return {formatv("Out of bound access to memory {0} {1}",
451 AlsoMentionUnderflow ? "around" : "after the end of",
452 RegName),
453 std::string(Buf)};
454}
455
456static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName,
457 bool AlsoMentionUnderflow) {
458 std::string RegName = getRegionName(Region);
459 return {formatv("Potential out of bound access to {0} with tainted {1}",
460 RegName, OffsetName),
461 formatv("Access of {0} with a tainted {1} that may be {2}too large",
462 RegName, OffsetName,
463 AlsoMentionUnderflow ? "negative or " : "")};
464}
465
466const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
467 // Don't create a note tag if we didn't assume anything:
468 if (!AssumedNonNegative && !AssumedUpperBound)
469 return nullptr;
470
471 return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
472 return getMessage(BR);
473 });
474}
475
476std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
477 bool ShouldReportNonNegative = AssumedNonNegative;
478 if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
479 if (AssumedUpperBound &&
480 providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
481 // Even if the byte offset isn't interesting (e.g. it's a constant value),
482 // the assumption can still be interesting if it provides information
483 // about an interesting symbolic upper bound.
484 ShouldReportNonNegative = false;
485 } else {
486 // We don't have anything interesting, don't report the assumption.
487 return "";
488 }
489 }
490
491 std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
492 std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
493
494 const bool UseIndex =
495 ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
496
498 llvm::raw_svector_ostream Out(Buf);
499 Out << "Assuming ";
500 if (UseIndex) {
501 Out << "index ";
502 if (OffsetN)
503 Out << "'" << OffsetN << "' ";
504 } else if (AssumedUpperBound) {
505 Out << "byte offset ";
506 if (OffsetN)
507 Out << "'" << OffsetN << "' ";
508 } else {
509 Out << "offset ";
510 }
511
512 Out << "is";
513 if (ShouldReportNonNegative) {
514 Out << " non-negative";
515 }
516 if (AssumedUpperBound) {
517 if (ShouldReportNonNegative)
518 Out << " and";
519 Out << " less than ";
520 if (ExtentN)
521 Out << *ExtentN << ", ";
522 if (UseIndex && ElementType)
523 Out << "the number of '" << ElementType->getAsString()
524 << "' elements in ";
525 else
526 Out << "the extent of ";
527 Out << getRegionName(Reg);
528 }
529 return std::string(Out.str());
530}
531
532bool StateUpdateReporter::providesInformationAboutInteresting(
534 if (!Sym)
535 return false;
536 for (SymbolRef PartSym : Sym->symbols()) {
537 // The interestingess mark may appear on any layer as we're stripping off
538 // the SymIntExpr, UnarySymExpr etc. layers...
539 if (BR.isInteresting(PartSym))
540 return true;
541 // ...but if both sides of the expression are symbolic, then there is no
542 // practical algorithm to produce separate constraints for the two
543 // operands (from the single combined result).
544 if (isa<SymSymExpr>(PartSym))
545 return false;
546 }
547 return false;
548}
549
550void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
551 const SVal Location = C.getSVal(E);
552
553 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
554 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
555 // and incomplete analysis of these leads to false positives. As even
556 // accurate reports would be confusing for the users, just disable reports
557 // from these macros:
558 if (isFromCtypeMacro(E, C.getASTContext()))
559 return;
560
561 ProgramStateRef State = C.getState();
562 SValBuilder &SVB = C.getSValBuilder();
563
564 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
565 computeOffset(State, SVB, Location);
566
567 if (!RawOffset)
568 return;
569
570 auto [Reg, ByteOffset] = *RawOffset;
571
572 // The state updates will be reported as a single note tag, which will be
573 // composed by this helper class.
574 StateUpdateReporter SUR(Reg, ByteOffset, E, C);
575
576 // CHECK LOWER BOUND
577 const MemSpaceRegion *Space = Reg->getMemorySpace();
578 if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
579 // A symbolic region in unknown space represents an unknown pointer that
580 // may point into the middle of an array, so we don't look for underflows.
581 // Both conditions are significant because we want to check underflows in
582 // symbolic regions on the heap (which may be introduced by checkers like
583 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
584 // non-symbolic regions (e.g. a field subregion of a symbolic region) in
585 // unknown space.
586 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
587 State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
588
589 if (PrecedesLowerBound) {
590 // The offset may be invalid (negative)...
591 if (!WithinLowerBound) {
592 // ...and it cannot be valid (>= 0), so report an error.
593 Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
594 reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
595 return;
596 }
597 // ...but it can be valid as well, so the checker will (optimistically)
598 // assume that it's valid and mention this in the note tag.
599 SUR.recordNonNegativeAssumption();
600 }
601
602 // Actually update the state. The "if" only fails in the extremely unlikely
603 // case when compareValueToThreshold returns {nullptr, nullptr} becasue
604 // evalBinOpNN fails to evaluate the less-than operator.
605 if (WithinLowerBound)
606 State = WithinLowerBound;
607 }
608
609 // CHECK UPPER BOUND
610 DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
611 if (auto KnownSize = Size.getAs<NonLoc>()) {
612 // In a situation where both underflow and overflow are possible (but the
613 // index is either tainted or known to be invalid), the logic of this
614 // checker will first assume that the offset is non-negative, and then
615 // (with this additional assumption) it will detect an overflow error.
616 // In this situation the warning message should mention both possibilities.
617 bool AlsoMentionUnderflow = SUR.assumedNonNegative();
618
619 auto [WithinUpperBound, ExceedsUpperBound] =
620 compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
621
622 if (ExceedsUpperBound) {
623 // The offset may be invalid (>= Size)...
624 if (!WithinUpperBound) {
625 // ...and it cannot be within bounds, so report an error, unless we can
626 // definitely determine that this is an idiomatic `&array[size]`
627 // expression that calculates the past-the-end pointer.
628 if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
629 *KnownSize, C)) {
630 C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
631 return;
632 }
633
634 Messages Msgs =
635 getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, *KnownSize,
636 Location, AlsoMentionUnderflow);
637 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
638 return;
639 }
640 // ...and it can be valid as well...
641 if (isTainted(State, ByteOffset)) {
642 // ...but it's tainted, so report an error.
643
644 // Diagnostic detail: saying "tainted offset" is always correct, but
645 // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
646 // nicer to say "tainted index".
647 const char *OffsetName = "offset";
648 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
649 if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
650 OffsetName = "index";
651
652 Messages Msgs = getTaintMsgs(Reg, OffsetName, AlsoMentionUnderflow);
653 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
654 /*IsTaintBug=*/true);
655 return;
656 }
657 // ...and it isn't tainted, so the checker will (optimistically) assume
658 // that the offset is in bounds and mention this in the note tag.
659 SUR.recordUpperBoundAssumption(*KnownSize);
660 }
661
662 // Actually update the state. The "if" only fails in the extremely unlikely
663 // case when compareValueToThreshold returns {nullptr, nullptr} becasue
664 // evalBinOpNN fails to evaluate the less-than operator.
665 if (WithinUpperBound)
666 State = WithinUpperBound;
667 }
668
669 // Add a transition, reporting the state updates that we accumulated.
670 C.addTransition(State, SUR.createNoteTag(C));
671}
672
673void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR,
674 ProgramStateRef ErrorState,
675 NonLoc Val, bool MarkTaint) {
676 if (SymbolRef Sym = Val.getAsSymbol()) {
677 // If the offset is a symbolic value, iterate over its "parts" with
678 // `SymExpr::symbols()` and mark each of them as interesting.
679 // For example, if the offset is `x*4 + y` then we put interestingness onto
680 // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
681 // `x` and `y`.
682 for (SymbolRef PartSym : Sym->symbols())
683 BR.markInteresting(PartSym);
684 }
685
686 if (MarkTaint) {
687 // If the issue that we're reporting depends on the taintedness of the
688 // offset, then put interestingness onto symbols that could be the origin
689 // of the taint. Note that this may find symbols that did not appear in
690 // `Sym->symbols()` (because they're only loosely connected to `Val`).
691 for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
692 BR.markInteresting(Sym);
693 }
694}
695
696void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
697 ProgramStateRef ErrorState, Messages Msgs,
698 NonLoc Offset, std::optional<NonLoc> Extent,
699 bool IsTaintBug /*=false*/) const {
700
701 ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
702 if (!ErrorNode)
703 return;
704
705 auto BR = std::make_unique<PathSensitiveBugReport>(
706 IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
707
708 // FIXME: ideally we would just call trackExpressionValue() and that would
709 // "do the right thing": mark the relevant symbols as interesting, track the
710 // control dependencies and statements storing the relevant values and add
711 // helpful diagnostic pieces. However, right now trackExpressionValue() is
712 // a heap of unreliable heuristics, so it would cause several issues:
713 // - Interestingness is not applied consistently, e.g. if `array[x+10]`
714 // causes an overflow, then `x` is not marked as interesting.
715 // - We get irrelevant diagnostic pieces, e.g. in the code
716 // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
717 // it places a "Storing uninitialized value" note on the `malloc` call
718 // (which is technically true, but irrelevant).
719 // If trackExpressionValue() becomes reliable, it should be applied instead
720 // of this custom markPartsInteresting().
721 markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
722 if (Extent)
723 markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
724
725 C.emitReport(std::move(BR));
726}
727
728bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
729 SourceLocation Loc = S->getBeginLoc();
730 if (!Loc.isMacroID())
731 return false;
732
733 StringRef MacroName = Lexer::getImmediateMacroName(
734 Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
735
736 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
737 return false;
738
739 return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
740 (MacroName == "isblank") || (MacroName == "isdigit") ||
741 (MacroName == "isgraph") || (MacroName == "islower") ||
742 (MacroName == "isnctrl") || (MacroName == "isprint") ||
743 (MacroName == "ispunct") || (MacroName == "isspace") ||
744 (MacroName == "isupper") || (MacroName == "isxdigit"));
745}
746
747bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
748 ParentMapContext &ParentCtx = ACtx.getParentMapContext();
749 do {
750 const DynTypedNodeList Parents = ParentCtx.getParents(*S);
751 if (Parents.empty())
752 return false;
753 S = Parents[0].get<Stmt>();
754 } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
755 const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
756 return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
757}
758
759bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E,
760 ProgramStateRef State,
761 NonLoc Offset, NonLoc Limit,
762 CheckerContext &C) {
763 if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
764 auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
765 State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
766 return EqualsToThreshold && !NotEqualToThreshold;
767 }
768 return false;
769}
770
771void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
772 mgr.registerChecker<ArrayBoundCheckerV2>();
773}
774
775bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
776 return true;
777}
static std::pair< ProgramStateRef, ProgramStateRef > compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, SValBuilder &SVB, bool CheckEquality=false)
static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, NonLoc Offset, NonLoc Extent, SVal Location, bool AlsoMentionUnderflow)
static std::optional< std::pair< const SubRegion *, NonLoc > > computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location)
For a given Location that can be represented as a symbolic expression Arr[Idx] (or perhaps Arr[Idx1][...
static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName, bool AlsoMentionUnderflow)
static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value)
static std::string getRegionName(const SubRegion *Region)
static std::optional< int64_t > getConcreteValue(NonLoc SV)
static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset)
static bool isUnsigned(SValBuilder &SVB, NonLoc Value)
static std::pair< NonLoc, nonloc::ConcreteInt > getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, SValBuilder &svalBuilder)
static bool tryDividePair(std::optional< int64_t > &Val1, std::optional< int64_t > &Val2, int64_t Divisor)
Try to divide Val1 and Val2 (in place) by Divisor and return true if it can be performed (Divisor is ...
Expr * E
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:187
SourceManager & getSourceManager()
Definition: ASTContext.h:721
ParentMapContext & getParentMapContext()
Returns the dynamic AST node parent map context.
Definition: ASTContext.cpp:853
const LangOptions & getLangOpts() const
Definition: ASTContext.h:797
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
ArraySubscriptExpr - [C99 6.5.2.1] Array Subscripting.
Definition: Expr.h:2674
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
Container for either a single DynTypedNode or for an ArrayRef to DynTypedNode.
This represents one expression.
Definition: Expr.h:110
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1060
MemberExpr - [C99 6.5.2.3] Structure and Union Members.
Definition: Expr.h:3187
DynTypedNodeList getParents(const NodeT &Node)
Returns the parents of the given node (within the traversal scope).
A (possibly-)qualified type.
Definition: Type.h:941
static std::string getAsString(SplitQualType split, const PrintingPolicy &Policy)
Definition: Type.h:1339
Encodes a location in the source.
Stmt - This represents one statement.
Definition: Stmt.h:84
bool isIncompleteType(NamedDecl **Def=nullptr) const
Types are partitioned into 3 broad categories (C99 6.2.5p1): object types, function types,...
Definition: Type.cpp:2362
bool isUnsignedIntegerType() const
Return true if this is an integer type that is unsigned, according to C99 6.2.5p6 [which returns true...
Definition: Type.cpp:2196
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2188
QualType getType() const
Definition: Value.cpp:234
A record of the "type" of an APSInt, used for conversions.
Definition: APSIntType.h:19
llvm::APSInt convert(const llvm::APSInt &Value) const LLVM_READONLY
Convert and return a new APSInt with the given value, but this type's bit width and signedness.
Definition: APSIntType.h:48
Template implementation for all binary symbolic expressions.
CHECKER * registerChecker(AT &&... Args)
Used to register checkers.
ElementRegion is used to represent both array elements and casts.
Definition: MemRegion.h:1199
QualType getElementType() const
Definition: MemRegion.h:1223
NonLoc getIndex() const
Definition: MemRegion.h:1219
MemRegion - The root abstract class for all memory regions.
Definition: MemRegion.h:97
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemSpaceRegion * getMemorySpace() const
Definition: MemRegion.cpp:1328
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * StripCasts(bool StripBaseAndDerivedCasts=true) const
Definition: MemRegion.cpp:1389
std::string getDescriptiveName(bool UseQuotes=true) const
Get descriptive name for memory region.
Definition: MemRegion.cpp:718
const RegionTy * getAs() const
Definition: MemRegion.h:1388
MemSpaceRegion - A memory region that represents a "memory space"; for example, the set of global var...
Definition: MemRegion.h:208
The tag upon which the TagVisitor reacts.
Definition: BugReporter.h:779
void markInteresting(SymbolRef sym, bugreporter::TrackingKind TKind=bugreporter::TrackingKind::Thorough)
Marks a symbol as interesting.
bool isInteresting(SymbolRef sym) const
NonLoc makeArrayIndex(uint64_t idx)
Definition: SValBuilder.h:284
ASTContext & getContext()
Definition: SValBuilder.h:148
nonloc::ConcreteInt makeIntVal(const IntegerLiteral *integer)
Definition: SValBuilder.h:290
QualType getArrayIndexType() const
Definition: SValBuilder.h:157
virtual SVal evalBinOpNN(ProgramStateRef state, BinaryOperator::Opcode op, NonLoc lhs, NonLoc rhs, QualType resultTy)=0
Create a new value which represents a binary expression with two non- location operands.
QualType getConditionType() const
Definition: SValBuilder.h:153
virtual const llvm::APSInt * getMaxValue(ProgramStateRef state, SVal val)=0
Tries to get the maximal possible (integer) value of a given SVal.
SVal - This represents a symbolic expression, which can be either an L-value or an R-value.
Definition: SVals.h:55
SymbolRef getAsSymbol(bool IncludeBaseRegions=false) const
If this SVal wraps a symbol return that SymbolRef.
Definition: SVals.cpp:104
std::optional< T > getAs() const
Convert to the specified SVal type, returning std::nullopt if this SVal is not of the desired type.
Definition: SVals.h:86
const MemRegion * getAsRegion() const
Definition: SVals.cpp:120
SubRegion - A region that subsets another larger region.
Definition: MemRegion.h:446
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * getSuperRegion() const
Definition: MemRegion.h:459
Symbolic value.
Definition: SymExpr.h:30
llvm::iterator_range< symbol_iterator > symbols() const
Definition: SymExpr.h:87
Value representing integer constant.
Definition: SVals.h:297
const llvm::APSInt & getValue() const
Definition: SVals.h:301
Represents symbolic expression that isn't a location.
Definition: SVals.h:276
std::vector< SymbolRef > getTaintedSymbols(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Returns the tainted Symbols for a given Statement and state.
Definition: Taint.cpp:169
bool isTainted(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Check if the statement has a tainted value in the given state.
Definition: Taint.cpp:147
DefinedOrUnknownSVal getDynamicExtent(ProgramStateRef State, const MemRegion *MR, SValBuilder &SVB)
@ Full
This outputs the full clang module dependency graph suitable for use for explicitly building modules.
The JSON file list parser is used to communicate input to InstallAPI.
BinaryOperatorKind
const FunctionProtoType * T