clang 22.0.0git
ArrayBoundChecker.cpp
Go to the documentation of this file.
1//== ArrayBoundChecker.cpp -------------------------------------------------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines security.ArrayBound, which is a path-sensitive checker
10// that looks for out of bounds access of memory regions.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/AST/CharUnits.h"
25#include "llvm/ADT/APSInt.h"
26#include "llvm/Support/FormatVariadic.h"
27#include "llvm/Support/raw_ostream.h"
28#include <optional>
29
30using namespace clang;
31using namespace ento;
32using namespace taint;
33using llvm::formatv;
34
35namespace {
36/// If `E` is an array subscript expression with a base that is "clean" (= not
37/// modified by pointer arithmetic = the beginning of a memory region), return
38/// it as a pointer to ArraySubscriptExpr; otherwise return nullptr.
39/// This helper function is used by two separate heuristics that are only valid
40/// in these "clean" cases.
41static const ArraySubscriptExpr *
42getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) {
43 const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
44 if (!ASE)
45 return nullptr;
46
47 const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
48 if (!SubscriptBaseReg)
49 return nullptr;
50
51 // The base of the subscript expression is affected by pointer arithmetics,
52 // so we want to report byte offsets instead of indices and we don't want to
53 // activate the "index is unsigned -> cannot be negative" shortcut.
54 if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
55 return nullptr;
56
57 return ASE;
58}
59
60/// If `E` is a "clean" array subscript expression, return the type of the
61/// accessed element; otherwise return std::nullopt because that's the best (or
62/// least bad) option for the diagnostic generation that relies on this.
63static std::optional<QualType> determineElementType(const Expr *E,
64 const CheckerContext &C) {
65 const auto *ASE = getAsCleanArraySubscriptExpr(E, C);
66 if (!ASE)
67 return std::nullopt;
68
69 return ASE->getType();
70}
71
72static std::optional<int64_t>
73determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
74 if (!T)
75 return std::nullopt;
76 return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
77}
78
79class StateUpdateReporter {
80 const MemSpaceRegion *Space;
81 const SubRegion *Reg;
82 const NonLoc ByteOffsetVal;
83 const std::optional<QualType> ElementType;
84 const std::optional<int64_t> ElementSize;
85 bool AssumedNonNegative = false;
86 std::optional<NonLoc> AssumedUpperBound = std::nullopt;
87
88public:
89 StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
90 CheckerContext &C)
91 : Space(R->getMemorySpace(C.getState())), Reg(R),
92 ByteOffsetVal(ByteOffsVal), ElementType(determineElementType(E, C)),
93 ElementSize(determineElementSize(ElementType, C)) {}
94
95 void recordNonNegativeAssumption() { AssumedNonNegative = true; }
96 void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
97 AssumedUpperBound = UpperBoundVal;
98 }
99
100 bool assumedNonNegative() { return AssumedNonNegative; }
101
102 const NoteTag *createNoteTag(CheckerContext &C) const;
103
104private:
105 std::string getMessage(PathSensitiveBugReport &BR) const;
106
107 /// Return true if information about the value of `Sym` can put constraints
108 /// on some symbol which is interesting within the bug report `BR`.
109 /// In particular, this returns true when `Sym` is interesting within `BR`;
110 /// but it also returns true if `Sym` is an expression that contains integer
111 /// constants and a single symbolic operand which is interesting (in `BR`).
112 /// We need to use this instead of plain `BR.isInteresting()` because if we
113 /// are analyzing code like
114 /// int array[10];
115 /// int f(int arg) {
116 /// return array[arg] && array[arg + 10];
117 /// }
118 /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
119 /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
120 /// to detect this out of bounds access).
121 static bool providesInformationAboutInteresting(SymbolRef Sym,
122 PathSensitiveBugReport &BR);
123 static bool providesInformationAboutInteresting(SVal SV,
124 PathSensitiveBugReport &BR) {
125 return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
126 }
127};
128
129struct Messages {
130 std::string Short, Full;
131};
132
133enum class BadOffsetKind { Negative, Overflowing, Indeterminate };
134
135constexpr llvm::StringLiteral Adjectives[] = {"a negative", "an overflowing",
136 "a negative or overflowing"};
137static StringRef asAdjective(BadOffsetKind Problem) {
138 return Adjectives[static_cast<int>(Problem)];
139}
140
141constexpr llvm::StringLiteral Prepositions[] = {"preceding", "after the end of",
142 "around"};
143static StringRef asPreposition(BadOffsetKind Problem) {
144 return Prepositions[static_cast<int>(Problem)];
145}
146
147// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
148// instead of `PreStmt` because the current implementation passes the whole
149// expression to `CheckerContext::getSVal()` which only works after the
150// symbolic evaluation of the expression. (To turn them into `PreStmt`
151// callbacks, we'd need to duplicate the logic that evaluates these
152// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
153// defined as `PostStmt` for the sake of consistency with the other callbacks.
154class ArrayBoundChecker : public Checker<check::PostStmt<ArraySubscriptExpr>,
155 check::PostStmt<UnaryOperator>,
156 check::PostStmt<MemberExpr>> {
157 BugType BT{this, "Out-of-bound access"};
158 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
159
160 void performCheck(const Expr *E, CheckerContext &C) const;
161
162 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
163 NonLoc Offset, std::optional<NonLoc> Extent,
164 bool IsTaintBug = false) const;
165
166 static void markPartsInteresting(PathSensitiveBugReport &BR,
167 ProgramStateRef ErrorState, NonLoc Val,
168 bool MarkTaint);
169
170 static bool isFromCtypeMacro(const Expr *E, ASTContext &AC);
171
172 static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C);
173
174 static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
175 NonLoc Offset, NonLoc Limit,
176 CheckerContext &C);
177 static bool isInAddressOf(const Stmt *S, ASTContext &AC);
178
179public:
180 void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
181 performCheck(E, C);
182 }
183 void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
184 if (E->getOpcode() == UO_Deref)
185 performCheck(E, C);
186 }
187 void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
188 if (E->isArrow())
189 performCheck(E->getBase(), C);
190 }
191};
192
193} // anonymous namespace
194
195/// For a given Location that can be represented as a symbolic expression
196/// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
197/// Arr and the distance of Location from the beginning of Arr (expressed in a
198/// NonLoc that specifies the number of CharUnits). Returns nullopt when these
199/// cannot be determined.
200static std::optional<std::pair<const SubRegion *, NonLoc>>
203 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
204 // We will use this utility to add and multiply values.
205 return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
206 };
207
208 const SubRegion *OwnerRegion = nullptr;
209 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
210
211 const ElementRegion *CurRegion =
212 dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
213
214 while (CurRegion) {
215 const auto Index = CurRegion->getIndex().getAs<NonLoc>();
216 if (!Index)
217 return std::nullopt;
218
219 QualType ElemType = CurRegion->getElementType();
220
221 // FIXME: The following early return was presumably added to safeguard the
222 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
223 // it seems that `ElemType` cannot be incomplete at this point.
224 if (ElemType->isIncompleteType())
225 return std::nullopt;
226
227 // Calculate Delta = Index * sizeof(ElemType).
228 NonLoc Size = SVB.makeArrayIndex(
229 SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
230 auto Delta = EvalBinOp(BO_Mul, *Index, Size);
231 if (!Delta)
232 return std::nullopt;
233
234 // Perform Offset += Delta.
235 Offset = EvalBinOp(BO_Add, *Offset, *Delta);
236 if (!Offset)
237 return std::nullopt;
238
239 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
240 // When this is just another ElementRegion layer, we need to continue the
241 // offset calculations:
242 CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
243 }
244
245 if (OwnerRegion)
246 return std::make_pair(OwnerRegion, *Offset);
247
248 return std::nullopt;
249}
250
251// NOTE: This function is the "heart" of this checker. It simplifies
252// inequalities with transformations that are valid (and very elementary) in
253// pure mathematics, but become invalid if we use them in C++ number model
254// where the calculations may overflow.
255// Due to the overflow issues I think it's impossible (or at least not
256// practical) to integrate this kind of simplification into the resolution of
257// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
258// produces valid results when the calculations are handling memory offsets
259// and every value is well below SIZE_MAX.
260// TODO: This algorithm should be moved to a central location where it's
261// available for other checkers that need to compare memory offsets.
262// NOTE: the simplification preserves the order of the two operands in a
263// mathematical sense, but it may change the result produced by a C++
264// comparison operator (and the automatic type conversions).
265// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
266// size_t and the RHS is stored in an int. (As size_t is unsigned, this
267// comparison is false for all values of "X".) However, the simplification may
268// turn it into "X < -1", which is still always false in a mathematical sense,
269// but can produce a true result when evaluated by `evalBinOp` (which follows
270// the rules of C++ and casts -1 to SIZE_MAX).
271static std::pair<NonLoc, nonloc::ConcreteInt>
273 SValBuilder &svalBuilder) {
274 const llvm::APSInt &extentVal = extent.getValue();
275 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
276 if (SymVal && SymVal->isExpression()) {
277 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
278 llvm::APSInt constant = APSIntType(extentVal).convert(SIE->getRHS());
279 switch (SIE->getOpcode()) {
280 case BO_Mul:
281 // The constant should never be 0 here, becasue multiplication by zero
282 // is simplified by the engine.
283 if ((extentVal % constant) != 0)
284 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
285 else
287 nonloc::SymbolVal(SIE->getLHS()),
288 svalBuilder.makeIntVal(extentVal / constant), svalBuilder);
289 case BO_Add:
291 nonloc::SymbolVal(SIE->getLHS()),
292 svalBuilder.makeIntVal(extentVal - constant), svalBuilder);
293 default:
294 break;
295 }
296 }
297 }
298
299 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
300}
301
303 const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value);
304 return MaxV && MaxV->isNegative();
305}
306
307static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
309 return T->isUnsignedIntegerType();
310}
311
312// Evaluate the comparison Value < Threshold with the help of the custom
313// simplification algorithm defined for this checker. Return a pair of states,
314// where the first one corresponds to "value below threshold" and the second
315// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
316// the case when the evaluation fails.
317// If the optional argument CheckEquality is true, then use BO_EQ instead of
318// the default BO_LT after consistently applying the same simplification steps.
319static std::pair<ProgramStateRef, ProgramStateRef>
321 SValBuilder &SVB, bool CheckEquality = false) {
322 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
323 std::tie(Value, Threshold) =
324 getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
325 }
326
327 // We want to perform a _mathematical_ comparison between the numbers `Value`
328 // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
329 // perform automatic conversions. For example the number -1 is less than the
330 // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
331 // -1 is converted to ULONGLONG_MAX.
332 // To avoid automatic conversions, we evaluate the "obvious" cases without
333 // calling `evalBinOpNN`:
334 if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) {
335 if (CheckEquality) {
336 // negative_value == unsigned_threshold is always false
337 return {nullptr, State};
338 }
339 // negative_value < unsigned_threshold is always true
340 return {State, nullptr};
341 }
342 if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) {
343 // unsigned_value == negative_threshold and
344 // unsigned_value < negative_threshold are both always false
345 return {nullptr, State};
346 }
347 // FIXME: These special cases are sufficient for handling real-world
348 // comparisons, but in theory there could be contrived situations where
349 // automatic conversion of a symbolic value (which can be negative and can be
350 // positive) leads to incorrect results.
351 // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
352 // we want to ensure that assumptions coming from this precondition and
353 // assumptions coming from regular C/C++ operator calls are represented by
354 // constraints on the same symbolic expression. A solution that would
355 // evaluate these "mathematical" comparisons through a separate pathway would
356 // be a step backwards in this sense.
357
358 const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
359 auto BelowThreshold =
360 SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
361 .getAs<NonLoc>();
362
363 if (BelowThreshold)
364 return State->assume(*BelowThreshold);
365
366 return {nullptr, nullptr};
367}
368
369static std::string getRegionName(const MemSpaceRegion *Space,
370 const SubRegion *Region) {
371 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
372 return RegName;
373
374 // Field regions only have descriptive names when their parent has a
375 // descriptive name; so we provide a fallback representation for them:
376 if (const auto *FR = Region->getAs<FieldRegion>()) {
377 if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
378 return formatv("the field '{0}'", Name);
379 return "the unnamed field";
380 }
381
382 if (isa<AllocaRegion>(Region))
383 return "the memory returned by 'alloca'";
384
385 if (isa<SymbolicRegion>(Region) && isa<HeapSpaceRegion>(Space))
386 return "the heap area";
387
388 if (isa<StringRegion>(Region))
389 return "the string literal";
390
391 return "the region";
392}
393
394static std::optional<int64_t> getConcreteValue(NonLoc SV) {
395 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
396 return ConcreteVal->getValue()->tryExtValue();
397 }
398 return std::nullopt;
399}
400
401static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
402 return SV ? getConcreteValue(*SV) : std::nullopt;
403}
404
405/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
406/// it can be performed (`Divisor` is nonzero and there is no remainder). The
407/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
408/// division is considered to be successful.
409static bool tryDividePair(std::optional<int64_t> &Val1,
410 std::optional<int64_t> &Val2, int64_t Divisor) {
411 if (!Divisor)
412 return false;
413 const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
414 const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
415 if (Val1HasRemainder || Val2HasRemainder)
416 return false;
417 if (Val1)
418 *Val1 /= Divisor;
419 if (Val2)
420 *Val2 /= Divisor;
421 return true;
422}
423
424static Messages getNonTaintMsgs(const ASTContext &ACtx,
425 const MemSpaceRegion *Space,
426 const SubRegion *Region, NonLoc Offset,
427 std::optional<NonLoc> Extent, SVal Location,
428 BadOffsetKind Problem) {
429 std::string RegName = getRegionName(Space, Region);
430 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
431 assert(EReg && "this checker only handles element access");
432 QualType ElemType = EReg->getElementType();
433
434 std::optional<int64_t> OffsetN = getConcreteValue(Offset);
435 std::optional<int64_t> ExtentN = getConcreteValue(Extent);
436
437 int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
438
439 bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
440 const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
441
443 llvm::raw_svector_ostream Out(Buf);
444 Out << "Access of ";
445 if (OffsetN && !ExtentN && !UseByteOffsets) {
446 // If the offset is reported as an index, then the report must mention the
447 // element type (because it is not always clear from the code). It's more
448 // natural to mention the element type later where the extent is described,
449 // but if the extent is unknown/irrelevant, then the element type can be
450 // inserted into the message at this point.
451 Out << "'" << ElemType.getAsString() << "' element in ";
452 }
453 Out << RegName << " at ";
454 if (OffsetN) {
455 if (Problem == BadOffsetKind::Negative)
456 Out << "negative ";
457 Out << OffsetOrIndex << " " << *OffsetN;
458 } else {
459 Out << asAdjective(Problem) << " " << OffsetOrIndex;
460 }
461 if (ExtentN) {
462 Out << ", while it holds only ";
463 if (*ExtentN != 1)
464 Out << *ExtentN;
465 else
466 Out << "a single";
467 if (UseByteOffsets)
468 Out << " byte";
469 else
470 Out << " '" << ElemType.getAsString() << "' element";
471
472 if (*ExtentN > 1)
473 Out << "s";
474 }
475
476 return {formatv("Out of bound access to memory {0} {1}",
477 asPreposition(Problem), RegName),
478 std::string(Buf)};
479}
480
481static Messages getTaintMsgs(const MemSpaceRegion *Space,
482 const SubRegion *Region, const char *OffsetName,
483 bool AlsoMentionUnderflow) {
484 std::string RegName = getRegionName(Space, Region);
485 return {formatv("Potential out of bound access to {0} with tainted {1}",
486 RegName, OffsetName),
487 formatv("Access of {0} with a tainted {1} that may be {2}too large",
488 RegName, OffsetName,
489 AlsoMentionUnderflow ? "negative or " : "")};
490}
491
492const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
493 // Don't create a note tag if we didn't assume anything:
494 if (!AssumedNonNegative && !AssumedUpperBound)
495 return nullptr;
496
497 return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
498 return getMessage(BR);
499 });
500}
501
502std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
503 bool ShouldReportNonNegative = AssumedNonNegative;
504 if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
505 if (AssumedUpperBound &&
506 providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
507 // Even if the byte offset isn't interesting (e.g. it's a constant value),
508 // the assumption can still be interesting if it provides information
509 // about an interesting symbolic upper bound.
510 ShouldReportNonNegative = false;
511 } else {
512 // We don't have anything interesting, don't report the assumption.
513 return "";
514 }
515 }
516
517 std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
518 std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
519
520 const bool UseIndex =
521 ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
522
523 SmallString<256> Buf;
524 llvm::raw_svector_ostream Out(Buf);
525 Out << "Assuming ";
526 if (UseIndex) {
527 Out << "index ";
528 if (OffsetN)
529 Out << "'" << OffsetN << "' ";
530 } else if (AssumedUpperBound) {
531 Out << "byte offset ";
532 if (OffsetN)
533 Out << "'" << OffsetN << "' ";
534 } else {
535 Out << "offset ";
536 }
537
538 Out << "is";
539 if (ShouldReportNonNegative) {
540 Out << " non-negative";
541 }
542 if (AssumedUpperBound) {
543 if (ShouldReportNonNegative)
544 Out << " and";
545 Out << " less than ";
546 if (ExtentN)
547 Out << *ExtentN << ", ";
548 if (UseIndex && ElementType)
549 Out << "the number of '" << ElementType->getAsString()
550 << "' elements in ";
551 else
552 Out << "the extent of ";
553 Out << getRegionName(Space, Reg);
554 }
555 return std::string(Out.str());
556}
557
558bool StateUpdateReporter::providesInformationAboutInteresting(
559 SymbolRef Sym, PathSensitiveBugReport &BR) {
560 if (!Sym)
561 return false;
562 for (SymbolRef PartSym : Sym->symbols()) {
563 // The interestingess mark may appear on any layer as we're stripping off
564 // the SymIntExpr, UnarySymExpr etc. layers...
565 if (BR.isInteresting(PartSym))
566 return true;
567 // ...but if both sides of the expression are symbolic, then there is no
568 // practical algorithm to produce separate constraints for the two
569 // operands (from the single combined result).
570 if (isa<SymSymExpr>(PartSym))
571 return false;
572 }
573 return false;
574}
575
576void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const {
577 const SVal Location = C.getSVal(E);
578
579 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
580 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
581 // and incomplete analysis of these leads to false positives. As even
582 // accurate reports would be confusing for the users, just disable reports
583 // from these macros:
584 if (isFromCtypeMacro(E, C.getASTContext()))
585 return;
586
587 ProgramStateRef State = C.getState();
588 SValBuilder &SVB = C.getSValBuilder();
589
590 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
591 computeOffset(State, SVB, Location);
592
593 if (!RawOffset)
594 return;
595
596 auto [Reg, ByteOffset] = *RawOffset;
597
598 // The state updates will be reported as a single note tag, which will be
599 // composed by this helper class.
600 StateUpdateReporter SUR(Reg, ByteOffset, E, C);
601
602 // CHECK LOWER BOUND
603 const MemSpaceRegion *Space = Reg->getMemorySpace(State);
604 if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
605 // A symbolic region in unknown space represents an unknown pointer that
606 // may point into the middle of an array, so we don't look for underflows.
607 // Both conditions are significant because we want to check underflows in
608 // symbolic regions on the heap (which may be introduced by checkers like
609 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
610 // non-symbolic regions (e.g. a field subregion of a symbolic region) in
611 // unknown space.
612 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
613 State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
614
615 if (PrecedesLowerBound) {
616 // The analyzer thinks that the offset may be invalid (negative)...
617
618 if (isOffsetObviouslyNonnegative(E, C)) {
619 // ...but the offset is obviously non-negative (clear array subscript
620 // with an unsigned index), so we're in a buggy situation.
621
622 // TODO: Currently the analyzer ignores many casts (e.g. signed ->
623 // unsigned casts), so it can easily reach states where it will load a
624 // signed (and negative) value from an unsigned variable. This sanity
625 // check is a duct tape "solution" that silences most of the ugly false
626 // positives that are caused by this buggy behavior. Note that this is
627 // not a complete solution: this cannot silence reports where pointer
628 // arithmetic complicates the picture and cannot ensure modeling of the
629 // "unsigned index is positive with highest bit set" cases which are
630 // "usurped" by the nonsense "unsigned index is negative" case.
631 // For more information about this topic, see the umbrella ticket
632 // https://github.com/llvm/llvm-project/issues/39492
633 // TODO: Remove this hack once 'SymbolCast's are modeled properly.
634
635 if (!WithinLowerBound) {
636 // The state is completely nonsense -- let's just sink it!
637 C.addSink();
638 return;
639 }
640 // Otherwise continue on the 'WithinLowerBound' branch where the
641 // unsigned index _is_ non-negative. Don't mention this assumption as a
642 // note tag, because it would just confuse the users!
643 } else {
644 if (!WithinLowerBound) {
645 // ...and it cannot be valid (>= 0), so report an error.
646 Messages Msgs = getNonTaintMsgs(C.getASTContext(), Space, Reg,
647 ByteOffset, /*Extent=*/std::nullopt,
648 Location, BadOffsetKind::Negative);
649 reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
650 return;
651 }
652 // ...but it can be valid as well, so the checker will (optimistically)
653 // assume that it's valid and mention this in the note tag.
654 SUR.recordNonNegativeAssumption();
655 }
656 }
657
658 // Actually update the state. The "if" only fails in the extremely unlikely
659 // case when compareValueToThreshold returns {nullptr, nullptr} because
660 // evalBinOpNN fails to evaluate the less-than operator.
661 if (WithinLowerBound)
662 State = WithinLowerBound;
663 }
664
665 // CHECK UPPER BOUND
666 DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
667 if (auto KnownSize = Size.getAs<NonLoc>()) {
668 // In a situation where both underflow and overflow are possible (but the
669 // index is either tainted or known to be invalid), the logic of this
670 // checker will first assume that the offset is non-negative, and then
671 // (with this additional assumption) it will detect an overflow error.
672 // In this situation the warning message should mention both possibilities.
673 bool AlsoMentionUnderflow = SUR.assumedNonNegative();
674
675 auto [WithinUpperBound, ExceedsUpperBound] =
676 compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
677
678 if (ExceedsUpperBound) {
679 // The offset may be invalid (>= Size)...
680 if (!WithinUpperBound) {
681 // ...and it cannot be within bounds, so report an error, unless we can
682 // definitely determine that this is an idiomatic `&array[size]`
683 // expression that calculates the past-the-end pointer.
684 if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
685 *KnownSize, C)) {
686 C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
687 return;
688 }
689
690 BadOffsetKind Problem = AlsoMentionUnderflow
691 ? BadOffsetKind::Indeterminate
692 : BadOffsetKind::Overflowing;
693 Messages Msgs =
694 getNonTaintMsgs(C.getASTContext(), Space, Reg, ByteOffset,
695 *KnownSize, Location, Problem);
696 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
697 return;
698 }
699 // ...and it can be valid as well...
700 if (isTainted(State, ByteOffset)) {
701 // ...but it's tainted, so report an error.
702
703 // Diagnostic detail: saying "tainted offset" is always correct, but
704 // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
705 // nicer to say "tainted index".
706 const char *OffsetName = "offset";
707 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
708 if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
709 OffsetName = "index";
710
711 Messages Msgs =
712 getTaintMsgs(Space, Reg, OffsetName, AlsoMentionUnderflow);
713 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
714 /*IsTaintBug=*/true);
715 return;
716 }
717 // ...and it isn't tainted, so the checker will (optimistically) assume
718 // that the offset is in bounds and mention this in the note tag.
719 SUR.recordUpperBoundAssumption(*KnownSize);
720 }
721
722 // Actually update the state. The "if" only fails in the extremely unlikely
723 // case when compareValueToThreshold returns {nullptr, nullptr} because
724 // evalBinOpNN fails to evaluate the less-than operator.
725 if (WithinUpperBound)
726 State = WithinUpperBound;
727 }
728
729 // Add a transition, reporting the state updates that we accumulated.
730 C.addTransition(State, SUR.createNoteTag(C));
731}
732
733void ArrayBoundChecker::markPartsInteresting(PathSensitiveBugReport &BR,
734 ProgramStateRef ErrorState,
735 NonLoc Val, bool MarkTaint) {
736 if (SymbolRef Sym = Val.getAsSymbol()) {
737 // If the offset is a symbolic value, iterate over its "parts" with
738 // `SymExpr::symbols()` and mark each of them as interesting.
739 // For example, if the offset is `x*4 + y` then we put interestingness onto
740 // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
741 // `x` and `y`.
742 for (SymbolRef PartSym : Sym->symbols())
743 BR.markInteresting(PartSym);
744 }
745
746 if (MarkTaint) {
747 // If the issue that we're reporting depends on the taintedness of the
748 // offset, then put interestingness onto symbols that could be the origin
749 // of the taint. Note that this may find symbols that did not appear in
750 // `Sym->symbols()` (because they're only loosely connected to `Val`).
751 for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
752 BR.markInteresting(Sym);
753 }
754}
755
756void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState,
757 Messages Msgs, NonLoc Offset,
758 std::optional<NonLoc> Extent,
759 bool IsTaintBug /*=false*/) const {
760
761 ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
762 if (!ErrorNode)
763 return;
764
765 auto BR = std::make_unique<PathSensitiveBugReport>(
766 IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
767
768 // FIXME: ideally we would just call trackExpressionValue() and that would
769 // "do the right thing": mark the relevant symbols as interesting, track the
770 // control dependencies and statements storing the relevant values and add
771 // helpful diagnostic pieces. However, right now trackExpressionValue() is
772 // a heap of unreliable heuristics, so it would cause several issues:
773 // - Interestingness is not applied consistently, e.g. if `array[x+10]`
774 // causes an overflow, then `x` is not marked as interesting.
775 // - We get irrelevant diagnostic pieces, e.g. in the code
776 // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
777 // it places a "Storing uninitialized value" note on the `malloc` call
778 // (which is technically true, but irrelevant).
779 // If trackExpressionValue() becomes reliable, it should be applied instead
780 // of this custom markPartsInteresting().
781 markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
782 if (Extent)
783 markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
784
785 C.emitReport(std::move(BR));
786}
787
788bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) {
789 SourceLocation Loc = E->getBeginLoc();
790 if (!Loc.isMacroID())
791 return false;
792
793 StringRef MacroName = Lexer::getImmediateMacroName(
794 Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
795
796 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
797 return false;
798
799 return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
800 (MacroName == "isblank") || (MacroName == "isdigit") ||
801 (MacroName == "isgraph") || (MacroName == "islower") ||
802 (MacroName == "isnctrl") || (MacroName == "isprint") ||
803 (MacroName == "ispunct") || (MacroName == "isspace") ||
804 (MacroName == "isupper") || (MacroName == "isxdigit"));
805}
806
807bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E,
808 CheckerContext &C) {
809 const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C);
810 if (!ASE)
811 return false;
813}
814
815bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
816 ParentMapContext &ParentCtx = ACtx.getParentMapContext();
817 do {
818 const DynTypedNodeList Parents = ParentCtx.getParents(*S);
819 if (Parents.empty())
820 return false;
821 S = Parents[0].get<Stmt>();
822 } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
823 const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
824 return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
825}
826
827bool ArrayBoundChecker::isIdiomaticPastTheEndPtr(const Expr *E,
828 ProgramStateRef State,
829 NonLoc Offset, NonLoc Limit,
830 CheckerContext &C) {
831 if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
832 auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
833 State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
834 return EqualsToThreshold && !NotEqualToThreshold;
835 }
836 return false;
837}
838
839void ento::registerArrayBoundChecker(CheckerManager &mgr) {
840 mgr.registerChecker<ArrayBoundChecker>();
841}
842
843bool ento::shouldRegisterArrayBoundChecker(const CheckerManager &mgr) {
844 return true;
845}
static std::pair< ProgramStateRef, ProgramStateRef > compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, SValBuilder &SVB, bool CheckEquality=false)
static std::string getRegionName(const MemSpaceRegion *Space, const SubRegion *Region)
static std::optional< std::pair< const SubRegion *, NonLoc > > computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location)
For a given Location that can be represented as a symbolic expression Arr[Idx] (or perhaps Arr[Idx1][...
static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value)
static std::optional< int64_t > getConcreteValue(NonLoc SV)
static Messages getTaintMsgs(const MemSpaceRegion *Space, const SubRegion *Region, const char *OffsetName, bool AlsoMentionUnderflow)
static bool isUnsigned(SValBuilder &SVB, NonLoc Value)
static Messages getNonTaintMsgs(const ASTContext &ACtx, const MemSpaceRegion *Space, const SubRegion *Region, NonLoc Offset, std::optional< NonLoc > Extent, SVal Location, BadOffsetKind Problem)
static std::pair< NonLoc, nonloc::ConcreteInt > getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, SValBuilder &svalBuilder)
static bool tryDividePair(std::optional< int64_t > &Val1, std::optional< int64_t > &Val2, int64_t Divisor)
Try to divide Val1 and Val2 (in place) by Divisor and return true if it can be performed (Divisor is ...
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition ASTContext.h:188
SourceManager & getSourceManager()
Definition ASTContext.h:798
ParentMapContext & getParentMapContext()
Returns the dynamic AST node parent map context.
const LangOptions & getLangOpts() const
Definition ASTContext.h:891
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
ArraySubscriptExpr - [C99 6.5.2.1] Array Subscripting.
Definition Expr.h:2721
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition CharUnits.h:185
This represents one expression.
Definition Expr.h:112
QualType getType() const
Definition Expr.h:144
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition Lexer.cpp:1056
Expr * getBase() const
Definition Expr.h:3375
bool isArrow() const
Definition Expr.h:3482
DynTypedNodeList getParents(const NodeT &Node)
Returns the parents of the given node (within the traversal scope).
A (possibly-)qualified type.
Definition TypeBase.h:937
static std::string getAsString(SplitQualType split, const PrintingPolicy &Policy)
Definition TypeBase.h:1332
SourceLocation getBeginLoc() const LLVM_READONLY
Definition Stmt.cpp:346
bool isUnsignedIntegerOrEnumerationType() const
Determines whether this is an integer type that is unsigned or an enumeration types whose underlying ...
Definition Type.cpp:2273
bool isIncompleteType(NamedDecl **Def=nullptr) const
Types are partitioned into 3 broad categories (C99 6.2.5p1): object types, function types,...
Definition Type.cpp:2436
Opcode getOpcode() const
Definition Expr.h:2280
QualType getType() const
Definition Value.cpp:237
A record of the "type" of an APSInt, used for conversions.
Definition APSIntType.h:19
llvm::APSInt convert(const llvm::APSInt &Value) const LLVM_READONLY
Convert and return a new APSInt with the given value, but this type's bit width and signedness.
Definition APSIntType.h:48
CHECKER * registerChecker(AT &&...Args)
Register a single-part checker (derived from Checker): construct its singleton instance,...
Simple checker classes that implement one frontend (i.e.
Definition Checker.h:553
ElementRegion is used to represent both array elements and casts.
Definition MemRegion.h:1227
QualType getElementType() const
Definition MemRegion.h:1251
MemRegion - The root abstract class for all memory regions.
Definition MemRegion.h:98
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * StripCasts(bool StripBaseAndDerivedCasts=true) const
std::string getDescriptiveName(bool UseQuotes=true) const
Get descriptive name for memory region.
const RegionTy * getAs() const
Definition MemRegion.h:1416
MemSpaceRegion - A memory region that represents a "memory space"; for example, the set of global var...
Definition MemRegion.h:236
The tag upon which the TagVisitor reacts.
void markInteresting(SymbolRef sym, bugreporter::TrackingKind TKind=bugreporter::TrackingKind::Thorough)
Marks a symbol as interesting.
bool isInteresting(SymbolRef sym) const
NonLoc makeArrayIndex(uint64_t idx)
ASTContext & getContext()
nonloc::ConcreteInt makeIntVal(const IntegerLiteral *integer)
QualType getArrayIndexType() const
virtual SVal evalBinOpNN(ProgramStateRef state, BinaryOperator::Opcode op, NonLoc lhs, NonLoc rhs, QualType resultTy)=0
Create a new value which represents a binary expression with two non- location operands.
QualType getConditionType() const
virtual const llvm::APSInt * getMaxValue(ProgramStateRef state, SVal val)=0
Tries to get the maximal possible (integer) value of a given SVal.
SVal - This represents a symbolic expression, which can be either an L-value or an R-value.
Definition SVals.h:56
SymbolRef getAsSymbol(bool IncludeBaseRegions=false) const
If this SVal wraps a symbol return that SymbolRef.
Definition SVals.cpp:103
std::optional< T > getAs() const
Convert to the specified SVal type, returning std::nullopt if this SVal is not of the desired type.
Definition SVals.h:87
SubRegion - A region that subsets another larger region.
Definition MemRegion.h:474
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * getSuperRegion() const
Definition MemRegion.h:487
llvm::iterator_range< symbol_iterator > symbols() const
Definition SymExpr.h:107
Value representing integer constant.
Definition SVals.h:300
APSIntPtr getValue() const
Definition SVals.h:304
Represents symbolic expression that isn't a location.
Definition SVals.h:279
std::vector< SymbolRef > getTaintedSymbols(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Returns the tainted Symbols for a given Statement and state.
Definition Taint.cpp:170
bool isTainted(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Check if the statement has a tainted value in the given state.
Definition Taint.cpp:148
IntrusiveRefCntPtr< const ProgramState > ProgramStateRef
const SymExpr * SymbolRef
Definition SymExpr.h:133
DefinedOrUnknownSVal getDynamicExtent(ProgramStateRef State, const MemRegion *MR, SValBuilder &SVB)
BinarySymExprImpl< const SymExpr *, APSIntPtr, SymExpr::Kind::SymIntExprKind > SymIntExpr
Represents a symbolic expression like 'x' + 3.
The JSON file list parser is used to communicate input to InstallAPI.
bool isa(CodeGen::Address addr)
Definition Address.h:330
const FunctionProtoType * T