clang 18.0.0git
ArrayBoundCheckerV2.cpp
Go to the documentation of this file.
1//== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10// which looks for an out-of-bound array element access.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/AST/CharUnits.h"
24#include "llvm/ADT/SmallString.h"
25#include "llvm/Support/FormatVariadic.h"
26#include "llvm/Support/raw_ostream.h"
27#include <optional>
28
29using namespace clang;
30using namespace ento;
31using namespace taint;
32using llvm::formatv;
33
34namespace {
35enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
36
37class ArrayBoundCheckerV2 :
38 public Checker<check::Location> {
39 BugType BT{this, "Out-of-bound access"};
40 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
41
42 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
43 NonLoc Offset, std::string RegName, std::string Msg) const;
44
45 static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
46
47public:
48 void checkLocation(SVal l, bool isLoad, const Stmt *S,
49 CheckerContext &C) const;
50};
51} // anonymous namespace
52
53/// For a given Location that can be represented as a symbolic expression
54/// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
55/// Arr and the distance of Location from the beginning of Arr (expressed in a
56/// NonLoc that specifies the number of CharUnits). Returns nullopt when these
57/// cannot be determined.
58static std::optional<std::pair<const SubRegion *, NonLoc>>
61 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
62 // We will use this utility to add and multiply values.
63 return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
64 };
65
66 const SubRegion *OwnerRegion = nullptr;
67 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
68
69 const ElementRegion *CurRegion =
70 dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
71
72 while (CurRegion) {
73 const auto Index = CurRegion->getIndex().getAs<NonLoc>();
74 if (!Index)
75 return std::nullopt;
76
77 QualType ElemType = CurRegion->getElementType();
78
79 // FIXME: The following early return was presumably added to safeguard the
80 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
81 // it seems that `ElemType` cannot be incomplete at this point.
82 if (ElemType->isIncompleteType())
83 return std::nullopt;
84
85 // Calculate Delta = Index * sizeof(ElemType).
86 NonLoc Size = SVB.makeArrayIndex(
87 SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
88 auto Delta = EvalBinOp(BO_Mul, *Index, Size);
89 if (!Delta)
90 return std::nullopt;
91
92 // Perform Offset += Delta.
93 Offset = EvalBinOp(BO_Add, *Offset, *Delta);
94 if (!Offset)
95 return std::nullopt;
96
97 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
98 // When this is just another ElementRegion layer, we need to continue the
99 // offset calculations:
100 CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
101 }
102
103 if (OwnerRegion)
104 return std::make_pair(OwnerRegion, *Offset);
105
106 return std::nullopt;
107}
108
109// TODO: once the constraint manager is smart enough to handle non simplified
110// symbolic expressions remove this function. Note that this can not be used in
111// the constraint manager as is, since this does not handle overflows. It is
112// safe to assume, however, that memory offsets will not overflow.
113// NOTE: callers of this function need to be aware of the effects of overflows
114// and signed<->unsigned conversions!
115static std::pair<NonLoc, nonloc::ConcreteInt>
117 SValBuilder &svalBuilder) {
118 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
119 if (SymVal && SymVal->isExpression()) {
120 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
121 llvm::APSInt constant =
122 APSIntType(extent.getValue()).convert(SIE->getRHS());
123 switch (SIE->getOpcode()) {
124 case BO_Mul:
125 // The constant should never be 0 here, becasue multiplication by zero
126 // is simplified by the engine.
127 if ((extent.getValue() % constant) != 0)
128 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
129 else
131 nonloc::SymbolVal(SIE->getLHS()),
132 svalBuilder.makeIntVal(extent.getValue() / constant),
133 svalBuilder);
134 case BO_Add:
136 nonloc::SymbolVal(SIE->getLHS()),
137 svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder);
138 default:
139 break;
140 }
141 }
142 }
143
144 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
145}
146
147// Evaluate the comparison Value < Threshold with the help of the custom
148// simplification algorithm defined for this checker. Return a pair of states,
149// where the first one corresponds to "value below threshold" and the second
150// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
151// the case when the evaluation fails.
152static std::pair<ProgramStateRef, ProgramStateRef>
154 SValBuilder &SVB) {
155 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
156 std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
157 }
158 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
159 QualType T = Value.getType(SVB.getContext());
160 if (T->isUnsignedIntegerType() && ConcreteThreshold->getValue().isNegative()) {
161 // In this case we reduced the bound check to a comparison of the form
162 // (symbol or value with unsigned type) < (negative number)
163 // which is always false. We are handling these cases separately because
164 // evalBinOpNN can perform a signed->unsigned conversion that turns the
165 // negative number into a huge positive value and leads to wildly
166 // inaccurate conclusions.
167 return {nullptr, State};
168 }
169 }
170 auto BelowThreshold =
171 SVB.evalBinOpNN(State, BO_LT, Value, Threshold, SVB.getConditionType()).getAs<NonLoc>();
172
173 if (BelowThreshold)
174 return State->assume(*BelowThreshold);
175
176 return {nullptr, nullptr};
177}
178
179static std::string getRegionName(const SubRegion *Region) {
180 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
181 return RegName;
182
183 // Field regions only have descriptive names when their parent has a
184 // descriptive name; so we provide a fallback representation for them:
185 if (const auto *FR = Region->getAs<FieldRegion>()) {
186 if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
187 return formatv("the field '{0}'", Name);
188 return "the unnamed field";
189 }
190
191 if (isa<AllocaRegion>(Region))
192 return "the memory returned by 'alloca'";
193
194 if (isa<SymbolicRegion>(Region) &&
195 isa<HeapSpaceRegion>(Region->getMemorySpace()))
196 return "the heap area";
197
198 if (isa<StringRegion>(Region))
199 return "the string literal";
200
201 return "the region";
202}
203
204static std::optional<int64_t> getConcreteValue(NonLoc SV) {
205 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
206 return ConcreteVal->getValue().tryExtValue();
207 }
208 return std::nullopt;
209}
210
211static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
212 static const char *ShortMsgTemplates[] = {
213 "Out of bound access to memory preceding {0}",
214 "Out of bound access to memory after the end of {0}",
215 "Potential out of bound access to {0} with tainted offset"};
216
217 return formatv(ShortMsgTemplates[Kind], RegName);
218}
219
220static std::string getPrecedesMsg(std::string RegName, NonLoc Offset) {
222 llvm::raw_svector_ostream Out(Buf);
223 Out << "Access of " << RegName << " at negative byte offset";
224 if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
225 Out << ' ' << ConcreteIdx->getValue();
226 return std::string(Buf);
227}
228static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
229 NonLoc Offset, NonLoc Extent, SVal Location) {
230 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
231 assert(EReg && "this checker only handles element access");
232 QualType ElemType = EReg->getElementType();
233
234 std::optional<int64_t> OffsetN = getConcreteValue(Offset);
235 std::optional<int64_t> ExtentN = getConcreteValue(Extent);
236
237 bool UseByteOffsets = true;
238 if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) {
239 const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize;
240 const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize;
241 if (!OffsetHasRemainder && !ExtentHasRemainder) {
242 UseByteOffsets = false;
243 if (OffsetN)
244 *OffsetN /= ElemSize;
245 if (ExtentN)
246 *ExtentN /= ElemSize;
247 }
248 }
249
251 llvm::raw_svector_ostream Out(Buf);
252 Out << "Access of ";
253 if (!ExtentN && !UseByteOffsets)
254 Out << "'" << ElemType.getAsString() << "' element in ";
255 Out << RegName << " at ";
256 if (OffsetN) {
257 Out << (UseByteOffsets ? "byte offset " : "index ") << *OffsetN;
258 } else {
259 Out << "an overflowing " << (UseByteOffsets ? "byte offset" : "index");
260 }
261 if (ExtentN) {
262 Out << ", while it holds only ";
263 if (*ExtentN != 1)
264 Out << *ExtentN;
265 else
266 Out << "a single";
267 if (UseByteOffsets)
268 Out << " byte";
269 else
270 Out << " '" << ElemType.getAsString() << "' element";
271
272 if (*ExtentN > 1)
273 Out << "s";
274 }
275
276 return std::string(Buf);
277}
278static std::string getTaintMsg(std::string RegName) {
280 llvm::raw_svector_ostream Out(Buf);
281 Out << "Access of " << RegName
282 << " with a tainted offset that may be too large";
283 return std::string(Buf);
284}
285
286void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
287 const Stmt *LoadS,
288 CheckerContext &C) const {
289
290 // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
291 // some new logic here that reasons directly about memory region extents.
292 // Once that logic is more mature, we can bring it back to assumeInBound()
293 // for all clients to use.
294 //
295 // The algorithm we are using here for bounds checking is to see if the
296 // memory access is within the extent of the base region. Since we
297 // have some flexibility in defining the base region, we can achieve
298 // various levels of conservatism in our buffer overflow checking.
299
300 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
301 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
302 // and incomplete analysis of these leads to false positives. As even
303 // accurate reports would be confusing for the users, just disable reports
304 // from these macros:
305 if (isFromCtypeMacro(LoadS, C.getASTContext()))
306 return;
307
308 ProgramStateRef State = C.getState();
309 SValBuilder &SVB = C.getSValBuilder();
310
311 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
312 computeOffset(State, SVB, Location);
313
314 if (!RawOffset)
315 return;
316
317 auto [Reg, ByteOffset] = *RawOffset;
318
319 // CHECK LOWER BOUND
320 const MemSpaceRegion *Space = Reg->getMemorySpace();
321 if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
322 // A symbolic region in unknown space represents an unknown pointer that
323 // may point into the middle of an array, so we don't look for underflows.
324 // Both conditions are significant because we want to check underflows in
325 // symbolic regions on the heap (which may be introduced by checkers like
326 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
327 // non-symbolic regions (e.g. a field subregion of a symbolic region) in
328 // unknown space.
329 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
330 State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
331
332 if (PrecedesLowerBound && !WithinLowerBound) {
333 // We know that the index definitely precedes the lower bound.
334 std::string RegName = getRegionName(Reg);
335 std::string Msg = getPrecedesMsg(RegName, ByteOffset);
336 reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, RegName, Msg);
337 return;
338 }
339
340 if (WithinLowerBound)
341 State = WithinLowerBound;
342 }
343
344 // CHECK UPPER BOUND
345 DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
346 if (auto KnownSize = Size.getAs<NonLoc>()) {
347 auto [WithinUpperBound, ExceedsUpperBound] =
348 compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
349
350 if (ExceedsUpperBound) {
351 if (!WithinUpperBound) {
352 // We know that the index definitely exceeds the upper bound.
353 std::string RegName = getRegionName(Reg);
354 std::string Msg = getExceedsMsg(C.getASTContext(), RegName, ByteOffset,
355 *KnownSize, Location);
356 reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, RegName, Msg);
357 return;
358 }
359 if (isTainted(State, ByteOffset)) {
360 // Both cases are possible, but the index is tainted, so report.
361 std::string RegName = getRegionName(Reg);
362 std::string Msg = getTaintMsg(RegName);
363 reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, RegName, Msg);
364 return;
365 }
366 }
367
368 if (WithinUpperBound)
369 State = WithinUpperBound;
370 }
371
372 C.addTransition(State);
373}
374
375void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
376 ProgramStateRef ErrorState, OOB_Kind Kind,
377 NonLoc Offset, std::string RegName,
378 std::string Msg) const {
379
380 ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
381 if (!ErrorNode)
382 return;
383
384 std::string ShortMsg = getShortMsg(Kind, RegName);
385
386 auto BR = std::make_unique<PathSensitiveBugReport>(
387 Kind == OOB_Taint ? TaintBT : BT, ShortMsg, Msg, ErrorNode);
388
389 // Track back the propagation of taintedness.
390 if (Kind == OOB_Taint)
391 for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset))
392 BR->markInteresting(Sym);
393
394 C.emitReport(std::move(BR));
395}
396
397bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
398 SourceLocation Loc = S->getBeginLoc();
399 if (!Loc.isMacroID())
400 return false;
401
402 StringRef MacroName = Lexer::getImmediateMacroName(
403 Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
404
405 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
406 return false;
407
408 return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
409 (MacroName == "isblank") || (MacroName == "isdigit") ||
410 (MacroName == "isgraph") || (MacroName == "islower") ||
411 (MacroName == "isnctrl") || (MacroName == "isprint") ||
412 (MacroName == "ispunct") || (MacroName == "isspace") ||
413 (MacroName == "isupper") || (MacroName == "isxdigit"));
414}
415
416void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
417 mgr.registerChecker<ArrayBoundCheckerV2>();
418}
419
420bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
421 return true;
422}
static std::optional< std::pair< const SubRegion *, NonLoc > > computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location)
For a given Location that can be represented as a symbolic expression Arr[Idx] (or perhaps Arr[Idx1][...
static std::string getPrecedesMsg(std::string RegName, NonLoc Offset)
static std::string getRegionName(const SubRegion *Region)
static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName, NonLoc Offset, NonLoc Extent, SVal Location)
static std::optional< int64_t > getConcreteValue(NonLoc SV)
static std::string getTaintMsg(std::string RegName)
static std::string getShortMsg(OOB_Kind Kind, std::string RegName)
static std::pair< NonLoc, nonloc::ConcreteInt > getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, SValBuilder &svalBuilder)
static std::pair< ProgramStateRef, ProgramStateRef > compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, SValBuilder &SVB)
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:182
SourceManager & getSourceManager()
Definition: ASTContext.h:697
const LangOptions & getLangOpts() const
Definition: ASTContext.h:767
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1015
A (possibly-)qualified type.
Definition: Type.h:736
static std::string getAsString(SplitQualType split, const PrintingPolicy &Policy)
Definition: Type.h:1120
Encodes a location in the source.
Stmt - This represents one statement.
Definition: Stmt.h:84
bool isIncompleteType(NamedDecl **Def=nullptr) const
Types are partitioned into 3 broad categories (C99 6.2.5p1): object types, function types,...
Definition: Type.cpp:2299
bool isUnsignedIntegerType() const
Return true if this is an integer type that is unsigned, according to C99 6.2.5p6 [which returns true...
Definition: Type.cpp:2133
QualType getType() const
Definition: Value.cpp:234
A record of the "type" of an APSInt, used for conversions.
Definition: APSIntType.h:19
llvm::APSInt convert(const llvm::APSInt &Value) const LLVM_READONLY
Convert and return a new APSInt with the given value, but this type's bit width and signedness.
Definition: APSIntType.h:48
Template implementation for all binary symbolic expressions.
CHECKER * registerChecker(AT &&... Args)
Used to register checkers.
ElementRegion is used to represent both array elements and casts.
Definition: MemRegion.h:1194
QualType getElementType() const
Definition: MemRegion.h:1218
NonLoc getIndex() const
Definition: MemRegion.h:1214
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemSpaceRegion * getMemorySpace() const
Definition: MemRegion.cpp:1309
std::string getDescriptiveName(bool UseQuotes=true) const
Get descriptive name for memory region.
Definition: MemRegion.cpp:707
const RegionTy * getAs() const
Definition: MemRegion.h:1383
MemSpaceRegion - A memory region that represents a "memory space"; for example, the set of global var...
Definition: MemRegion.h:203
NonLoc makeArrayIndex(uint64_t idx)
Definition: SValBuilder.h:272
ASTContext & getContext()
Definition: SValBuilder.h:136
nonloc::ConcreteInt makeIntVal(const IntegerLiteral *integer)
Definition: SValBuilder.h:278
QualType getArrayIndexType() const
Definition: SValBuilder.h:145
virtual SVal evalBinOpNN(ProgramStateRef state, BinaryOperator::Opcode op, NonLoc lhs, NonLoc rhs, QualType resultTy)=0
Create a new value which represents a binary expression with two non- location operands.
QualType getConditionType() const
Definition: SValBuilder.h:141
SVal - This represents a symbolic expression, which can be either an L-value or an R-value.
Definition: SVals.h:55
std::optional< T > getAs() const
Convert to the specified SVal type, returning std::nullopt if this SVal is not of the desired type.
Definition: SVals.h:86
const MemRegion * getAsRegion() const
Definition: SVals.cpp:120
SubRegion - A region that subsets another larger region.
Definition: MemRegion.h:441
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * getSuperRegion() const
Definition: MemRegion.h:454
Symbolic value.
Definition: SymExpr.h:30
Value representing integer constant.
Definition: SVals.h:305
const llvm::APSInt & getValue() const
Definition: SVals.h:309
Represents symbolic expression that isn't a location.
Definition: SVals.h:284
std::vector< SymbolRef > getTaintedSymbols(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Returns the tainted Symbols for a given Statement and state.
Definition: Taint.cpp:169
bool isTainted(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Check if the statement has a tainted value in the given state.
Definition: Taint.cpp:147
DefinedOrUnknownSVal getDynamicExtent(ProgramStateRef State, const MemRegion *MR, SValBuilder &SVB)
BinaryOperatorKind