clang 18.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/Support/MathExtras.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
35 llvm::FunctionCallee ExitCallee = nullptr;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52 ContBlock = CGF.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55 CGF.EmitBlock(ThenBlock);
56 }
57 }
58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(ContBlock);
61 CGF.EmitBlock(ContBlock, true);
62 }
63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65 }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
77
78public:
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86};
87
88static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
89 RefExpr = RefExpr->IgnoreParens();
90 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
91 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
93 Base = TempASE->getBase()->IgnoreParenImpCasts();
94 RefExpr = Base;
95 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
96 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
98 Base = TempOASE->getBase()->IgnoreParenImpCasts();
99 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
100 Base = TempASE->getBase()->IgnoreParenImpCasts();
101 RefExpr = Base;
102 }
103 RefExpr = RefExpr->IgnoreParenImpCasts();
104 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
105 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106 const auto *ME = cast<MemberExpr>(RefExpr);
107 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108}
109
110static RecordDecl *buildRecordForGlobalizedVars(
112 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
114 &MappedDeclsFields,
115 int BufSize) {
116 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
117 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118 return nullptr;
119 SmallVector<VarsDataTy, 4> GlobalizedVars;
120 for (const ValueDecl *D : EscapedDecls)
121 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
122 for (const ValueDecl *D : EscapedDeclsForTeams)
123 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
124
125 // Build struct _globalized_locals_ty {
126 // /* globalized vars */[WarSize] align (decl_align)
127 // /* globalized vars */ for EscapedDeclsForTeams
128 // };
129 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
130 GlobalizedRD->startDefinition();
132 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133 for (const auto &Pair : GlobalizedVars) {
134 const ValueDecl *VD = Pair.second;
135 QualType Type = VD->getType();
137 Type = C.getPointerType(Type.getNonReferenceType());
138 else
139 Type = Type.getNonReferenceType();
140 SourceLocation Loc = VD->getLocation();
142 if (SingleEscaped.count(VD)) {
144 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
145 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
146 /*BW=*/nullptr, /*Mutable=*/false,
147 /*InitStyle=*/ICIS_NoInit);
148 Field->setAccess(AS_public);
149 if (VD->hasAttrs()) {
150 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151 E(VD->getAttrs().end());
152 I != E; ++I)
153 Field->addAttr(*I);
154 }
155 } else {
156 if (BufSize > 1) {
157 llvm::APInt ArraySize(32, BufSize);
158 Type = C.getConstantArrayType(Type, ArraySize, nullptr,
159 ArraySizeModifier::Normal, 0);
160 }
162 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
163 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
164 /*BW=*/nullptr, /*Mutable=*/false,
165 /*InitStyle=*/ICIS_NoInit);
166 Field->setAccess(AS_public);
167 llvm::APInt Align(32, Pair.first.getQuantity());
168 Field->addAttr(AlignedAttr::CreateImplicit(
169 C, /*IsAlignmentExpr=*/true,
171 C.getIntTypeForBitwidth(32, /*Signed=*/0),
173 {}, AlignedAttr::GNU_aligned));
174 }
175 GlobalizedRD->addDecl(Field);
176 MappedDeclsFields.try_emplace(VD, Field);
177 }
178 GlobalizedRD->completeDefinition();
179 return GlobalizedRD;
180}
181
182/// Get the list of variables that can escape their declaration context.
183class CheckVarsEscapingDeclContext final
184 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185 CodeGenFunction &CGF;
186 llvm::SetVector<const ValueDecl *> EscapedDecls;
187 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
190 RecordDecl *GlobalizedRD = nullptr;
191 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
192 bool AllEscaped = false;
193 bool IsForCombinedParallelRegion = false;
194
195 void markAsEscaped(const ValueDecl *VD) {
196 // Do not globalize declare target variables.
197 if (!isa<VarDecl>(VD) ||
198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199 return;
200 VD = cast<ValueDecl>(VD->getCanonicalDecl());
201 // Use user-specified allocation.
202 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203 return;
204 // Variables captured by value must be globalized.
205 bool IsCaptured = false;
206 if (auto *CSI = CGF.CapturedStmtInfo) {
207 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
208 // Check if need to capture the variable that was already captured by
209 // value in the outer region.
210 IsCaptured = true;
211 if (!IsForCombinedParallelRegion) {
212 if (!FD->hasAttrs())
213 return;
214 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215 if (!Attr)
216 return;
217 if (((Attr->getCaptureKind() != OMPC_map) &&
218 !isOpenMPPrivate(Attr->getCaptureKind())) ||
219 ((Attr->getCaptureKind() == OMPC_map) &&
220 !FD->getType()->isAnyPointerType()))
221 return;
222 }
223 if (!FD->getType()->isReferenceType()) {
224 assert(!VD->getType()->isVariablyModifiedType() &&
225 "Parameter captured by value with variably modified type");
226 EscapedParameters.insert(VD);
227 } else if (!IsForCombinedParallelRegion) {
228 return;
229 }
230 }
231 }
232 if ((!CGF.CapturedStmtInfo ||
233 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234 VD->getType()->isReferenceType())
235 // Do not globalize variables with reference type.
236 return;
237 if (VD->getType()->isVariablyModifiedType()) {
238 // If not captured at the target region level then mark the escaped
239 // variable as delayed.
240 if (IsCaptured)
241 EscapedVariableLengthDecls.insert(VD);
242 else
243 DelayedVariableLengthDecls.insert(VD);
244 } else
245 EscapedDecls.insert(VD);
246 }
247
248 void VisitValueDecl(const ValueDecl *VD) {
249 if (VD->getType()->isLValueReferenceType())
250 markAsEscaped(VD);
251 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
252 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
253 const bool SavedAllEscaped = AllEscaped;
254 AllEscaped = VD->getType()->isLValueReferenceType();
255 Visit(VarD->getInit());
256 AllEscaped = SavedAllEscaped;
257 }
258 }
259 }
260 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261 ArrayRef<OMPClause *> Clauses,
262 bool IsCombinedParallelRegion) {
263 if (!S)
264 return;
265 for (const CapturedStmt::Capture &C : S->captures()) {
266 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267 const ValueDecl *VD = C.getCapturedVar();
268 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269 if (IsCombinedParallelRegion) {
270 // Check if the variable is privatized in the combined construct and
271 // those private copies must be shared in the inner parallel
272 // directive.
273 IsForCombinedParallelRegion = false;
274 for (const OMPClause *C : Clauses) {
275 if (!isOpenMPPrivate(C->getClauseKind()) ||
276 C->getClauseKind() == OMPC_reduction ||
277 C->getClauseKind() == OMPC_linear ||
278 C->getClauseKind() == OMPC_private)
279 continue;
281 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
282 Vars = PC->getVarRefs();
283 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
284 Vars = PC->getVarRefs();
285 else
286 llvm_unreachable("Unexpected clause.");
287 for (const auto *E : Vars) {
288 const Decl *D =
289 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
290 if (D == VD->getCanonicalDecl()) {
291 IsForCombinedParallelRegion = true;
292 break;
293 }
294 }
295 if (IsForCombinedParallelRegion)
296 break;
297 }
298 }
299 markAsEscaped(VD);
300 if (isa<OMPCapturedExprDecl>(VD))
301 VisitValueDecl(VD);
302 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303 }
304 }
305 }
306
307 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308 assert(!GlobalizedRD &&
309 "Record for globalized variables is built already.");
310 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312 if (IsInTTDRegion)
313 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314 else
315 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316 GlobalizedRD = ::buildRecordForGlobalizedVars(
317 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
318 MappedDeclsFields, WarpSize);
319 }
320
321public:
322 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323 ArrayRef<const ValueDecl *> TeamsReductions)
324 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325 }
326 virtual ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
332 VisitValueDecl(VD);
333 }
334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, D->clauses(),
351 CaptureRegions.back() == OMPD_parallel &&
353 }
354 }
355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
362 if (isa<OMPCapturedExprDecl>(VD))
363 VisitValueDecl(VD);
364 }
365 }
366 }
367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
375 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(Arg);
406 }
407 }
408 Visit(E->getCallee());
409 }
410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
416 if (isa<OMPCapturedExprDecl>(VD))
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(E->getSubExpr());
431 }
432 }
433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(Child);
454 AllEscaped = SavedAllEscaped;
455 }
456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501};
502} // anonymous namespace
503
504/// Get the id of the warp in the block.
505/// We assume that the warp size is 32, which is always the case
506/// on the NVPTX device, to generate more efficient code.
507static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
508 CGBuilderTy &Bld = CGF.Builder;
509 unsigned LaneIDBits =
510 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
511 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
512 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
513}
514
515/// Get the id of the current lane in the Warp.
516/// We assume that the warp size is 32, which is always the case
517/// on the NVPTX device, to generate more efficient code.
518static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
519 CGBuilderTy &Bld = CGF.Builder;
520 unsigned LaneIDBits =
521 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
522 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
523 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
524 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
525 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
526 "nvptx_lane_id");
527}
528
530CGOpenMPRuntimeGPU::getExecutionMode() const {
531 return CurrentExecutionMode;
532}
533
535CGOpenMPRuntimeGPU::getDataSharingMode() const {
536 return CurrentDataSharingMode;
537}
538
539/// Check for inner (nested) SPMD construct, if any
541 const OMPExecutableDirective &D) {
542 const auto *CS = D.getInnermostCapturedStmt();
543 const auto *Body =
544 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
545 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
546
547 if (const auto *NestedDir =
548 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
549 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
550 switch (D.getDirectiveKind()) {
551 case OMPD_target:
552 if (isOpenMPParallelDirective(DKind))
553 return true;
554 if (DKind == OMPD_teams) {
555 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
556 /*IgnoreCaptured=*/true);
557 if (!Body)
558 return false;
559 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
560 if (const auto *NND =
561 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
562 DKind = NND->getDirectiveKind();
563 if (isOpenMPParallelDirective(DKind))
564 return true;
565 }
566 }
567 return false;
568 case OMPD_target_teams:
569 return isOpenMPParallelDirective(DKind);
570 case OMPD_target_simd:
571 case OMPD_target_parallel:
572 case OMPD_target_parallel_for:
573 case OMPD_target_parallel_for_simd:
574 case OMPD_target_teams_distribute:
575 case OMPD_target_teams_distribute_simd:
576 case OMPD_target_teams_distribute_parallel_for:
577 case OMPD_target_teams_distribute_parallel_for_simd:
578 case OMPD_parallel:
579 case OMPD_for:
580 case OMPD_parallel_for:
581 case OMPD_parallel_master:
582 case OMPD_parallel_sections:
583 case OMPD_for_simd:
584 case OMPD_parallel_for_simd:
585 case OMPD_cancel:
586 case OMPD_cancellation_point:
587 case OMPD_ordered:
588 case OMPD_threadprivate:
589 case OMPD_allocate:
590 case OMPD_task:
591 case OMPD_simd:
592 case OMPD_sections:
593 case OMPD_section:
594 case OMPD_single:
595 case OMPD_master:
596 case OMPD_critical:
597 case OMPD_taskyield:
598 case OMPD_barrier:
599 case OMPD_taskwait:
600 case OMPD_taskgroup:
601 case OMPD_atomic:
602 case OMPD_flush:
603 case OMPD_depobj:
604 case OMPD_scan:
605 case OMPD_teams:
606 case OMPD_target_data:
607 case OMPD_target_exit_data:
608 case OMPD_target_enter_data:
609 case OMPD_distribute:
610 case OMPD_distribute_simd:
611 case OMPD_distribute_parallel_for:
612 case OMPD_distribute_parallel_for_simd:
613 case OMPD_teams_distribute:
614 case OMPD_teams_distribute_simd:
615 case OMPD_teams_distribute_parallel_for:
616 case OMPD_teams_distribute_parallel_for_simd:
617 case OMPD_target_update:
618 case OMPD_declare_simd:
619 case OMPD_declare_variant:
620 case OMPD_begin_declare_variant:
621 case OMPD_end_declare_variant:
622 case OMPD_declare_target:
623 case OMPD_end_declare_target:
624 case OMPD_declare_reduction:
625 case OMPD_declare_mapper:
626 case OMPD_taskloop:
627 case OMPD_taskloop_simd:
628 case OMPD_master_taskloop:
629 case OMPD_master_taskloop_simd:
630 case OMPD_parallel_master_taskloop:
631 case OMPD_parallel_master_taskloop_simd:
632 case OMPD_requires:
633 case OMPD_unknown:
634 default:
635 llvm_unreachable("Unexpected directive.");
636 }
637 }
638
639 return false;
640}
641
643 const OMPExecutableDirective &D) {
644 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
645 switch (DirectiveKind) {
646 case OMPD_target:
647 case OMPD_target_teams:
648 return hasNestedSPMDDirective(Ctx, D);
649 case OMPD_target_teams_loop:
650 case OMPD_target_parallel_loop:
651 case OMPD_target_parallel:
652 case OMPD_target_parallel_for:
653 case OMPD_target_parallel_for_simd:
654 case OMPD_target_teams_distribute_parallel_for:
655 case OMPD_target_teams_distribute_parallel_for_simd:
656 case OMPD_target_simd:
657 case OMPD_target_teams_distribute_simd:
658 return true;
659 case OMPD_target_teams_distribute:
660 return false;
661 case OMPD_parallel:
662 case OMPD_for:
663 case OMPD_parallel_for:
664 case OMPD_parallel_master:
665 case OMPD_parallel_sections:
666 case OMPD_for_simd:
667 case OMPD_parallel_for_simd:
668 case OMPD_cancel:
669 case OMPD_cancellation_point:
670 case OMPD_ordered:
671 case OMPD_threadprivate:
672 case OMPD_allocate:
673 case OMPD_task:
674 case OMPD_simd:
675 case OMPD_sections:
676 case OMPD_section:
677 case OMPD_single:
678 case OMPD_master:
679 case OMPD_critical:
680 case OMPD_taskyield:
681 case OMPD_barrier:
682 case OMPD_taskwait:
683 case OMPD_taskgroup:
684 case OMPD_atomic:
685 case OMPD_flush:
686 case OMPD_depobj:
687 case OMPD_scan:
688 case OMPD_teams:
689 case OMPD_target_data:
690 case OMPD_target_exit_data:
691 case OMPD_target_enter_data:
692 case OMPD_distribute:
693 case OMPD_distribute_simd:
694 case OMPD_distribute_parallel_for:
695 case OMPD_distribute_parallel_for_simd:
696 case OMPD_teams_distribute:
697 case OMPD_teams_distribute_simd:
698 case OMPD_teams_distribute_parallel_for:
699 case OMPD_teams_distribute_parallel_for_simd:
700 case OMPD_target_update:
701 case OMPD_declare_simd:
702 case OMPD_declare_variant:
703 case OMPD_begin_declare_variant:
704 case OMPD_end_declare_variant:
705 case OMPD_declare_target:
706 case OMPD_end_declare_target:
707 case OMPD_declare_reduction:
708 case OMPD_declare_mapper:
709 case OMPD_taskloop:
710 case OMPD_taskloop_simd:
711 case OMPD_master_taskloop:
712 case OMPD_master_taskloop_simd:
713 case OMPD_parallel_master_taskloop:
714 case OMPD_parallel_master_taskloop_simd:
715 case OMPD_requires:
716 case OMPD_unknown:
717 default:
718 break;
719 }
720 llvm_unreachable(
721 "Unknown programming model for OpenMP directive on NVPTX target.");
722}
723
724void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
725 StringRef ParentName,
726 llvm::Function *&OutlinedFn,
727 llvm::Constant *&OutlinedFnID,
728 bool IsOffloadEntry,
729 const RegionCodeGenTy &CodeGen) {
730 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
731 EntryFunctionState EST;
732 WrapperFunctionsMap.clear();
733
734 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
735 assert(!IsBareKernel && "bare kernel should not be at generic mode");
736
737 // Emit target region as a standalone region.
738 class NVPTXPrePostActionTy : public PrePostActionTy {
739 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
740 const OMPExecutableDirective &D;
741
742 public:
743 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
744 const OMPExecutableDirective &D)
745 : EST(EST), D(D) {}
746 void Enter(CodeGenFunction &CGF) override {
747 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
748 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
749 // Skip target region initialization.
750 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
751 }
752 void Exit(CodeGenFunction &CGF) override {
753 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
755 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
756 }
757 } Action(EST, D);
758 CodeGen.setAction(Action);
759 IsInTTDRegion = true;
760 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
761 IsOffloadEntry, CodeGen);
762 IsInTTDRegion = false;
763}
764
765void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
766 CodeGenFunction &CGF,
767 EntryFunctionState &EST, bool IsSPMD) {
768 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
769 MaxTeamsVal = -1;
770 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
771 MinTeamsVal, MaxTeamsVal);
772
773 CGBuilderTy &Bld = CGF.Builder;
774 Bld.restoreIP(OMPBuilder.createTargetInit(
775 Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
776 if (!IsSPMD)
777 emitGenericVarsProlog(CGF, EST.Loc);
778}
779
780void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
781 EntryFunctionState &EST,
782 bool IsSPMD) {
783 if (!IsSPMD)
784 emitGenericVarsEpilog(CGF);
785
786 // This is temporary until we remove the fixed sized buffer.
788 RecordDecl *StaticRD = C.buildImplicitRecord(
789 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
790 StaticRD->startDefinition();
791 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
792 QualType RecTy = C.getRecordType(TeamReductionRec);
793 auto *Field = FieldDecl::Create(
794 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
795 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
796 /*BW=*/nullptr, /*Mutable=*/false,
797 /*InitStyle=*/ICIS_NoInit);
798 Field->setAccess(AS_public);
799 StaticRD->addDecl(Field);
800 }
801 StaticRD->completeDefinition();
802 QualType StaticTy = C.getRecordType(StaticRD);
803 llvm::Type *LLVMReductionsBufferTy =
804 CGM.getTypes().ConvertTypeForMem(StaticTy);
805 const auto &DL = CGM.getModule().getDataLayout();
806 uint64_t ReductionDataSize =
807 TeamsReductions.empty()
808 ? 0
809 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
810 CGBuilderTy &Bld = CGF.Builder;
811 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
812 C.getLangOpts().OpenMPCUDAReductionBufNum);
813 TeamsReductions.clear();
814}
815
816void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
817 StringRef ParentName,
818 llvm::Function *&OutlinedFn,
819 llvm::Constant *&OutlinedFnID,
820 bool IsOffloadEntry,
821 const RegionCodeGenTy &CodeGen) {
822 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
823 EntryFunctionState EST;
824
825 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
826
827 // Emit target region as a standalone region.
828 class NVPTXPrePostActionTy : public PrePostActionTy {
830 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
831 bool IsBareKernel;
832 DataSharingMode Mode;
833 const OMPExecutableDirective &D;
834
835 public:
836 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
837 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
838 bool IsBareKernel, const OMPExecutableDirective &D)
839 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
840 Mode(RT.CurrentDataSharingMode), D(D) {}
841 void Enter(CodeGenFunction &CGF) override {
842 if (IsBareKernel) {
843 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
844 return;
845 }
846 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
847 // Skip target region initialization.
848 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
849 }
850 void Exit(CodeGenFunction &CGF) override {
851 if (IsBareKernel) {
852 RT.CurrentDataSharingMode = Mode;
853 return;
854 }
855 RT.clearLocThreadIdInsertPt(CGF);
856 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
857 }
858 } Action(*this, EST, IsBareKernel, D);
859 CodeGen.setAction(Action);
860 IsInTTDRegion = true;
861 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
862 IsOffloadEntry, CodeGen);
863 IsInTTDRegion = false;
864}
865
866void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
867 const OMPExecutableDirective &D, StringRef ParentName,
868 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
869 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
870 if (!IsOffloadEntry) // Nothing to do.
871 return;
872
873 assert(!ParentName.empty() && "Invalid target region parent name!");
874
875 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
876 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
877 if (Mode || IsBareKernel)
878 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
879 CodeGen);
880 else
881 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
882 CodeGen);
883}
884
886 : CGOpenMPRuntime(CGM) {
887 llvm::OpenMPIRBuilderConfig Config(
888 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
889 CGM.getLangOpts().OpenMPOffloadMandatory,
890 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
891 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
892 OMPBuilder.setConfig(Config);
893
894 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
895 llvm_unreachable("OpenMP can only handle device code.");
896
897 if (CGM.getLangOpts().OpenMPCUDAMode)
898 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
899
900 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
901 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
902 return;
903
904 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
905 "__omp_rtl_debug_kind");
906 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
907 "__omp_rtl_assume_teams_oversubscription");
908 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
909 "__omp_rtl_assume_threads_oversubscription");
910 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
911 "__omp_rtl_assume_no_thread_state");
912 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
913 "__omp_rtl_assume_no_nested_parallelism");
914}
915
917 ProcBindKind ProcBind,
918 SourceLocation Loc) {
919 // Nothing to do.
920}
921
923 llvm::Value *NumThreads,
924 SourceLocation Loc) {
925 // Nothing to do.
926}
927
929 const Expr *NumTeams,
930 const Expr *ThreadLimit,
931 SourceLocation Loc) {}
932
935 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
936 const RegionCodeGenTy &CodeGen) {
937 // Emit target region as a standalone region.
938 bool PrevIsInTTDRegion = IsInTTDRegion;
939 IsInTTDRegion = false;
940 auto *OutlinedFun =
942 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
943 IsInTTDRegion = PrevIsInTTDRegion;
944 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
945 llvm::Function *WrapperFun =
946 createParallelDataSharingWrapper(OutlinedFun, D);
947 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
948 }
949
950 return OutlinedFun;
951}
952
953/// Get list of lastprivate variables from the teams distribute ... or
954/// teams {distribute ...} directives.
955static void
959 "expected teams directive.");
960 const OMPExecutableDirective *Dir = &D;
963 Ctx,
965 /*IgnoreCaptured=*/true))) {
966 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
968 Dir = nullptr;
969 }
970 }
971 if (!Dir)
972 return;
973 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
974 for (const Expr *E : C->getVarRefs())
975 Vars.push_back(getPrivateItem(E));
976 }
977}
978
979/// Get list of reduction variables from the teams ... directives.
980static void
984 "expected teams directive.");
985 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
986 for (const Expr *E : C->privates())
987 Vars.push_back(getPrivateItem(E));
988 }
989}
990
993 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
994 const RegionCodeGenTy &CodeGen) {
995 SourceLocation Loc = D.getBeginLoc();
996
997 const RecordDecl *GlobalizedRD = nullptr;
998 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
999 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1000 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1001 // Globalize team reductions variable unconditionally in all modes.
1002 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1003 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1004 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1005 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1006 if (!LastPrivatesReductions.empty()) {
1007 GlobalizedRD = ::buildRecordForGlobalizedVars(
1008 CGM.getContext(), std::nullopt, LastPrivatesReductions,
1009 MappedDeclsFields, WarpSize);
1010 }
1011 } else if (!LastPrivatesReductions.empty()) {
1012 assert(!TeamAndReductions.first &&
1013 "Previous team declaration is not expected.");
1014 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1015 std::swap(TeamAndReductions.second, LastPrivatesReductions);
1016 }
1017
1018 // Emit target region as a standalone region.
1019 class NVPTXPrePostActionTy : public PrePostActionTy {
1020 SourceLocation &Loc;
1021 const RecordDecl *GlobalizedRD;
1022 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1023 &MappedDeclsFields;
1024
1025 public:
1026 NVPTXPrePostActionTy(
1027 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1028 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1029 &MappedDeclsFields)
1030 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1031 MappedDeclsFields(MappedDeclsFields) {}
1032 void Enter(CodeGenFunction &CGF) override {
1033 auto &Rt =
1034 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1035 if (GlobalizedRD) {
1036 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1037 I->getSecond().MappedParams =
1038 std::make_unique<CodeGenFunction::OMPMapVars>();
1039 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1040 for (const auto &Pair : MappedDeclsFields) {
1041 assert(Pair.getFirst()->isCanonicalDecl() &&
1042 "Expected canonical declaration");
1043 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1044 }
1045 }
1046 Rt.emitGenericVarsProlog(CGF, Loc);
1047 }
1048 void Exit(CodeGenFunction &CGF) override {
1049 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1050 .emitGenericVarsEpilog(CGF);
1051 }
1052 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1053 CodeGen.setAction(Action);
1054 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1055 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1056
1057 return OutlinedFun;
1058}
1059
1060void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1061 SourceLocation Loc) {
1062 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1063 return;
1064
1065 CGBuilderTy &Bld = CGF.Builder;
1066
1067 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1068 if (I == FunctionGlobalizedDecls.end())
1069 return;
1070
1071 for (auto &Rec : I->getSecond().LocalVarData) {
1072 const auto *VD = cast<VarDecl>(Rec.first);
1073 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1074 QualType VarTy = VD->getType();
1075
1076 // Get the local allocation of a firstprivate variable before sharing
1077 llvm::Value *ParValue;
1078 if (EscapedParam) {
1079 LValue ParLVal =
1080 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1081 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1082 }
1083
1084 // Allocate space for the variable to be globalized
1085 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1086 llvm::CallBase *VoidPtr =
1087 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1088 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1089 AllocArgs, VD->getName());
1090 // FIXME: We should use the variables actual alignment as an argument.
1091 VoidPtr->addRetAttr(llvm::Attribute::get(
1092 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1094
1095 // Cast the void pointer and get the address of the globalized variable.
1096 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1097 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1098 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1099 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1100 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1101 Rec.second.GlobalizedVal = VoidPtr;
1102
1103 // Assign the local allocation to the newly globalized location.
1104 if (EscapedParam) {
1105 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1106 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1107 }
1108 if (auto *DI = CGF.getDebugInfo())
1109 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1110 }
1111
1112 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1113 const auto *VD = cast<VarDecl>(ValueD);
1114 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1115 getKmpcAllocShared(CGF, VD);
1116 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
1117 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1120 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
1121 }
1122 I->getSecond().MappedParams->apply(CGF);
1123}
1124
1126 const VarDecl *VD) const {
1127 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1128 if (I == FunctionGlobalizedDecls.end())
1129 return false;
1130
1131 // Check variable declaration is delayed:
1132 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
1133}
1134
1135std::pair<llvm::Value *, llvm::Value *>
1137 const VarDecl *VD) {
1138 CGBuilderTy &Bld = CGF.Builder;
1139
1140 // Compute size and alignment.
1141 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1142 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1143 Size = Bld.CreateNUWAdd(
1144 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1145 llvm::Value *AlignVal =
1146 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1147 Size = Bld.CreateUDiv(Size, AlignVal);
1148 Size = Bld.CreateNUWMul(Size, AlignVal);
1149
1150 // Allocate space for this VLA object to be globalized.
1151 llvm::Value *AllocArgs[] = {Size};
1152 llvm::CallBase *VoidPtr =
1153 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1154 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1155 AllocArgs, VD->getName());
1156 VoidPtr->addRetAttr(llvm::Attribute::get(
1157 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1158
1159 return std::make_pair(VoidPtr, Size);
1160}
1161
1163 CodeGenFunction &CGF,
1164 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1165 // Deallocate the memory for each globalized VLA object
1166 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1167 CGM.getModule(), OMPRTL___kmpc_free_shared),
1168 {AddrSizePair.first, AddrSizePair.second});
1169}
1170
1171void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1172 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1173 return;
1174
1175 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1176 if (I != FunctionGlobalizedDecls.end()) {
1177 // Deallocate the memory for each globalized VLA object that was
1178 // globalized in the prolog (i.e. emitGenericVarsProlog).
1179 for (const auto &AddrSizePair :
1180 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1181 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1182 CGM.getModule(), OMPRTL___kmpc_free_shared),
1183 {AddrSizePair.first, AddrSizePair.second});
1184 }
1185 // Deallocate the memory for each globalized value
1186 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1187 const auto *VD = cast<VarDecl>(Rec.first);
1188 I->getSecond().MappedParams->restore(CGF);
1189
1190 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1191 CGF.getTypeSize(VD->getType())};
1192 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1193 CGM.getModule(), OMPRTL___kmpc_free_shared),
1194 FreeArgs);
1195 }
1196 }
1197}
1198
1200 const OMPExecutableDirective &D,
1201 SourceLocation Loc,
1202 llvm::Function *OutlinedFn,
1203 ArrayRef<llvm::Value *> CapturedVars) {
1204 if (!CGF.HaveInsertPoint())
1205 return;
1206
1207 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1208
1209 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1210 /*Name=*/".zero.addr");
1211 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1213 // We don't emit any thread id function call in bare kernel, but because the
1214 // outlined function has a pointer argument, we emit a nullptr here.
1215 if (IsBareKernel)
1216 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
1217 else
1218 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1219 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1220 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1221 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1222}
1223
1225 SourceLocation Loc,
1226 llvm::Function *OutlinedFn,
1227 ArrayRef<llvm::Value *> CapturedVars,
1228 const Expr *IfCond,
1229 llvm::Value *NumThreads) {
1230 if (!CGF.HaveInsertPoint())
1231 return;
1232
1233 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1234 NumThreads](CodeGenFunction &CGF,
1235 PrePostActionTy &Action) {
1236 CGBuilderTy &Bld = CGF.Builder;
1237 llvm::Value *NumThreadsVal = NumThreads;
1238 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1239 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1240 if (WFn)
1241 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1242 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1243
1244 // Create a private scope that will globalize the arguments
1245 // passed from the outside of the target region.
1246 // TODO: Is that needed?
1247 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1248
1249 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1250 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1251 "captured_vars_addrs");
1252 // There's something to share.
1253 if (!CapturedVars.empty()) {
1254 // Prepare for parallel region. Indicate the outlined function.
1255 ASTContext &Ctx = CGF.getContext();
1256 unsigned Idx = 0;
1257 for (llvm::Value *V : CapturedVars) {
1258 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1259 llvm::Value *PtrV;
1260 if (V->getType()->isIntegerTy())
1261 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1262 else
1264 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1265 Ctx.getPointerType(Ctx.VoidPtrTy));
1266 ++Idx;
1267 }
1268 }
1269
1270 llvm::Value *IfCondVal = nullptr;
1271 if (IfCond)
1272 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1273 /* isSigned */ false);
1274 else
1275 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1276
1277 if (!NumThreadsVal)
1278 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1279 else
1280 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1281
1282 assert(IfCondVal && "Expected a value");
1283 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1284 llvm::Value *Args[] = {
1285 RTLoc,
1286 getThreadID(CGF, Loc),
1287 IfCondVal,
1288 NumThreadsVal,
1289 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1290 FnPtr,
1291 ID,
1292 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1293 CGF.VoidPtrPtrTy),
1294 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1295 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1296 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1297 Args);
1298 };
1299
1300 RegionCodeGenTy RCG(ParallelGen);
1301 RCG(CGF);
1302}
1303
1304void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1305 // Always emit simple barriers!
1306 if (!CGF.HaveInsertPoint())
1307 return;
1308 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1309 // This function does not use parameters, so we can emit just default values.
1310 llvm::Value *Args[] = {
1311 llvm::ConstantPointerNull::get(
1312 cast<llvm::PointerType>(getIdentTyPointerTy())),
1313 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1314 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1315 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1316 Args);
1317}
1318
1320 SourceLocation Loc,
1321 OpenMPDirectiveKind Kind, bool,
1322 bool) {
1323 // Always emit simple barriers!
1324 if (!CGF.HaveInsertPoint())
1325 return;
1326 // Build call __kmpc_cancel_barrier(loc, thread_id);
1327 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1328 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1329 getThreadID(CGF, Loc)};
1330
1331 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1332 CGM.getModule(), OMPRTL___kmpc_barrier),
1333 Args);
1334}
1335
1337 CodeGenFunction &CGF, StringRef CriticalName,
1338 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1339 const Expr *Hint) {
1340 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1341 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1342 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1343 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1344 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1345
1346 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1347
1348 // Get the mask of active threads in the warp.
1349 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1350 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1351 // Fetch team-local id of the thread.
1352 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1353
1354 // Get the width of the team.
1355 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1356
1357 // Initialize the counter variable for the loop.
1358 QualType Int32Ty =
1359 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1360 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1361 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1362 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1363 /*isInit=*/true);
1364
1365 // Block checks if loop counter exceeds upper bound.
1366 CGF.EmitBlock(LoopBB);
1367 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1368 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1369 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1370
1371 // Block tests which single thread should execute region, and which threads
1372 // should go straight to synchronisation point.
1373 CGF.EmitBlock(TestBB);
1374 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1375 llvm::Value *CmpThreadToCounter =
1376 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1377 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1378
1379 // Block emits the body of the critical region.
1380 CGF.EmitBlock(BodyBB);
1381
1382 // Output the critical statement.
1383 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1384 Hint);
1385
1386 // After the body surrounded by the critical region, the single executing
1387 // thread will jump to the synchronisation point.
1388 // Block waits for all threads in current team to finish then increments the
1389 // counter variable and returns to the loop.
1390 CGF.EmitBlock(SyncBB);
1391 // Reconverge active threads in the warp.
1392 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1393 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1394 Mask);
1395
1396 llvm::Value *IncCounterVal =
1397 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1398 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1399 CGF.EmitBranch(LoopBB);
1400
1401 // Block that is reached when all threads in the team complete the region.
1402 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1403}
1404
1405/// Cast value to the specified type.
1406static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1407 QualType ValTy, QualType CastTy,
1408 SourceLocation Loc) {
1409 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1410 "Cast type must sized.");
1411 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1412 "Val type must sized.");
1413 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1414 if (ValTy == CastTy)
1415 return Val;
1416 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1417 CGF.getContext().getTypeSizeInChars(CastTy))
1418 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1419 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1420 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1422 Address CastItem = CGF.CreateMemTemp(CastTy);
1423 Address ValCastItem = CastItem.withElementType(Val->getType());
1424 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1426 TBAAAccessInfo());
1427 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1429 TBAAAccessInfo());
1430}
1431
1432/// This function creates calls to one of two shuffle functions to copy
1433/// variables between lanes in a warp.
1435 llvm::Value *Elem,
1436 QualType ElemType,
1437 llvm::Value *Offset,
1438 SourceLocation Loc) {
1439 CodeGenModule &CGM = CGF.CGM;
1440 CGBuilderTy &Bld = CGF.Builder;
1441 CGOpenMPRuntimeGPU &RT =
1442 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1443 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1444
1445 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1446 assert(Size.getQuantity() <= 8 &&
1447 "Unsupported bitwidth in shuffle instruction.");
1448
1449 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1450 ? OMPRTL___kmpc_shuffle_int32
1451 : OMPRTL___kmpc_shuffle_int64;
1452
1453 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1455 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1456 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1457 llvm::Value *WarpSize =
1458 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1459
1460 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1461 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1462 {ElemCast, Offset, WarpSize});
1463
1464 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1465}
1466
1467static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1468 Address DestAddr, QualType ElemType,
1469 llvm::Value *Offset, SourceLocation Loc) {
1470 CGBuilderTy &Bld = CGF.Builder;
1471
1472 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1473 // Create the loop over the big sized data.
1474 // ptr = (void*)Elem;
1475 // ptrEnd = (void*) Elem + 1;
1476 // Step = 8;
1477 // while (ptr + Step < ptrEnd)
1478 // shuffle((int64_t)*ptr);
1479 // Step = 4;
1480 // while (ptr + Step < ptrEnd)
1481 // shuffle((int32_t)*ptr);
1482 // ...
1483 Address ElemPtr = DestAddr;
1484 Address Ptr = SrcAddr;
1486 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1487 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1488 if (Size < CharUnits::fromQuantity(IntSize))
1489 continue;
1492 /*Signed=*/1);
1493 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1494 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1495 IntTy);
1497 ElemPtr, IntTy->getPointerTo(), IntTy);
1498 if (Size.getQuantity() / IntSize > 1) {
1499 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1500 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1501 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1502 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1503 CGF.EmitBlock(PreCondBB);
1504 llvm::PHINode *PhiSrc =
1505 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1506 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1507 llvm::PHINode *PhiDest =
1508 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1509 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1510 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1511 ElemPtr =
1512 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1513 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1514 CGF.Int8Ty, PtrEnd.getPointer(),
1516 CGF.VoidPtrTy));
1517 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1518 ThenBB, ExitBB);
1519 CGF.EmitBlock(ThenBB);
1520 llvm::Value *Res = createRuntimeShuffleFunction(
1521 CGF,
1522 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1524 TBAAAccessInfo()),
1525 IntType, Offset, Loc);
1526 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1528 TBAAAccessInfo());
1529 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1530 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1531 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1532 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1533 CGF.EmitBranch(PreCondBB);
1534 CGF.EmitBlock(ExitBB);
1535 } else {
1536 llvm::Value *Res = createRuntimeShuffleFunction(
1537 CGF,
1538 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1540 TBAAAccessInfo()),
1541 IntType, Offset, Loc);
1542 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1544 TBAAAccessInfo());
1545 Ptr = Bld.CreateConstGEP(Ptr, 1);
1546 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1547 }
1548 Size = Size % IntSize;
1549 }
1550}
1551
1552namespace {
1553enum CopyAction : unsigned {
1554 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1555 // the warp using shuffle instructions.
1556 RemoteLaneToThread,
1557 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1558 ThreadCopy,
1559};
1560} // namespace
1561
1563 llvm::Value *RemoteLaneOffset;
1564 llvm::Value *ScratchpadIndex;
1565 llvm::Value *ScratchpadWidth;
1566};
1567
1568/// Emit instructions to copy a Reduce list, which contains partially
1569/// aggregated values, in the specified direction.
1571 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1572 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1573 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1574
1575 CodeGenModule &CGM = CGF.CGM;
1576 ASTContext &C = CGM.getContext();
1577 CGBuilderTy &Bld = CGF.Builder;
1578
1579 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1580
1581 // Iterates, element-by-element, through the source Reduce list and
1582 // make a copy.
1583 unsigned Idx = 0;
1584 for (const Expr *Private : Privates) {
1585 Address SrcElementAddr = Address::invalid();
1586 Address DestElementAddr = Address::invalid();
1587 Address DestElementPtrAddr = Address::invalid();
1588 // Should we shuffle in an element from a remote lane?
1589 bool ShuffleInElement = false;
1590 // Set to true to update the pointer in the dest Reduce list to a
1591 // newly created element.
1592 bool UpdateDestListPtr = false;
1593 QualType PrivatePtrType = C.getPointerType(Private->getType());
1594 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1595
1596 switch (Action) {
1597 case RemoteLaneToThread: {
1598 // Step 1.1: Get the address for the src element in the Reduce list.
1599 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1600 SrcElementAddr = CGF.EmitLoadOfPointer(
1601 SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
1602 PrivatePtrType->castAs<PointerType>());
1603
1604 // Step 1.2: Create a temporary to store the element in the destination
1605 // Reduce list.
1606 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1607 DestElementAddr =
1608 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1609 ShuffleInElement = true;
1610 UpdateDestListPtr = true;
1611 break;
1612 }
1613 case ThreadCopy: {
1614 // Step 1.1: Get the address for the src element in the Reduce list.
1615 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1616 SrcElementAddr = CGF.EmitLoadOfPointer(
1617 SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
1618 PrivatePtrType->castAs<PointerType>());
1619
1620 // Step 1.2: Get the address for dest element. The destination
1621 // element has already been created on the thread's stack.
1622 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1623 DestElementAddr = CGF.EmitLoadOfPointer(
1624 DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
1625 PrivatePtrType->castAs<PointerType>());
1626 break;
1627 }
1628 }
1629
1630 // Regardless of src and dest of copy, we emit the load of src
1631 // element as this is required in all directions
1632 SrcElementAddr = SrcElementAddr.withElementType(
1633 CGF.ConvertTypeForMem(Private->getType()));
1634 DestElementAddr =
1635 DestElementAddr.withElementType(SrcElementAddr.getElementType());
1636
1637 // Now that all active lanes have read the element in the
1638 // Reduce list, shuffle over the value from the remote lane.
1639 if (ShuffleInElement) {
1640 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1641 RemoteLaneOffset, Private->getExprLoc());
1642 } else {
1643 switch (CGF.getEvaluationKind(Private->getType())) {
1644 case TEK_Scalar: {
1645 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1646 SrcElementAddr, /*Volatile=*/false, Private->getType(),
1647 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1648 TBAAAccessInfo());
1649 // Store the source element value to the dest element address.
1651 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1653 break;
1654 }
1655 case TEK_Complex: {
1657 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1658 Private->getExprLoc());
1660 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1661 /*isInit=*/false);
1662 break;
1663 }
1664 case TEK_Aggregate:
1666 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1667 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1669 break;
1670 }
1671 }
1672
1673 // Step 3.1: Modify reference in dest Reduce list as needed.
1674 // Modifying the reference in Reduce list to point to the newly
1675 // created element. The element is live in the current function
1676 // scope and that of functions it invokes (i.e., reduce_function).
1677 // RemoteReduceData[i] = (void*)&RemoteElem
1678 if (UpdateDestListPtr) {
1680 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1681 DestElementPtrAddr, /*Volatile=*/false,
1682 C.VoidPtrTy);
1683 }
1684
1685 ++Idx;
1686 }
1687}
1688
1689/// This function emits a helper that gathers Reduce lists from the first
1690/// lane of every active warp to lanes in the first warp.
1691///
1692/// void inter_warp_copy_func(void* reduce_data, num_warps)
1693/// shared smem[warp_size];
1694/// For all data entries D in reduce_data:
1695/// sync
1696/// If (I am the first lane in each warp)
1697/// Copy my local D to smem[warp_id]
1698/// sync
1699/// if (I am the first warp)
1700/// Copy smem[thread_id] to my local D
1702 ArrayRef<const Expr *> Privates,
1703 QualType ReductionArrayTy,
1704 SourceLocation Loc) {
1705 ASTContext &C = CGM.getContext();
1706 llvm::Module &M = CGM.getModule();
1707
1708 // ReduceList: thread local Reduce list.
1709 // At the stage of the computation when this function is called, partially
1710 // aggregated values reside in the first lane of every active warp.
1711 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1712 C.VoidPtrTy, ImplicitParamKind::Other);
1713 // NumWarps: number of warps active in the parallel region. This could
1714 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1715 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1716 C.getIntTypeForBitwidth(32, /* Signed */ true),
1718 FunctionArgList Args;
1719 Args.push_back(&ReduceListArg);
1720 Args.push_back(&NumWarpsArg);
1721
1722 const CGFunctionInfo &CGFI =
1723 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1724 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
1725 llvm::GlobalValue::InternalLinkage,
1726 "_omp_reduction_inter_warp_copy_func", &M);
1728 Fn->setDoesNotRecurse();
1729 CodeGenFunction CGF(CGM);
1730 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
1731
1732 CGBuilderTy &Bld = CGF.Builder;
1733
1734 // This array is used as a medium to transfer, one reduce element at a time,
1735 // the data from the first lane of every warp to lanes in the first warp
1736 // in order to perform the final step of a reduction in a parallel region
1737 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1738 // for reduced latency, as well as to have a distinct copy for concurrently
1739 // executing target regions. The array is declared with common linkage so
1740 // as to be shared across compilation units.
1741 StringRef TransferMediumName =
1742 "__openmp_nvptx_data_transfer_temporary_storage";
1743 llvm::GlobalVariable *TransferMedium =
1744 M.getGlobalVariable(TransferMediumName);
1745 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1746 if (!TransferMedium) {
1747 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
1748 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1749 TransferMedium = new llvm::GlobalVariable(
1750 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1751 llvm::UndefValue::get(Ty), TransferMediumName,
1752 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1753 SharedAddressSpace);
1754 CGM.addCompilerUsedGlobal(TransferMedium);
1755 }
1756
1757 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1758 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1759 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1760 // nvptx_lane_id = nvptx_id % warpsize
1761 llvm::Value *LaneID = getNVPTXLaneID(CGF);
1762 // nvptx_warp_id = nvptx_id / warpsize
1763 llvm::Value *WarpID = getNVPTXWarpID(CGF);
1764
1765 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1766 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
1767 Address LocalReduceList(
1769 CGF.EmitLoadOfScalar(
1770 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1772 ElemTy->getPointerTo()),
1773 ElemTy, CGF.getPointerAlign());
1774
1775 unsigned Idx = 0;
1776 for (const Expr *Private : Privates) {
1777 //
1778 // Warp master copies reduce element to transfer medium in __shared__
1779 // memory.
1780 //
1781 unsigned RealTySize =
1782 C.getTypeSizeInChars(Private->getType())
1783 .alignTo(C.getTypeAlignInChars(Private->getType()))
1784 .getQuantity();
1785 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1786 unsigned NumIters = RealTySize / TySize;
1787 if (NumIters == 0)
1788 continue;
1789 QualType CType = C.getIntTypeForBitwidth(
1790 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
1791 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
1792 CharUnits Align = CharUnits::fromQuantity(TySize);
1793 llvm::Value *Cnt = nullptr;
1794 Address CntAddr = Address::invalid();
1795 llvm::BasicBlock *PrecondBB = nullptr;
1796 llvm::BasicBlock *ExitBB = nullptr;
1797 if (NumIters > 1) {
1798 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1799 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
1800 /*Volatile=*/false, C.IntTy);
1801 PrecondBB = CGF.createBasicBlock("precond");
1802 ExitBB = CGF.createBasicBlock("exit");
1803 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
1804 // There is no need to emit line number for unconditional branch.
1806 CGF.EmitBlock(PrecondBB);
1807 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1808 llvm::Value *Cmp =
1809 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
1810 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1811 CGF.EmitBlock(BodyBB);
1812 }
1813 // kmpc_barrier.
1814 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1815 /*EmitChecks=*/false,
1816 /*ForceSimpleCall=*/true);
1817 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1818 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1819 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1820
1821 // if (lane_id == 0)
1822 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
1823 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1824 CGF.EmitBlock(ThenBB);
1825
1826 // Reduce element = LocalReduceList[i]
1827 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1828 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1829 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1830 // elemptr = ((CopyType*)(elemptrptr)) + I
1831 Address ElemPtr(ElemPtrPtr, CopyType, Align);
1832 if (NumIters > 1)
1833 ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
1834
1835 // Get pointer to location in transfer medium.
1836 // MediumPtr = &medium[warp_id]
1837 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1838 TransferMedium->getValueType(), TransferMedium,
1839 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1840 // Casting to actual data type.
1841 // MediumPtr = (CopyType*)MediumPtrAddr;
1842 Address MediumPtr(MediumPtrVal, CopyType, Align);
1843
1844 // elem = *elemptr
1845 //*MediumPtr = elem
1846 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1847 ElemPtr, /*Volatile=*/false, CType, Loc,
1849 // Store the source element value to the dest element address.
1850 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
1852 TBAAAccessInfo());
1853
1854 Bld.CreateBr(MergeBB);
1855
1856 CGF.EmitBlock(ElseBB);
1857 Bld.CreateBr(MergeBB);
1858
1859 CGF.EmitBlock(MergeBB);
1860
1861 // kmpc_barrier.
1862 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1863 /*EmitChecks=*/false,
1864 /*ForceSimpleCall=*/true);
1865
1866 //
1867 // Warp 0 copies reduce element from transfer medium.
1868 //
1869 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1870 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1871 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1872
1873 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1874 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1875 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1876
1877 // Up to 32 threads in warp 0 are active.
1878 llvm::Value *IsActiveThread =
1879 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1880 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1881
1882 CGF.EmitBlock(W0ThenBB);
1883
1884 // SrcMediumPtr = &medium[tid]
1885 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1886 TransferMedium->getValueType(), TransferMedium,
1887 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1888 // SrcMediumVal = *SrcMediumPtr;
1889 Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
1890
1891 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1892 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1893 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1894 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
1895 Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
1896 if (NumIters > 1)
1897 TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
1898
1899 // *TargetElemPtr = SrcMediumVal;
1900 llvm::Value *SrcMediumValue =
1901 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
1902 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1903 CType);
1904 Bld.CreateBr(W0MergeBB);
1905
1906 CGF.EmitBlock(W0ElseBB);
1907 Bld.CreateBr(W0MergeBB);
1908
1909 CGF.EmitBlock(W0MergeBB);
1910
1911 if (NumIters > 1) {
1912 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
1913 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
1914 CGF.EmitBranch(PrecondBB);
1916 CGF.EmitBlock(ExitBB);
1917 }
1918 RealTySize %= TySize;
1919 }
1920 ++Idx;
1921 }
1922
1923 CGF.FinishFunction();
1924 return Fn;
1925}
1926
1927/// Emit a helper that reduces data across two OpenMP threads (lanes)
1928/// in the same warp. It uses shuffle instructions to copy over data from
1929/// a remote lane's stack. The reduction algorithm performed is specified
1930/// by the fourth parameter.
1931///
1932/// Algorithm Versions.
1933/// Full Warp Reduce (argument value 0):
1934/// This algorithm assumes that all 32 lanes are active and gathers
1935/// data from these 32 lanes, producing a single resultant value.
1936/// Contiguous Partial Warp Reduce (argument value 1):
1937/// This algorithm assumes that only a *contiguous* subset of lanes
1938/// are active. This happens for the last warp in a parallel region
1939/// when the user specified num_threads is not an integer multiple of
1940/// 32. This contiguous subset always starts with the zeroth lane.
1941/// Partial Warp Reduce (argument value 2):
1942/// This algorithm gathers data from any number of lanes at any position.
1943/// All reduced values are stored in the lowest possible lane. The set
1944/// of problems every algorithm addresses is a super set of those
1945/// addressable by algorithms with a lower version number. Overhead
1946/// increases as algorithm version increases.
1947///
1948/// Terminology
1949/// Reduce element:
1950/// Reduce element refers to the individual data field with primitive
1951/// data types to be combined and reduced across threads.
1952/// Reduce list:
1953/// Reduce list refers to a collection of local, thread-private
1954/// reduce elements.
1955/// Remote Reduce list:
1956/// Remote Reduce list refers to a collection of remote (relative to
1957/// the current thread) reduce elements.
1958///
1959/// We distinguish between three states of threads that are important to
1960/// the implementation of this function.
1961/// Alive threads:
1962/// Threads in a warp executing the SIMT instruction, as distinguished from
1963/// threads that are inactive due to divergent control flow.
1964/// Active threads:
1965/// The minimal set of threads that has to be alive upon entry to this
1966/// function. The computation is correct iff active threads are alive.
1967/// Some threads are alive but they are not active because they do not
1968/// contribute to the computation in any useful manner. Turning them off
1969/// may introduce control flow overheads without any tangible benefits.
1970/// Effective threads:
1971/// In order to comply with the argument requirements of the shuffle
1972/// function, we must keep all lanes holding data alive. But at most
1973/// half of them perform value aggregation; we refer to this half of
1974/// threads as effective. The other half is simply handing off their
1975/// data.
1976///
1977/// Procedure
1978/// Value shuffle:
1979/// In this step active threads transfer data from higher lane positions
1980/// in the warp to lower lane positions, creating Remote Reduce list.
1981/// Value aggregation:
1982/// In this step, effective threads combine their thread local Reduce list
1983/// with Remote Reduce list and store the result in the thread local
1984/// Reduce list.
1985/// Value copy:
1986/// In this step, we deal with the assumption made by algorithm 2
1987/// (i.e. contiguity assumption). When we have an odd number of lanes
1988/// active, say 2k+1, only k threads will be effective and therefore k
1989/// new values will be produced. However, the Reduce list owned by the
1990/// (2k+1)th thread is ignored in the value aggregation. Therefore
1991/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1992/// that the contiguity assumption still holds.
1993static llvm::Function *emitShuffleAndReduceFunction(
1995 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
1996 ASTContext &C = CGM.getContext();
1997
1998 // Thread local Reduce list used to host the values of data to be reduced.
1999 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2000 C.VoidPtrTy, ImplicitParamKind::Other);
2001 // Current lane id; could be logical.
2002 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2004 // Offset of the remote source lane relative to the current lane.
2005 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2006 C.ShortTy, ImplicitParamKind::Other);
2007 // Algorithm version. This is expected to be known at compile time.
2008 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2009 C.ShortTy, ImplicitParamKind::Other);
2010 FunctionArgList Args;
2011 Args.push_back(&ReduceListArg);
2012 Args.push_back(&LaneIDArg);
2013 Args.push_back(&RemoteLaneOffsetArg);
2014 Args.push_back(&AlgoVerArg);
2015
2016 const CGFunctionInfo &CGFI =
2017 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2018 auto *Fn = llvm::Function::Create(
2019 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2020 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2022 Fn->setDoesNotRecurse();
2023
2024 CodeGenFunction CGF(CGM);
2025 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2026
2027 CGBuilderTy &Bld = CGF.Builder;
2028
2029 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2030 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2031 Address LocalReduceList(
2033 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2034 C.VoidPtrTy, SourceLocation()),
2035 ElemTy->getPointerTo()),
2036 ElemTy, CGF.getPointerAlign());
2037
2038 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2039 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2040 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2041
2042 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2043 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2044 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2045
2046 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2047 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2048 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2049
2050 // Create a local thread-private variable to host the Reduce list
2051 // from a remote lane.
2052 Address RemoteReduceList =
2053 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2054
2055 // This loop iterates through the list of reduce elements and copies,
2056 // element by element, from a remote lane in the warp to RemoteReduceList,
2057 // hosted on the thread's stack.
2058 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2059 LocalReduceList, RemoteReduceList,
2060 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2061 /*ScratchpadIndex=*/nullptr,
2062 /*ScratchpadWidth=*/nullptr});
2063
2064 // The actions to be performed on the Remote Reduce list is dependent
2065 // on the algorithm version.
2066 //
2067 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2068 // LaneId % 2 == 0 && Offset > 0):
2069 // do the reduction value aggregation
2070 //
2071 // The thread local variable Reduce list is mutated in place to host the
2072 // reduced data, which is the aggregated value produced from local and
2073 // remote lanes.
2074 //
2075 // Note that AlgoVer is expected to be a constant integer known at compile
2076 // time.
2077 // When AlgoVer==0, the first conjunction evaluates to true, making
2078 // the entire predicate true during compile time.
2079 // When AlgoVer==1, the second conjunction has only the second part to be
2080 // evaluated during runtime. Other conjunctions evaluates to false
2081 // during compile time.
2082 // When AlgoVer==2, the third conjunction has only the second part to be
2083 // evaluated during runtime. Other conjunctions evaluates to false
2084 // during compile time.
2085 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2086
2087 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2088 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2089 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2090
2091 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2092 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2093 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2094 CondAlgo2 = Bld.CreateAnd(
2095 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2096
2097 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2098 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2099
2100 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2101 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2102 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2103 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2104
2105 CGF.EmitBlock(ThenBB);
2106 // reduce_function(LocalReduceList, RemoteReduceList)
2107 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2108 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2109 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2110 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2112 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2113 Bld.CreateBr(MergeBB);
2114
2115 CGF.EmitBlock(ElseBB);
2116 Bld.CreateBr(MergeBB);
2117
2118 CGF.EmitBlock(MergeBB);
2119
2120 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2121 // Reduce list.
2122 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2123 llvm::Value *CondCopy = Bld.CreateAnd(
2124 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2125
2126 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2127 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2128 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2129 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2130
2131 CGF.EmitBlock(CpyThenBB);
2132 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2133 RemoteReduceList, LocalReduceList);
2134 Bld.CreateBr(CpyMergeBB);
2135
2136 CGF.EmitBlock(CpyElseBB);
2137 Bld.CreateBr(CpyMergeBB);
2138
2139 CGF.EmitBlock(CpyMergeBB);
2140
2141 CGF.FinishFunction();
2142 return Fn;
2143}
2144
2145/// This function emits a helper that copies all the reduction variables from
2146/// the team into the provided global buffer for the reduction variables.
2147///
2148/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2149/// For all data entries D in reduce_data:
2150/// Copy local D to buffer.D[Idx]
2153 QualType ReductionArrayTy, SourceLocation Loc,
2154 const RecordDecl *TeamReductionRec,
2155 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2156 &VarFieldMap) {
2157 ASTContext &C = CGM.getContext();
2158
2159 // Buffer: global reduction buffer.
2160 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2161 C.VoidPtrTy, ImplicitParamKind::Other);
2162 // Idx: index of the buffer.
2163 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2165 // ReduceList: thread local Reduce list.
2166 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2167 C.VoidPtrTy, ImplicitParamKind::Other);
2168 FunctionArgList Args;
2169 Args.push_back(&BufferArg);
2170 Args.push_back(&IdxArg);
2171 Args.push_back(&ReduceListArg);
2172
2173 const CGFunctionInfo &CGFI =
2174 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2175 auto *Fn = llvm::Function::Create(
2176 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2177 "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2179 Fn->setDoesNotRecurse();
2180 CodeGenFunction CGF(CGM);
2181 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2182
2183 CGBuilderTy &Bld = CGF.Builder;
2184
2185 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2186 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2187 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2188 Address LocalReduceList(
2190 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2191 C.VoidPtrTy, Loc),
2192 ElemTy->getPointerTo()),
2193 ElemTy, CGF.getPointerAlign());
2194 QualType StaticTy = C.getRecordType(TeamReductionRec);
2195 llvm::Type *LLVMReductionsBufferTy =
2196 CGM.getTypes().ConvertTypeForMem(StaticTy);
2197 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2198 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2199 LLVMReductionsBufferTy->getPointerTo());
2200 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2201 /*Volatile=*/false, C.IntTy,
2202 Loc)};
2203 unsigned Idx = 0;
2204 for (const Expr *Private : Privates) {
2205 // Reduce element = LocalReduceList[i]
2206 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2207 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2208 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2209 // elemptr = ((CopyType*)(elemptrptr)) + I
2210 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2211 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2212 ElemPtrPtr, ElemTy->getPointerTo());
2213 Address ElemPtr =
2214 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2215 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2216 // Global = Buffer.VD[Idx];
2217 const FieldDecl *FD = VarFieldMap.lookup(VD);
2218 llvm::Value *BufferPtr =
2219 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2220 LValue GlobLVal = CGF.EmitLValueForField(
2221 CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
2222 Address GlobAddr = GlobLVal.getAddress(CGF);
2223 GlobLVal.setAddress(Address(GlobAddr.getPointer(),
2224 CGF.ConvertTypeForMem(Private->getType()),
2225 GlobAddr.getAlignment()));
2226 switch (CGF.getEvaluationKind(Private->getType())) {
2227 case TEK_Scalar: {
2228 llvm::Value *V = CGF.EmitLoadOfScalar(
2229 ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2231 CGF.EmitStoreOfScalar(V, GlobLVal);
2232 break;
2233 }
2234 case TEK_Complex: {
2236 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2237 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2238 break;
2239 }
2240 case TEK_Aggregate:
2241 CGF.EmitAggregateCopy(GlobLVal,
2242 CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2244 break;
2245 }
2246 ++Idx;
2247 }
2248
2249 CGF.FinishFunction();
2250 return Fn;
2251}
2252
2253/// This function emits a helper that reduces all the reduction variables from
2254/// the team into the provided global buffer for the reduction variables.
2255///
2256/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2257/// void *GlobPtrs[];
2258/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2259/// ...
2260/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2261/// reduce_function(GlobPtrs, reduce_data);
2264 QualType ReductionArrayTy, SourceLocation Loc,
2265 const RecordDecl *TeamReductionRec,
2266 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2267 &VarFieldMap,
2268 llvm::Function *ReduceFn) {
2269 ASTContext &C = CGM.getContext();
2270
2271 // Buffer: global reduction buffer.
2272 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2273 C.VoidPtrTy, ImplicitParamKind::Other);
2274 // Idx: index of the buffer.
2275 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2277 // ReduceList: thread local Reduce list.
2278 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2279 C.VoidPtrTy, ImplicitParamKind::Other);
2280 FunctionArgList Args;
2281 Args.push_back(&BufferArg);
2282 Args.push_back(&IdxArg);
2283 Args.push_back(&ReduceListArg);
2284
2285 const CGFunctionInfo &CGFI =
2286 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2287 auto *Fn = llvm::Function::Create(
2288 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2289 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2291 Fn->setDoesNotRecurse();
2292 CodeGenFunction CGF(CGM);
2293 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2294
2295 CGBuilderTy &Bld = CGF.Builder;
2296
2297 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2298 QualType StaticTy = C.getRecordType(TeamReductionRec);
2299 llvm::Type *LLVMReductionsBufferTy =
2300 CGM.getTypes().ConvertTypeForMem(StaticTy);
2301 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2302 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2303 LLVMReductionsBufferTy->getPointerTo());
2304
2305 // 1. Build a list of reduction variables.
2306 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2307 Address ReductionList =
2308 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2309 auto IPriv = Privates.begin();
2310 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2311 /*Volatile=*/false, C.IntTy,
2312 Loc)};
2313 unsigned Idx = 0;
2314 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2315 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2316 // Global = Buffer.VD[Idx];
2317 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2318 const FieldDecl *FD = VarFieldMap.lookup(VD);
2319 llvm::Value *BufferPtr =
2320 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2321 LValue GlobLVal = CGF.EmitLValueForField(
2322 CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
2323 Address GlobAddr = GlobLVal.getAddress(CGF);
2324 CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false,
2325 C.VoidPtrTy);
2326 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2327 // Store array size.
2328 ++Idx;
2329 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2330 llvm::Value *Size = CGF.Builder.CreateIntCast(
2331 CGF.getVLASize(
2332 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2333 .NumElts,
2334 CGF.SizeTy, /*isSigned=*/false);
2335 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2336 Elem);
2337 }
2338 }
2339
2340 // Call reduce_function(GlobalReduceList, ReduceList)
2341 llvm::Value *GlobalReduceList = ReductionList.getPointer();
2342 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2343 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2344 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2346 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2347 CGF.FinishFunction();
2348 return Fn;
2349}
2350
2351/// This function emits a helper that copies all the reduction variables from
2352/// the team into the provided global buffer for the reduction variables.
2353///
2354/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2355/// For all data entries D in reduce_data:
2356/// Copy buffer.D[Idx] to local D;
2359 QualType ReductionArrayTy, SourceLocation Loc,
2360 const RecordDecl *TeamReductionRec,
2361 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2362 &VarFieldMap) {
2363 ASTContext &C = CGM.getContext();
2364
2365 // Buffer: global reduction buffer.
2366 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2367 C.VoidPtrTy, ImplicitParamKind::Other);
2368 // Idx: index of the buffer.
2369 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2371 // ReduceList: thread local Reduce list.
2372 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2373 C.VoidPtrTy, ImplicitParamKind::Other);
2374 FunctionArgList Args;
2375 Args.push_back(&BufferArg);
2376 Args.push_back(&IdxArg);
2377 Args.push_back(&ReduceListArg);
2378
2379 const CGFunctionInfo &CGFI =
2380 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2381 auto *Fn = llvm::Function::Create(
2382 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2383 "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2385 Fn->setDoesNotRecurse();
2386 CodeGenFunction CGF(CGM);
2387 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2388
2389 CGBuilderTy &Bld = CGF.Builder;
2390
2391 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2392 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2393 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2394 Address LocalReduceList(
2396 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2397 C.VoidPtrTy, Loc),
2398 ElemTy->getPointerTo()),
2399 ElemTy, CGF.getPointerAlign());
2400 QualType StaticTy = C.getRecordType(TeamReductionRec);
2401 llvm::Type *LLVMReductionsBufferTy =
2402 CGM.getTypes().ConvertTypeForMem(StaticTy);
2403 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2404 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2405 LLVMReductionsBufferTy->getPointerTo());
2406
2407 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2408 /*Volatile=*/false, C.IntTy,
2409 Loc)};
2410 unsigned Idx = 0;
2411 for (const Expr *Private : Privates) {
2412 // Reduce element = LocalReduceList[i]
2413 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2414 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2415 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2416 // elemptr = ((CopyType*)(elemptrptr)) + I
2417 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2418 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2419 ElemPtrPtr, ElemTy->getPointerTo());
2420 Address ElemPtr =
2421 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2422 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2423 // Global = Buffer.VD[Idx];
2424 const FieldDecl *FD = VarFieldMap.lookup(VD);
2425 llvm::Value *BufferPtr =
2426 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2427 LValue GlobLVal = CGF.EmitLValueForField(
2428 CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
2429 Address GlobAddr = GlobLVal.getAddress(CGF);
2430 GlobLVal.setAddress(Address(GlobAddr.getPointer(),
2431 CGF.ConvertTypeForMem(Private->getType()),
2432 GlobAddr.getAlignment()));
2433 switch (CGF.getEvaluationKind(Private->getType())) {
2434 case TEK_Scalar: {
2435 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2436 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2438 TBAAAccessInfo());
2439 break;
2440 }
2441 case TEK_Complex: {
2443 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2444 /*isInit=*/false);
2445 break;
2446 }
2447 case TEK_Aggregate:
2448 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2449 GlobLVal, Private->getType(),
2451 break;
2452 }
2453 ++Idx;
2454 }
2455
2456 CGF.FinishFunction();
2457 return Fn;
2458}
2459
2460/// This function emits a helper that reduces all the reduction variables from
2461/// the team into the provided global buffer for the reduction variables.
2462///
2463/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2464/// void *GlobPtrs[];
2465/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2466/// ...
2467/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2468/// reduce_function(reduce_data, GlobPtrs);
2471 QualType ReductionArrayTy, SourceLocation Loc,
2472 const RecordDecl *TeamReductionRec,
2473 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2474 &VarFieldMap,
2475 llvm::Function *ReduceFn) {
2476 ASTContext &C = CGM.getContext();
2477
2478 // Buffer: global reduction buffer.
2479 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2480 C.VoidPtrTy, ImplicitParamKind::Other);
2481 // Idx: index of the buffer.
2482 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2484 // ReduceList: thread local Reduce list.
2485 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2486 C.VoidPtrTy, ImplicitParamKind::Other);
2487 FunctionArgList Args;
2488 Args.push_back(&BufferArg);
2489 Args.push_back(&IdxArg);
2490 Args.push_back(&ReduceListArg);
2491
2492 const CGFunctionInfo &CGFI =
2493 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2494 auto *Fn = llvm::Function::Create(
2495 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2496 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2498 Fn->setDoesNotRecurse();
2499 CodeGenFunction CGF(CGM);
2500 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2501
2502 CGBuilderTy &Bld = CGF.Builder;
2503
2504 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2505 QualType StaticTy = C.getRecordType(TeamReductionRec);
2506 llvm::Type *LLVMReductionsBufferTy =
2507 CGM.getTypes().ConvertTypeForMem(StaticTy);
2508 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2509 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2510 LLVMReductionsBufferTy->getPointerTo());
2511
2512 // 1. Build a list of reduction variables.
2513 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2514 Address ReductionList =
2515 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2516 auto IPriv = Privates.begin();
2517 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2518 /*Volatile=*/false, C.IntTy,
2519 Loc)};
2520 unsigned Idx = 0;
2521 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2522 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2523 // Global = Buffer.VD[Idx];
2524 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2525 const FieldDecl *FD = VarFieldMap.lookup(VD);
2526 llvm::Value *BufferPtr =
2527 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2528 LValue GlobLVal = CGF.EmitLValueForField(
2529 CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
2530 Address GlobAddr = GlobLVal.getAddress(CGF);
2531 CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false,
2532 C.VoidPtrTy);
2533 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2534 // Store array size.
2535 ++Idx;
2536 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2537 llvm::Value *Size = CGF.Builder.CreateIntCast(
2538 CGF.getVLASize(
2539 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2540 .NumElts,
2541 CGF.SizeTy, /*isSigned=*/false);
2542 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2543 Elem);
2544 }
2545 }
2546
2547 // Call reduce_function(ReduceList, GlobalReduceList)
2548 llvm::Value *GlobalReduceList = ReductionList.getPointer();
2549 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2550 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2551 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2553 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2554 CGF.FinishFunction();
2555 return Fn;
2556}
2557
2558///
2559/// Design of OpenMP reductions on the GPU
2560///
2561/// Consider a typical OpenMP program with one or more reduction
2562/// clauses:
2563///
2564/// float foo;
2565/// double bar;
2566/// #pragma omp target teams distribute parallel for \
2567/// reduction(+:foo) reduction(*:bar)
2568/// for (int i = 0; i < N; i++) {
2569/// foo += A[i]; bar *= B[i];
2570/// }
2571///
2572/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2573/// all teams. In our OpenMP implementation on the NVPTX device an
2574/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2575/// within a team are mapped to CUDA threads within a threadblock.
2576/// Our goal is to efficiently aggregate values across all OpenMP
2577/// threads such that:
2578///
2579/// - the compiler and runtime are logically concise, and
2580/// - the reduction is performed efficiently in a hierarchical
2581/// manner as follows: within OpenMP threads in the same warp,
2582/// across warps in a threadblock, and finally across teams on
2583/// the NVPTX device.
2584///
2585/// Introduction to Decoupling
2586///
2587/// We would like to decouple the compiler and the runtime so that the
2588/// latter is ignorant of the reduction variables (number, data types)
2589/// and the reduction operators. This allows a simpler interface
2590/// and implementation while still attaining good performance.
2591///
2592/// Pseudocode for the aforementioned OpenMP program generated by the
2593/// compiler is as follows:
2594///
2595/// 1. Create private copies of reduction variables on each OpenMP
2596/// thread: 'foo_private', 'bar_private'
2597/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2598/// to it and writes the result in 'foo_private' and 'bar_private'
2599/// respectively.
2600/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2601/// and store the result on the team master:
2602///
2603/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2604/// reduceData, shuffleReduceFn, interWarpCpyFn)
2605///
2606/// where:
2607/// struct ReduceData {
2608/// double *foo;
2609/// double *bar;
2610/// } reduceData
2611/// reduceData.foo = &foo_private
2612/// reduceData.bar = &bar_private
2613///
2614/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2615/// auxiliary functions generated by the compiler that operate on
2616/// variables of type 'ReduceData'. They aid the runtime perform
2617/// algorithmic steps in a data agnostic manner.
2618///
2619/// 'shuffleReduceFn' is a pointer to a function that reduces data
2620/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2621/// same warp. It takes the following arguments as input:
2622///
2623/// a. variable of type 'ReduceData' on the calling lane,
2624/// b. its lane_id,
2625/// c. an offset relative to the current lane_id to generate a
2626/// remote_lane_id. The remote lane contains the second
2627/// variable of type 'ReduceData' that is to be reduced.
2628/// d. an algorithm version parameter determining which reduction
2629/// algorithm to use.
2630///
2631/// 'shuffleReduceFn' retrieves data from the remote lane using
2632/// efficient GPU shuffle intrinsics and reduces, using the
2633/// algorithm specified by the 4th parameter, the two operands
2634/// element-wise. The result is written to the first operand.
2635///
2636/// Different reduction algorithms are implemented in different
2637/// runtime functions, all calling 'shuffleReduceFn' to perform
2638/// the essential reduction step. Therefore, based on the 4th
2639/// parameter, this function behaves slightly differently to
2640/// cooperate with the runtime to ensure correctness under
2641/// different circumstances.
2642///
2643/// 'InterWarpCpyFn' is a pointer to a function that transfers
2644/// reduced variables across warps. It tunnels, through CUDA
2645/// shared memory, the thread-private data of type 'ReduceData'
2646/// from lane 0 of each warp to a lane in the first warp.
2647/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2648/// The last team writes the global reduced value to memory.
2649///
2650/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2651/// reduceData, shuffleReduceFn, interWarpCpyFn,
2652/// scratchpadCopyFn, loadAndReduceFn)
2653///
2654/// 'scratchpadCopyFn' is a helper that stores reduced
2655/// data from the team master to a scratchpad array in
2656/// global memory.
2657///
2658/// 'loadAndReduceFn' is a helper that loads data from
2659/// the scratchpad array and reduces it with the input
2660/// operand.
2661///
2662/// These compiler generated functions hide address
2663/// calculation and alignment information from the runtime.
2664/// 5. if ret == 1:
2665/// The team master of the last team stores the reduced
2666/// result to the globals in memory.
2667/// foo += reduceData.foo; bar *= reduceData.bar
2668///
2669///
2670/// Warp Reduction Algorithms
2671///
2672/// On the warp level, we have three algorithms implemented in the
2673/// OpenMP runtime depending on the number of active lanes:
2674///
2675/// Full Warp Reduction
2676///
2677/// The reduce algorithm within a warp where all lanes are active
2678/// is implemented in the runtime as follows:
2679///
2680/// full_warp_reduce(void *reduce_data,
2681/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2682/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2683/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2684/// }
2685///
2686/// The algorithm completes in log(2, WARPSIZE) steps.
2687///
2688/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2689/// not used therefore we save instructions by not retrieving lane_id
2690/// from the corresponding special registers. The 4th parameter, which
2691/// represents the version of the algorithm being used, is set to 0 to
2692/// signify full warp reduction.
2693///
2694/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2695///
2696/// #reduce_elem refers to an element in the local lane's data structure
2697/// #remote_elem is retrieved from a remote lane
2698/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2699/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2700///
2701/// Contiguous Partial Warp Reduction
2702///
2703/// This reduce algorithm is used within a warp where only the first
2704/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2705/// number of OpenMP threads in a parallel region is not a multiple of
2706/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2707///
2708/// void
2709/// contiguous_partial_reduce(void *reduce_data,
2710/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2711/// int size, int lane_id) {
2712/// int curr_size;
2713/// int offset;
2714/// curr_size = size;
2715/// mask = curr_size/2;
2716/// while (offset>0) {
2717/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2718/// curr_size = (curr_size+1)/2;
2719/// offset = curr_size/2;
2720/// }
2721/// }
2722///
2723/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2724///
2725/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2726/// if (lane_id < offset)
2727/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2728/// else
2729/// reduce_elem = remote_elem
2730///
2731/// This algorithm assumes that the data to be reduced are located in a
2732/// contiguous subset of lanes starting from the first. When there is
2733/// an odd number of active lanes, the data in the last lane is not
2734/// aggregated with any other lane's dat but is instead copied over.
2735///
2736/// Dispersed Partial Warp Reduction
2737///
2738/// This algorithm is used within a warp when any discontiguous subset of
2739/// lanes are active. It is used to implement the reduction operation
2740/// across lanes in an OpenMP simd region or in a nested parallel region.
2741///
2742/// void
2743/// dispersed_partial_reduce(void *reduce_data,
2744/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2745/// int size, remote_id;
2746/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2747/// do {
2748/// remote_id = next_active_lane_id_right_after_me();
2749/// # the above function returns 0 of no active lane
2750/// # is present right after the current lane.
2751/// size = number_of_active_lanes_in_this_warp();
2752/// logical_lane_id /= 2;
2753/// ShuffleReduceFn(reduce_data, logical_lane_id,
2754/// remote_id-1-threadIdx.x, 2);
2755/// } while (logical_lane_id % 2 == 0 && size > 1);
2756/// }
2757///
2758/// There is no assumption made about the initial state of the reduction.
2759/// Any number of lanes (>=1) could be active at any position. The reduction
2760/// result is returned in the first active lane.
2761///
2762/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2763///
2764/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2765/// if (lane_id % 2 == 0 && offset > 0)
2766/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2767/// else
2768/// reduce_elem = remote_elem
2769///
2770///
2771/// Intra-Team Reduction
2772///
2773/// This function, as implemented in the runtime call
2774/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2775/// threads in a team. It first reduces within a warp using the
2776/// aforementioned algorithms. We then proceed to gather all such
2777/// reduced values at the first warp.
2778///
2779/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2780/// data from each of the "warp master" (zeroth lane of each warp, where
2781/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2782/// a mathematical sense) the problem of reduction across warp masters in
2783/// a block to the problem of warp reduction.
2784///
2785///
2786/// Inter-Team Reduction
2787///
2788/// Once a team has reduced its data to a single value, it is stored in
2789/// a global scratchpad array. Since each team has a distinct slot, this
2790/// can be done without locking.
2791///
2792/// The last team to write to the scratchpad array proceeds to reduce the
2793/// scratchpad array. One or more workers in the last team use the helper
2794/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2795/// the k'th worker reduces every k'th element.
2796///
2797/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2798/// reduce across workers and compute a globally reduced value.
2799///
2803 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2804 if (!CGF.HaveInsertPoint())
2805 return;
2806
2807 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2808#ifndef NDEBUG
2809 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2810#endif
2811
2812 if (Options.SimpleReduction) {
2813 assert(!TeamsReduction && !ParallelReduction &&
2814 "Invalid reduction selection in emitReduction.");
2815 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2816 ReductionOps, Options);
2817 return;
2818 }
2819
2820 assert((TeamsReduction || ParallelReduction) &&
2821 "Invalid reduction selection in emitReduction.");
2822
2823 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2824 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2825 int Cnt = 0;
2826 for (const Expr *DRE : Privates) {
2827 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2828 ++Cnt;
2829 }
2830
2832 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
2833 CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
2834
2835 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2836 // RedList, shuffle_reduce_func, interwarp_copy_func);
2837 // or
2838 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2839 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2840
2841 llvm::Value *Res;
2842 // 1. Build a list of reduction variables.
2843 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2844 auto Size = RHSExprs.size();
2845 for (const Expr *E : Privates) {
2846 if (E->getType()->isVariablyModifiedType())
2847 // Reserve place for array size.
2848 ++Size;
2849 }
2850 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2851 QualType ReductionArrayTy = C.getConstantArrayType(
2852 C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
2853 /*IndexTypeQuals=*/0);
2854 Address ReductionList =
2855 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2856 auto IPriv = Privates.begin();
2857 unsigned Idx = 0;
2858 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2859 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2860 CGF.Builder.CreateStore(
2862 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
2863 Elem);
2864 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2865 // Store array size.
2866 ++Idx;
2867 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2868 llvm::Value *Size = CGF.Builder.CreateIntCast(
2869 CGF.getVLASize(
2870 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2871 .NumElts,
2872 CGF.SizeTy, /*isSigned=*/false);
2873 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2874 Elem);
2875 }
2876 }
2877
2878 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2879 ReductionList.getPointer(), CGF.VoidPtrTy);
2880 llvm::Function *ReductionFn = emitReductionFunction(
2881 CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
2882 Privates, LHSExprs, RHSExprs, ReductionOps);
2883 llvm::Value *ReductionDataSize =
2884 CGF.getTypeSize(C.getRecordType(ReductionRec));
2885 ReductionDataSize =
2886 CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
2887 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2888 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2889 llvm::Value *InterWarpCopyFn =
2890 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2891
2892 if (ParallelReduction) {
2893 llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
2894 InterWarpCopyFn};
2895
2896 Res = CGF.EmitRuntimeCall(
2897 OMPBuilder.getOrCreateRuntimeFunction(
2898 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2899 Args);
2900 } else {
2901 assert(TeamsReduction && "expected teams reduction.");
2902 TeamsReductions.push_back(ReductionRec);
2903 auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
2904 OMPBuilder.getOrCreateRuntimeFunction(
2905 CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
2906 {}, "_openmp_teams_reductions_buffer_$_$ptr");
2907 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2908 CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
2909 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2910 CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
2911 ReductionFn);
2912 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2913 CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
2914 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2915 CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
2916 ReductionFn);
2917
2918 llvm::Value *Args[] = {
2919 RTLoc,
2920 KernelTeamsReductionPtr,
2921 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2922 ReductionDataSize,
2923 RL,
2924 ShuffleAndReduceFn,
2925 InterWarpCopyFn,
2926 GlobalToBufferCpyFn,
2927 GlobalToBufferRedFn,
2928 BufferToGlobalCpyFn,
2929 BufferToGlobalRedFn};
2930
2931 Res = CGF.EmitRuntimeCall(
2932 OMPBuilder.getOrCreateRuntimeFunction(
2933 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2934 Args);
2935 }
2936
2937 // 5. Build if (res == 1)
2938 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
2939 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
2940 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
2941 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
2942 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
2943
2944 // 6. Build then branch: where we have reduced values in the master
2945 // thread in each team.
2946 // __kmpc_end_reduce{_nowait}(<gtid>);
2947 // break;
2948 CGF.EmitBlock(ThenBB);
2949
2950 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2951 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
2952 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2953 auto IPriv = Privates.begin();
2954 auto ILHS = LHSExprs.begin();
2955 auto IRHS = RHSExprs.begin();
2956 for (const Expr *E : ReductionOps) {
2957 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2958 cast<DeclRefExpr>(*IRHS));
2959 ++IPriv;
2960 ++ILHS;
2961 ++IRHS;
2962 }
2963 };
2964 RegionCodeGenTy RCG(CodeGen);
2965 RCG(CGF);
2966 // There is no need to emit line number for unconditional branch.
2968 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2969}
2970
2971const VarDecl *
2973 const VarDecl *NativeParam) const {
2974 if (!NativeParam->getType()->isReferenceType())
2975 return NativeParam;
2976 QualType ArgType = NativeParam->getType();
2978 const Type *NonQualTy = QC.strip(ArgType);
2979 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2980 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2981 if (Attr->getCaptureKind() == OMPC_map) {
2982 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2984 }
2985 }
2986 ArgType = CGM.getContext().getPointerType(PointeeTy);
2987 QC.addRestrict();
2988 enum { NVPTX_local_addr = 5 };
2989 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
2990 ArgType = QC.apply(CGM.getContext(), ArgType);
2991 if (isa<ImplicitParamDecl>(NativeParam))
2993 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
2994 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
2995 return ParmVarDecl::Create(
2996 CGM.getContext(),
2997 const_cast<DeclContext *>(NativeParam->getDeclContext()),
2998 NativeParam->getBeginLoc(), NativeParam->getLocation(),
2999 NativeParam->getIdentifier(), ArgType,
3000 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3001}
3002
3003Address
3005 const VarDecl *NativeParam,
3006 const VarDecl *TargetParam) const {
3007 assert(NativeParam != TargetParam &&
3008 NativeParam->getType()->isReferenceType() &&
3009 "Native arg must not be the same as target arg.");
3010 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3011 QualType NativeParamType = NativeParam->getType();
3013 const Type *NonQualTy = QC.strip(NativeParamType);
3014 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3015 unsigned NativePointeeAddrSpace =
3016 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
3017 QualType TargetTy = TargetParam->getType();
3018 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,
3019 TargetTy, SourceLocation());
3020 // First cast to generic.
3022 TargetAddr,
3023 llvm::PointerType::get(CGF.getLLVMContext(), /*AddrSpace=*/0));
3024 // Cast from generic to native address space.
3026 TargetAddr,
3027 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));
3028 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3029 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3030 NativeParamType);
3031 return NativeParamAddr;
3032}
3033
3035 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3036 ArrayRef<llvm::Value *> Args) const {
3038 TargetArgs.reserve(Args.size());
3039 auto *FnType = OutlinedFn.getFunctionType();
3040 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3041 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3042 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3043 break;
3044 }
3045 llvm::Type *TargetType = FnType->getParamType(I);
3046 llvm::Value *NativeArg = Args[I];
3047 if (!TargetType->isPointerTy()) {
3048 TargetArgs.emplace_back(NativeArg);
3049 continue;
3050 }
3051 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3052 NativeArg,
3053 llvm::PointerType::get(CGF.getLLVMContext(), /*AddrSpace*/ 0));
3054 TargetArgs.emplace_back(
3055 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3056 }
3057 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3058}
3059
3060/// Emit function which wraps the outline parallel region
3061/// and controls the arguments which are passed to this function.
3062/// The wrapper ensures that the outlined function is called
3063/// with the correct arguments when data is shared.
3064llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3065 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3066 ASTContext &Ctx = CGM.getContext();
3067 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3068
3069 // Create a function that takes as argument the source thread.
3070 FunctionArgList WrapperArgs;
3071 QualType Int16QTy =
3072 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3073 QualType Int32QTy =
3074 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3075 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3076 /*Id=*/nullptr, Int16QTy,
3078 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3079 /*Id=*/nullptr, Int32QTy,
3081 WrapperArgs.emplace_back(&ParallelLevelArg);
3082 WrapperArgs.emplace_back(&WrapperArg);
3083
3084 const CGFunctionInfo &CGFI =
3086
3087 auto *Fn = llvm::Function::Create(
3088 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3089 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3090
3091 // Ensure we do not inline the function. This is trivially true for the ones
3092 // passed to __kmpc_fork_call but the ones calles in serialized regions
3093 // could be inlined. This is not a perfect but it is closer to the invariant
3094 // we want, namely, every data environment starts with a new function.
3095 // TODO: We should pass the if condition to the runtime function and do the
3096 // handling there. Much cleaner code.
3097 Fn->addFnAttr(llvm::Attribute::NoInline);
3098
3100 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3101 Fn->setDoesNotRecurse();
3102
3103 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3104 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3105 D.getBeginLoc(), D.getBeginLoc());
3106
3107 const auto *RD = CS.getCapturedRecordDecl();
3108 auto CurField = RD->field_begin();
3109
3110 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3111 /*Name=*/".zero.addr");
3112 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3113 // Get the array of arguments.
3115
3116 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3117 Args.emplace_back(ZeroAddr.getPointer());
3118
3119 CGBuilderTy &Bld = CGF.Builder;
3120 auto CI = CS.capture_begin();
3121
3122 // Use global memory for data sharing.
3123 // Handle passing of global args to workers.
3124 Address GlobalArgs =
3125 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3126 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3127 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3128 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3129 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3130 DataSharingArgs);
3131
3132 // Retrieve the shared variables from the list of references returned
3133 // by the runtime. Pass the variables to the outlined function.
3134 Address SharedArgListAddress = Address::invalid();
3135 if (CS.capture_size() > 0 ||
3137 SharedArgListAddress = CGF.EmitLoadOfPointer(
3138 GlobalArgs, CGF.getContext()
3140 .castAs<PointerType>());
3141 }
3142 unsigned Idx = 0;
3144 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3146 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3147 llvm::Value *LB = CGF.EmitLoadOfScalar(
3148 TypedAddress,
3149 /*Volatile=*/false,
3151 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3152 Args.emplace_back(LB);
3153 ++Idx;
3154 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3155 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3156 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3157 llvm::Value *UB = CGF.EmitLoadOfScalar(
3158 TypedAddress,
3159 /*Volatile=*/false,
3161 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3162 Args.emplace_back(UB);
3163 ++Idx;
3164 }
3165 if (CS.capture_size() > 0) {
3166 ASTContext &CGFContext = CGF.getContext();
3167 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3168 QualType ElemTy = CurField->getType();
3169 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3171 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3172 CGF.ConvertTypeForMem(ElemTy));
3173 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3174 /*Volatile=*/false,
3175 CGFContext.getPointerType(ElemTy),
3176 CI->getLocation());
3177 if (CI->capturesVariableByCopy() &&
3178 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3179 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3180 CI->getLocation());
3181 }
3182 Args.emplace_back(Arg);
3183 }
3184 }
3185
3186 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3187 CGF.FinishFunction();
3188 return Fn;
3189}
3190
3192 const Decl *D) {
3193 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3194 return;
3195
3196 assert(D && "Expected function or captured|block decl.");
3197 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3198 "Function is registered already.");
3199 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3200 "Team is set but not processed.");
3201 const Stmt *Body = nullptr;
3202 bool NeedToDelayGlobalization = false;
3203 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3204 Body = FD->getBody();
3205 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3206 Body = BD->getBody();
3207 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3208 Body = CD->getBody();
3209 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3210 if (NeedToDelayGlobalization &&
3211 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3212 return;
3213 }
3214 if (!Body)
3215 return;
3216 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3217 VarChecker.Visit(Body);
3218 const RecordDecl *GlobalizedVarsRecord =
3219 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3220 TeamAndReductions.first = nullptr;
3221 TeamAndReductions.second.clear();
3222 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3223 VarChecker.getEscapedVariableLengthDecls();
3224 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
3225 VarChecker.getDelayedVariableLengthDecls();
3226 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
3227 DelayedVariableLengthDecls.empty())
3228 return;
3229 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3230 I->getSecond().MappedParams =
3231 std::make_unique<CodeGenFunction::OMPMapVars>();
3232 I->getSecond().EscapedParameters.insert(
3233 VarChecker.getEscapedParameters().begin(),
3234 VarChecker.getEscapedParameters().end());
3235 I->getSecond().EscapedVariableLengthDecls.append(
3236 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3237 I->getSecond().DelayedVariableLengthDecls.append(
3238 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
3239 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3240 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3241 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3242 Data.insert(std::make_pair(VD, MappedVarData()));
3243 }
3244 if (!NeedToDelayGlobalization) {
3245 emitGenericVarsProlog(CGF, D->getBeginLoc());
3246 struct GlobalizationScope final : EHScopeStack::Cleanup {
3247 GlobalizationScope() = default;
3248
3249 void Emit(CodeGenFunction &CGF, Flags flags) override {
3250 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3251 .emitGenericVarsEpilog(CGF);
3252 }
3253 };
3254 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3255 }
3256}
3257
3259 const VarDecl *VD) {
3260 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3261 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3262 auto AS = LangAS::Default;
3263 switch (A->getAllocatorType()) {
3264 // Use the default allocator here as by default local vars are
3265 // threadlocal.
3266 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3267 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3268 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3269 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3270 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3271 // Follow the user decision - use default allocation.
3272 return Address::invalid();
3273 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3274 // TODO: implement aupport for user-defined allocators.
3275 return Address::invalid();
3276 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3278 break;
3279 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3281 break;
3282 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3283 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3284 break;
3285 }
3286 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3287 auto *GV = new llvm::GlobalVariable(
3288 CGM.getModule(), VarTy, /*isConstant=*/false,
3289 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
3290 VD->getName(),
3291 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3293 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3294 GV->setAlignment(Align.getAsAlign());
3295 return Address(
3297 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3298 VD->getType().getAddressSpace()))),
3299 VarTy, Align);
3300 }
3301
3302 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3303 return Address::invalid();
3304
3305 VD = VD->getCanonicalDecl();
3306 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3307 if (I == FunctionGlobalizedDecls.end())
3308 return Address::invalid();
3309 auto VDI = I->getSecond().LocalVarData.find(VD);
3310 if (VDI != I->getSecond().LocalVarData.end())
3311 return VDI->second.PrivateAddr;
3312 if (VD->hasAttrs()) {
3314 E(VD->attr_end());
3315 IT != E; ++IT) {
3316 auto VDI = I->getSecond().LocalVarData.find(
3317 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3318 ->getCanonicalDecl());
3319 if (VDI != I->getSecond().LocalVarData.end())
3320 return VDI->second.PrivateAddr;
3321 }
3322 }
3323
3324 return Address::invalid();
3325}
3326
3328 FunctionGlobalizedDecls.erase(CGF.CurFn);
3330}
3331
3333 CodeGenFunction &CGF, const OMPLoopDirective &S,
3334 OpenMPDistScheduleClauseKind &ScheduleKind,
3335 llvm::Value *&Chunk) const {
3336 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3337 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3338 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3339 Chunk = CGF.EmitScalarConversion(
3340 RT.getGPUNumThreads(CGF),
3341 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3342 S.getIterationVariable()->getType(), S.getBeginLoc());
3343 return;
3344 }
3346 CGF, S, ScheduleKind, Chunk);
3347}
3348
3350 CodeGenFunction &CGF, const OMPLoopDirective &S,
3351 OpenMPScheduleClauseKind &ScheduleKind,
3352 const Expr *&ChunkExpr) const {
3353 ScheduleKind = OMPC_SCHEDULE_static;
3354 // Chunk size is 1 in this case.
3355 llvm::APInt ChunkSize(32, 1);
3356 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3357 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3358 SourceLocation());
3359}
3360
3362 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3364 " Expected target-based directive.");
3365 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3366 for (const CapturedStmt::Capture &C : CS->captures()) {
3367 // Capture variables captured by reference in lambdas for target-based
3368 // directives.
3369 if (!C.capturesVariable())
3370 continue;
3371 const VarDecl *VD = C.getCapturedVar();
3372 const auto *RD = VD->getType()
3376 if (!RD || !RD->isLambda())
3377 continue;
3378 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3379 LValue VDLVal;
3381 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3382 else
3383 VDLVal = CGF.MakeAddrLValue(
3384 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3385 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3386 FieldDecl *ThisCapture = nullptr;
3387 RD->getCaptureFields(Captures, ThisCapture);
3388 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3389 LValue ThisLVal =
3390 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3391 llvm::Value *CXXThis = CGF.LoadCXXThis();
3392 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3393 }
3394 for (const LambdaCapture &LC : RD->captures()) {
3395 if (LC.getCaptureKind() != LCK_ByRef)
3396 continue;
3397 const ValueDecl *VD = LC.getCapturedVar();
3398 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3399 // capturing structured bindings in lambdas yet.
3400 if (!CS->capturesVariable(cast<VarDecl>(VD)))
3401 continue;
3402 auto It = Captures.find(VD);
3403 assert(It != Captures.end() && "Found lambda capture without field.");
3404 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3405 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3407 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3408 VD->getType().getCanonicalType())
3409 .getAddress(CGF);
3410 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3411 }
3412 }
3413}
3414
3416 LangAS &AS) {
3417 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3418 return false;
3419 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3420 switch(A->getAllocatorType()) {
3421 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3422 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3423 // Not supported, fallback to the default mem space.
3424 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3425 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3426 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3427 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3428 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3429 AS = LangAS::Default;
3430 return true;
3431 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3433 return true;
3434 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3436 return true;
3437 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3438 llvm_unreachable("Expected predefined allocator for the variables with the "
3439 "static storage.");
3440 }
3441 return false;
3442}
3443
3444// Get current CudaArch and ignore any unknown values
3446 if (!CGM.getTarget().hasFeature("ptx"))
3447 return CudaArch::UNKNOWN;
3448 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3449 if (Feature.getValue()) {
3450 CudaArch Arch = StringToCudaArch(Feature.getKey());
3451 if (Arch != CudaArch::UNKNOWN)
3452 return Arch;
3453 }
3454 }
3455 return CudaArch::UNKNOWN;
3456}
3457
3458/// Check to see if target architecture supports unified addressing which is
3459/// a restriction for OpenMP requires clause "unified_shared_memory".
3461 const OMPRequiresDecl *D) {
3462 for (const OMPClause *Clause : D->clauselists()) {
3463 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3464 CudaArch Arch = getCudaArch(CGM);
3465 switch (Arch) {
3466 case CudaArch::SM_20:
3467 case CudaArch::SM_21:
3468 case CudaArch::SM_30:
3469 case CudaArch::SM_32:
3470 case CudaArch::SM_35:
3471 case CudaArch::SM_37:
3472 case CudaArch::SM_50:
3473 case CudaArch::SM_52:
3474 case CudaArch::SM_53: {
3475 SmallString<256> Buffer;
3476 llvm::raw_svector_ostream Out(Buffer);
3477 Out << "Target architecture " << CudaArchToString(Arch)
3478 << " does not support unified addressing";
3479 CGM.Error(Clause->getBeginLoc(), Out.str());
3480 return;
3481 }
3482 case CudaArch::SM_60:
3483 case CudaArch::SM_61:
3484 case CudaArch::SM_62:
3485 case CudaArch::SM_70:
3486 case CudaArch::SM_72:
3487 case CudaArch::SM_75:
3488 case CudaArch::SM_80:
3489 case CudaArch::SM_86:
3490 case CudaArch::SM_87:
3491 case CudaArch::SM_89:
3492 case CudaArch::SM_90:
3493 case CudaArch::GFX600:
3494 case CudaArch::GFX601:
3495 case CudaArch::GFX602:
3496 case CudaArch::GFX700:
3497 case CudaArch::GFX701:
3498 case CudaArch::GFX702:
3499 case CudaArch::GFX703:
3500 case CudaArch::GFX704:
3501 case CudaArch::GFX705:
3502 case CudaArch::GFX801:
3503 case CudaArch::GFX802:
3504 case CudaArch::GFX803:
3505 case CudaArch::GFX805:
3506 case CudaArch::GFX810:
3507 case CudaArch::GFX900:
3508 case CudaArch::GFX902:
3509 case CudaArch::GFX904:
3510 case CudaArch::GFX906:
3511 case CudaArch::GFX908:
3512 case CudaArch::GFX909:
3513 case CudaArch::GFX90a:
3514 case CudaArch::GFX90c:
3515 case CudaArch::GFX940:
3516 case CudaArch::GFX941:
3517 case CudaArch::GFX942:
3518 case CudaArch::GFX1010:
3519 case CudaArch::GFX1011:
3520 case CudaArch::GFX1012:
3521 case CudaArch::GFX1013:
3522 case CudaArch::GFX1030:
3523 case CudaArch::GFX1031:
3524 case CudaArch::GFX1032:
3525 case CudaArch::GFX1033:
3526 case CudaArch::GFX1034:
3527 case CudaArch::GFX1035:
3528 case CudaArch::GFX1036:
3529 case CudaArch::GFX1100:
3530 case CudaArch::GFX1101:
3531 case CudaArch::GFX1102:
3532 case CudaArch::GFX1103:
3533 case CudaArch::GFX1150:
3534 case CudaArch::GFX1151:
3535 case CudaArch::GFX1200:
3536 case CudaArch::GFX1201:
3537 case CudaArch::Generic:
3538 case CudaArch::UNUSED:
3539 case CudaArch::UNKNOWN:
3540 break;
3541 case CudaArch::LAST:
3542 llvm_unreachable("Unexpected Cuda arch.");
3543 }
3544 }
3545 }
3547}
3548
3550 CGBuilderTy &Bld = CGF.Builder;
3551 llvm::Module *M = &CGF.CGM.getModule();
3552 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3553 llvm::Function *F = M->getFunction(LocSize);
3554 if (!F) {
3555 F = llvm::Function::Create(
3556 llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
3557 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
3558 }
3559 return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
3560}
3561
3564 return CGF.EmitRuntimeCall(
3565 OMPBuilder.getOrCreateRuntimeFunction(
3566 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3567 Args);
3568}
3569
3572 return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3573 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3574 Args);
3575}
#define V(N, I)
Definition: ASTContext.h:3241
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static CudaArch getCudaArch(CodeGenModule &CGM)
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values,...
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ... or teams {distribute ....
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
This file defines OpenMP nodes for declarative directives.
This file defines OpenMP AST classes for clauses.
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
const char * Data
This file defines OpenMP AST classes for executable directives and clauses.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:182
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
CanQualType VoidPtrTy
Definition: ASTContext.h:1110
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
CanQualType VoidTy
Definition: ASTContext.h:1083
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2731
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:749
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
Definition: Attr.h:41
A class which contains all the information about a particular captured value.
Definition: Decl.h:4469
ArrayRef< Capture > captures() const
Definition: Decl.h:4590
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:6184
const BlockDecl * getBlockDecl() const
Definition: Expr.h:6196
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2847
Expr * getCallee()
Definition: Expr.h:2997
arg_range arguments()
Definition: Expr.h:3086
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3764
This captures a statement into a function.
Definition: Stmt.h:3751
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1405
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3855
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1429
capture_range captures()
Definition: Stmt.h:3889
CastKind getCastKind() const
Definition: Expr.h:3561
Expr * getSubExpr()
Definition: Expr.h:3567
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:122
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:189
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
An aligned address.
Definition: Address.h:29
static Address invalid()
Definition: Address.h:46
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:78
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:62
Address withElementType(llvm::Type *ElemTy) const
Return address with different element type, but same pointer and alignment.
Definition: Address.h:100
llvm::Value * getPointer() const
Definition: Address.h:51
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:57
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:886
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:97
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:159
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ... produce name = getelementptr inbounds addr, i64 0, i64 index where i64 is a...
Definition: CGBuilder.h:196
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:233
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:216
Address CreateGEP(Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition: CGBuilder.h:249
CGFunctionInfo - Class to encapsulate the information about a function definition.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
@ DS_CUDA
CUDA data sharing mode.
@ DS_Generic
Generic data-sharing mode.
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
void getKmpcFreeShared(CodeGenFunction &CGF, const std::pair< llvm::Value *, llvm::Value * > &AddrSizePair) override
Get call to __kmpc_free_shared.
llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
llvm::Value * getGPUWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
bool isDelayedVariableLengthDecl(CodeGenFunction &CGF, const VarDecl *VD) const override
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
ExecutionMode
Defines the execution mode.
@ EM_NonSPMD
Non-SPMD execution mode (1 master thread, others are workers).
@ EM_Unknown
Unknown execution mode (orphaned directive).
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
std::pair< llvm::Value *, llvm::Value * > getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) override
Get call to __kmpc_alloc_shared.
bool isGPU() const override
Returns true if the current target is a GPU.
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
virtual llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D, CodeGenFunction &CGF, int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal, int32_t &MaxTeamsVal)
Helper to determine the min/max number of threads/teams for D.
virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Helper to emit outlined function for 'target' directive.
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
virtual void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false)
Emit an implicit/explicit barrier for OpenMP threads.
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
virtual llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
llvm::OpenMPIRBuilder & getOMPBuilder()
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
llvm::Function * emitReductionFunction(StringRef ReducerName, SourceLocation Loc, llvm::Type *ArgsElemType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
CGCapturedStmtInfo * CapturedStmtInfo
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements,...
LValue EmitLValue(const Expr *E, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitLValue - Emit code to compute a designator that specifies the location of the expression.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::Type * ConvertTypeForMem(QualType T)
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
const TargetInfo & getTarget() const
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
llvm::Type * ConvertType(QualType T)
CodeGenTypes & getTypes() const
LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T)
llvm::Value * EvaluateExprAsBool(const Expr *E)
EvaluateExprAsBool - Perform the usual unary conversions on the specified expression and compare the ...
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
llvm::LLVMContext & getLLVMContext()
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
This class organizes the cross-function state that is used while generating LLVM code.
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
llvm::Module & getModule() const
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
const LangOptions & getLangOpts() const
const TargetInfo & getTarget() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
ASTContext & getContext() const
llvm::LLVMContext & getLLVMContext()
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1623
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:670
unsigned getTargetAddressSpace(QualType T) const
llvm::Type * ConvertTypeForMem(QualType T, bool ForBitField=false)
ConvertTypeForMem - Convert type T into a llvm::Type.
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:351
LValue - This represents an lvalue references.
Definition: CGValue.h:171
Address getAddress(CodeGenFunction &CGF) const
Definition: CGValue.h:350
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:346
QualType getType() const
Definition: CGValue.h:279
void setAddress(Address address)
Definition: CGValue.h:354
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
void setAction(PrePostActionTy &Action) const
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:194
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1435
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1711
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1248
ValueDecl * getDecl()
Definition: Expr.h:1316
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1493
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:85
T * getAttr() const
Definition: DeclBase.h:577
bool hasAttrs() const
Definition: DeclBase.h:523
attr_iterator attr_end() const
Definition: DeclBase.h:547
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:973
attr_iterator attr_begin() const
Definition: DeclBase.h:544
SourceLocation getLocation() const
Definition: DeclBase.h:444
DeclContext * getDeclContext()
Definition: DeclBase.h:453
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:436
AttrVec & getAttrs()
Definition: DeclBase.h:529
bool hasAttr() const
Definition: DeclBase.h:581
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:967
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:820
This represents one expression.
Definition: Expr.h:110
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3031
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3027
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:271
QualType getType() const
Definition: Expr.h:142
Represents a member of a struct/union/class.
Definition: Decl.h:3015
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4483
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3677
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:5303
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:947
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1938
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition: ExprCXX.cpp:1290
capture_range captures() const
Retrieve this lambda's captures.
Definition: ExprCXX.cpp:1303
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:471
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:269
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:275
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:556
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:531
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive.
Definition: StmtOpenMP.h:547
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:569
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:534
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:502
ArrayRef< OMPClause * > clauses() const
Definition: StmtOpenMP.h:586
static const SpecificClause * getSingleClause(ArrayRef< OMPClause * > Clauses)
Gets a single clause of the specified kind associated with the current directive iff there is only on...
Definition: StmtOpenMP.h:477
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:459
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1018
This represents clause 'reduction' in the '#pragma omp ...' directives.
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:416
clauselist_range clauselists()
Definition: DeclOpenMP.h:441
This represents 'ompx_bare' clause in the '#pragma omp target teams ...' directive.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2901
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2896
A (possibly-)qualified type.
Definition: Type.h:736
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:6906
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6981
QualType getCanonicalType() const
Definition: Type.h:6833
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:6721
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:6728
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:4075
void addAddressSpace(LangAS space)
Definition: Type.h:403
void addRestrict()
Definition: Type.h:286
Represents a struct/union/class.
Definition: Decl.h:4117
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:5018
Encodes a location in the source.
RetTy Visit(PTR(Stmt) S, ParamTys... P)
Definition: StmtVisitor.h:43
Stmt - This represents one statement.
Definition: Stmt.h:84
child_range children()
Definition: Stmt.cpp:286
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top,...
Definition: Stmt.cpp:196
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4675
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:716
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:304
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1563
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1410
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62
The base class of the type hierarchy.
Definition: Type.h:1602
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1819
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:7384
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:7625
bool isReferenceType() const
Definition: Type.h:7045
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:651
bool isLValueReferenceType() const
Definition: Type.h:7049
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2123
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2439
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2210
Expr * getSubExpr() const
Definition: Expr.h:2255
Opcode getOpcode() const
Definition: Expr.h:2250
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:704
QualType getType() const
Definition: Decl.h:715
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:5295
Represents a variable declaration or definition.
Definition: Decl.h:916
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2250
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1553
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:33
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
CudaArch
Definition: Cuda.h:51
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:267
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:165
@ CR_OpenMP
Definition: CapturedStmt.h:19
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
@ SC_None
Definition: Specifiers.h:245
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86
@ CXXThis
Parameter for C++ 'this' argument.
@ Other
Other implicit parameter.
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:147
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30
@ AS_public
Definition: Specifiers.h:119
unsigned long uint64_t
llvm::Value * ScratchpadIndex
llvm::Value * ScratchpadWidth
llvm::Value * RemoteLaneOffset
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::IntegerType * IntTy
int