clang 17.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/Support/MathExtras.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
35 llvm::FunctionCallee ExitCallee = nullptr;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52 ContBlock = CGF.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55 CGF.EmitBlock(ThenBlock);
56 }
57 }
58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(ContBlock);
61 CGF.EmitBlock(ContBlock, true);
62 }
63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65 }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
77
78public:
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86};
87
88/// GPU Configuration: This information can be derived from cuda registers,
89/// however, providing compile time constants helps generate more efficient
90/// code. For all practical purposes this is fine because the configuration
91/// is the same for all known NVPTX architectures.
92enum MachineConfiguration : unsigned {
93 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
94 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
95
96 /// Global memory alignment for performance.
97 GlobalMemoryAlignment = 128,
98};
99
100static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
101 RefExpr = RefExpr->IgnoreParens();
102 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
103 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
104 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
105 Base = TempASE->getBase()->IgnoreParenImpCasts();
106 RefExpr = Base;
107 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
108 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
109 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
110 Base = TempOASE->getBase()->IgnoreParenImpCasts();
111 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
112 Base = TempASE->getBase()->IgnoreParenImpCasts();
113 RefExpr = Base;
114 }
115 RefExpr = RefExpr->IgnoreParenImpCasts();
116 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
117 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
118 const auto *ME = cast<MemberExpr>(RefExpr);
119 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
120}
121
122
123static RecordDecl *buildRecordForGlobalizedVars(
125 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
126 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
127 &MappedDeclsFields, int BufSize) {
128 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
129 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
130 return nullptr;
131 SmallVector<VarsDataTy, 4> GlobalizedVars;
132 for (const ValueDecl *D : EscapedDecls)
133 GlobalizedVars.emplace_back(
135 C.getDeclAlign(D).getQuantity(),
136 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
137 D);
138 for (const ValueDecl *D : EscapedDeclsForTeams)
139 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
140 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
141 return L.first > R.first;
142 });
143
144 // Build struct _globalized_locals_ty {
145 // /* globalized vars */[WarSize] align (max(decl_align,
146 // GlobalMemoryAlignment))
147 // /* globalized vars */ for EscapedDeclsForTeams
148 // };
149 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
150 GlobalizedRD->startDefinition();
152 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
153 for (const auto &Pair : GlobalizedVars) {
154 const ValueDecl *VD = Pair.second;
155 QualType Type = VD->getType();
157 Type = C.getPointerType(Type.getNonReferenceType());
158 else
159 Type = Type.getNonReferenceType();
160 SourceLocation Loc = VD->getLocation();
162 if (SingleEscaped.count(VD)) {
164 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
165 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
166 /*BW=*/nullptr, /*Mutable=*/false,
167 /*InitStyle=*/ICIS_NoInit);
168 Field->setAccess(AS_public);
169 if (VD->hasAttrs()) {
170 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
171 E(VD->getAttrs().end());
172 I != E; ++I)
173 Field->addAttr(*I);
174 }
175 } else {
176 llvm::APInt ArraySize(32, BufSize);
177 Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
178 0);
180 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
181 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
182 /*BW=*/nullptr, /*Mutable=*/false,
183 /*InitStyle=*/ICIS_NoInit);
184 Field->setAccess(AS_public);
185 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
186 static_cast<CharUnits::QuantityType>(
187 GlobalMemoryAlignment)));
188 Field->addAttr(AlignedAttr::CreateImplicit(
189 C, /*IsAlignmentExpr=*/true,
191 C.getIntTypeForBitwidth(32, /*Signed=*/0),
193 {}, AlignedAttr::GNU_aligned));
194 }
195 GlobalizedRD->addDecl(Field);
196 MappedDeclsFields.try_emplace(VD, Field);
197 }
198 GlobalizedRD->completeDefinition();
199 return GlobalizedRD;
200}
201
202/// Get the list of variables that can escape their declaration context.
203class CheckVarsEscapingDeclContext final
204 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
205 CodeGenFunction &CGF;
206 llvm::SetVector<const ValueDecl *> EscapedDecls;
207 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
208 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
209 RecordDecl *GlobalizedRD = nullptr;
210 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
211 bool AllEscaped = false;
212 bool IsForCombinedParallelRegion = false;
213
214 void markAsEscaped(const ValueDecl *VD) {
215 // Do not globalize declare target variables.
216 if (!isa<VarDecl>(VD) ||
217 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
218 return;
219 VD = cast<ValueDecl>(VD->getCanonicalDecl());
220 // Use user-specified allocation.
221 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
222 return;
223 // Variables captured by value must be globalized.
224 if (auto *CSI = CGF.CapturedStmtInfo) {
225 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
226 // Check if need to capture the variable that was already captured by
227 // value in the outer region.
228 if (!IsForCombinedParallelRegion) {
229 if (!FD->hasAttrs())
230 return;
231 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
232 if (!Attr)
233 return;
234 if (((Attr->getCaptureKind() != OMPC_map) &&
235 !isOpenMPPrivate(Attr->getCaptureKind())) ||
236 ((Attr->getCaptureKind() == OMPC_map) &&
237 !FD->getType()->isAnyPointerType()))
238 return;
239 }
240 if (!FD->getType()->isReferenceType()) {
241 assert(!VD->getType()->isVariablyModifiedType() &&
242 "Parameter captured by value with variably modified type");
243 EscapedParameters.insert(VD);
244 } else if (!IsForCombinedParallelRegion) {
245 return;
246 }
247 }
248 }
249 if ((!CGF.CapturedStmtInfo ||
250 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
251 VD->getType()->isReferenceType())
252 // Do not globalize variables with reference type.
253 return;
254 if (VD->getType()->isVariablyModifiedType())
255 EscapedVariableLengthDecls.insert(VD);
256 else
257 EscapedDecls.insert(VD);
258 }
259
260 void VisitValueDecl(const ValueDecl *VD) {
261 if (VD->getType()->isLValueReferenceType())
262 markAsEscaped(VD);
263 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
264 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
265 const bool SavedAllEscaped = AllEscaped;
266 AllEscaped = VD->getType()->isLValueReferenceType();
267 Visit(VarD->getInit());
268 AllEscaped = SavedAllEscaped;
269 }
270 }
271 }
272 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
273 ArrayRef<OMPClause *> Clauses,
274 bool IsCombinedParallelRegion) {
275 if (!S)
276 return;
277 for (const CapturedStmt::Capture &C : S->captures()) {
278 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
279 const ValueDecl *VD = C.getCapturedVar();
280 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
281 if (IsCombinedParallelRegion) {
282 // Check if the variable is privatized in the combined construct and
283 // those private copies must be shared in the inner parallel
284 // directive.
285 IsForCombinedParallelRegion = false;
286 for (const OMPClause *C : Clauses) {
287 if (!isOpenMPPrivate(C->getClauseKind()) ||
288 C->getClauseKind() == OMPC_reduction ||
289 C->getClauseKind() == OMPC_linear ||
290 C->getClauseKind() == OMPC_private)
291 continue;
293 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
294 Vars = PC->getVarRefs();
295 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
296 Vars = PC->getVarRefs();
297 else
298 llvm_unreachable("Unexpected clause.");
299 for (const auto *E : Vars) {
300 const Decl *D =
301 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
302 if (D == VD->getCanonicalDecl()) {
303 IsForCombinedParallelRegion = true;
304 break;
305 }
306 }
307 if (IsForCombinedParallelRegion)
308 break;
309 }
310 }
311 markAsEscaped(VD);
312 if (isa<OMPCapturedExprDecl>(VD))
313 VisitValueDecl(VD);
314 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
315 }
316 }
317 }
318
319 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
320 assert(!GlobalizedRD &&
321 "Record for globalized variables is built already.");
322 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
323 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
324 if (IsInTTDRegion)
325 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
326 else
327 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
328 GlobalizedRD = ::buildRecordForGlobalizedVars(
329 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
330 MappedDeclsFields, WarpSize);
331 }
332
333public:
334 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
335 ArrayRef<const ValueDecl *> TeamsReductions)
336 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
337 }
338 virtual ~CheckVarsEscapingDeclContext() = default;
339 void VisitDeclStmt(const DeclStmt *S) {
340 if (!S)
341 return;
342 for (const Decl *D : S->decls())
343 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
344 VisitValueDecl(VD);
345 }
346 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
347 if (!D)
348 return;
349 if (!D->hasAssociatedStmt())
350 return;
351 if (const auto *S =
352 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
353 // Do not analyze directives that do not actually require capturing,
354 // like `omp for` or `omp simd` directives.
356 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
357 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
358 VisitStmt(S->getCapturedStmt());
359 return;
360 }
361 VisitOpenMPCapturedStmt(
362 S, D->clauses(),
363 CaptureRegions.back() == OMPD_parallel &&
365 }
366 }
367 void VisitCapturedStmt(const CapturedStmt *S) {
368 if (!S)
369 return;
370 for (const CapturedStmt::Capture &C : S->captures()) {
371 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
372 const ValueDecl *VD = C.getCapturedVar();
373 markAsEscaped(VD);
374 if (isa<OMPCapturedExprDecl>(VD))
375 VisitValueDecl(VD);
376 }
377 }
378 }
379 void VisitLambdaExpr(const LambdaExpr *E) {
380 if (!E)
381 return;
382 for (const LambdaCapture &C : E->captures()) {
383 if (C.capturesVariable()) {
384 if (C.getCaptureKind() == LCK_ByRef) {
385 const ValueDecl *VD = C.getCapturedVar();
386 markAsEscaped(VD);
387 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
388 VisitValueDecl(VD);
389 }
390 }
391 }
392 }
393 void VisitBlockExpr(const BlockExpr *E) {
394 if (!E)
395 return;
396 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
397 if (C.isByRef()) {
398 const VarDecl *VD = C.getVariable();
399 markAsEscaped(VD);
400 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
401 VisitValueDecl(VD);
402 }
403 }
404 }
405 void VisitCallExpr(const CallExpr *E) {
406 if (!E)
407 return;
408 for (const Expr *Arg : E->arguments()) {
409 if (!Arg)
410 continue;
411 if (Arg->isLValue()) {
412 const bool SavedAllEscaped = AllEscaped;
413 AllEscaped = true;
414 Visit(Arg);
415 AllEscaped = SavedAllEscaped;
416 } else {
417 Visit(Arg);
418 }
419 }
420 Visit(E->getCallee());
421 }
422 void VisitDeclRefExpr(const DeclRefExpr *E) {
423 if (!E)
424 return;
425 const ValueDecl *VD = E->getDecl();
426 if (AllEscaped)
427 markAsEscaped(VD);
428 if (isa<OMPCapturedExprDecl>(VD))
429 VisitValueDecl(VD);
430 else if (VD->isInitCapture())
431 VisitValueDecl(VD);
432 }
433 void VisitUnaryOperator(const UnaryOperator *E) {
434 if (!E)
435 return;
436 if (E->getOpcode() == UO_AddrOf) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
445 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
446 if (!E)
447 return;
448 if (E->getCastKind() == CK_ArrayToPointerDecay) {
449 const bool SavedAllEscaped = AllEscaped;
450 AllEscaped = true;
451 Visit(E->getSubExpr());
452 AllEscaped = SavedAllEscaped;
453 } else {
454 Visit(E->getSubExpr());
455 }
456 }
457 void VisitExpr(const Expr *E) {
458 if (!E)
459 return;
460 bool SavedAllEscaped = AllEscaped;
461 if (!E->isLValue())
462 AllEscaped = false;
463 for (const Stmt *Child : E->children())
464 if (Child)
465 Visit(Child);
466 AllEscaped = SavedAllEscaped;
467 }
468 void VisitStmt(const Stmt *S) {
469 if (!S)
470 return;
471 for (const Stmt *Child : S->children())
472 if (Child)
473 Visit(Child);
474 }
475
476 /// Returns the record that handles all the escaped local variables and used
477 /// instead of their original storage.
478 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
479 if (!GlobalizedRD)
480 buildRecordForGlobalizedVars(IsInTTDRegion);
481 return GlobalizedRD;
482 }
483
484 /// Returns the field in the globalized record for the escaped variable.
485 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
486 assert(GlobalizedRD &&
487 "Record for globalized variables must be generated already.");
488 auto I = MappedDeclsFields.find(VD);
489 if (I == MappedDeclsFields.end())
490 return nullptr;
491 return I->getSecond();
492 }
493
494 /// Returns the list of the escaped local variables/parameters.
495 ArrayRef<const ValueDecl *> getEscapedDecls() const {
496 return EscapedDecls.getArrayRef();
497 }
498
499 /// Checks if the escaped local variable is actually a parameter passed by
500 /// value.
501 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
502 return EscapedParameters;
503 }
504
505 /// Returns the list of the escaped variables with the variably modified
506 /// types.
507 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
508 return EscapedVariableLengthDecls.getArrayRef();
509 }
510};
511} // anonymous namespace
512
513/// Get the id of the warp in the block.
514/// We assume that the warp size is 32, which is always the case
515/// on the NVPTX device, to generate more efficient code.
516static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
517 CGBuilderTy &Bld = CGF.Builder;
518 unsigned LaneIDBits =
519 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
520 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
521 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
522}
523
524/// Get the id of the current lane in the Warp.
525/// We assume that the warp size is 32, which is always the case
526/// on the NVPTX device, to generate more efficient code.
527static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
528 CGBuilderTy &Bld = CGF.Builder;
529 unsigned LaneIDBits =
530 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
531 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
532 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
533 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
534 "nvptx_lane_id");
535}
536
538CGOpenMPRuntimeGPU::getExecutionMode() const {
539 return CurrentExecutionMode;
540}
541
544 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
546}
547
548/// Check for inner (nested) SPMD construct, if any
550 const OMPExecutableDirective &D) {
551 const auto *CS = D.getInnermostCapturedStmt();
552 const auto *Body =
553 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
554 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
555
556 if (const auto *NestedDir =
557 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
558 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
559 switch (D.getDirectiveKind()) {
560 case OMPD_target:
561 if (isOpenMPParallelDirective(DKind))
562 return true;
563 if (DKind == OMPD_teams) {
564 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
565 /*IgnoreCaptured=*/true);
566 if (!Body)
567 return false;
568 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
569 if (const auto *NND =
570 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
571 DKind = NND->getDirectiveKind();
572 if (isOpenMPParallelDirective(DKind))
573 return true;
574 }
575 }
576 return false;
577 case OMPD_target_teams:
578 return isOpenMPParallelDirective(DKind);
579 case OMPD_target_simd:
580 case OMPD_target_parallel:
581 case OMPD_target_parallel_for:
582 case OMPD_target_parallel_for_simd:
583 case OMPD_target_teams_distribute:
584 case OMPD_target_teams_distribute_simd:
585 case OMPD_target_teams_distribute_parallel_for:
586 case OMPD_target_teams_distribute_parallel_for_simd:
587 case OMPD_parallel:
588 case OMPD_for:
589 case OMPD_parallel_for:
590 case OMPD_parallel_master:
591 case OMPD_parallel_sections:
592 case OMPD_for_simd:
593 case OMPD_parallel_for_simd:
594 case OMPD_cancel:
595 case OMPD_cancellation_point:
596 case OMPD_ordered:
597 case OMPD_threadprivate:
598 case OMPD_allocate:
599 case OMPD_task:
600 case OMPD_simd:
601 case OMPD_sections:
602 case OMPD_section:
603 case OMPD_single:
604 case OMPD_master:
605 case OMPD_critical:
606 case OMPD_taskyield:
607 case OMPD_barrier:
608 case OMPD_taskwait:
609 case OMPD_taskgroup:
610 case OMPD_atomic:
611 case OMPD_flush:
612 case OMPD_depobj:
613 case OMPD_scan:
614 case OMPD_teams:
615 case OMPD_target_data:
616 case OMPD_target_exit_data:
617 case OMPD_target_enter_data:
618 case OMPD_distribute:
619 case OMPD_distribute_simd:
620 case OMPD_distribute_parallel_for:
621 case OMPD_distribute_parallel_for_simd:
622 case OMPD_teams_distribute:
623 case OMPD_teams_distribute_simd:
624 case OMPD_teams_distribute_parallel_for:
625 case OMPD_teams_distribute_parallel_for_simd:
626 case OMPD_target_update:
627 case OMPD_declare_simd:
628 case OMPD_declare_variant:
629 case OMPD_begin_declare_variant:
630 case OMPD_end_declare_variant:
631 case OMPD_declare_target:
632 case OMPD_end_declare_target:
633 case OMPD_declare_reduction:
634 case OMPD_declare_mapper:
635 case OMPD_taskloop:
636 case OMPD_taskloop_simd:
637 case OMPD_master_taskloop:
638 case OMPD_master_taskloop_simd:
639 case OMPD_parallel_master_taskloop:
640 case OMPD_parallel_master_taskloop_simd:
641 case OMPD_requires:
642 case OMPD_unknown:
643 default:
644 llvm_unreachable("Unexpected directive.");
645 }
646 }
647
648 return false;
649}
650
652 const OMPExecutableDirective &D) {
653 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
654 switch (DirectiveKind) {
655 case OMPD_target:
656 case OMPD_target_teams:
657 return hasNestedSPMDDirective(Ctx, D);
658 case OMPD_target_parallel:
659 case OMPD_target_parallel_for:
660 case OMPD_target_parallel_for_simd:
661 case OMPD_target_teams_distribute_parallel_for:
662 case OMPD_target_teams_distribute_parallel_for_simd:
663 case OMPD_target_simd:
664 case OMPD_target_teams_distribute_simd:
665 return true;
666 case OMPD_target_teams_distribute:
667 return false;
668 case OMPD_parallel:
669 case OMPD_for:
670 case OMPD_parallel_for:
671 case OMPD_parallel_master:
672 case OMPD_parallel_sections:
673 case OMPD_for_simd:
674 case OMPD_parallel_for_simd:
675 case OMPD_cancel:
676 case OMPD_cancellation_point:
677 case OMPD_ordered:
678 case OMPD_threadprivate:
679 case OMPD_allocate:
680 case OMPD_task:
681 case OMPD_simd:
682 case OMPD_sections:
683 case OMPD_section:
684 case OMPD_single:
685 case OMPD_master:
686 case OMPD_critical:
687 case OMPD_taskyield:
688 case OMPD_barrier:
689 case OMPD_taskwait:
690 case OMPD_taskgroup:
691 case OMPD_atomic:
692 case OMPD_flush:
693 case OMPD_depobj:
694 case OMPD_scan:
695 case OMPD_teams:
696 case OMPD_target_data:
697 case OMPD_target_exit_data:
698 case OMPD_target_enter_data:
699 case OMPD_distribute:
700 case OMPD_distribute_simd:
701 case OMPD_distribute_parallel_for:
702 case OMPD_distribute_parallel_for_simd:
703 case OMPD_teams_distribute:
704 case OMPD_teams_distribute_simd:
705 case OMPD_teams_distribute_parallel_for:
706 case OMPD_teams_distribute_parallel_for_simd:
707 case OMPD_target_update:
708 case OMPD_declare_simd:
709 case OMPD_declare_variant:
710 case OMPD_begin_declare_variant:
711 case OMPD_end_declare_variant:
712 case OMPD_declare_target:
713 case OMPD_end_declare_target:
714 case OMPD_declare_reduction:
715 case OMPD_declare_mapper:
716 case OMPD_taskloop:
717 case OMPD_taskloop_simd:
718 case OMPD_master_taskloop:
719 case OMPD_master_taskloop_simd:
720 case OMPD_parallel_master_taskloop:
721 case OMPD_parallel_master_taskloop_simd:
722 case OMPD_requires:
723 case OMPD_unknown:
724 default:
725 break;
726 }
727 llvm_unreachable(
728 "Unknown programming model for OpenMP directive on NVPTX target.");
729}
730
731void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
732 StringRef ParentName,
733 llvm::Function *&OutlinedFn,
734 llvm::Constant *&OutlinedFnID,
735 bool IsOffloadEntry,
736 const RegionCodeGenTy &CodeGen) {
737 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
738 EntryFunctionState EST;
739 WrapperFunctionsMap.clear();
740
741 // Emit target region as a standalone region.
742 class NVPTXPrePostActionTy : public PrePostActionTy {
743 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
744
745 public:
746 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
747 : EST(EST) {}
748 void Enter(CodeGenFunction &CGF) override {
749 auto &RT =
750 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
751 RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
752 // Skip target region initialization.
753 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
754 }
755 void Exit(CodeGenFunction &CGF) override {
756 auto &RT =
757 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
759 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
760 }
761 } Action(EST);
762 CodeGen.setAction(Action);
763 IsInTTDRegion = true;
764 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
765 IsOffloadEntry, CodeGen);
766 IsInTTDRegion = false;
767}
768
769void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
770 EntryFunctionState &EST, bool IsSPMD) {
771 CGBuilderTy &Bld = CGF.Builder;
772 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
773 if (!IsSPMD)
774 emitGenericVarsProlog(CGF, EST.Loc);
775}
776
777void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
778 EntryFunctionState &EST,
779 bool IsSPMD) {
780 if (!IsSPMD)
781 emitGenericVarsEpilog(CGF);
782
783 CGBuilderTy &Bld = CGF.Builder;
784 OMPBuilder.createTargetDeinit(Bld, IsSPMD);
785}
786
787void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
788 StringRef ParentName,
789 llvm::Function *&OutlinedFn,
790 llvm::Constant *&OutlinedFnID,
791 bool IsOffloadEntry,
792 const RegionCodeGenTy &CodeGen) {
793 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
794 EntryFunctionState EST;
795
796 // Emit target region as a standalone region.
797 class NVPTXPrePostActionTy : public PrePostActionTy {
799 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
800
801 public:
802 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
803 CGOpenMPRuntimeGPU::EntryFunctionState &EST)
804 : RT(RT), EST(EST) {}
805 void Enter(CodeGenFunction &CGF) override {
806 RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
807 // Skip target region initialization.
808 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
809 }
810 void Exit(CodeGenFunction &CGF) override {
812 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
813 }
814 } Action(*this, EST);
815 CodeGen.setAction(Action);
816 IsInTTDRegion = true;
817 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
818 IsOffloadEntry, CodeGen);
819 IsInTTDRegion = false;
820}
821
822// Create a unique global variable to indicate the execution mode of this target
823// region. The execution mode is either 'generic', or 'spmd' depending on the
824// target directive. This variable is picked up by the offload library to setup
825// the device appropriately before kernel launch. If the execution mode is
826// 'generic', the runtime reserves one warp for the master, otherwise, all
827// warps participate in parallel work.
828static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
829 bool Mode) {
830 auto *GVMode = new llvm::GlobalVariable(
831 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
832 llvm::GlobalValue::WeakAnyLinkage,
833 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
834 : OMP_TGT_EXEC_MODE_GENERIC),
835 Twine(Name, "_exec_mode"));
836 GVMode->setVisibility(llvm::GlobalVariable::ProtectedVisibility);
837 CGM.addCompilerUsedGlobal(GVMode);
838}
839
840void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
841 const OMPExecutableDirective &D, StringRef ParentName,
842 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
843 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
844 if (!IsOffloadEntry) // Nothing to do.
845 return;
846
847 assert(!ParentName.empty() && "Invalid target region parent name!");
848
849 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
850 if (Mode)
851 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
852 CodeGen);
853 else
854 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
855 CodeGen);
856
857 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
858}
859
861 : CGOpenMPRuntime(CGM) {
862 llvm::OpenMPIRBuilderConfig Config(CGM.getLangOpts().OpenMPIsDevice, true,
864 CGM.getLangOpts().OpenMPOffloadMandatory);
865 OMPBuilder.setConfig(Config);
866
867 if (!CGM.getLangOpts().OpenMPIsDevice)
868 llvm_unreachable("OpenMP can only handle device code.");
869
870 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
871 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
872 return;
873
874 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
875 "__omp_rtl_debug_kind");
876 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
877 "__omp_rtl_assume_teams_oversubscription");
878 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
879 "__omp_rtl_assume_threads_oversubscription");
880 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
881 "__omp_rtl_assume_no_thread_state");
882 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
883 "__omp_rtl_assume_no_nested_parallelism");
884}
885
887 ProcBindKind ProcBind,
888 SourceLocation Loc) {
889 // Do nothing in case of SPMD mode and L0 parallel.
890 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
891 return;
892
893 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
894}
895
897 llvm::Value *NumThreads,
898 SourceLocation Loc) {
899 // Nothing to do.
900}
901
903 const Expr *NumTeams,
904 const Expr *ThreadLimit,
905 SourceLocation Loc) {}
906
909 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
910 const RegionCodeGenTy &CodeGen) {
911 // Emit target region as a standalone region.
912 bool PrevIsInTTDRegion = IsInTTDRegion;
913 IsInTTDRegion = false;
914 auto *OutlinedFun =
916 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
917 IsInTTDRegion = PrevIsInTTDRegion;
918 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
919 llvm::Function *WrapperFun =
920 createParallelDataSharingWrapper(OutlinedFun, D);
921 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
922 }
923
924 return OutlinedFun;
925}
926
927/// Get list of lastprivate variables from the teams distribute ... or
928/// teams {distribute ...} directives.
929static void
933 "expected teams directive.");
934 const OMPExecutableDirective *Dir = &D;
937 Ctx,
939 /*IgnoreCaptured=*/true))) {
940 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
942 Dir = nullptr;
943 }
944 }
945 if (!Dir)
946 return;
947 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
948 for (const Expr *E : C->getVarRefs())
949 Vars.push_back(getPrivateItem(E));
950 }
951}
952
953/// Get list of reduction variables from the teams ... directives.
954static void
958 "expected teams directive.");
959 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
960 for (const Expr *E : C->privates())
961 Vars.push_back(getPrivateItem(E));
962 }
963}
964
967 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
968 const RegionCodeGenTy &CodeGen) {
969 SourceLocation Loc = D.getBeginLoc();
970
971 const RecordDecl *GlobalizedRD = nullptr;
972 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
973 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
974 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
975 // Globalize team reductions variable unconditionally in all modes.
976 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
977 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
978 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
979 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
980 if (!LastPrivatesReductions.empty()) {
981 GlobalizedRD = ::buildRecordForGlobalizedVars(
982 CGM.getContext(), std::nullopt, LastPrivatesReductions,
983 MappedDeclsFields, WarpSize);
984 }
985 } else if (!LastPrivatesReductions.empty()) {
986 assert(!TeamAndReductions.first &&
987 "Previous team declaration is not expected.");
988 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
989 std::swap(TeamAndReductions.second, LastPrivatesReductions);
990 }
991
992 // Emit target region as a standalone region.
993 class NVPTXPrePostActionTy : public PrePostActionTy {
994 SourceLocation &Loc;
995 const RecordDecl *GlobalizedRD;
996 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
997 &MappedDeclsFields;
998
999 public:
1000 NVPTXPrePostActionTy(
1001 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1002 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1003 &MappedDeclsFields)
1004 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1005 MappedDeclsFields(MappedDeclsFields) {}
1006 void Enter(CodeGenFunction &CGF) override {
1007 auto &Rt =
1008 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1009 if (GlobalizedRD) {
1010 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1011 I->getSecond().MappedParams =
1012 std::make_unique<CodeGenFunction::OMPMapVars>();
1013 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1014 for (const auto &Pair : MappedDeclsFields) {
1015 assert(Pair.getFirst()->isCanonicalDecl() &&
1016 "Expected canonical declaration");
1017 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1018 }
1019 }
1020 Rt.emitGenericVarsProlog(CGF, Loc);
1021 }
1022 void Exit(CodeGenFunction &CGF) override {
1023 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1024 .emitGenericVarsEpilog(CGF);
1025 }
1026 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1027 CodeGen.setAction(Action);
1028 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1029 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1030
1031 return OutlinedFun;
1032}
1033
1034void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1035 SourceLocation Loc,
1036 bool WithSPMDCheck) {
1038 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1039 return;
1040
1041 CGBuilderTy &Bld = CGF.Builder;
1042
1043 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1044 if (I == FunctionGlobalizedDecls.end())
1045 return;
1046
1047 for (auto &Rec : I->getSecond().LocalVarData) {
1048 const auto *VD = cast<VarDecl>(Rec.first);
1049 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1050 QualType VarTy = VD->getType();
1051
1052 // Get the local allocation of a firstprivate variable before sharing
1053 llvm::Value *ParValue;
1054 if (EscapedParam) {
1055 LValue ParLVal =
1056 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1057 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1058 }
1059
1060 // Allocate space for the variable to be globalized
1061 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1062 llvm::CallBase *VoidPtr =
1063 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1064 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1065 AllocArgs, VD->getName());
1066 // FIXME: We should use the variables actual alignment as an argument.
1067 VoidPtr->addRetAttr(llvm::Attribute::get(
1068 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1070
1071 // Cast the void pointer and get the address of the globalized variable.
1072 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1073 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1074 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1075 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1076 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1077 Rec.second.GlobalizedVal = VoidPtr;
1078
1079 // Assign the local allocation to the newly globalized location.
1080 if (EscapedParam) {
1081 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1082 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1083 }
1084 if (auto *DI = CGF.getDebugInfo())
1085 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1086 }
1087 for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1088 // Use actual memory size of the VLA object including the padding
1089 // for alignment purposes.
1090 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1091 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1092 Size = Bld.CreateNUWAdd(
1093 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1094 llvm::Value *AlignVal =
1095 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1096
1097 Size = Bld.CreateUDiv(Size, AlignVal);
1098 Size = Bld.CreateNUWMul(Size, AlignVal);
1099
1100 // Allocate space for this VLA object to be globalized.
1101 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1102 llvm::CallBase *VoidPtr =
1103 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1104 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1105 AllocArgs, VD->getName());
1106 VoidPtr->addRetAttr(
1107 llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
1109
1110 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1111 std::pair<llvm::Value *, llvm::Value *>(
1112 {VoidPtr, CGF.getTypeSize(VD->getType())}));
1113 LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1116 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1117 Base.getAddress(CGF));
1118 }
1119 I->getSecond().MappedParams->apply(CGF);
1120}
1121
1122void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1123 bool WithSPMDCheck) {
1125 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1126 return;
1127
1128 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1129 if (I != FunctionGlobalizedDecls.end()) {
1130 // Deallocate the memory for each globalized VLA object
1131 for (const auto &AddrSizePair :
1132 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1133 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1134 CGM.getModule(), OMPRTL___kmpc_free_shared),
1135 {AddrSizePair.first, AddrSizePair.second});
1136 }
1137 // Deallocate the memory for each globalized value
1138 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1139 const auto *VD = cast<VarDecl>(Rec.first);
1140 I->getSecond().MappedParams->restore(CGF);
1141
1142 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1143 CGF.getTypeSize(VD->getType())};
1144 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1145 CGM.getModule(), OMPRTL___kmpc_free_shared),
1146 FreeArgs);
1147 }
1148 }
1149}
1150
1152 const OMPExecutableDirective &D,
1153 SourceLocation Loc,
1154 llvm::Function *OutlinedFn,
1155 ArrayRef<llvm::Value *> CapturedVars) {
1156 if (!CGF.HaveInsertPoint())
1157 return;
1158
1159 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1160 /*Name=*/".zero.addr");
1161 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1163 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1164 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1165 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1166 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1167}
1168
1170 SourceLocation Loc,
1171 llvm::Function *OutlinedFn,
1172 ArrayRef<llvm::Value *> CapturedVars,
1173 const Expr *IfCond,
1174 llvm::Value *NumThreads) {
1175 if (!CGF.HaveInsertPoint())
1176 return;
1177
1178 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1179 NumThreads](CodeGenFunction &CGF,
1180 PrePostActionTy &Action) {
1181 CGBuilderTy &Bld = CGF.Builder;
1182 llvm::Value *NumThreadsVal = NumThreads;
1183 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1184 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1185 if (WFn)
1186 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1187 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1188
1189 // Create a private scope that will globalize the arguments
1190 // passed from the outside of the target region.
1191 // TODO: Is that needed?
1192 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1193
1194 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1195 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1196 "captured_vars_addrs");
1197 // There's something to share.
1198 if (!CapturedVars.empty()) {
1199 // Prepare for parallel region. Indicate the outlined function.
1200 ASTContext &Ctx = CGF.getContext();
1201 unsigned Idx = 0;
1202 for (llvm::Value *V : CapturedVars) {
1203 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1204 llvm::Value *PtrV;
1205 if (V->getType()->isIntegerTy())
1206 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1207 else
1209 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1210 Ctx.getPointerType(Ctx.VoidPtrTy));
1211 ++Idx;
1212 }
1213 }
1214
1215 llvm::Value *IfCondVal = nullptr;
1216 if (IfCond)
1217 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1218 /* isSigned */ false);
1219 else
1220 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1221
1222 if (!NumThreadsVal)
1223 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1224 else
1225 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1226
1227 assert(IfCondVal && "Expected a value");
1228 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1229 llvm::Value *Args[] = {
1230 RTLoc,
1231 getThreadID(CGF, Loc),
1232 IfCondVal,
1233 NumThreadsVal,
1234 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1235 FnPtr,
1236 ID,
1237 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1238 CGF.VoidPtrPtrTy),
1239 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1240 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1241 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1242 Args);
1243 };
1244
1245 RegionCodeGenTy RCG(ParallelGen);
1246 RCG(CGF);
1247}
1248
1249void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1250 // Always emit simple barriers!
1251 if (!CGF.HaveInsertPoint())
1252 return;
1253 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1254 // This function does not use parameters, so we can emit just default values.
1255 llvm::Value *Args[] = {
1256 llvm::ConstantPointerNull::get(
1257 cast<llvm::PointerType>(getIdentTyPointerTy())),
1258 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1259 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1260 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1261 Args);
1262}
1263
1265 SourceLocation Loc,
1266 OpenMPDirectiveKind Kind, bool,
1267 bool) {
1268 // Always emit simple barriers!
1269 if (!CGF.HaveInsertPoint())
1270 return;
1271 // Build call __kmpc_cancel_barrier(loc, thread_id);
1272 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1273 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1274 getThreadID(CGF, Loc)};
1275
1276 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1277 CGM.getModule(), OMPRTL___kmpc_barrier),
1278 Args);
1279}
1280
1282 CodeGenFunction &CGF, StringRef CriticalName,
1283 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1284 const Expr *Hint) {
1285 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1286 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1287 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1288 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1289 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1290
1291 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1292
1293 // Get the mask of active threads in the warp.
1294 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1295 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1296 // Fetch team-local id of the thread.
1297 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1298
1299 // Get the width of the team.
1300 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1301
1302 // Initialize the counter variable for the loop.
1303 QualType Int32Ty =
1304 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1305 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1306 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1307 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1308 /*isInit=*/true);
1309
1310 // Block checks if loop counter exceeds upper bound.
1311 CGF.EmitBlock(LoopBB);
1312 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1313 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1314 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1315
1316 // Block tests which single thread should execute region, and which threads
1317 // should go straight to synchronisation point.
1318 CGF.EmitBlock(TestBB);
1319 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1320 llvm::Value *CmpThreadToCounter =
1321 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1322 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1323
1324 // Block emits the body of the critical region.
1325 CGF.EmitBlock(BodyBB);
1326
1327 // Output the critical statement.
1328 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1329 Hint);
1330
1331 // After the body surrounded by the critical region, the single executing
1332 // thread will jump to the synchronisation point.
1333 // Block waits for all threads in current team to finish then increments the
1334 // counter variable and returns to the loop.
1335 CGF.EmitBlock(SyncBB);
1336 // Reconverge active threads in the warp.
1337 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1338 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1339 Mask);
1340
1341 llvm::Value *IncCounterVal =
1342 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1343 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1344 CGF.EmitBranch(LoopBB);
1345
1346 // Block that is reached when all threads in the team complete the region.
1347 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1348}
1349
1350/// Cast value to the specified type.
1351static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1352 QualType ValTy, QualType CastTy,
1353 SourceLocation Loc) {
1354 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1355 "Cast type must sized.");
1356 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1357 "Val type must sized.");
1358 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1359 if (ValTy == CastTy)
1360 return Val;
1361 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1362 CGF.getContext().getTypeSizeInChars(CastTy))
1363 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1364 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1365 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1367 Address CastItem = CGF.CreateMemTemp(CastTy);
1369 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()),
1370 Val->getType());
1371 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1373 TBAAAccessInfo());
1374 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1376 TBAAAccessInfo());
1377}
1378
1379/// This function creates calls to one of two shuffle functions to copy
1380/// variables between lanes in a warp.
1382 llvm::Value *Elem,
1383 QualType ElemType,
1384 llvm::Value *Offset,
1385 SourceLocation Loc) {
1386 CodeGenModule &CGM = CGF.CGM;
1387 CGBuilderTy &Bld = CGF.Builder;
1388 CGOpenMPRuntimeGPU &RT =
1389 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1390 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1391
1392 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1393 assert(Size.getQuantity() <= 8 &&
1394 "Unsupported bitwidth in shuffle instruction.");
1395
1396 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1397 ? OMPRTL___kmpc_shuffle_int32
1398 : OMPRTL___kmpc_shuffle_int64;
1399
1400 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1402 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1403 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1404 llvm::Value *WarpSize =
1405 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1406
1407 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1408 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1409 {ElemCast, Offset, WarpSize});
1410
1411 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1412}
1413
1414static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1415 Address DestAddr, QualType ElemType,
1416 llvm::Value *Offset, SourceLocation Loc) {
1417 CGBuilderTy &Bld = CGF.Builder;
1418
1419 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1420 // Create the loop over the big sized data.
1421 // ptr = (void*)Elem;
1422 // ptrEnd = (void*) Elem + 1;
1423 // Step = 8;
1424 // while (ptr + Step < ptrEnd)
1425 // shuffle((int64_t)*ptr);
1426 // Step = 4;
1427 // while (ptr + Step < ptrEnd)
1428 // shuffle((int32_t)*ptr);
1429 // ...
1430 Address ElemPtr = DestAddr;
1431 Address Ptr = SrcAddr;
1433 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1434 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1435 if (Size < CharUnits::fromQuantity(IntSize))
1436 continue;
1439 /*Signed=*/1);
1440 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1441 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1442 IntTy);
1444 ElemPtr, IntTy->getPointerTo(), IntTy);
1445 if (Size.getQuantity() / IntSize > 1) {
1446 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1447 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1448 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1449 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1450 CGF.EmitBlock(PreCondBB);
1451 llvm::PHINode *PhiSrc =
1452 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1453 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1454 llvm::PHINode *PhiDest =
1455 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1456 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1457 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1458 ElemPtr =
1459 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1460 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1461 CGF.Int8Ty, PtrEnd.getPointer(),
1463 CGF.VoidPtrTy));
1464 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1465 ThenBB, ExitBB);
1466 CGF.EmitBlock(ThenBB);
1467 llvm::Value *Res = createRuntimeShuffleFunction(
1468 CGF,
1469 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1471 TBAAAccessInfo()),
1472 IntType, Offset, Loc);
1473 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1475 TBAAAccessInfo());
1476 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1477 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1478 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1479 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1480 CGF.EmitBranch(PreCondBB);
1481 CGF.EmitBlock(ExitBB);
1482 } else {
1483 llvm::Value *Res = createRuntimeShuffleFunction(
1484 CGF,
1485 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1487 TBAAAccessInfo()),
1488 IntType, Offset, Loc);
1489 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1491 TBAAAccessInfo());
1492 Ptr = Bld.CreateConstGEP(Ptr, 1);
1493 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1494 }
1495 Size = Size % IntSize;
1496 }
1497}
1498
1499namespace {
1500enum CopyAction : unsigned {
1501 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1502 // the warp using shuffle instructions.
1503 RemoteLaneToThread,
1504 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1505 ThreadCopy,
1506 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1507 ThreadToScratchpad,
1508 // ScratchpadToThread: Copy from a scratchpad array in global memory
1509 // containing team-reduced data to a thread's stack.
1510 ScratchpadToThread,
1511};
1512} // namespace
1513
1515 llvm::Value *RemoteLaneOffset;
1516 llvm::Value *ScratchpadIndex;
1517 llvm::Value *ScratchpadWidth;
1518};
1519
1520/// Emit instructions to copy a Reduce list, which contains partially
1521/// aggregated values, in the specified direction.
1523 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1524 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1525 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1526
1527 CodeGenModule &CGM = CGF.CGM;
1528 ASTContext &C = CGM.getContext();
1529 CGBuilderTy &Bld = CGF.Builder;
1530
1531 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1532 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1533 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1534
1535 // Iterates, element-by-element, through the source Reduce list and
1536 // make a copy.
1537 unsigned Idx = 0;
1538 unsigned Size = Privates.size();
1539 for (const Expr *Private : Privates) {
1540 Address SrcElementAddr = Address::invalid();
1541 Address DestElementAddr = Address::invalid();
1542 Address DestElementPtrAddr = Address::invalid();
1543 // Should we shuffle in an element from a remote lane?
1544 bool ShuffleInElement = false;
1545 // Set to true to update the pointer in the dest Reduce list to a
1546 // newly created element.
1547 bool UpdateDestListPtr = false;
1548 // Increment the src or dest pointer to the scratchpad, for each
1549 // new element.
1550 bool IncrScratchpadSrc = false;
1551 bool IncrScratchpadDest = false;
1552 QualType PrivatePtrType = C.getPointerType(Private->getType());
1553 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1554
1555 switch (Action) {
1556 case RemoteLaneToThread: {
1557 // Step 1.1: Get the address for the src element in the Reduce list.
1558 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1559 SrcElementAddr =
1561 SrcElementPtrAddr, PrivateLlvmPtrType),
1562 PrivatePtrType->castAs<PointerType>());
1563
1564 // Step 1.2: Create a temporary to store the element in the destination
1565 // Reduce list.
1566 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1567 DestElementAddr =
1568 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1569 ShuffleInElement = true;
1570 UpdateDestListPtr = true;
1571 break;
1572 }
1573 case ThreadCopy: {
1574 // Step 1.1: Get the address for the src element in the Reduce list.
1575 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1576 SrcElementAddr =
1578 SrcElementPtrAddr, PrivateLlvmPtrType),
1579 PrivatePtrType->castAs<PointerType>());
1580
1581 // Step 1.2: Get the address for dest element. The destination
1582 // element has already been created on the thread's stack.
1583 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1584 DestElementAddr =
1586 DestElementPtrAddr, PrivateLlvmPtrType),
1587 PrivatePtrType->castAs<PointerType>());
1588 break;
1589 }
1590 case ThreadToScratchpad: {
1591 // Step 1.1: Get the address for the src element in the Reduce list.
1592 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1593 SrcElementAddr =
1595 SrcElementPtrAddr, PrivateLlvmPtrType),
1596 PrivatePtrType->castAs<PointerType>());
1597
1598 // Step 1.2: Get the address for dest element:
1599 // address = base + index * ElementSizeInChars.
1600 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1601 llvm::Value *CurrentOffset =
1602 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1603 llvm::Value *ScratchPadElemAbsolutePtrVal =
1604 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1605 ScratchPadElemAbsolutePtrVal =
1606 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1607 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1608 C.getTypeAlignInChars(Private->getType()));
1609 IncrScratchpadDest = true;
1610 break;
1611 }
1612 case ScratchpadToThread: {
1613 // Step 1.1: Get the address for the src element in the scratchpad.
1614 // address = base + index * ElementSizeInChars.
1615 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1616 llvm::Value *CurrentOffset =
1617 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1618 llvm::Value *ScratchPadElemAbsolutePtrVal =
1619 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1620 ScratchPadElemAbsolutePtrVal =
1621 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1622 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1623 C.getTypeAlignInChars(Private->getType()));
1624 IncrScratchpadSrc = true;
1625
1626 // Step 1.2: Create a temporary to store the element in the destination
1627 // Reduce list.
1628 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1629 DestElementAddr =
1630 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1631 UpdateDestListPtr = true;
1632 break;
1633 }
1634 }
1635
1636 // Regardless of src and dest of copy, we emit the load of src
1637 // element as this is required in all directions
1638 SrcElementAddr = Bld.CreateElementBitCast(
1639 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1640 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1641 SrcElementAddr.getElementType());
1642
1643 // Now that all active lanes have read the element in the
1644 // Reduce list, shuffle over the value from the remote lane.
1645 if (ShuffleInElement) {
1646 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1647 RemoteLaneOffset, Private->getExprLoc());
1648 } else {
1649 switch (CGF.getEvaluationKind(Private->getType())) {
1650 case TEK_Scalar: {
1651 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1652 SrcElementAddr, /*Volatile=*/false, Private->getType(),
1653 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1654 TBAAAccessInfo());
1655 // Store the source element value to the dest element address.
1657 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1659 break;
1660 }
1661 case TEK_Complex: {
1663 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1664 Private->getExprLoc());
1666 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1667 /*isInit=*/false);
1668 break;
1669 }
1670 case TEK_Aggregate:
1672 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1673 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1675 break;
1676 }
1677 }
1678
1679 // Step 3.1: Modify reference in dest Reduce list as needed.
1680 // Modifying the reference in Reduce list to point to the newly
1681 // created element. The element is live in the current function
1682 // scope and that of functions it invokes (i.e., reduce_function).
1683 // RemoteReduceData[i] = (void*)&RemoteElem
1684 if (UpdateDestListPtr) {
1686 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1687 DestElementPtrAddr, /*Volatile=*/false,
1688 C.VoidPtrTy);
1689 }
1690
1691 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1692 // address of the next element in scratchpad memory, unless we're currently
1693 // processing the last one. Memory alignment is also taken care of here.
1694 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1695 // FIXME: This code doesn't make any sense, it's trying to perform
1696 // integer arithmetic on pointers.
1697 llvm::Value *ScratchpadBasePtr =
1698 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1699 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1700 ScratchpadBasePtr = Bld.CreateNUWAdd(
1701 ScratchpadBasePtr,
1702 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
1703
1704 // Take care of global memory alignment for performance
1705 ScratchpadBasePtr = Bld.CreateNUWSub(
1706 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1707 ScratchpadBasePtr = Bld.CreateUDiv(
1708 ScratchpadBasePtr,
1709 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1710 ScratchpadBasePtr = Bld.CreateNUWAdd(
1711 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1712 ScratchpadBasePtr = Bld.CreateNUWMul(
1713 ScratchpadBasePtr,
1714 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1715
1716 if (IncrScratchpadDest)
1717 DestBase =
1718 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1719 else /* IncrScratchpadSrc = true */
1720 SrcBase =
1721 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1722 }
1723
1724 ++Idx;
1725 }
1726}
1727
1728/// This function emits a helper that gathers Reduce lists from the first
1729/// lane of every active warp to lanes in the first warp.
1730///
1731/// void inter_warp_copy_func(void* reduce_data, num_warps)
1732/// shared smem[warp_size];
1733/// For all data entries D in reduce_data:
1734/// sync
1735/// If (I am the first lane in each warp)
1736/// Copy my local D to smem[warp_id]
1737/// sync
1738/// if (I am the first warp)
1739/// Copy smem[thread_id] to my local D
1741 ArrayRef<const Expr *> Privates,
1742 QualType ReductionArrayTy,
1743 SourceLocation Loc) {
1744 ASTContext &C = CGM.getContext();
1745 llvm::Module &M = CGM.getModule();
1746
1747 // ReduceList: thread local Reduce list.
1748 // At the stage of the computation when this function is called, partially
1749 // aggregated values reside in the first lane of every active warp.
1750 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1751 C.VoidPtrTy, ImplicitParamDecl::Other);
1752 // NumWarps: number of warps active in the parallel region. This could
1753 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1754 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1755 C.getIntTypeForBitwidth(32, /* Signed */ true),
1757 FunctionArgList Args;
1758 Args.push_back(&ReduceListArg);
1759 Args.push_back(&NumWarpsArg);
1760
1761 const CGFunctionInfo &CGFI =
1762 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1763 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
1764 llvm::GlobalValue::InternalLinkage,
1765 "_omp_reduction_inter_warp_copy_func", &M);
1767 Fn->setDoesNotRecurse();
1768 CodeGenFunction CGF(CGM);
1769 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
1770
1771 CGBuilderTy &Bld = CGF.Builder;
1772
1773 // This array is used as a medium to transfer, one reduce element at a time,
1774 // the data from the first lane of every warp to lanes in the first warp
1775 // in order to perform the final step of a reduction in a parallel region
1776 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1777 // for reduced latency, as well as to have a distinct copy for concurrently
1778 // executing target regions. The array is declared with common linkage so
1779 // as to be shared across compilation units.
1780 StringRef TransferMediumName =
1781 "__openmp_nvptx_data_transfer_temporary_storage";
1782 llvm::GlobalVariable *TransferMedium =
1783 M.getGlobalVariable(TransferMediumName);
1784 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1785 if (!TransferMedium) {
1786 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
1787 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1788 TransferMedium = new llvm::GlobalVariable(
1789 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1790 llvm::UndefValue::get(Ty), TransferMediumName,
1791 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1792 SharedAddressSpace);
1793 CGM.addCompilerUsedGlobal(TransferMedium);
1794 }
1795
1796 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1797 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1798 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1799 // nvptx_lane_id = nvptx_id % warpsize
1800 llvm::Value *LaneID = getNVPTXLaneID(CGF);
1801 // nvptx_warp_id = nvptx_id / warpsize
1802 llvm::Value *WarpID = getNVPTXWarpID(CGF);
1803
1804 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1805 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
1806 Address LocalReduceList(
1808 CGF.EmitLoadOfScalar(
1809 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1811 ElemTy->getPointerTo()),
1812 ElemTy, CGF.getPointerAlign());
1813
1814 unsigned Idx = 0;
1815 for (const Expr *Private : Privates) {
1816 //
1817 // Warp master copies reduce element to transfer medium in __shared__
1818 // memory.
1819 //
1820 unsigned RealTySize =
1821 C.getTypeSizeInChars(Private->getType())
1822 .alignTo(C.getTypeAlignInChars(Private->getType()))
1823 .getQuantity();
1824 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1825 unsigned NumIters = RealTySize / TySize;
1826 if (NumIters == 0)
1827 continue;
1828 QualType CType = C.getIntTypeForBitwidth(
1829 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
1830 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
1831 CharUnits Align = CharUnits::fromQuantity(TySize);
1832 llvm::Value *Cnt = nullptr;
1833 Address CntAddr = Address::invalid();
1834 llvm::BasicBlock *PrecondBB = nullptr;
1835 llvm::BasicBlock *ExitBB = nullptr;
1836 if (NumIters > 1) {
1837 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1838 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
1839 /*Volatile=*/false, C.IntTy);
1840 PrecondBB = CGF.createBasicBlock("precond");
1841 ExitBB = CGF.createBasicBlock("exit");
1842 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
1843 // There is no need to emit line number for unconditional branch.
1845 CGF.EmitBlock(PrecondBB);
1846 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1847 llvm::Value *Cmp =
1848 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
1849 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1850 CGF.EmitBlock(BodyBB);
1851 }
1852 // kmpc_barrier.
1853 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1854 /*EmitChecks=*/false,
1855 /*ForceSimpleCall=*/true);
1856 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1857 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1858 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1859
1860 // if (lane_id == 0)
1861 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
1862 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1863 CGF.EmitBlock(ThenBB);
1864
1865 // Reduce element = LocalReduceList[i]
1866 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1867 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1868 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1869 // elemptr = ((CopyType*)(elemptrptr)) + I
1870 Address ElemPtr(ElemPtrPtr, CGF.Int8Ty, Align);
1871 ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
1872 if (NumIters > 1)
1873 ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
1874
1875 // Get pointer to location in transfer medium.
1876 // MediumPtr = &medium[warp_id]
1877 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1878 TransferMedium->getValueType(), TransferMedium,
1879 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1880 // Casting to actual data type.
1881 // MediumPtr = (CopyType*)MediumPtrAddr;
1882 Address MediumPtr(
1883 Bld.CreateBitCast(
1884 MediumPtrVal,
1885 CopyType->getPointerTo(
1886 MediumPtrVal->getType()->getPointerAddressSpace())),
1887 CopyType, Align);
1888
1889 // elem = *elemptr
1890 //*MediumPtr = elem
1891 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1892 ElemPtr, /*Volatile=*/false, CType, Loc,
1894 // Store the source element value to the dest element address.
1895 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
1897 TBAAAccessInfo());
1898
1899 Bld.CreateBr(MergeBB);
1900
1901 CGF.EmitBlock(ElseBB);
1902 Bld.CreateBr(MergeBB);
1903
1904 CGF.EmitBlock(MergeBB);
1905
1906 // kmpc_barrier.
1907 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1908 /*EmitChecks=*/false,
1909 /*ForceSimpleCall=*/true);
1910
1911 //
1912 // Warp 0 copies reduce element from transfer medium.
1913 //
1914 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1915 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1916 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1917
1918 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1919 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1920 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1921
1922 // Up to 32 threads in warp 0 are active.
1923 llvm::Value *IsActiveThread =
1924 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1925 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1926
1927 CGF.EmitBlock(W0ThenBB);
1928
1929 // SrcMediumPtr = &medium[tid]
1930 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1931 TransferMedium->getValueType(), TransferMedium,
1932 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1933 // SrcMediumVal = *SrcMediumPtr;
1934 Address SrcMediumPtr(
1935 Bld.CreateBitCast(
1936 SrcMediumPtrVal,
1937 CopyType->getPointerTo(
1938 SrcMediumPtrVal->getType()->getPointerAddressSpace())),
1939 CopyType, Align);
1940
1941 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1942 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1943 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1944 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
1945 Address TargetElemPtr(TargetElemPtrVal, CGF.Int8Ty, Align);
1946 TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
1947 if (NumIters > 1)
1948 TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
1949
1950 // *TargetElemPtr = SrcMediumVal;
1951 llvm::Value *SrcMediumValue =
1952 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
1953 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1954 CType);
1955 Bld.CreateBr(W0MergeBB);
1956
1957 CGF.EmitBlock(W0ElseBB);
1958 Bld.CreateBr(W0MergeBB);
1959
1960 CGF.EmitBlock(W0MergeBB);
1961
1962 if (NumIters > 1) {
1963 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
1964 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
1965 CGF.EmitBranch(PrecondBB);
1967 CGF.EmitBlock(ExitBB);
1968 }
1969 RealTySize %= TySize;
1970 }
1971 ++Idx;
1972 }
1973
1974 CGF.FinishFunction();
1975 return Fn;
1976}
1977
1978/// Emit a helper that reduces data across two OpenMP threads (lanes)
1979/// in the same warp. It uses shuffle instructions to copy over data from
1980/// a remote lane's stack. The reduction algorithm performed is specified
1981/// by the fourth parameter.
1982///
1983/// Algorithm Versions.
1984/// Full Warp Reduce (argument value 0):
1985/// This algorithm assumes that all 32 lanes are active and gathers
1986/// data from these 32 lanes, producing a single resultant value.
1987/// Contiguous Partial Warp Reduce (argument value 1):
1988/// This algorithm assumes that only a *contiguous* subset of lanes
1989/// are active. This happens for the last warp in a parallel region
1990/// when the user specified num_threads is not an integer multiple of
1991/// 32. This contiguous subset always starts with the zeroth lane.
1992/// Partial Warp Reduce (argument value 2):
1993/// This algorithm gathers data from any number of lanes at any position.
1994/// All reduced values are stored in the lowest possible lane. The set
1995/// of problems every algorithm addresses is a super set of those
1996/// addressable by algorithms with a lower version number. Overhead
1997/// increases as algorithm version increases.
1998///
1999/// Terminology
2000/// Reduce element:
2001/// Reduce element refers to the individual data field with primitive
2002/// data types to be combined and reduced across threads.
2003/// Reduce list:
2004/// Reduce list refers to a collection of local, thread-private
2005/// reduce elements.
2006/// Remote Reduce list:
2007/// Remote Reduce list refers to a collection of remote (relative to
2008/// the current thread) reduce elements.
2009///
2010/// We distinguish between three states of threads that are important to
2011/// the implementation of this function.
2012/// Alive threads:
2013/// Threads in a warp executing the SIMT instruction, as distinguished from
2014/// threads that are inactive due to divergent control flow.
2015/// Active threads:
2016/// The minimal set of threads that has to be alive upon entry to this
2017/// function. The computation is correct iff active threads are alive.
2018/// Some threads are alive but they are not active because they do not
2019/// contribute to the computation in any useful manner. Turning them off
2020/// may introduce control flow overheads without any tangible benefits.
2021/// Effective threads:
2022/// In order to comply with the argument requirements of the shuffle
2023/// function, we must keep all lanes holding data alive. But at most
2024/// half of them perform value aggregation; we refer to this half of
2025/// threads as effective. The other half is simply handing off their
2026/// data.
2027///
2028/// Procedure
2029/// Value shuffle:
2030/// In this step active threads transfer data from higher lane positions
2031/// in the warp to lower lane positions, creating Remote Reduce list.
2032/// Value aggregation:
2033/// In this step, effective threads combine their thread local Reduce list
2034/// with Remote Reduce list and store the result in the thread local
2035/// Reduce list.
2036/// Value copy:
2037/// In this step, we deal with the assumption made by algorithm 2
2038/// (i.e. contiguity assumption). When we have an odd number of lanes
2039/// active, say 2k+1, only k threads will be effective and therefore k
2040/// new values will be produced. However, the Reduce list owned by the
2041/// (2k+1)th thread is ignored in the value aggregation. Therefore
2042/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2043/// that the contiguity assumption still holds.
2044static llvm::Function *emitShuffleAndReduceFunction(
2046 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2047 ASTContext &C = CGM.getContext();
2048
2049 // Thread local Reduce list used to host the values of data to be reduced.
2050 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2051 C.VoidPtrTy, ImplicitParamDecl::Other);
2052 // Current lane id; could be logical.
2053 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2055 // Offset of the remote source lane relative to the current lane.
2056 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2057 C.ShortTy, ImplicitParamDecl::Other);
2058 // Algorithm version. This is expected to be known at compile time.
2059 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2060 C.ShortTy, ImplicitParamDecl::Other);
2061 FunctionArgList Args;
2062 Args.push_back(&ReduceListArg);
2063 Args.push_back(&LaneIDArg);
2064 Args.push_back(&RemoteLaneOffsetArg);
2065 Args.push_back(&AlgoVerArg);
2066
2067 const CGFunctionInfo &CGFI =
2068 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2069 auto *Fn = llvm::Function::Create(
2070 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2071 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2073 Fn->setDoesNotRecurse();
2074
2075 CodeGenFunction CGF(CGM);
2076 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2077
2078 CGBuilderTy &Bld = CGF.Builder;
2079
2080 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2081 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2082 Address LocalReduceList(
2084 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2085 C.VoidPtrTy, SourceLocation()),
2086 ElemTy->getPointerTo()),
2087 ElemTy, CGF.getPointerAlign());
2088
2089 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2090 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2091 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2092
2093 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2094 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2095 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2096
2097 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2098 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2099 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2100
2101 // Create a local thread-private variable to host the Reduce list
2102 // from a remote lane.
2103 Address RemoteReduceList =
2104 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2105
2106 // This loop iterates through the list of reduce elements and copies,
2107 // element by element, from a remote lane in the warp to RemoteReduceList,
2108 // hosted on the thread's stack.
2109 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2110 LocalReduceList, RemoteReduceList,
2111 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2112 /*ScratchpadIndex=*/nullptr,
2113 /*ScratchpadWidth=*/nullptr});
2114
2115 // The actions to be performed on the Remote Reduce list is dependent
2116 // on the algorithm version.
2117 //
2118 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2119 // LaneId % 2 == 0 && Offset > 0):
2120 // do the reduction value aggregation
2121 //
2122 // The thread local variable Reduce list is mutated in place to host the
2123 // reduced data, which is the aggregated value produced from local and
2124 // remote lanes.
2125 //
2126 // Note that AlgoVer is expected to be a constant integer known at compile
2127 // time.
2128 // When AlgoVer==0, the first conjunction evaluates to true, making
2129 // the entire predicate true during compile time.
2130 // When AlgoVer==1, the second conjunction has only the second part to be
2131 // evaluated during runtime. Other conjunctions evaluates to false
2132 // during compile time.
2133 // When AlgoVer==2, the third conjunction has only the second part to be
2134 // evaluated during runtime. Other conjunctions evaluates to false
2135 // during compile time.
2136 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2137
2138 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2139 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2140 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2141
2142 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2143 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2144 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2145 CondAlgo2 = Bld.CreateAnd(
2146 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2147
2148 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2149 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2150
2151 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2152 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2153 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2154 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2155
2156 CGF.EmitBlock(ThenBB);
2157 // reduce_function(LocalReduceList, RemoteReduceList)
2158 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2159 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2160 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2161 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2163 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2164 Bld.CreateBr(MergeBB);
2165
2166 CGF.EmitBlock(ElseBB);
2167 Bld.CreateBr(MergeBB);
2168
2169 CGF.EmitBlock(MergeBB);
2170
2171 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2172 // Reduce list.
2173 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2174 llvm::Value *CondCopy = Bld.CreateAnd(
2175 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2176
2177 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2178 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2179 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2180 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2181
2182 CGF.EmitBlock(CpyThenBB);
2183 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2184 RemoteReduceList, LocalReduceList);
2185 Bld.CreateBr(CpyMergeBB);
2186
2187 CGF.EmitBlock(CpyElseBB);
2188 Bld.CreateBr(CpyMergeBB);
2189
2190 CGF.EmitBlock(CpyMergeBB);
2191
2192 CGF.FinishFunction();
2193 return Fn;
2194}
2195
2196/// This function emits a helper that copies all the reduction variables from
2197/// the team into the provided global buffer for the reduction variables.
2198///
2199/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2200/// For all data entries D in reduce_data:
2201/// Copy local D to buffer.D[Idx]
2204 QualType ReductionArrayTy, SourceLocation Loc,
2205 const RecordDecl *TeamReductionRec,
2206 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2207 &VarFieldMap) {
2208 ASTContext &C = CGM.getContext();
2209
2210 // Buffer: global reduction buffer.
2211 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2212 C.VoidPtrTy, ImplicitParamDecl::Other);
2213 // Idx: index of the buffer.
2214 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2216 // ReduceList: thread local Reduce list.
2217 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2218 C.VoidPtrTy, ImplicitParamDecl::Other);
2219 FunctionArgList Args;
2220 Args.push_back(&BufferArg);
2221 Args.push_back(&IdxArg);
2222 Args.push_back(&ReduceListArg);
2223
2224 const CGFunctionInfo &CGFI =
2225 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2226 auto *Fn = llvm::Function::Create(
2227 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2228 "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2230 Fn->setDoesNotRecurse();
2231 CodeGenFunction CGF(CGM);
2232 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2233
2234 CGBuilderTy &Bld = CGF.Builder;
2235
2236 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2237 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2238 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2239 Address LocalReduceList(
2241 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2242 C.VoidPtrTy, Loc),
2243 ElemTy->getPointerTo()),
2244 ElemTy, CGF.getPointerAlign());
2245 QualType StaticTy = C.getRecordType(TeamReductionRec);
2246 llvm::Type *LLVMReductionsBufferTy =
2247 CGM.getTypes().ConvertTypeForMem(StaticTy);
2248 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2249 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2250 LLVMReductionsBufferTy->getPointerTo());
2251 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2252 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2253 /*Volatile=*/false, C.IntTy,
2254 Loc)};
2255 unsigned Idx = 0;
2256 for (const Expr *Private : Privates) {
2257 // Reduce element = LocalReduceList[i]
2258 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2259 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2260 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2261 // elemptr = ((CopyType*)(elemptrptr)) + I
2262 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2263 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2264 ElemPtrPtr, ElemTy->getPointerTo());
2265 Address ElemPtr =
2266 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2267 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2268 // Global = Buffer.VD[Idx];
2269 const FieldDecl *FD = VarFieldMap.lookup(VD);
2270 LValue GlobLVal = CGF.EmitLValueForField(
2271 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2272 Address GlobAddr = GlobLVal.getAddress(CGF);
2273 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2274 GlobAddr.getPointer(), Idxs);
2275 GlobLVal.setAddress(Address(BufferPtr,
2276 CGF.ConvertTypeForMem(Private->getType()),
2277 GlobAddr.getAlignment()));
2278 switch (CGF.getEvaluationKind(Private->getType())) {
2279 case TEK_Scalar: {
2280 llvm::Value *V = CGF.EmitLoadOfScalar(
2281 ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2283 CGF.EmitStoreOfScalar(V, GlobLVal);
2284 break;
2285 }
2286 case TEK_Complex: {
2288 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2289 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2290 break;
2291 }
2292 case TEK_Aggregate:
2293 CGF.EmitAggregateCopy(GlobLVal,
2294 CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2296 break;
2297 }
2298 ++Idx;
2299 }
2300
2301 CGF.FinishFunction();
2302 return Fn;
2303}
2304
2305/// This function emits a helper that reduces all the reduction variables from
2306/// the team into the provided global buffer for the reduction variables.
2307///
2308/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2309/// void *GlobPtrs[];
2310/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2311/// ...
2312/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2313/// reduce_function(GlobPtrs, reduce_data);
2316 QualType ReductionArrayTy, SourceLocation Loc,
2317 const RecordDecl *TeamReductionRec,
2318 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2319 &VarFieldMap,
2320 llvm::Function *ReduceFn) {
2321 ASTContext &C = CGM.getContext();
2322
2323 // Buffer: global reduction buffer.
2324 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2325 C.VoidPtrTy, ImplicitParamDecl::Other);
2326 // Idx: index of the buffer.
2327 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2329 // ReduceList: thread local Reduce list.
2330 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2331 C.VoidPtrTy, ImplicitParamDecl::Other);
2332 FunctionArgList Args;
2333 Args.push_back(&BufferArg);
2334 Args.push_back(&IdxArg);
2335 Args.push_back(&ReduceListArg);
2336
2337 const CGFunctionInfo &CGFI =
2338 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2339 auto *Fn = llvm::Function::Create(
2340 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2341 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2343 Fn->setDoesNotRecurse();
2344 CodeGenFunction CGF(CGM);
2345 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2346
2347 CGBuilderTy &Bld = CGF.Builder;
2348
2349 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2350 QualType StaticTy = C.getRecordType(TeamReductionRec);
2351 llvm::Type *LLVMReductionsBufferTy =
2352 CGM.getTypes().ConvertTypeForMem(StaticTy);
2353 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2354 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2355 LLVMReductionsBufferTy->getPointerTo());
2356
2357 // 1. Build a list of reduction variables.
2358 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2359 Address ReductionList =
2360 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2361 auto IPriv = Privates.begin();
2362 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2363 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2364 /*Volatile=*/false, C.IntTy,
2365 Loc)};
2366 unsigned Idx = 0;
2367 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2368 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2369 // Global = Buffer.VD[Idx];
2370 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2371 const FieldDecl *FD = VarFieldMap.lookup(VD);
2372 LValue GlobLVal = CGF.EmitLValueForField(
2373 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2374 Address GlobAddr = GlobLVal.getAddress(CGF);
2375 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2376 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2377 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2378 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2379 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2380 // Store array size.
2381 ++Idx;
2382 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2383 llvm::Value *Size = CGF.Builder.CreateIntCast(
2384 CGF.getVLASize(
2385 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2386 .NumElts,
2387 CGF.SizeTy, /*isSigned=*/false);
2388 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2389 Elem);
2390 }
2391 }
2392
2393 // Call reduce_function(GlobalReduceList, ReduceList)
2394 llvm::Value *GlobalReduceList =
2395 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2396 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2397 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2398 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2400 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2401 CGF.FinishFunction();
2402 return Fn;
2403}
2404
2405/// This function emits a helper that copies all the reduction variables from
2406/// the team into the provided global buffer for the reduction variables.
2407///
2408/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2409/// For all data entries D in reduce_data:
2410/// Copy buffer.D[Idx] to local D;
2413 QualType ReductionArrayTy, SourceLocation Loc,
2414 const RecordDecl *TeamReductionRec,
2415 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2416 &VarFieldMap) {
2417 ASTContext &C = CGM.getContext();
2418
2419 // Buffer: global reduction buffer.
2420 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2421 C.VoidPtrTy, ImplicitParamDecl::Other);
2422 // Idx: index of the buffer.
2423 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2425 // ReduceList: thread local Reduce list.
2426 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2427 C.VoidPtrTy, ImplicitParamDecl::Other);
2428 FunctionArgList Args;
2429 Args.push_back(&BufferArg);
2430 Args.push_back(&IdxArg);
2431 Args.push_back(&ReduceListArg);
2432
2433 const CGFunctionInfo &CGFI =
2434 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2435 auto *Fn = llvm::Function::Create(
2436 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2437 "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2439 Fn->setDoesNotRecurse();
2440 CodeGenFunction CGF(CGM);
2441 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2442
2443 CGBuilderTy &Bld = CGF.Builder;
2444
2445 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2446 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2447 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2448 Address LocalReduceList(
2450 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2451 C.VoidPtrTy, Loc),
2452 ElemTy->getPointerTo()),
2453 ElemTy, CGF.getPointerAlign());
2454 QualType StaticTy = C.getRecordType(TeamReductionRec);
2455 llvm::Type *LLVMReductionsBufferTy =
2456 CGM.getTypes().ConvertTypeForMem(StaticTy);
2457 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2458 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2459 LLVMReductionsBufferTy->getPointerTo());
2460
2461 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2462 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2463 /*Volatile=*/false, C.IntTy,
2464 Loc)};
2465 unsigned Idx = 0;
2466 for (const Expr *Private : Privates) {
2467 // Reduce element = LocalReduceList[i]
2468 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2469 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2470 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2471 // elemptr = ((CopyType*)(elemptrptr)) + I
2472 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2473 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2474 ElemPtrPtr, ElemTy->getPointerTo());
2475 Address ElemPtr =
2476 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2477 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2478 // Global = Buffer.VD[Idx];
2479 const FieldDecl *FD = VarFieldMap.lookup(VD);
2480 LValue GlobLVal = CGF.EmitLValueForField(
2481 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2482 Address GlobAddr = GlobLVal.getAddress(CGF);
2483 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2484 GlobAddr.getPointer(), Idxs);
2485 GlobLVal.setAddress(Address(BufferPtr,
2486 CGF.ConvertTypeForMem(Private->getType()),
2487 GlobAddr.getAlignment()));
2488 switch (CGF.getEvaluationKind(Private->getType())) {
2489 case TEK_Scalar: {
2490 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2491 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2493 TBAAAccessInfo());
2494 break;
2495 }
2496 case TEK_Complex: {
2498 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2499 /*isInit=*/false);
2500 break;
2501 }
2502 case TEK_Aggregate:
2503 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2504 GlobLVal, Private->getType(),
2506 break;
2507 }
2508 ++Idx;
2509 }
2510
2511 CGF.FinishFunction();
2512 return Fn;
2513}
2514
2515/// This function emits a helper that reduces all the reduction variables from
2516/// the team into the provided global buffer for the reduction variables.
2517///
2518/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2519/// void *GlobPtrs[];
2520/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2521/// ...
2522/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2523/// reduce_function(reduce_data, GlobPtrs);
2526 QualType ReductionArrayTy, SourceLocation Loc,
2527 const RecordDecl *TeamReductionRec,
2528 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2529 &VarFieldMap,
2530 llvm::Function *ReduceFn) {
2531 ASTContext &C = CGM.getContext();
2532
2533 // Buffer: global reduction buffer.
2534 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2535 C.VoidPtrTy, ImplicitParamDecl::Other);
2536 // Idx: index of the buffer.
2537 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2539 // ReduceList: thread local Reduce list.
2540 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2541 C.VoidPtrTy, ImplicitParamDecl::Other);
2542 FunctionArgList Args;
2543 Args.push_back(&BufferArg);
2544 Args.push_back(&IdxArg);
2545 Args.push_back(&ReduceListArg);
2546
2547 const CGFunctionInfo &CGFI =
2548 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2549 auto *Fn = llvm::Function::Create(
2550 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2551 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2553 Fn->setDoesNotRecurse();
2554 CodeGenFunction CGF(CGM);
2555 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2556
2557 CGBuilderTy &Bld = CGF.Builder;
2558
2559 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2560 QualType StaticTy = C.getRecordType(TeamReductionRec);
2561 llvm::Type *LLVMReductionsBufferTy =
2562 CGM.getTypes().ConvertTypeForMem(StaticTy);
2563 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2564 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2565 LLVMReductionsBufferTy->getPointerTo());
2566
2567 // 1. Build a list of reduction variables.
2568 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2569 Address ReductionList =
2570 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2571 auto IPriv = Privates.begin();
2572 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2573 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2574 /*Volatile=*/false, C.IntTy,
2575 Loc)};
2576 unsigned Idx = 0;
2577 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2578 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2579 // Global = Buffer.VD[Idx];
2580 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2581 const FieldDecl *FD = VarFieldMap.lookup(VD);
2582 LValue GlobLVal = CGF.EmitLValueForField(
2583 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2584 Address GlobAddr = GlobLVal.getAddress(CGF);
2585 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2586 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2587 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2588 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2589 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2590 // Store array size.
2591 ++Idx;
2592 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2593 llvm::Value *Size = CGF.Builder.CreateIntCast(
2594 CGF.getVLASize(
2595 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2596 .NumElts,
2597 CGF.SizeTy, /*isSigned=*/false);
2598 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2599 Elem);
2600 }
2601 }
2602
2603 // Call reduce_function(ReduceList, GlobalReduceList)
2604 llvm::Value *GlobalReduceList =
2605 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2606 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2607 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2608 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2610 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2611 CGF.FinishFunction();
2612 return Fn;
2613}
2614
2615///
2616/// Design of OpenMP reductions on the GPU
2617///
2618/// Consider a typical OpenMP program with one or more reduction
2619/// clauses:
2620///
2621/// float foo;
2622/// double bar;
2623/// #pragma omp target teams distribute parallel for \
2624/// reduction(+:foo) reduction(*:bar)
2625/// for (int i = 0; i < N; i++) {
2626/// foo += A[i]; bar *= B[i];
2627/// }
2628///
2629/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2630/// all teams. In our OpenMP implementation on the NVPTX device an
2631/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2632/// within a team are mapped to CUDA threads within a threadblock.
2633/// Our goal is to efficiently aggregate values across all OpenMP
2634/// threads such that:
2635///
2636/// - the compiler and runtime are logically concise, and
2637/// - the reduction is performed efficiently in a hierarchical
2638/// manner as follows: within OpenMP threads in the same warp,
2639/// across warps in a threadblock, and finally across teams on
2640/// the NVPTX device.
2641///
2642/// Introduction to Decoupling
2643///
2644/// We would like to decouple the compiler and the runtime so that the
2645/// latter is ignorant of the reduction variables (number, data types)
2646/// and the reduction operators. This allows a simpler interface
2647/// and implementation while still attaining good performance.
2648///
2649/// Pseudocode for the aforementioned OpenMP program generated by the
2650/// compiler is as follows:
2651///
2652/// 1. Create private copies of reduction variables on each OpenMP
2653/// thread: 'foo_private', 'bar_private'
2654/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2655/// to it and writes the result in 'foo_private' and 'bar_private'
2656/// respectively.
2657/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2658/// and store the result on the team master:
2659///
2660/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2661/// reduceData, shuffleReduceFn, interWarpCpyFn)
2662///
2663/// where:
2664/// struct ReduceData {
2665/// double *foo;
2666/// double *bar;
2667/// } reduceData
2668/// reduceData.foo = &foo_private
2669/// reduceData.bar = &bar_private
2670///
2671/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2672/// auxiliary functions generated by the compiler that operate on
2673/// variables of type 'ReduceData'. They aid the runtime perform
2674/// algorithmic steps in a data agnostic manner.
2675///
2676/// 'shuffleReduceFn' is a pointer to a function that reduces data
2677/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2678/// same warp. It takes the following arguments as input:
2679///
2680/// a. variable of type 'ReduceData' on the calling lane,
2681/// b. its lane_id,
2682/// c. an offset relative to the current lane_id to generate a
2683/// remote_lane_id. The remote lane contains the second
2684/// variable of type 'ReduceData' that is to be reduced.
2685/// d. an algorithm version parameter determining which reduction
2686/// algorithm to use.
2687///
2688/// 'shuffleReduceFn' retrieves data from the remote lane using
2689/// efficient GPU shuffle intrinsics and reduces, using the
2690/// algorithm specified by the 4th parameter, the two operands
2691/// element-wise. The result is written to the first operand.
2692///
2693/// Different reduction algorithms are implemented in different
2694/// runtime functions, all calling 'shuffleReduceFn' to perform
2695/// the essential reduction step. Therefore, based on the 4th
2696/// parameter, this function behaves slightly differently to
2697/// cooperate with the runtime to ensure correctness under
2698/// different circumstances.
2699///
2700/// 'InterWarpCpyFn' is a pointer to a function that transfers
2701/// reduced variables across warps. It tunnels, through CUDA
2702/// shared memory, the thread-private data of type 'ReduceData'
2703/// from lane 0 of each warp to a lane in the first warp.
2704/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2705/// The last team writes the global reduced value to memory.
2706///
2707/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2708/// reduceData, shuffleReduceFn, interWarpCpyFn,
2709/// scratchpadCopyFn, loadAndReduceFn)
2710///
2711/// 'scratchpadCopyFn' is a helper that stores reduced
2712/// data from the team master to a scratchpad array in
2713/// global memory.
2714///
2715/// 'loadAndReduceFn' is a helper that loads data from
2716/// the scratchpad array and reduces it with the input
2717/// operand.
2718///
2719/// These compiler generated functions hide address
2720/// calculation and alignment information from the runtime.
2721/// 5. if ret == 1:
2722/// The team master of the last team stores the reduced
2723/// result to the globals in memory.
2724/// foo += reduceData.foo; bar *= reduceData.bar
2725///
2726///
2727/// Warp Reduction Algorithms
2728///
2729/// On the warp level, we have three algorithms implemented in the
2730/// OpenMP runtime depending on the number of active lanes:
2731///
2732/// Full Warp Reduction
2733///
2734/// The reduce algorithm within a warp where all lanes are active
2735/// is implemented in the runtime as follows:
2736///
2737/// full_warp_reduce(void *reduce_data,
2738/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2739/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2740/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2741/// }
2742///
2743/// The algorithm completes in log(2, WARPSIZE) steps.
2744///
2745/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2746/// not used therefore we save instructions by not retrieving lane_id
2747/// from the corresponding special registers. The 4th parameter, which
2748/// represents the version of the algorithm being used, is set to 0 to
2749/// signify full warp reduction.
2750///
2751/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2752///
2753/// #reduce_elem refers to an element in the local lane's data structure
2754/// #remote_elem is retrieved from a remote lane
2755/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2756/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2757///
2758/// Contiguous Partial Warp Reduction
2759///
2760/// This reduce algorithm is used within a warp where only the first
2761/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2762/// number of OpenMP threads in a parallel region is not a multiple of
2763/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2764///
2765/// void
2766/// contiguous_partial_reduce(void *reduce_data,
2767/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2768/// int size, int lane_id) {
2769/// int curr_size;
2770/// int offset;
2771/// curr_size = size;
2772/// mask = curr_size/2;
2773/// while (offset>0) {
2774/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2775/// curr_size = (curr_size+1)/2;
2776/// offset = curr_size/2;
2777/// }
2778/// }
2779///
2780/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2781///
2782/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2783/// if (lane_id < offset)
2784/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2785/// else
2786/// reduce_elem = remote_elem
2787///
2788/// This algorithm assumes that the data to be reduced are located in a
2789/// contiguous subset of lanes starting from the first. When there is
2790/// an odd number of active lanes, the data in the last lane is not
2791/// aggregated with any other lane's dat but is instead copied over.
2792///
2793/// Dispersed Partial Warp Reduction
2794///
2795/// This algorithm is used within a warp when any discontiguous subset of
2796/// lanes are active. It is used to implement the reduction operation
2797/// across lanes in an OpenMP simd region or in a nested parallel region.
2798///
2799/// void
2800/// dispersed_partial_reduce(void *reduce_data,
2801/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2802/// int size, remote_id;
2803/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2804/// do {
2805/// remote_id = next_active_lane_id_right_after_me();
2806/// # the above function returns 0 of no active lane
2807/// # is present right after the current lane.
2808/// size = number_of_active_lanes_in_this_warp();
2809/// logical_lane_id /= 2;
2810/// ShuffleReduceFn(reduce_data, logical_lane_id,
2811/// remote_id-1-threadIdx.x, 2);
2812/// } while (logical_lane_id % 2 == 0 && size > 1);
2813/// }
2814///
2815/// There is no assumption made about the initial state of the reduction.
2816/// Any number of lanes (>=1) could be active at any position. The reduction
2817/// result is returned in the first active lane.
2818///
2819/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2820///
2821/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2822/// if (lane_id % 2 == 0 && offset > 0)
2823/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2824/// else
2825/// reduce_elem = remote_elem
2826///
2827///
2828/// Intra-Team Reduction
2829///
2830/// This function, as implemented in the runtime call
2831/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2832/// threads in a team. It first reduces within a warp using the
2833/// aforementioned algorithms. We then proceed to gather all such
2834/// reduced values at the first warp.
2835///
2836/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2837/// data from each of the "warp master" (zeroth lane of each warp, where
2838/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2839/// a mathematical sense) the problem of reduction across warp masters in
2840/// a block to the problem of warp reduction.
2841///
2842///
2843/// Inter-Team Reduction
2844///
2845/// Once a team has reduced its data to a single value, it is stored in
2846/// a global scratchpad array. Since each team has a distinct slot, this
2847/// can be done without locking.
2848///
2849/// The last team to write to the scratchpad array proceeds to reduce the
2850/// scratchpad array. One or more workers in the last team use the helper
2851/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2852/// the k'th worker reduces every k'th element.
2853///
2854/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2855/// reduce across workers and compute a globally reduced value.
2856///
2860 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2861 if (!CGF.HaveInsertPoint())
2862 return;
2863
2864 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2865#ifndef NDEBUG
2866 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2867#endif
2868
2869 if (Options.SimpleReduction) {
2870 assert(!TeamsReduction && !ParallelReduction &&
2871 "Invalid reduction selection in emitReduction.");
2872 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2873 ReductionOps, Options);
2874 return;
2875 }
2876
2877 assert((TeamsReduction || ParallelReduction) &&
2878 "Invalid reduction selection in emitReduction.");
2879
2880 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2881 // RedList, shuffle_reduce_func, interwarp_copy_func);
2882 // or
2883 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2884 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2885 llvm::Value *ThreadId = getThreadID(CGF, Loc);
2886
2887 llvm::Value *Res;
2889 // 1. Build a list of reduction variables.
2890 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2891 auto Size = RHSExprs.size();
2892 for (const Expr *E : Privates) {
2893 if (E->getType()->isVariablyModifiedType())
2894 // Reserve place for array size.
2895 ++Size;
2896 }
2897 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2898 QualType ReductionArrayTy =
2899 C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
2900 /*IndexTypeQuals=*/0);
2901 Address ReductionList =
2902 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2903 auto IPriv = Privates.begin();
2904 unsigned Idx = 0;
2905 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2906 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2907 CGF.Builder.CreateStore(
2909 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
2910 Elem);
2911 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2912 // Store array size.
2913 ++Idx;
2914 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2915 llvm::Value *Size = CGF.Builder.CreateIntCast(
2916 CGF.getVLASize(
2917 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2918 .NumElts,
2919 CGF.SizeTy, /*isSigned=*/false);
2920 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2921 Elem);
2922 }
2923 }
2924
2925 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2926 ReductionList.getPointer(), CGF.VoidPtrTy);
2927 llvm::Function *ReductionFn = emitReductionFunction(
2928 CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
2929 Privates, LHSExprs, RHSExprs, ReductionOps);
2930 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2931 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2932 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2933 llvm::Value *InterWarpCopyFn =
2934 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2935
2936 if (ParallelReduction) {
2937 llvm::Value *Args[] = {RTLoc,
2938 ThreadId,
2939 CGF.Builder.getInt32(RHSExprs.size()),
2940 ReductionArrayTySize,
2941 RL,
2942 ShuffleAndReduceFn,
2943 InterWarpCopyFn};
2944
2945 Res = CGF.EmitRuntimeCall(
2946 OMPBuilder.getOrCreateRuntimeFunction(
2947 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2948 Args);
2949 } else {
2950 assert(TeamsReduction && "expected teams reduction.");
2951 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2952 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2953 int Cnt = 0;
2954 for (const Expr *DRE : Privates) {
2955 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2956 ++Cnt;
2957 }
2958 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
2959 CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
2960 C.getLangOpts().OpenMPCUDAReductionBufNum);
2961 TeamsReductions.push_back(TeamReductionRec);
2962 if (!KernelTeamsReductionPtr) {
2963 KernelTeamsReductionPtr = new llvm::GlobalVariable(
2964 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
2965 llvm::GlobalValue::InternalLinkage, nullptr,
2966 "_openmp_teams_reductions_buffer_$_$ptr");
2967 }
2968 llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
2969 Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
2970 /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
2971 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2972 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2973 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2974 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2975 ReductionFn);
2976 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2977 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2978 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2979 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2980 ReductionFn);
2981
2982 llvm::Value *Args[] = {
2983 RTLoc,
2984 ThreadId,
2985 GlobalBufferPtr,
2986 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2987 RL,
2988 ShuffleAndReduceFn,
2989 InterWarpCopyFn,
2990 GlobalToBufferCpyFn,
2991 GlobalToBufferRedFn,
2992 BufferToGlobalCpyFn,
2993 BufferToGlobalRedFn};
2994
2995 Res = CGF.EmitRuntimeCall(
2996 OMPBuilder.getOrCreateRuntimeFunction(
2997 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2998 Args);
2999 }
3000
3001 // 5. Build if (res == 1)
3002 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3003 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3004 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3005 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3006 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3007
3008 // 6. Build then branch: where we have reduced values in the master
3009 // thread in each team.
3010 // __kmpc_end_reduce{_nowait}(<gtid>);
3011 // break;
3012 CGF.EmitBlock(ThenBB);
3013
3014 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3015 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3016 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3017 auto IPriv = Privates.begin();
3018 auto ILHS = LHSExprs.begin();
3019 auto IRHS = RHSExprs.begin();
3020 for (const Expr *E : ReductionOps) {
3021 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3022 cast<DeclRefExpr>(*IRHS));
3023 ++IPriv;
3024 ++ILHS;
3025 ++IRHS;
3026 }
3027 };
3028 llvm::Value *EndArgs[] = {ThreadId};
3029 RegionCodeGenTy RCG(CodeGen);
3030 NVPTXActionTy Action(
3031 nullptr, std::nullopt,
3032 OMPBuilder.getOrCreateRuntimeFunction(
3033 CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3034 EndArgs);
3035 RCG.setAction(Action);
3036 RCG(CGF);
3037 // There is no need to emit line number for unconditional branch.
3039 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3040}
3041
3042const VarDecl *
3044 const VarDecl *NativeParam) const {
3045 if (!NativeParam->getType()->isReferenceType())
3046 return NativeParam;
3047 QualType ArgType = NativeParam->getType();
3049 const Type *NonQualTy = QC.strip(ArgType);
3050 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3051 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3052 if (Attr->getCaptureKind() == OMPC_map) {
3053 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3055 }
3056 }
3057 ArgType = CGM.getContext().getPointerType(PointeeTy);
3058 QC.addRestrict();
3059 enum { NVPTX_local_addr = 5 };
3060 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3061 ArgType = QC.apply(CGM.getContext(), ArgType);
3062 if (isa<ImplicitParamDecl>(NativeParam))
3064 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3065 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3066 return ParmVarDecl::Create(
3067 CGM.getContext(),
3068 const_cast<DeclContext *>(NativeParam->getDeclContext()),
3069 NativeParam->getBeginLoc(), NativeParam->getLocation(),
3070 NativeParam->getIdentifier(), ArgType,
3071 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3072}
3073
3074Address
3076 const VarDecl *NativeParam,
3077 const VarDecl *TargetParam) const {
3078 assert(NativeParam != TargetParam &&
3079 NativeParam->getType()->isReferenceType() &&
3080 "Native arg must not be the same as target arg.");
3081 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3082 QualType NativeParamType = NativeParam->getType();
3084 const Type *NonQualTy = QC.strip(NativeParamType);
3085 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3086 unsigned NativePointeeAddrSpace =
3087 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
3088 QualType TargetTy = TargetParam->getType();
3089 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3090 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3091 // First cast to generic.
3093 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3094 cast<llvm::PointerType>(TargetAddr->getType()), /*AddrSpace=*/0));
3095 // Cast from generic to native address space.
3097 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3098 cast<llvm::PointerType>(TargetAddr->getType()),
3099 NativePointeeAddrSpace));
3100 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3101 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3102 NativeParamType);
3103 return NativeParamAddr;
3104}
3105
3107 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3108 ArrayRef<llvm::Value *> Args) const {
3110 TargetArgs.reserve(Args.size());
3111 auto *FnType = OutlinedFn.getFunctionType();
3112 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3113 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3114 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3115 break;
3116 }
3117 llvm::Type *TargetType = FnType->getParamType(I);
3118 llvm::Value *NativeArg = Args[I];
3119 if (!TargetType->isPointerTy()) {
3120 TargetArgs.emplace_back(NativeArg);
3121 continue;
3122 }
3123 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3124 NativeArg, llvm::PointerType::getWithSamePointeeType(
3125 cast<llvm::PointerType>(NativeArg->getType()), /*AddrSpace*/ 0));
3126 TargetArgs.emplace_back(
3127 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3128 }
3129 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3130}
3131
3132/// Emit function which wraps the outline parallel region
3133/// and controls the arguments which are passed to this function.
3134/// The wrapper ensures that the outlined function is called
3135/// with the correct arguments when data is shared.
3136llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3137 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3138 ASTContext &Ctx = CGM.getContext();
3139 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3140
3141 // Create a function that takes as argument the source thread.
3142 FunctionArgList WrapperArgs;
3143 QualType Int16QTy =
3144 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3145 QualType Int32QTy =
3146 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3147 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3148 /*Id=*/nullptr, Int16QTy,
3150 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3151 /*Id=*/nullptr, Int32QTy,
3153 WrapperArgs.emplace_back(&ParallelLevelArg);
3154 WrapperArgs.emplace_back(&WrapperArg);
3155
3156 const CGFunctionInfo &CGFI =
3158
3159 auto *Fn = llvm::Function::Create(
3160 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3161 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3162
3163 // Ensure we do not inline the function. This is trivially true for the ones
3164 // passed to __kmpc_fork_call but the ones calles in serialized regions
3165 // could be inlined. This is not a perfect but it is closer to the invariant
3166 // we want, namely, every data environment starts with a new function.
3167 // TODO: We should pass the if condition to the runtime function and do the
3168 // handling there. Much cleaner code.
3169 Fn->addFnAttr(llvm::Attribute::NoInline);
3170
3172 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3173 Fn->setDoesNotRecurse();
3174
3175 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3176 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3177 D.getBeginLoc(), D.getBeginLoc());
3178
3179 const auto *RD = CS.getCapturedRecordDecl();
3180 auto CurField = RD->field_begin();
3181
3182 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3183 /*Name=*/".zero.addr");
3184 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3185 // Get the array of arguments.
3187
3188 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3189 Args.emplace_back(ZeroAddr.getPointer());
3190
3191 CGBuilderTy &Bld = CGF.Builder;
3192 auto CI = CS.capture_begin();
3193
3194 // Use global memory for data sharing.
3195 // Handle passing of global args to workers.
3196 Address GlobalArgs =
3197 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3198 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3199 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3200 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3201 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3202 DataSharingArgs);
3203
3204 // Retrieve the shared variables from the list of references returned
3205 // by the runtime. Pass the variables to the outlined function.
3206 Address SharedArgListAddress = Address::invalid();
3207 if (CS.capture_size() > 0 ||
3209 SharedArgListAddress = CGF.EmitLoadOfPointer(
3210 GlobalArgs, CGF.getContext()
3212 .castAs<PointerType>());
3213 }
3214 unsigned Idx = 0;
3216 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3218 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3219 llvm::Value *LB = CGF.EmitLoadOfScalar(
3220 TypedAddress,
3221 /*Volatile=*/false,
3223 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3224 Args.emplace_back(LB);
3225 ++Idx;
3226 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3227 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3228 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3229 llvm::Value *UB = CGF.EmitLoadOfScalar(
3230 TypedAddress,
3231 /*Volatile=*/false,
3233 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3234 Args.emplace_back(UB);
3235 ++Idx;
3236 }
3237 if (CS.capture_size() > 0) {
3238 ASTContext &CGFContext = CGF.getContext();
3239 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3240 QualType ElemTy = CurField->getType();
3241 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3243 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3244 CGF.ConvertTypeForMem(ElemTy));
3245 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3246 /*Volatile=*/false,
3247 CGFContext.getPointerType(ElemTy),
3248 CI->getLocation());
3249 if (CI->capturesVariableByCopy() &&
3250 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3251 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3252 CI->getLocation());
3253 }
3254 Args.emplace_back(Arg);
3255 }
3256 }
3257
3258 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3259 CGF.FinishFunction();
3260 return Fn;
3261}
3262
3264 const Decl *D) {
3266 return;
3267
3268 assert(D && "Expected function or captured|block decl.");
3269 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3270 "Function is registered already.");
3271 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3272 "Team is set but not processed.");
3273 const Stmt *Body = nullptr;
3274 bool NeedToDelayGlobalization = false;
3275 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3276 Body = FD->getBody();
3277 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3278 Body = BD->getBody();
3279 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3280 Body = CD->getBody();
3281 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3282 if (NeedToDelayGlobalization &&
3283 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3284 return;
3285 }
3286 if (!Body)
3287 return;
3288 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3289 VarChecker.Visit(Body);
3290 const RecordDecl *GlobalizedVarsRecord =
3291 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3292 TeamAndReductions.first = nullptr;
3293 TeamAndReductions.second.clear();
3294 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3295 VarChecker.getEscapedVariableLengthDecls();
3296 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3297 return;
3298 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3299 I->getSecond().MappedParams =
3300 std::make_unique<CodeGenFunction::OMPMapVars>();
3301 I->getSecond().EscapedParameters.insert(
3302 VarChecker.getEscapedParameters().begin(),
3303 VarChecker.getEscapedParameters().end());
3304 I->getSecond().EscapedVariableLengthDecls.append(
3305 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3306 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3307 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3308 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3309 Data.insert(std::make_pair(VD, MappedVarData()));
3310 }
3311 if (!NeedToDelayGlobalization) {
3312 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3313 struct GlobalizationScope final : EHScopeStack::Cleanup {
3314 GlobalizationScope() = default;
3315
3316 void Emit(CodeGenFunction &CGF, Flags flags) override {
3317 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3318 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3319 }
3320 };
3321 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3322 }
3323}
3324
3326 const VarDecl *VD) {
3327 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3328 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3329 auto AS = LangAS::Default;
3330 switch (A->getAllocatorType()) {
3331 // Use the default allocator here as by default local vars are
3332 // threadlocal.
3333 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3334 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3335 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3336 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3337 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3338 // Follow the user decision - use default allocation.
3339 return Address::invalid();
3340 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3341 // TODO: implement aupport for user-defined allocators.
3342 return Address::invalid();
3343 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3345 break;
3346 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3348 break;
3349 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3350 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3351 break;
3352 }
3353 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3354 auto *GV = new llvm::GlobalVariable(
3355 CGM.getModule(), VarTy, /*isConstant=*/false,
3356 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
3357 VD->getName(),
3358 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3360 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3361 GV->setAlignment(Align.getAsAlign());
3362 return Address(
3364 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3365 VD->getType().getAddressSpace()))),
3366 VarTy, Align);
3367 }
3368
3370 return Address::invalid();
3371
3372 VD = VD->getCanonicalDecl();
3373 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3374 if (I == FunctionGlobalizedDecls.end())
3375 return Address::invalid();
3376 auto VDI = I->getSecond().LocalVarData.find(VD);
3377 if (VDI != I->getSecond().LocalVarData.end())
3378 return VDI->second.PrivateAddr;
3379 if (VD->hasAttrs()) {
3381 E(VD->attr_end());
3382 IT != E; ++IT) {
3383 auto VDI = I->getSecond().LocalVarData.find(
3384 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3385 ->getCanonicalDecl());
3386 if (VDI != I->getSecond().LocalVarData.end())
3387 return VDI->second.PrivateAddr;
3388 }
3389 }
3390
3391 return Address::invalid();
3392}
3393
3395 FunctionGlobalizedDecls.erase(CGF.CurFn);
3397}
3398
3400 CodeGenFunction &CGF, const OMPLoopDirective &S,
3401 OpenMPDistScheduleClauseKind &ScheduleKind,
3402 llvm::Value *&Chunk) const {
3403 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3404 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3405 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3406 Chunk = CGF.EmitScalarConversion(
3407 RT.getGPUNumThreads(CGF),
3408 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3409 S.getIterationVariable()->getType(), S.getBeginLoc());
3410 return;
3411 }
3413 CGF, S, ScheduleKind, Chunk);
3414}
3415
3417 CodeGenFunction &CGF, const OMPLoopDirective &S,
3418 OpenMPScheduleClauseKind &ScheduleKind,
3419 const Expr *&ChunkExpr) const {
3420 ScheduleKind = OMPC_SCHEDULE_static;
3421 // Chunk size is 1 in this case.
3422 llvm::APInt ChunkSize(32, 1);
3423 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3424 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3425 SourceLocation());
3426}
3427
3429 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3431 " Expected target-based directive.");
3432 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3433 for (const CapturedStmt::Capture &C : CS->captures()) {
3434 // Capture variables captured by reference in lambdas for target-based
3435 // directives.
3436 if (!C.capturesVariable())
3437 continue;
3438 const VarDecl *VD = C.getCapturedVar();
3439 const auto *RD = VD->getType()
3443 if (!RD || !RD->isLambda())
3444 continue;
3445 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3446 LValue VDLVal;
3448 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3449 else
3450 VDLVal = CGF.MakeAddrLValue(
3451 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3452 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3453 FieldDecl *ThisCapture = nullptr;
3454 RD->getCaptureFields(Captures, ThisCapture);
3455 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3456 LValue ThisLVal =
3457 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3458 llvm::Value *CXXThis = CGF.LoadCXXThis();
3459 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3460 }
3461 for (const LambdaCapture &LC : RD->captures()) {
3462 if (LC.getCaptureKind() != LCK_ByRef)
3463 continue;
3464 const ValueDecl *VD = LC.getCapturedVar();
3465 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3466 // capturing structured bindings in lambdas yet.
3467 if (!CS->capturesVariable(cast<VarDecl>(VD)))
3468 continue;
3469 auto It = Captures.find(VD);
3470 assert(It != Captures.end() && "Found lambda capture without field.");
3471 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3472 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3474 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3475 VD->getType().getCanonicalType())
3476 .getAddress(CGF);
3477 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3478 }
3479 }
3480}
3481
3483 LangAS &AS) {
3484 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3485 return false;
3486 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3487 switch(A->getAllocatorType()) {
3488 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3489 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3490 // Not supported, fallback to the default mem space.
3491 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3492 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3493 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3494 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3495 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3496 AS = LangAS::Default;
3497 return true;
3498 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3500 return true;
3501 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3503 return true;
3504 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3505 llvm_unreachable("Expected predefined allocator for the variables with the "
3506 "static storage.");
3507 }
3508 return false;
3509}
3510
3511// Get current CudaArch and ignore any unknown values
3513 if (!CGM.getTarget().hasFeature("ptx"))
3514 return CudaArch::UNKNOWN;
3515 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3516 if (Feature.getValue()) {
3517 CudaArch Arch = StringToCudaArch(Feature.getKey());
3518 if (Arch != CudaArch::UNKNOWN)
3519 return Arch;
3520 }
3521 }
3522 return CudaArch::UNKNOWN;
3523}
3524
3525/// Check to see if target architecture supports unified addressing which is
3526/// a restriction for OpenMP requires clause "unified_shared_memory".
3528 const OMPRequiresDecl *D) {
3529 for (const OMPClause *Clause : D->clauselists()) {
3530 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3531 CudaArch Arch = getCudaArch(CGM);
3532 switch (Arch) {
3533 case CudaArch::SM_20:
3534 case CudaArch::SM_21:
3535 case CudaArch::SM_30:
3536 case CudaArch::SM_32:
3537 case CudaArch::SM_35:
3538 case CudaArch::SM_37:
3539 case CudaArch::SM_50:
3540 case CudaArch::SM_52:
3541 case CudaArch::SM_53: {
3542 SmallString<256> Buffer;
3543 llvm::raw_svector_ostream Out(Buffer);
3544 Out << "Target architecture " << CudaArchToString(Arch)
3545 << " does not support unified addressing";
3546 CGM.Error(Clause->getBeginLoc(), Out.str());
3547 return;
3548 }
3549 case CudaArch::SM_60:
3550 case CudaArch::SM_61:
3551 case CudaArch::SM_62:
3552 case CudaArch::SM_70:
3553 case CudaArch::SM_72:
3554 case CudaArch::SM_75:
3555 case CudaArch::SM_80:
3556 case CudaArch::SM_86:
3557 case CudaArch::SM_87:
3558 case CudaArch::SM_89:
3559 case CudaArch::SM_90:
3560 case CudaArch::GFX600:
3561 case CudaArch::GFX601:
3562 case CudaArch::GFX602:
3563 case CudaArch::GFX700:
3564 case CudaArch::GFX701:
3565 case CudaArch::GFX702:
3566 case CudaArch::GFX703:
3567 case CudaArch::GFX704:
3568 case CudaArch::GFX705:
3569 case CudaArch::GFX801:
3570 case CudaArch::GFX802:
3571 case CudaArch::GFX803:
3572 case CudaArch::GFX805:
3573 case CudaArch::GFX810:
3574 case CudaArch::GFX900:
3575 case CudaArch::GFX902:
3576 case CudaArch::GFX904:
3577 case CudaArch::GFX906:
3578 case CudaArch::GFX908:
3579 case CudaArch::GFX909:
3580 case CudaArch::GFX90a:
3581 case CudaArch::GFX90c:
3582 case CudaArch::GFX940:
3583 case CudaArch::GFX941:
3584 case CudaArch::GFX942:
3585 case CudaArch::GFX1010:
3586 case CudaArch::GFX1011:
3587 case CudaArch::GFX1012:
3588 case CudaArch::GFX1013:
3589 case CudaArch::GFX1030:
3590 case CudaArch::GFX1031:
3591 case CudaArch::GFX1032:
3592 case CudaArch::GFX1033:
3593 case CudaArch::GFX1034:
3594 case CudaArch::GFX1035:
3595 case CudaArch::GFX1036:
3596 case CudaArch::GFX1100:
3597 case CudaArch::GFX1101:
3598 case CudaArch::GFX1102:
3599 case CudaArch::GFX1103:
3600 case CudaArch::Generic:
3601 case CudaArch::UNUSED:
3602 case CudaArch::UNKNOWN:
3603 break;
3604 case CudaArch::LAST:
3605 llvm_unreachable("Unexpected Cuda arch.");
3606 }
3607 }
3608 }
3610}
3611
3613
3614 if (!TeamsReductions.empty()) {
3616 RecordDecl *StaticRD = C.buildImplicitRecord(
3617 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3618 StaticRD->startDefinition();
3619 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3620 QualType RecTy = C.getRecordType(TeamReductionRec);
3621 auto *Field = FieldDecl::Create(
3622 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3623 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3624 /*BW=*/nullptr, /*Mutable=*/false,
3625 /*InitStyle=*/ICIS_NoInit);
3626 Field->setAccess(AS_public);
3627 StaticRD->addDecl(Field);
3628 }
3629 StaticRD->completeDefinition();
3630 QualType StaticTy = C.getRecordType(StaticRD);
3631 llvm::Type *LLVMReductionsBufferTy =
3632 CGM.getTypes().ConvertTypeForMem(StaticTy);
3633 // FIXME: nvlink does not handle weak linkage correctly (object with the
3634 // different size are reported as erroneous).
3635 // Restore CommonLinkage as soon as nvlink is fixed.
3636 auto *GV = new llvm::GlobalVariable(
3637 CGM.getModule(), LLVMReductionsBufferTy,
3638 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3639 llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3640 "_openmp_teams_reductions_buffer_$_");
3641 KernelTeamsReductionPtr->setInitializer(
3642 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3643 CGM.VoidPtrTy));
3644 }
3646}
3647
3649 CGBuilderTy &Bld = CGF.Builder;
3650 llvm::Module *M = &CGF.CGM.getModule();
3651 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3652 llvm::Function *F = M->getFunction(LocSize);
3653 if (!F) {
3654 F = llvm::Function::Create(
3655 llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
3656 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
3657 }
3658 return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
3659}
3660
3663 return CGF.EmitRuntimeCall(
3664 OMPBuilder.getOrCreateRuntimeFunction(
3665 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3666 Args);
3667}
3668
3671 return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3672 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3673 Args);
3674}
#define V(N, I)
Definition: ASTContext.h:3230
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static CudaArch getCudaArch(CodeGenModule &CGM)
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values,...
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ... or teams {distribute ....
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
static CGOpenMPRuntimeGPU::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
This file defines OpenMP nodes for declarative directives.
unsigned Offset
Definition: Format.cpp:2798
This file defines OpenMP AST classes for clauses.
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
const char * Data
This file defines OpenMP AST classes for executable directives and clauses.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:182
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
CanQualType VoidPtrTy
Definition: ASTContext.h:1104
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
CanQualType VoidTy
Definition: ASTContext.h:1077
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2720
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:743
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
Definition: Attr.h:40
A class which contains all the information about a particular captured value.
Definition: Decl.h:4361
ArrayRef< Capture > captures() const
Definition: Decl.h:4482
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:6025
const BlockDecl * getBlockDecl() const
Definition: Expr.h:6037
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2817
Expr * getCallee()
Definition: Expr.h:2967
arg_range arguments()
Definition: Expr.h:3056
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3586
This captures a statement into a function.
Definition: Stmt.h:3573
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1405
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3677
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1429
capture_range captures()
Definition: Stmt.h:3711
CastKind getCastKind() const
Definition: Expr.h:3531
Expr * getSubExpr()
Definition: Expr.h:3537
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
int64_t QuantityType
Definition: CharUnits.h:40
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:122
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:189
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
An aligned address.
Definition: Address.h:29
static Address invalid()
Definition: Address.h:49
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:81
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:65
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:71
llvm::Value * getPointer() const
Definition: Address.h:54
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:60
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:870
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:99
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:169
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:177
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ... produce name = getelementptr inbounds addr, i64 0, i64 index where i64 is a...
Definition: CGBuilder.h:214
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:251
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:234
Address CreateGEP(Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition: CGBuilder.h:267
CGFunctionInfo - Class to encapsulate the information about a function definition.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
@ Generic
Generic data-sharing mode.
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
llvm::Value * getGPUWarpSize(CodeGenFunction &CGF)
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
ExecutionMode
Defines the execution mode.
@ EM_NonSPMD
Non-SPMD execution mode (1 master thread, others are workers).
@ EM_Unknown
Unknown execution mode (orphaned directive).
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
void setLocThreadIdInsertPt(CodeGenFunction &CGF, bool AtCurrentPoint=false)
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
virtual llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Helper to emit outlined function for 'target' directive.
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
virtual void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
virtual void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false)
Emit an implicit/explicit barrier for OpenMP threads.
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
virtual llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
llvm::OpenMPIRBuilder & getOMPBuilder()
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
llvm::Function * emitReductionFunction(StringRef ReducerName, SourceLocation Loc, llvm::Type *ArgsElemType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
CGCapturedStmtInfo * CapturedStmtInfo
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements,...
LValue EmitLValue(const Expr *E, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitLValue - Emit code to compute a designator that specifies the location of the expression.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::Type * ConvertTypeForMem(QualType T)
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
const TargetInfo & getTarget() const
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
llvm::Type * ConvertType(QualType T)
CodeGenTypes & getTypes() const
LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T)
llvm::Value * EmitCastToVoidPtr(llvm::Value *value)
Emit a cast to void* in the appropriate address space.
llvm::Value * EvaluateExprAsBool(const Expr *E)
EvaluateExprAsBool - Perform the usual unary conversions on the specified expression and compare the ...
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
This class organizes the cross-function state that is used while generating LLVM code.
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
llvm::Module & getModule() const
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
const LangOptions & getLangOpts() const
const TargetInfo & getTarget() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
ASTContext & getContext() const
llvm::LLVMContext & getLLVMContext()
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1618
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:671
unsigned getTargetAddressSpace(QualType T) const
llvm::Type * ConvertTypeForMem(QualType T, bool ForBitField=false)
ConvertTypeForMem - Convert type T into a llvm::Type.
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:353
LValue - This represents an lvalue references.
Definition: CGValue.h:171
Address getAddress(CodeGenFunction &CGF) const
Definition: CGValue.h:352
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:348
QualType getType() const
Definition: CGValue.h:281
void setAddress(Address address)
Definition: CGValue.h:356
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
virtual void Exit(CodeGenFunction &CGF)
virtual void Enter(CodeGenFunction &CGF)
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
void setAction(PrePostActionTy &Action) const
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:194
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1402
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1642
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1237
ValueDecl * getDecl()
Definition: Expr.h:1305
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1315
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:83
T * getAttr() const
Definition: DeclBase.h:556
bool hasAttrs() const
Definition: DeclBase.h:502
attr_iterator attr_end() const
Definition: DeclBase.h:526
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:952
attr_iterator attr_begin() const
Definition: DeclBase.h:523
SourceLocation getLocation() const
Definition: DeclBase.h:432
DeclContext * getDeclContext()
Definition: DeclBase.h:441
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:424
AttrVec & getAttrs()
Definition: DeclBase.h:508
bool hasAttr() const
Definition: DeclBase.h:560
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:946
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:817
This represents one expression.
Definition: Expr.h:110
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3055
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3051
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:271
QualType getType() const
Definition: Expr.h:142
Represents a member of a struct/union/class.
Definition: Decl.h:2945
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4351
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3634
@ Other
Other implicit parameter.
Definition: Decl.h:1682
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:5156
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:984
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1924
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition: ExprCXX.cpp:1280
capture_range captures() const
Retrieve this lambda's captures.
Definition: ExprCXX.cpp:1293
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:460
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:268
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:274
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:543
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:518
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive.
Definition: StmtOpenMP.h:534
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:556
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:521
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:489
ArrayRef< OMPClause * > clauses() const
Definition: StmtOpenMP.h:573
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:446
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1003
This represents clause 'reduction' in the '#pragma omp ...' directives.
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:416
clauselist_range clauselists()
Definition: DeclOpenMP.h:441
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2854
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2800
A (possibly-)qualified type.
Definition: Type.h:736
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:6789
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6864
QualType getCanonicalType() const
Definition: Type.h:6716
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:6604
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:6611
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3983
void addAddressSpace(LangAS space)
Definition: Type.h:403
void addRestrict()
Definition: Type.h:286
Represents a struct/union/class.
Definition: Decl.h:4012
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:4871
Encodes a location in the source.
RetTy Visit(PTR(Stmt) S, ParamTys... P)
Definition: StmtVisitor.h:43
Stmt - This represents one statement.
Definition: Stmt.h:72
child_range children()
Definition: Stmt.cpp:286
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top,...
Definition: Stmt.cpp:196
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4528
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:695
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:283
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1539
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1393
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62
The base class of the type hierarchy.
Definition: Type.h:1568
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1799
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:7256
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:7497
bool isReferenceType() const
Definition: Type.h:6928
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:631
bool isLValueReferenceType() const
Definition: Type.h:6932
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2103
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2347
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2181
Expr * getSubExpr() const
Definition: Expr.h:2226
Opcode getOpcode() const
Definition: Expr.h:2221
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:701
QualType getType() const
Definition: Decl.h:712
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:5148
Represents a variable declaration or definition.
Definition: Decl.h:913
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2203
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1528
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:33
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
CudaArch
Definition: Cuda.h:51
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:263
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:161
@ CR_OpenMP
Definition: CapturedStmt.h:19
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
@ SC_None
Definition: Specifiers.h:241
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
@ C
Languages that the frontend can parse and compile.
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:143
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30
@ AS_public
Definition: Specifiers.h:115
llvm::Value * ScratchpadIndex
llvm::Value * ScratchpadWidth
llvm::Value * RemoteLaneOffset
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::IntegerType * IntTy
int