clang 19.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/Support/MathExtras.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
35 llvm::FunctionCallee ExitCallee = nullptr;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52 ContBlock = CGF.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55 CGF.EmitBlock(ThenBlock);
56 }
57 }
58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(ContBlock);
61 CGF.EmitBlock(ContBlock, true);
62 }
63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65 }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
77
78public:
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86};
87
88static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
89 RefExpr = RefExpr->IgnoreParens();
90 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
91 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
93 Base = TempASE->getBase()->IgnoreParenImpCasts();
94 RefExpr = Base;
95 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
96 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
98 Base = TempOASE->getBase()->IgnoreParenImpCasts();
99 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
100 Base = TempASE->getBase()->IgnoreParenImpCasts();
101 RefExpr = Base;
102 }
103 RefExpr = RefExpr->IgnoreParenImpCasts();
104 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
105 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106 const auto *ME = cast<MemberExpr>(RefExpr);
107 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108}
109
110static RecordDecl *buildRecordForGlobalizedVars(
112 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
114 &MappedDeclsFields,
115 int BufSize) {
116 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
117 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118 return nullptr;
119 SmallVector<VarsDataTy, 4> GlobalizedVars;
120 for (const ValueDecl *D : EscapedDecls)
121 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
122 for (const ValueDecl *D : EscapedDeclsForTeams)
123 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
124
125 // Build struct _globalized_locals_ty {
126 // /* globalized vars */[WarSize] align (decl_align)
127 // /* globalized vars */ for EscapedDeclsForTeams
128 // };
129 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
130 GlobalizedRD->startDefinition();
132 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133 for (const auto &Pair : GlobalizedVars) {
134 const ValueDecl *VD = Pair.second;
135 QualType Type = VD->getType();
137 Type = C.getPointerType(Type.getNonReferenceType());
138 else
139 Type = Type.getNonReferenceType();
140 SourceLocation Loc = VD->getLocation();
142 if (SingleEscaped.count(VD)) {
144 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
145 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
146 /*BW=*/nullptr, /*Mutable=*/false,
147 /*InitStyle=*/ICIS_NoInit);
148 Field->setAccess(AS_public);
149 if (VD->hasAttrs()) {
150 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151 E(VD->getAttrs().end());
152 I != E; ++I)
153 Field->addAttr(*I);
154 }
155 } else {
156 if (BufSize > 1) {
157 llvm::APInt ArraySize(32, BufSize);
158 Type = C.getConstantArrayType(Type, ArraySize, nullptr,
159 ArraySizeModifier::Normal, 0);
160 }
162 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
163 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
164 /*BW=*/nullptr, /*Mutable=*/false,
165 /*InitStyle=*/ICIS_NoInit);
166 Field->setAccess(AS_public);
167 llvm::APInt Align(32, Pair.first.getQuantity());
168 Field->addAttr(AlignedAttr::CreateImplicit(
169 C, /*IsAlignmentExpr=*/true,
171 C.getIntTypeForBitwidth(32, /*Signed=*/0),
173 {}, AlignedAttr::GNU_aligned));
174 }
175 GlobalizedRD->addDecl(Field);
176 MappedDeclsFields.try_emplace(VD, Field);
177 }
178 GlobalizedRD->completeDefinition();
179 return GlobalizedRD;
180}
181
182/// Get the list of variables that can escape their declaration context.
183class CheckVarsEscapingDeclContext final
184 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185 CodeGenFunction &CGF;
186 llvm::SetVector<const ValueDecl *> EscapedDecls;
187 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
190 RecordDecl *GlobalizedRD = nullptr;
191 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
192 bool AllEscaped = false;
193 bool IsForCombinedParallelRegion = false;
194
195 void markAsEscaped(const ValueDecl *VD) {
196 // Do not globalize declare target variables.
197 if (!isa<VarDecl>(VD) ||
198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199 return;
200 VD = cast<ValueDecl>(VD->getCanonicalDecl());
201 // Use user-specified allocation.
202 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203 return;
204 // Variables captured by value must be globalized.
205 bool IsCaptured = false;
206 if (auto *CSI = CGF.CapturedStmtInfo) {
207 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
208 // Check if need to capture the variable that was already captured by
209 // value in the outer region.
210 IsCaptured = true;
211 if (!IsForCombinedParallelRegion) {
212 if (!FD->hasAttrs())
213 return;
214 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215 if (!Attr)
216 return;
217 if (((Attr->getCaptureKind() != OMPC_map) &&
218 !isOpenMPPrivate(Attr->getCaptureKind())) ||
219 ((Attr->getCaptureKind() == OMPC_map) &&
220 !FD->getType()->isAnyPointerType()))
221 return;
222 }
223 if (!FD->getType()->isReferenceType()) {
224 assert(!VD->getType()->isVariablyModifiedType() &&
225 "Parameter captured by value with variably modified type");
226 EscapedParameters.insert(VD);
227 } else if (!IsForCombinedParallelRegion) {
228 return;
229 }
230 }
231 }
232 if ((!CGF.CapturedStmtInfo ||
233 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234 VD->getType()->isReferenceType())
235 // Do not globalize variables with reference type.
236 return;
237 if (VD->getType()->isVariablyModifiedType()) {
238 // If not captured at the target region level then mark the escaped
239 // variable as delayed.
240 if (IsCaptured)
241 EscapedVariableLengthDecls.insert(VD);
242 else
243 DelayedVariableLengthDecls.insert(VD);
244 } else
245 EscapedDecls.insert(VD);
246 }
247
248 void VisitValueDecl(const ValueDecl *VD) {
249 if (VD->getType()->isLValueReferenceType())
250 markAsEscaped(VD);
251 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
252 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
253 const bool SavedAllEscaped = AllEscaped;
254 AllEscaped = VD->getType()->isLValueReferenceType();
255 Visit(VarD->getInit());
256 AllEscaped = SavedAllEscaped;
257 }
258 }
259 }
260 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261 ArrayRef<OMPClause *> Clauses,
262 bool IsCombinedParallelRegion) {
263 if (!S)
264 return;
265 for (const CapturedStmt::Capture &C : S->captures()) {
266 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267 const ValueDecl *VD = C.getCapturedVar();
268 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269 if (IsCombinedParallelRegion) {
270 // Check if the variable is privatized in the combined construct and
271 // those private copies must be shared in the inner parallel
272 // directive.
273 IsForCombinedParallelRegion = false;
274 for (const OMPClause *C : Clauses) {
275 if (!isOpenMPPrivate(C->getClauseKind()) ||
276 C->getClauseKind() == OMPC_reduction ||
277 C->getClauseKind() == OMPC_linear ||
278 C->getClauseKind() == OMPC_private)
279 continue;
281 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
282 Vars = PC->getVarRefs();
283 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
284 Vars = PC->getVarRefs();
285 else
286 llvm_unreachable("Unexpected clause.");
287 for (const auto *E : Vars) {
288 const Decl *D =
289 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
290 if (D == VD->getCanonicalDecl()) {
291 IsForCombinedParallelRegion = true;
292 break;
293 }
294 }
295 if (IsForCombinedParallelRegion)
296 break;
297 }
298 }
299 markAsEscaped(VD);
300 if (isa<OMPCapturedExprDecl>(VD))
301 VisitValueDecl(VD);
302 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303 }
304 }
305 }
306
307 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308 assert(!GlobalizedRD &&
309 "Record for globalized variables is built already.");
310 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312 if (IsInTTDRegion)
313 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314 else
315 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316 GlobalizedRD = ::buildRecordForGlobalizedVars(
317 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
318 MappedDeclsFields, WarpSize);
319 }
320
321public:
322 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323 ArrayRef<const ValueDecl *> TeamsReductions)
324 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325 }
326 virtual ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
332 VisitValueDecl(VD);
333 }
334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, D->clauses(),
351 CaptureRegions.back() == OMPD_parallel &&
353 }
354 }
355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
362 if (isa<OMPCapturedExprDecl>(VD))
363 VisitValueDecl(VD);
364 }
365 }
366 }
367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
375 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(Arg);
406 }
407 }
408 Visit(E->getCallee());
409 }
410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
416 if (isa<OMPCapturedExprDecl>(VD))
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(E->getSubExpr());
431 }
432 }
433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(Child);
454 AllEscaped = SavedAllEscaped;
455 }
456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501};
502} // anonymous namespace
503
504/// Get the id of the warp in the block.
505/// We assume that the warp size is 32, which is always the case
506/// on the NVPTX device, to generate more efficient code.
507static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
508 CGBuilderTy &Bld = CGF.Builder;
509 unsigned LaneIDBits =
510 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
511 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
512 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
513}
514
515/// Get the id of the current lane in the Warp.
516/// We assume that the warp size is 32, which is always the case
517/// on the NVPTX device, to generate more efficient code.
518static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
519 CGBuilderTy &Bld = CGF.Builder;
520 unsigned LaneIDBits =
521 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
522 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
523 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
524 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
525 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
526 "nvptx_lane_id");
527}
528
530CGOpenMPRuntimeGPU::getExecutionMode() const {
531 return CurrentExecutionMode;
532}
533
535CGOpenMPRuntimeGPU::getDataSharingMode() const {
536 return CurrentDataSharingMode;
537}
538
539/// Check for inner (nested) SPMD construct, if any
541 const OMPExecutableDirective &D) {
542 const auto *CS = D.getInnermostCapturedStmt();
543 const auto *Body =
544 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
545 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
546
547 if (const auto *NestedDir =
548 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
549 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
550 switch (D.getDirectiveKind()) {
551 case OMPD_target:
552 if (isOpenMPParallelDirective(DKind))
553 return true;
554 if (DKind == OMPD_teams) {
555 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
556 /*IgnoreCaptured=*/true);
557 if (!Body)
558 return false;
559 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
560 if (const auto *NND =
561 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
562 DKind = NND->getDirectiveKind();
563 if (isOpenMPParallelDirective(DKind))
564 return true;
565 }
566 }
567 return false;
568 case OMPD_target_teams:
569 return isOpenMPParallelDirective(DKind);
570 case OMPD_target_simd:
571 case OMPD_target_parallel:
572 case OMPD_target_parallel_for:
573 case OMPD_target_parallel_for_simd:
574 case OMPD_target_teams_distribute:
575 case OMPD_target_teams_distribute_simd:
576 case OMPD_target_teams_distribute_parallel_for:
577 case OMPD_target_teams_distribute_parallel_for_simd:
578 case OMPD_parallel:
579 case OMPD_for:
580 case OMPD_parallel_for:
581 case OMPD_parallel_master:
582 case OMPD_parallel_sections:
583 case OMPD_for_simd:
584 case OMPD_parallel_for_simd:
585 case OMPD_cancel:
586 case OMPD_cancellation_point:
587 case OMPD_ordered:
588 case OMPD_threadprivate:
589 case OMPD_allocate:
590 case OMPD_task:
591 case OMPD_simd:
592 case OMPD_sections:
593 case OMPD_section:
594 case OMPD_single:
595 case OMPD_master:
596 case OMPD_critical:
597 case OMPD_taskyield:
598 case OMPD_barrier:
599 case OMPD_taskwait:
600 case OMPD_taskgroup:
601 case OMPD_atomic:
602 case OMPD_flush:
603 case OMPD_depobj:
604 case OMPD_scan:
605 case OMPD_teams:
606 case OMPD_target_data:
607 case OMPD_target_exit_data:
608 case OMPD_target_enter_data:
609 case OMPD_distribute:
610 case OMPD_distribute_simd:
611 case OMPD_distribute_parallel_for:
612 case OMPD_distribute_parallel_for_simd:
613 case OMPD_teams_distribute:
614 case OMPD_teams_distribute_simd:
615 case OMPD_teams_distribute_parallel_for:
616 case OMPD_teams_distribute_parallel_for_simd:
617 case OMPD_target_update:
618 case OMPD_declare_simd:
619 case OMPD_declare_variant:
620 case OMPD_begin_declare_variant:
621 case OMPD_end_declare_variant:
622 case OMPD_declare_target:
623 case OMPD_end_declare_target:
624 case OMPD_declare_reduction:
625 case OMPD_declare_mapper:
626 case OMPD_taskloop:
627 case OMPD_taskloop_simd:
628 case OMPD_master_taskloop:
629 case OMPD_master_taskloop_simd:
630 case OMPD_parallel_master_taskloop:
631 case OMPD_parallel_master_taskloop_simd:
632 case OMPD_requires:
633 case OMPD_unknown:
634 default:
635 llvm_unreachable("Unexpected directive.");
636 }
637 }
638
639 return false;
640}
641
643 const OMPExecutableDirective &D) {
644 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
645 switch (DirectiveKind) {
646 case OMPD_target:
647 case OMPD_target_teams:
648 return hasNestedSPMDDirective(Ctx, D);
649 case OMPD_target_parallel_loop:
650 case OMPD_target_parallel:
651 case OMPD_target_parallel_for:
652 case OMPD_target_parallel_for_simd:
653 case OMPD_target_teams_distribute_parallel_for:
654 case OMPD_target_teams_distribute_parallel_for_simd:
655 case OMPD_target_simd:
656 case OMPD_target_teams_distribute_simd:
657 return true;
658 case OMPD_target_teams_distribute:
659 return false;
660 case OMPD_target_teams_loop:
661 // Whether this is true or not depends on how the directive will
662 // eventually be emitted.
663 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
664 return TTLD->canBeParallelFor();
665 return false;
666 case OMPD_parallel:
667 case OMPD_for:
668 case OMPD_parallel_for:
669 case OMPD_parallel_master:
670 case OMPD_parallel_sections:
671 case OMPD_for_simd:
672 case OMPD_parallel_for_simd:
673 case OMPD_cancel:
674 case OMPD_cancellation_point:
675 case OMPD_ordered:
676 case OMPD_threadprivate:
677 case OMPD_allocate:
678 case OMPD_task:
679 case OMPD_simd:
680 case OMPD_sections:
681 case OMPD_section:
682 case OMPD_single:
683 case OMPD_master:
684 case OMPD_critical:
685 case OMPD_taskyield:
686 case OMPD_barrier:
687 case OMPD_taskwait:
688 case OMPD_taskgroup:
689 case OMPD_atomic:
690 case OMPD_flush:
691 case OMPD_depobj:
692 case OMPD_scan:
693 case OMPD_teams:
694 case OMPD_target_data:
695 case OMPD_target_exit_data:
696 case OMPD_target_enter_data:
697 case OMPD_distribute:
698 case OMPD_distribute_simd:
699 case OMPD_distribute_parallel_for:
700 case OMPD_distribute_parallel_for_simd:
701 case OMPD_teams_distribute:
702 case OMPD_teams_distribute_simd:
703 case OMPD_teams_distribute_parallel_for:
704 case OMPD_teams_distribute_parallel_for_simd:
705 case OMPD_target_update:
706 case OMPD_declare_simd:
707 case OMPD_declare_variant:
708 case OMPD_begin_declare_variant:
709 case OMPD_end_declare_variant:
710 case OMPD_declare_target:
711 case OMPD_end_declare_target:
712 case OMPD_declare_reduction:
713 case OMPD_declare_mapper:
714 case OMPD_taskloop:
715 case OMPD_taskloop_simd:
716 case OMPD_master_taskloop:
717 case OMPD_master_taskloop_simd:
718 case OMPD_parallel_master_taskloop:
719 case OMPD_parallel_master_taskloop_simd:
720 case OMPD_requires:
721 case OMPD_unknown:
722 default:
723 break;
724 }
725 llvm_unreachable(
726 "Unknown programming model for OpenMP directive on NVPTX target.");
727}
728
729void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
730 StringRef ParentName,
731 llvm::Function *&OutlinedFn,
732 llvm::Constant *&OutlinedFnID,
733 bool IsOffloadEntry,
734 const RegionCodeGenTy &CodeGen) {
735 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
736 EntryFunctionState EST;
737 WrapperFunctionsMap.clear();
738
739 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
740 assert(!IsBareKernel && "bare kernel should not be at generic mode");
741
742 // Emit target region as a standalone region.
743 class NVPTXPrePostActionTy : public PrePostActionTy {
744 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
745 const OMPExecutableDirective &D;
746
747 public:
748 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
749 const OMPExecutableDirective &D)
750 : EST(EST), D(D) {}
751 void Enter(CodeGenFunction &CGF) override {
752 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
753 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
754 // Skip target region initialization.
755 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
756 }
757 void Exit(CodeGenFunction &CGF) override {
758 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
760 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
761 }
762 } Action(EST, D);
763 CodeGen.setAction(Action);
764 IsInTTDRegion = true;
765 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
766 IsOffloadEntry, CodeGen);
767 IsInTTDRegion = false;
768}
769
770void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
771 CodeGenFunction &CGF,
772 EntryFunctionState &EST, bool IsSPMD) {
773 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
774 MaxTeamsVal = -1;
775 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
776 MinTeamsVal, MaxTeamsVal);
777
778 CGBuilderTy &Bld = CGF.Builder;
779 Bld.restoreIP(OMPBuilder.createTargetInit(
780 Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
781 if (!IsSPMD)
782 emitGenericVarsProlog(CGF, EST.Loc);
783}
784
785void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
786 EntryFunctionState &EST,
787 bool IsSPMD) {
788 if (!IsSPMD)
789 emitGenericVarsEpilog(CGF);
790
791 // This is temporary until we remove the fixed sized buffer.
793 RecordDecl *StaticRD = C.buildImplicitRecord(
794 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
795 StaticRD->startDefinition();
796 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
797 QualType RecTy = C.getRecordType(TeamReductionRec);
798 auto *Field = FieldDecl::Create(
799 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
800 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
801 /*BW=*/nullptr, /*Mutable=*/false,
802 /*InitStyle=*/ICIS_NoInit);
803 Field->setAccess(AS_public);
804 StaticRD->addDecl(Field);
805 }
806 StaticRD->completeDefinition();
807 QualType StaticTy = C.getRecordType(StaticRD);
808 llvm::Type *LLVMReductionsBufferTy =
809 CGM.getTypes().ConvertTypeForMem(StaticTy);
810 const auto &DL = CGM.getModule().getDataLayout();
811 uint64_t ReductionDataSize =
812 TeamsReductions.empty()
813 ? 0
814 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
815 CGBuilderTy &Bld = CGF.Builder;
816 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
817 C.getLangOpts().OpenMPCUDAReductionBufNum);
818 TeamsReductions.clear();
819}
820
821void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
822 StringRef ParentName,
823 llvm::Function *&OutlinedFn,
824 llvm::Constant *&OutlinedFnID,
825 bool IsOffloadEntry,
826 const RegionCodeGenTy &CodeGen) {
827 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
828 EntryFunctionState EST;
829
830 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
831
832 // Emit target region as a standalone region.
833 class NVPTXPrePostActionTy : public PrePostActionTy {
835 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
836 bool IsBareKernel;
837 DataSharingMode Mode;
838 const OMPExecutableDirective &D;
839
840 public:
841 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
842 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
843 bool IsBareKernel, const OMPExecutableDirective &D)
844 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
845 Mode(RT.CurrentDataSharingMode), D(D) {}
846 void Enter(CodeGenFunction &CGF) override {
847 if (IsBareKernel) {
848 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
849 return;
850 }
851 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
852 // Skip target region initialization.
853 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
854 }
855 void Exit(CodeGenFunction &CGF) override {
856 if (IsBareKernel) {
857 RT.CurrentDataSharingMode = Mode;
858 return;
859 }
860 RT.clearLocThreadIdInsertPt(CGF);
861 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
862 }
863 } Action(*this, EST, IsBareKernel, D);
864 CodeGen.setAction(Action);
865 IsInTTDRegion = true;
866 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
867 IsOffloadEntry, CodeGen);
868 IsInTTDRegion = false;
869}
870
871void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
872 const OMPExecutableDirective &D, StringRef ParentName,
873 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
874 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
875 if (!IsOffloadEntry) // Nothing to do.
876 return;
877
878 assert(!ParentName.empty() && "Invalid target region parent name!");
879
880 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
881 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
882 if (Mode || IsBareKernel)
883 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
884 CodeGen);
885 else
886 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
887 CodeGen);
888}
889
891 : CGOpenMPRuntime(CGM) {
892 llvm::OpenMPIRBuilderConfig Config(
893 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
894 CGM.getLangOpts().OpenMPOffloadMandatory,
895 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
896 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
897 OMPBuilder.setConfig(Config);
898
899 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
900 llvm_unreachable("OpenMP can only handle device code.");
901
902 if (CGM.getLangOpts().OpenMPCUDAMode)
903 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
904
905 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
906 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
907 return;
908
909 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
910 "__omp_rtl_debug_kind");
911 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
912 "__omp_rtl_assume_teams_oversubscription");
913 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
914 "__omp_rtl_assume_threads_oversubscription");
915 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
916 "__omp_rtl_assume_no_thread_state");
917 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
918 "__omp_rtl_assume_no_nested_parallelism");
919}
920
922 ProcBindKind ProcBind,
923 SourceLocation Loc) {
924 // Nothing to do.
925}
926
928 llvm::Value *NumThreads,
929 SourceLocation Loc) {
930 // Nothing to do.
931}
932
934 const Expr *NumTeams,
935 const Expr *ThreadLimit,
936 SourceLocation Loc) {}
937
940 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
941 const RegionCodeGenTy &CodeGen) {
942 // Emit target region as a standalone region.
943 bool PrevIsInTTDRegion = IsInTTDRegion;
944 IsInTTDRegion = false;
945 auto *OutlinedFun =
947 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
948 IsInTTDRegion = PrevIsInTTDRegion;
949 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
950 llvm::Function *WrapperFun =
951 createParallelDataSharingWrapper(OutlinedFun, D);
952 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
953 }
954
955 return OutlinedFun;
956}
957
958/// Get list of lastprivate variables from the teams distribute ... or
959/// teams {distribute ...} directives.
960static void
964 "expected teams directive.");
965 const OMPExecutableDirective *Dir = &D;
968 Ctx,
970 /*IgnoreCaptured=*/true))) {
971 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
973 Dir = nullptr;
974 }
975 }
976 if (!Dir)
977 return;
978 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
979 for (const Expr *E : C->getVarRefs())
980 Vars.push_back(getPrivateItem(E));
981 }
982}
983
984/// Get list of reduction variables from the teams ... directives.
985static void
989 "expected teams directive.");
990 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
991 for (const Expr *E : C->privates())
992 Vars.push_back(getPrivateItem(E));
993 }
994}
995
998 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
999 const RegionCodeGenTy &CodeGen) {
1000 SourceLocation Loc = D.getBeginLoc();
1001
1002 const RecordDecl *GlobalizedRD = nullptr;
1003 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1004 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1005 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1006 // Globalize team reductions variable unconditionally in all modes.
1007 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1008 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1009 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1010 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1011 if (!LastPrivatesReductions.empty()) {
1012 GlobalizedRD = ::buildRecordForGlobalizedVars(
1013 CGM.getContext(), std::nullopt, LastPrivatesReductions,
1014 MappedDeclsFields, WarpSize);
1015 }
1016 } else if (!LastPrivatesReductions.empty()) {
1017 assert(!TeamAndReductions.first &&
1018 "Previous team declaration is not expected.");
1019 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1020 std::swap(TeamAndReductions.second, LastPrivatesReductions);
1021 }
1022
1023 // Emit target region as a standalone region.
1024 class NVPTXPrePostActionTy : public PrePostActionTy {
1025 SourceLocation &Loc;
1026 const RecordDecl *GlobalizedRD;
1027 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1028 &MappedDeclsFields;
1029
1030 public:
1031 NVPTXPrePostActionTy(
1032 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1033 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1034 &MappedDeclsFields)
1035 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1036 MappedDeclsFields(MappedDeclsFields) {}
1037 void Enter(CodeGenFunction &CGF) override {
1038 auto &Rt =
1039 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1040 if (GlobalizedRD) {
1041 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1042 I->getSecond().MappedParams =
1043 std::make_unique<CodeGenFunction::OMPMapVars>();
1044 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1045 for (const auto &Pair : MappedDeclsFields) {
1046 assert(Pair.getFirst()->isCanonicalDecl() &&
1047 "Expected canonical declaration");
1048 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1049 }
1050 }
1051 Rt.emitGenericVarsProlog(CGF, Loc);
1052 }
1053 void Exit(CodeGenFunction &CGF) override {
1054 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1055 .emitGenericVarsEpilog(CGF);
1056 }
1057 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1058 CodeGen.setAction(Action);
1059 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1060 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1061
1062 return OutlinedFun;
1063}
1064
1065void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1066 SourceLocation Loc) {
1067 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1068 return;
1069
1070 CGBuilderTy &Bld = CGF.Builder;
1071
1072 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1073 if (I == FunctionGlobalizedDecls.end())
1074 return;
1075
1076 for (auto &Rec : I->getSecond().LocalVarData) {
1077 const auto *VD = cast<VarDecl>(Rec.first);
1078 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1079 QualType VarTy = VD->getType();
1080
1081 // Get the local allocation of a firstprivate variable before sharing
1082 llvm::Value *ParValue;
1083 if (EscapedParam) {
1084 LValue ParLVal =
1085 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1086 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1087 }
1088
1089 // Allocate space for the variable to be globalized
1090 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1091 llvm::CallBase *VoidPtr =
1092 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1093 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1094 AllocArgs, VD->getName());
1095 // FIXME: We should use the variables actual alignment as an argument.
1096 VoidPtr->addRetAttr(llvm::Attribute::get(
1097 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1099
1100 // Cast the void pointer and get the address of the globalized variable.
1101 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1102 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1103 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1104 LValue VarAddr =
1105 CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
1106 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1107 Rec.second.GlobalizedVal = VoidPtr;
1108
1109 // Assign the local allocation to the newly globalized location.
1110 if (EscapedParam) {
1111 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1112 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1113 }
1114 if (auto *DI = CGF.getDebugInfo())
1115 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1116 }
1117
1118 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1119 const auto *VD = cast<VarDecl>(ValueD);
1120 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1121 getKmpcAllocShared(CGF, VD);
1122 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
1123 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1126 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));
1127 }
1128 I->getSecond().MappedParams->apply(CGF);
1129}
1130
1132 const VarDecl *VD) const {
1133 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1134 if (I == FunctionGlobalizedDecls.end())
1135 return false;
1136
1137 // Check variable declaration is delayed:
1138 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
1139}
1140
1141std::pair<llvm::Value *, llvm::Value *>
1143 const VarDecl *VD) {
1144 CGBuilderTy &Bld = CGF.Builder;
1145
1146 // Compute size and alignment.
1147 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1148 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1149 Size = Bld.CreateNUWAdd(
1150 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1151 llvm::Value *AlignVal =
1152 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1153 Size = Bld.CreateUDiv(Size, AlignVal);
1154 Size = Bld.CreateNUWMul(Size, AlignVal);
1155
1156 // Allocate space for this VLA object to be globalized.
1157 llvm::Value *AllocArgs[] = {Size};
1158 llvm::CallBase *VoidPtr =
1159 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1160 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1161 AllocArgs, VD->getName());
1162 VoidPtr->addRetAttr(llvm::Attribute::get(
1163 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1164
1165 return std::make_pair(VoidPtr, Size);
1166}
1167
1169 CodeGenFunction &CGF,
1170 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1171 // Deallocate the memory for each globalized VLA object
1172 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1173 CGM.getModule(), OMPRTL___kmpc_free_shared),
1174 {AddrSizePair.first, AddrSizePair.second});
1175}
1176
1177void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1178 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1179 return;
1180
1181 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1182 if (I != FunctionGlobalizedDecls.end()) {
1183 // Deallocate the memory for each globalized VLA object that was
1184 // globalized in the prolog (i.e. emitGenericVarsProlog).
1185 for (const auto &AddrSizePair :
1186 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1187 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1188 CGM.getModule(), OMPRTL___kmpc_free_shared),
1189 {AddrSizePair.first, AddrSizePair.second});
1190 }
1191 // Deallocate the memory for each globalized value
1192 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1193 const auto *VD = cast<VarDecl>(Rec.first);
1194 I->getSecond().MappedParams->restore(CGF);
1195
1196 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1197 CGF.getTypeSize(VD->getType())};
1198 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1199 CGM.getModule(), OMPRTL___kmpc_free_shared),
1200 FreeArgs);
1201 }
1202 }
1203}
1204
1206 const OMPExecutableDirective &D,
1207 SourceLocation Loc,
1208 llvm::Function *OutlinedFn,
1209 ArrayRef<llvm::Value *> CapturedVars) {
1210 if (!CGF.HaveInsertPoint())
1211 return;
1212
1213 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1214
1216 /*Name=*/".zero.addr");
1217 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1219 // We don't emit any thread id function call in bare kernel, but because the
1220 // outlined function has a pointer argument, we emit a nullptr here.
1221 if (IsBareKernel)
1222 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
1223 else
1224 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1225 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1226 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1227 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1228}
1229
1231 SourceLocation Loc,
1232 llvm::Function *OutlinedFn,
1233 ArrayRef<llvm::Value *> CapturedVars,
1234 const Expr *IfCond,
1235 llvm::Value *NumThreads) {
1236 if (!CGF.HaveInsertPoint())
1237 return;
1238
1239 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1240 NumThreads](CodeGenFunction &CGF,
1241 PrePostActionTy &Action) {
1242 CGBuilderTy &Bld = CGF.Builder;
1243 llvm::Value *NumThreadsVal = NumThreads;
1244 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1245 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1246 if (WFn)
1247 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1248 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1249
1250 // Create a private scope that will globalize the arguments
1251 // passed from the outside of the target region.
1252 // TODO: Is that needed?
1253 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1254
1255 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1256 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1257 "captured_vars_addrs");
1258 // There's something to share.
1259 if (!CapturedVars.empty()) {
1260 // Prepare for parallel region. Indicate the outlined function.
1261 ASTContext &Ctx = CGF.getContext();
1262 unsigned Idx = 0;
1263 for (llvm::Value *V : CapturedVars) {
1264 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1265 llvm::Value *PtrV;
1266 if (V->getType()->isIntegerTy())
1267 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1268 else
1270 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1271 Ctx.getPointerType(Ctx.VoidPtrTy));
1272 ++Idx;
1273 }
1274 }
1275
1276 llvm::Value *IfCondVal = nullptr;
1277 if (IfCond)
1278 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1279 /* isSigned */ false);
1280 else
1281 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1282
1283 if (!NumThreadsVal)
1284 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1285 else
1286 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1287
1288 assert(IfCondVal && "Expected a value");
1289 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1290 llvm::Value *Args[] = {
1291 RTLoc,
1292 getThreadID(CGF, Loc),
1293 IfCondVal,
1294 NumThreadsVal,
1295 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1296 FnPtr,
1297 ID,
1298 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
1299 CGF.VoidPtrPtrTy),
1300 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1301 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1302 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1303 Args);
1304 };
1305
1306 RegionCodeGenTy RCG(ParallelGen);
1307 RCG(CGF);
1308}
1309
1310void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1311 // Always emit simple barriers!
1312 if (!CGF.HaveInsertPoint())
1313 return;
1314 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1315 // This function does not use parameters, so we can emit just default values.
1316 llvm::Value *Args[] = {
1317 llvm::ConstantPointerNull::get(
1318 cast<llvm::PointerType>(getIdentTyPointerTy())),
1319 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1320 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1321 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1322 Args);
1323}
1324
1326 SourceLocation Loc,
1327 OpenMPDirectiveKind Kind, bool,
1328 bool) {
1329 // Always emit simple barriers!
1330 if (!CGF.HaveInsertPoint())
1331 return;
1332 // Build call __kmpc_cancel_barrier(loc, thread_id);
1333 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1334 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1335 getThreadID(CGF, Loc)};
1336
1337 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1338 CGM.getModule(), OMPRTL___kmpc_barrier),
1339 Args);
1340}
1341
1343 CodeGenFunction &CGF, StringRef CriticalName,
1344 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1345 const Expr *Hint) {
1346 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1347 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1348 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1349 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1350 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1351
1352 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1353
1354 // Get the mask of active threads in the warp.
1355 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1356 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1357 // Fetch team-local id of the thread.
1358 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1359
1360 // Get the width of the team.
1361 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1362
1363 // Initialize the counter variable for the loop.
1364 QualType Int32Ty =
1365 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1366 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1367 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1368 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1369 /*isInit=*/true);
1370
1371 // Block checks if loop counter exceeds upper bound.
1372 CGF.EmitBlock(LoopBB);
1373 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1374 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1375 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1376
1377 // Block tests which single thread should execute region, and which threads
1378 // should go straight to synchronisation point.
1379 CGF.EmitBlock(TestBB);
1380 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1381 llvm::Value *CmpThreadToCounter =
1382 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1383 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1384
1385 // Block emits the body of the critical region.
1386 CGF.EmitBlock(BodyBB);
1387
1388 // Output the critical statement.
1389 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1390 Hint);
1391
1392 // After the body surrounded by the critical region, the single executing
1393 // thread will jump to the synchronisation point.
1394 // Block waits for all threads in current team to finish then increments the
1395 // counter variable and returns to the loop.
1396 CGF.EmitBlock(SyncBB);
1397 // Reconverge active threads in the warp.
1398 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1399 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1400 Mask);
1401
1402 llvm::Value *IncCounterVal =
1403 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1404 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1405 CGF.EmitBranch(LoopBB);
1406
1407 // Block that is reached when all threads in the team complete the region.
1408 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1409}
1410
1411/// Cast value to the specified type.
1412static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1413 QualType ValTy, QualType CastTy,
1414 SourceLocation Loc) {
1415 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1416 "Cast type must sized.");
1417 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1418 "Val type must sized.");
1419 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1420 if (ValTy == CastTy)
1421 return Val;
1422 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1423 CGF.getContext().getTypeSizeInChars(CastTy))
1424 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1425 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1426 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1428 Address CastItem = CGF.CreateMemTemp(CastTy);
1429 Address ValCastItem = CastItem.withElementType(Val->getType());
1430 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1432 TBAAAccessInfo());
1433 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1435 TBAAAccessInfo());
1436}
1437
1438/// This function creates calls to one of two shuffle functions to copy
1439/// variables between lanes in a warp.
1441 llvm::Value *Elem,
1442 QualType ElemType,
1443 llvm::Value *Offset,
1444 SourceLocation Loc) {
1445 CodeGenModule &CGM = CGF.CGM;
1446 CGBuilderTy &Bld = CGF.Builder;
1447 CGOpenMPRuntimeGPU &RT =
1448 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1449 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1450
1451 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1452 assert(Size.getQuantity() <= 8 &&
1453 "Unsupported bitwidth in shuffle instruction.");
1454
1455 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1456 ? OMPRTL___kmpc_shuffle_int32
1457 : OMPRTL___kmpc_shuffle_int64;
1458
1459 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1461 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1462 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1463 llvm::Value *WarpSize =
1464 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1465
1466 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1467 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1468 {ElemCast, Offset, WarpSize});
1469
1470 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1471}
1472
1473static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1474 Address DestAddr, QualType ElemType,
1475 llvm::Value *Offset, SourceLocation Loc) {
1476 CGBuilderTy &Bld = CGF.Builder;
1477
1478 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1479 // Create the loop over the big sized data.
1480 // ptr = (void*)Elem;
1481 // ptrEnd = (void*) Elem + 1;
1482 // Step = 8;
1483 // while (ptr + Step < ptrEnd)
1484 // shuffle((int64_t)*ptr);
1485 // Step = 4;
1486 // while (ptr + Step < ptrEnd)
1487 // shuffle((int32_t)*ptr);
1488 // ...
1489 Address ElemPtr = DestAddr;
1490 Address Ptr = SrcAddr;
1492 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1493 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1494 if (Size < CharUnits::fromQuantity(IntSize))
1495 continue;
1498 /*Signed=*/1);
1499 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1500 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1501 IntTy);
1503 ElemPtr, IntTy->getPointerTo(), IntTy);
1504 if (Size.getQuantity() / IntSize > 1) {
1505 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1506 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1507 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1508 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1509 CGF.EmitBlock(PreCondBB);
1510 llvm::PHINode *PhiSrc =
1511 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1512 PhiSrc->addIncoming(Ptr.emitRawPointer(CGF), CurrentBB);
1513 llvm::PHINode *PhiDest =
1514 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1515 PhiDest->addIncoming(ElemPtr.emitRawPointer(CGF), CurrentBB);
1516 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1517 ElemPtr =
1518 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1519 llvm::Value *PtrEndRaw = PtrEnd.emitRawPointer(CGF);
1520 llvm::Value *PtrRaw = Ptr.emitRawPointer(CGF);
1521 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1522 CGF.Int8Ty, PtrEndRaw,
1524 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1525 ThenBB, ExitBB);
1526 CGF.EmitBlock(ThenBB);
1527 llvm::Value *Res = createRuntimeShuffleFunction(
1528 CGF,
1529 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1531 TBAAAccessInfo()),
1532 IntType, Offset, Loc);
1533 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1535 TBAAAccessInfo());
1536 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1537 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1538 PhiSrc->addIncoming(LocalPtr.emitRawPointer(CGF), ThenBB);
1539 PhiDest->addIncoming(LocalElemPtr.emitRawPointer(CGF), ThenBB);
1540 CGF.EmitBranch(PreCondBB);
1541 CGF.EmitBlock(ExitBB);
1542 } else {
1543 llvm::Value *Res = createRuntimeShuffleFunction(
1544 CGF,
1545 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1547 TBAAAccessInfo()),
1548 IntType, Offset, Loc);
1549 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1551 TBAAAccessInfo());
1552 Ptr = Bld.CreateConstGEP(Ptr, 1);
1553 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1554 }
1555 Size = Size % IntSize;
1556 }
1557}
1558
1559namespace {
1560enum CopyAction : unsigned {
1561 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1562 // the warp using shuffle instructions.
1563 RemoteLaneToThread,
1564 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1565 ThreadCopy,
1566};
1567} // namespace
1568
1570 llvm::Value *RemoteLaneOffset;
1571 llvm::Value *ScratchpadIndex;
1572 llvm::Value *ScratchpadWidth;
1573};
1574
1575/// Emit instructions to copy a Reduce list, which contains partially
1576/// aggregated values, in the specified direction.
1578 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1579 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1580 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1581
1582 CodeGenModule &CGM = CGF.CGM;
1583 ASTContext &C = CGM.getContext();
1584 CGBuilderTy &Bld = CGF.Builder;
1585
1586 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1587
1588 // Iterates, element-by-element, through the source Reduce list and
1589 // make a copy.
1590 unsigned Idx = 0;
1591 for (const Expr *Private : Privates) {
1592 Address SrcElementAddr = Address::invalid();
1593 Address DestElementAddr = Address::invalid();
1594 Address DestElementPtrAddr = Address::invalid();
1595 // Should we shuffle in an element from a remote lane?
1596 bool ShuffleInElement = false;
1597 // Set to true to update the pointer in the dest Reduce list to a
1598 // newly created element.
1599 bool UpdateDestListPtr = false;
1600 QualType PrivatePtrType = C.getPointerType(Private->getType());
1601 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1602
1603 switch (Action) {
1604 case RemoteLaneToThread: {
1605 // Step 1.1: Get the address for the src element in the Reduce list.
1606 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1607 SrcElementAddr = CGF.EmitLoadOfPointer(
1608 SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
1609 PrivatePtrType->castAs<PointerType>());
1610
1611 // Step 1.2: Create a temporary to store the element in the destination
1612 // Reduce list.
1613 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1614 DestElementAddr =
1615 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1616 ShuffleInElement = true;
1617 UpdateDestListPtr = true;
1618 break;
1619 }
1620 case ThreadCopy: {
1621 // Step 1.1: Get the address for the src element in the Reduce list.
1622 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1623 SrcElementAddr = CGF.EmitLoadOfPointer(
1624 SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
1625 PrivatePtrType->castAs<PointerType>());
1626
1627 // Step 1.2: Get the address for dest element. The destination
1628 // element has already been created on the thread's stack.
1629 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1630 DestElementAddr = CGF.EmitLoadOfPointer(
1631 DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
1632 PrivatePtrType->castAs<PointerType>());
1633 break;
1634 }
1635 }
1636
1637 // Regardless of src and dest of copy, we emit the load of src
1638 // element as this is required in all directions
1639 SrcElementAddr = SrcElementAddr.withElementType(
1640 CGF.ConvertTypeForMem(Private->getType()));
1641 DestElementAddr =
1642 DestElementAddr.withElementType(SrcElementAddr.getElementType());
1643
1644 // Now that all active lanes have read the element in the
1645 // Reduce list, shuffle over the value from the remote lane.
1646 if (ShuffleInElement) {
1647 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1648 RemoteLaneOffset, Private->getExprLoc());
1649 } else {
1650 switch (CGF.getEvaluationKind(Private->getType())) {
1651 case TEK_Scalar: {
1652 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1653 SrcElementAddr, /*Volatile=*/false, Private->getType(),
1655 TBAAAccessInfo());
1656 // Store the source element value to the dest element address.
1658 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1660 break;
1661 }
1662 case TEK_Complex: {
1664 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1665 Private->getExprLoc());
1667 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1668 /*isInit=*/false);
1669 break;
1670 }
1671 case TEK_Aggregate:
1673 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1674 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1676 break;
1677 }
1678 }
1679
1680 // Step 3.1: Modify reference in dest Reduce list as needed.
1681 // Modifying the reference in Reduce list to point to the newly
1682 // created element. The element is live in the current function
1683 // scope and that of functions it invokes (i.e., reduce_function).
1684 // RemoteReduceData[i] = (void*)&RemoteElem
1685 if (UpdateDestListPtr) {
1688 DestElementAddr.emitRawPointer(CGF), CGF.VoidPtrTy),
1689 DestElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy);
1690 }
1691
1692 ++Idx;
1693 }
1694}
1695
1696/// This function emits a helper that gathers Reduce lists from the first
1697/// lane of every active warp to lanes in the first warp.
1698///
1699/// void inter_warp_copy_func(void* reduce_data, num_warps)
1700/// shared smem[warp_size];
1701/// For all data entries D in reduce_data:
1702/// sync
1703/// If (I am the first lane in each warp)
1704/// Copy my local D to smem[warp_id]
1705/// sync
1706/// if (I am the first warp)
1707/// Copy smem[thread_id] to my local D
1709 ArrayRef<const Expr *> Privates,
1710 QualType ReductionArrayTy,
1711 SourceLocation Loc) {
1712 ASTContext &C = CGM.getContext();
1713 llvm::Module &M = CGM.getModule();
1714
1715 // ReduceList: thread local Reduce list.
1716 // At the stage of the computation when this function is called, partially
1717 // aggregated values reside in the first lane of every active warp.
1718 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1719 C.VoidPtrTy, ImplicitParamKind::Other);
1720 // NumWarps: number of warps active in the parallel region. This could
1721 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1722 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1723 C.getIntTypeForBitwidth(32, /* Signed */ true),
1725 FunctionArgList Args;
1726 Args.push_back(&ReduceListArg);
1727 Args.push_back(&NumWarpsArg);
1728
1729 const CGFunctionInfo &CGFI =
1730 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1731 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
1732 llvm::GlobalValue::InternalLinkage,
1733 "_omp_reduction_inter_warp_copy_func", &M);
1735 Fn->setDoesNotRecurse();
1736 CodeGenFunction CGF(CGM);
1737 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
1738
1739 CGBuilderTy &Bld = CGF.Builder;
1740
1741 // This array is used as a medium to transfer, one reduce element at a time,
1742 // the data from the first lane of every warp to lanes in the first warp
1743 // in order to perform the final step of a reduction in a parallel region
1744 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1745 // for reduced latency, as well as to have a distinct copy for concurrently
1746 // executing target regions. The array is declared with common linkage so
1747 // as to be shared across compilation units.
1748 StringRef TransferMediumName =
1749 "__openmp_nvptx_data_transfer_temporary_storage";
1750 llvm::GlobalVariable *TransferMedium =
1751 M.getGlobalVariable(TransferMediumName);
1752 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1753 if (!TransferMedium) {
1754 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
1755 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1756 TransferMedium = new llvm::GlobalVariable(
1757 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1758 llvm::UndefValue::get(Ty), TransferMediumName,
1759 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1760 SharedAddressSpace);
1761 CGM.addCompilerUsedGlobal(TransferMedium);
1762 }
1763
1764 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1765 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1766 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1767 // nvptx_lane_id = nvptx_id % warpsize
1768 llvm::Value *LaneID = getNVPTXLaneID(CGF);
1769 // nvptx_warp_id = nvptx_id / warpsize
1770 llvm::Value *WarpID = getNVPTXWarpID(CGF);
1771
1772 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1773 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
1774 Address LocalReduceList(
1776 CGF.EmitLoadOfScalar(
1777 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1779 ElemTy->getPointerTo()),
1780 ElemTy, CGF.getPointerAlign());
1781
1782 unsigned Idx = 0;
1783 for (const Expr *Private : Privates) {
1784 //
1785 // Warp master copies reduce element to transfer medium in __shared__
1786 // memory.
1787 //
1788 unsigned RealTySize =
1789 C.getTypeSizeInChars(Private->getType())
1790 .alignTo(C.getTypeAlignInChars(Private->getType()))
1791 .getQuantity();
1792 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1793 unsigned NumIters = RealTySize / TySize;
1794 if (NumIters == 0)
1795 continue;
1796 QualType CType = C.getIntTypeForBitwidth(
1797 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
1798 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
1799 CharUnits Align = CharUnits::fromQuantity(TySize);
1800 llvm::Value *Cnt = nullptr;
1801 Address CntAddr = Address::invalid();
1802 llvm::BasicBlock *PrecondBB = nullptr;
1803 llvm::BasicBlock *ExitBB = nullptr;
1804 if (NumIters > 1) {
1805 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1806 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
1807 /*Volatile=*/false, C.IntTy);
1808 PrecondBB = CGF.createBasicBlock("precond");
1809 ExitBB = CGF.createBasicBlock("exit");
1810 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
1811 // There is no need to emit line number for unconditional branch.
1813 CGF.EmitBlock(PrecondBB);
1814 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1815 llvm::Value *Cmp =
1816 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
1817 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1818 CGF.EmitBlock(BodyBB);
1819 }
1820 // kmpc_barrier.
1821 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1822 /*EmitChecks=*/false,
1823 /*ForceSimpleCall=*/true);
1824 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1825 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1826 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1827
1828 // if (lane_id == 0)
1829 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
1830 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1831 CGF.EmitBlock(ThenBB);
1832
1833 // Reduce element = LocalReduceList[i]
1834 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1835 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1836 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1837 // elemptr = ((CopyType*)(elemptrptr)) + I
1838 Address ElemPtr(ElemPtrPtr, CopyType, Align);
1839 if (NumIters > 1)
1840 ElemPtr = Bld.CreateGEP(CGF, ElemPtr, Cnt);
1841
1842 // Get pointer to location in transfer medium.
1843 // MediumPtr = &medium[warp_id]
1844 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1845 TransferMedium->getValueType(), TransferMedium,
1846 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1847 // Casting to actual data type.
1848 // MediumPtr = (CopyType*)MediumPtrAddr;
1849 Address MediumPtr(MediumPtrVal, CopyType, Align);
1850
1851 // elem = *elemptr
1852 //*MediumPtr = elem
1853 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1854 ElemPtr, /*Volatile=*/false, CType, Loc,
1856 // Store the source element value to the dest element address.
1857 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
1859 TBAAAccessInfo());
1860
1861 Bld.CreateBr(MergeBB);
1862
1863 CGF.EmitBlock(ElseBB);
1864 Bld.CreateBr(MergeBB);
1865
1866 CGF.EmitBlock(MergeBB);
1867
1868 // kmpc_barrier.
1869 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1870 /*EmitChecks=*/false,
1871 /*ForceSimpleCall=*/true);
1872
1873 //
1874 // Warp 0 copies reduce element from transfer medium.
1875 //
1876 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1877 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1878 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1879
1880 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1881 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1882 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1883
1884 // Up to 32 threads in warp 0 are active.
1885 llvm::Value *IsActiveThread =
1886 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1887 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1888
1889 CGF.EmitBlock(W0ThenBB);
1890
1891 // SrcMediumPtr = &medium[tid]
1892 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1893 TransferMedium->getValueType(), TransferMedium,
1894 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1895 // SrcMediumVal = *SrcMediumPtr;
1896 Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
1897
1898 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1899 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1900 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1901 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
1902 Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
1903 if (NumIters > 1)
1904 TargetElemPtr = Bld.CreateGEP(CGF, TargetElemPtr, Cnt);
1905
1906 // *TargetElemPtr = SrcMediumVal;
1907 llvm::Value *SrcMediumValue =
1908 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
1909 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1910 CType);
1911 Bld.CreateBr(W0MergeBB);
1912
1913 CGF.EmitBlock(W0ElseBB);
1914 Bld.CreateBr(W0MergeBB);
1915
1916 CGF.EmitBlock(W0MergeBB);
1917
1918 if (NumIters > 1) {
1919 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
1920 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
1921 CGF.EmitBranch(PrecondBB);
1923 CGF.EmitBlock(ExitBB);
1924 }
1925 RealTySize %= TySize;
1926 }
1927 ++Idx;
1928 }
1929
1930 CGF.FinishFunction();
1931 return Fn;
1932}
1933
1934/// Emit a helper that reduces data across two OpenMP threads (lanes)
1935/// in the same warp. It uses shuffle instructions to copy over data from
1936/// a remote lane's stack. The reduction algorithm performed is specified
1937/// by the fourth parameter.
1938///
1939/// Algorithm Versions.
1940/// Full Warp Reduce (argument value 0):
1941/// This algorithm assumes that all 32 lanes are active and gathers
1942/// data from these 32 lanes, producing a single resultant value.
1943/// Contiguous Partial Warp Reduce (argument value 1):
1944/// This algorithm assumes that only a *contiguous* subset of lanes
1945/// are active. This happens for the last warp in a parallel region
1946/// when the user specified num_threads is not an integer multiple of
1947/// 32. This contiguous subset always starts with the zeroth lane.
1948/// Partial Warp Reduce (argument value 2):
1949/// This algorithm gathers data from any number of lanes at any position.
1950/// All reduced values are stored in the lowest possible lane. The set
1951/// of problems every algorithm addresses is a super set of those
1952/// addressable by algorithms with a lower version number. Overhead
1953/// increases as algorithm version increases.
1954///
1955/// Terminology
1956/// Reduce element:
1957/// Reduce element refers to the individual data field with primitive
1958/// data types to be combined and reduced across threads.
1959/// Reduce list:
1960/// Reduce list refers to a collection of local, thread-private
1961/// reduce elements.
1962/// Remote Reduce list:
1963/// Remote Reduce list refers to a collection of remote (relative to
1964/// the current thread) reduce elements.
1965///
1966/// We distinguish between three states of threads that are important to
1967/// the implementation of this function.
1968/// Alive threads:
1969/// Threads in a warp executing the SIMT instruction, as distinguished from
1970/// threads that are inactive due to divergent control flow.
1971/// Active threads:
1972/// The minimal set of threads that has to be alive upon entry to this
1973/// function. The computation is correct iff active threads are alive.
1974/// Some threads are alive but they are not active because they do not
1975/// contribute to the computation in any useful manner. Turning them off
1976/// may introduce control flow overheads without any tangible benefits.
1977/// Effective threads:
1978/// In order to comply with the argument requirements of the shuffle
1979/// function, we must keep all lanes holding data alive. But at most
1980/// half of them perform value aggregation; we refer to this half of
1981/// threads as effective. The other half is simply handing off their
1982/// data.
1983///
1984/// Procedure
1985/// Value shuffle:
1986/// In this step active threads transfer data from higher lane positions
1987/// in the warp to lower lane positions, creating Remote Reduce list.
1988/// Value aggregation:
1989/// In this step, effective threads combine their thread local Reduce list
1990/// with Remote Reduce list and store the result in the thread local
1991/// Reduce list.
1992/// Value copy:
1993/// In this step, we deal with the assumption made by algorithm 2
1994/// (i.e. contiguity assumption). When we have an odd number of lanes
1995/// active, say 2k+1, only k threads will be effective and therefore k
1996/// new values will be produced. However, the Reduce list owned by the
1997/// (2k+1)th thread is ignored in the value aggregation. Therefore
1998/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1999/// that the contiguity assumption still holds.
2000static llvm::Function *emitShuffleAndReduceFunction(
2002 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2003 ASTContext &C = CGM.getContext();
2004
2005 // Thread local Reduce list used to host the values of data to be reduced.
2006 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2007 C.VoidPtrTy, ImplicitParamKind::Other);
2008 // Current lane id; could be logical.
2009 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2011 // Offset of the remote source lane relative to the current lane.
2012 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2013 C.ShortTy, ImplicitParamKind::Other);
2014 // Algorithm version. This is expected to be known at compile time.
2015 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2016 C.ShortTy, ImplicitParamKind::Other);
2017 FunctionArgList Args;
2018 Args.push_back(&ReduceListArg);
2019 Args.push_back(&LaneIDArg);
2020 Args.push_back(&RemoteLaneOffsetArg);
2021 Args.push_back(&AlgoVerArg);
2022
2023 const CGFunctionInfo &CGFI =
2024 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2025 auto *Fn = llvm::Function::Create(
2026 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2027 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2029 Fn->setDoesNotRecurse();
2030
2031 CodeGenFunction CGF(CGM);
2032 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2033
2034 CGBuilderTy &Bld = CGF.Builder;
2035
2036 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2037 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2038 Address LocalReduceList(
2040 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2041 C.VoidPtrTy, SourceLocation()),
2042 ElemTy->getPointerTo()),
2043 ElemTy, CGF.getPointerAlign());
2044
2045 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2046 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2047 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2048
2049 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2050 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2051 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2052
2053 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2054 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2055 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2056
2057 // Create a local thread-private variable to host the Reduce list
2058 // from a remote lane.
2059 Address RemoteReduceList =
2060 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2061
2062 // This loop iterates through the list of reduce elements and copies,
2063 // element by element, from a remote lane in the warp to RemoteReduceList,
2064 // hosted on the thread's stack.
2065 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2066 LocalReduceList, RemoteReduceList,
2067 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2068 /*ScratchpadIndex=*/nullptr,
2069 /*ScratchpadWidth=*/nullptr});
2070
2071 // The actions to be performed on the Remote Reduce list is dependent
2072 // on the algorithm version.
2073 //
2074 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2075 // LaneId % 2 == 0 && Offset > 0):
2076 // do the reduction value aggregation
2077 //
2078 // The thread local variable Reduce list is mutated in place to host the
2079 // reduced data, which is the aggregated value produced from local and
2080 // remote lanes.
2081 //
2082 // Note that AlgoVer is expected to be a constant integer known at compile
2083 // time.
2084 // When AlgoVer==0, the first conjunction evaluates to true, making
2085 // the entire predicate true during compile time.
2086 // When AlgoVer==1, the second conjunction has only the second part to be
2087 // evaluated during runtime. Other conjunctions evaluates to false
2088 // during compile time.
2089 // When AlgoVer==2, the third conjunction has only the second part to be
2090 // evaluated during runtime. Other conjunctions evaluates to false
2091 // during compile time.
2092 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2093
2094 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2095 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2096 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2097
2098 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2099 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2100 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2101 CondAlgo2 = Bld.CreateAnd(
2102 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2103
2104 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2105 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2106
2107 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2108 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2109 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2110 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2111
2112 CGF.EmitBlock(ThenBB);
2113 // reduce_function(LocalReduceList, RemoteReduceList)
2114 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2115 LocalReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);
2116 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2117 RemoteReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);
2119 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2120 Bld.CreateBr(MergeBB);
2121
2122 CGF.EmitBlock(ElseBB);
2123 Bld.CreateBr(MergeBB);
2124
2125 CGF.EmitBlock(MergeBB);
2126
2127 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2128 // Reduce list.
2129 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2130 llvm::Value *CondCopy = Bld.CreateAnd(
2131 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2132
2133 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2134 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2135 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2136 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2137
2138 CGF.EmitBlock(CpyThenBB);
2139 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2140 RemoteReduceList, LocalReduceList);
2141 Bld.CreateBr(CpyMergeBB);
2142
2143 CGF.EmitBlock(CpyElseBB);
2144 Bld.CreateBr(CpyMergeBB);
2145
2146 CGF.EmitBlock(CpyMergeBB);
2147
2148 CGF.FinishFunction();
2149 return Fn;
2150}
2151
2152/// This function emits a helper that copies all the reduction variables from
2153/// the team into the provided global buffer for the reduction variables.
2154///
2155/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2156/// For all data entries D in reduce_data:
2157/// Copy local D to buffer.D[Idx]
2160 QualType ReductionArrayTy, SourceLocation Loc,
2161 const RecordDecl *TeamReductionRec,
2162 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2163 &VarFieldMap) {
2164 ASTContext &C = CGM.getContext();
2165
2166 // Buffer: global reduction buffer.
2167 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2168 C.VoidPtrTy, ImplicitParamKind::Other);
2169 // Idx: index of the buffer.
2170 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2172 // ReduceList: thread local Reduce list.
2173 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2174 C.VoidPtrTy, ImplicitParamKind::Other);
2175 FunctionArgList Args;
2176 Args.push_back(&BufferArg);
2177 Args.push_back(&IdxArg);
2178 Args.push_back(&ReduceListArg);
2179
2180 const CGFunctionInfo &CGFI =
2181 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2182 auto *Fn = llvm::Function::Create(
2183 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2184 "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2186 Fn->setDoesNotRecurse();
2187 CodeGenFunction CGF(CGM);
2188 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2189
2190 CGBuilderTy &Bld = CGF.Builder;
2191
2192 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2193 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2194 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2195 Address LocalReduceList(
2197 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2198 C.VoidPtrTy, Loc),
2199 ElemTy->getPointerTo()),
2200 ElemTy, CGF.getPointerAlign());
2201 QualType StaticTy = C.getRecordType(TeamReductionRec);
2202 llvm::Type *LLVMReductionsBufferTy =
2203 CGM.getTypes().ConvertTypeForMem(StaticTy);
2204 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2205 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2206 LLVMReductionsBufferTy->getPointerTo());
2207 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2208 /*Volatile=*/false, C.IntTy,
2209 Loc)};
2210 unsigned Idx = 0;
2211 for (const Expr *Private : Privates) {
2212 // Reduce element = LocalReduceList[i]
2213 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2214 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2215 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2216 // elemptr = ((CopyType*)(elemptrptr)) + I
2217 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2218 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2219 ElemPtrPtr, ElemTy->getPointerTo());
2220 Address ElemPtr =
2221 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2222 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2223 // Global = Buffer.VD[Idx];
2224 const FieldDecl *FD = VarFieldMap.lookup(VD);
2225 llvm::Value *BufferPtr =
2226 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2227 LValue GlobLVal = CGF.EmitLValueForField(
2228 CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
2229 Address GlobAddr = GlobLVal.getAddress(CGF);
2230 GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
2231 CGF.ConvertTypeForMem(Private->getType()),
2232 GlobAddr.getAlignment()));
2233 switch (CGF.getEvaluationKind(Private->getType())) {
2234 case TEK_Scalar: {
2235 llvm::Value *V = CGF.EmitLoadOfScalar(
2236 ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2238 CGF.EmitStoreOfScalar(V, GlobLVal);
2239 break;
2240 }
2241 case TEK_Complex: {
2243 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2244 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2245 break;
2246 }
2247 case TEK_Aggregate:
2248 CGF.EmitAggregateCopy(GlobLVal,
2249 CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2251 break;
2252 }
2253 ++Idx;
2254 }
2255
2256 CGF.FinishFunction();
2257 return Fn;
2258}
2259
2260/// This function emits a helper that reduces all the reduction variables from
2261/// the team into the provided global buffer for the reduction variables.
2262///
2263/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2264/// void *GlobPtrs[];
2265/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2266/// ...
2267/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2268/// reduce_function(GlobPtrs, reduce_data);
2271 QualType ReductionArrayTy, SourceLocation Loc,
2272 const RecordDecl *TeamReductionRec,
2273 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2274 &VarFieldMap,
2275 llvm::Function *ReduceFn) {
2276 ASTContext &C = CGM.getContext();
2277
2278 // Buffer: global reduction buffer.
2279 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2280 C.VoidPtrTy, ImplicitParamKind::Other);
2281 // Idx: index of the buffer.
2282 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2284 // ReduceList: thread local Reduce list.
2285 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2286 C.VoidPtrTy, ImplicitParamKind::Other);
2287 FunctionArgList Args;
2288 Args.push_back(&BufferArg);
2289 Args.push_back(&IdxArg);
2290 Args.push_back(&ReduceListArg);
2291
2292 const CGFunctionInfo &CGFI =
2293 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2294 auto *Fn = llvm::Function::Create(
2295 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2296 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2298 Fn->setDoesNotRecurse();
2299 CodeGenFunction CGF(CGM);
2300 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2301
2302 CGBuilderTy &Bld = CGF.Builder;
2303
2304 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2305 QualType StaticTy = C.getRecordType(TeamReductionRec);
2306 llvm::Type *LLVMReductionsBufferTy =
2307 CGM.getTypes().ConvertTypeForMem(StaticTy);
2308 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2309 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2310 LLVMReductionsBufferTy->getPointerTo());
2311
2312 // 1. Build a list of reduction variables.
2313 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2314 RawAddress ReductionList =
2315 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2316 auto IPriv = Privates.begin();
2317 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2318 /*Volatile=*/false, C.IntTy,
2319 Loc)};
2320 unsigned Idx = 0;
2321 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2322 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2323 // Global = Buffer.VD[Idx];
2324 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2325 const FieldDecl *FD = VarFieldMap.lookup(VD);
2326 llvm::Value *BufferPtr =
2327 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2328 LValue GlobLVal = CGF.EmitLValueForField(
2329 CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
2330 Address GlobAddr = GlobLVal.getAddress(CGF);
2331 CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
2332 /*Volatile=*/false, C.VoidPtrTy);
2333 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2334 // Store array size.
2335 ++Idx;
2336 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2337 llvm::Value *Size = CGF.Builder.CreateIntCast(
2338 CGF.getVLASize(
2339 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2340 .NumElts,
2341 CGF.SizeTy, /*isSigned=*/false);
2342 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2343 Elem);
2344 }
2345 }
2346
2347 // Call reduce_function(GlobalReduceList, ReduceList)
2348 llvm::Value *GlobalReduceList = ReductionList.getPointer();
2349 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2350 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2351 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2353 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2354 CGF.FinishFunction();
2355 return Fn;
2356}
2357
2358/// This function emits a helper that copies all the reduction variables from
2359/// the team into the provided global buffer for the reduction variables.
2360///
2361/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2362/// For all data entries D in reduce_data:
2363/// Copy buffer.D[Idx] to local D;
2366 QualType ReductionArrayTy, SourceLocation Loc,
2367 const RecordDecl *TeamReductionRec,
2368 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2369 &VarFieldMap) {
2370 ASTContext &C = CGM.getContext();
2371
2372 // Buffer: global reduction buffer.
2373 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2374 C.VoidPtrTy, ImplicitParamKind::Other);
2375 // Idx: index of the buffer.
2376 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2378 // ReduceList: thread local Reduce list.
2379 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2380 C.VoidPtrTy, ImplicitParamKind::Other);
2381 FunctionArgList Args;
2382 Args.push_back(&BufferArg);
2383 Args.push_back(&IdxArg);
2384 Args.push_back(&ReduceListArg);
2385
2386 const CGFunctionInfo &CGFI =
2387 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2388 auto *Fn = llvm::Function::Create(
2389 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2390 "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2392 Fn->setDoesNotRecurse();
2393 CodeGenFunction CGF(CGM);
2394 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2395
2396 CGBuilderTy &Bld = CGF.Builder;
2397
2398 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2399 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2400 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2401 Address LocalReduceList(
2403 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2404 C.VoidPtrTy, Loc),
2405 ElemTy->getPointerTo()),
2406 ElemTy, CGF.getPointerAlign());
2407 QualType StaticTy = C.getRecordType(TeamReductionRec);
2408 llvm::Type *LLVMReductionsBufferTy =
2409 CGM.getTypes().ConvertTypeForMem(StaticTy);
2410 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2411 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2412 LLVMReductionsBufferTy->getPointerTo());
2413
2414 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2415 /*Volatile=*/false, C.IntTy,
2416 Loc)};
2417 unsigned Idx = 0;
2418 for (const Expr *Private : Privates) {
2419 // Reduce element = LocalReduceList[i]
2420 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2421 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2422 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2423 // elemptr = ((CopyType*)(elemptrptr)) + I
2424 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2425 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2426 ElemPtrPtr, ElemTy->getPointerTo());
2427 Address ElemPtr =
2428 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2429 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2430 // Global = Buffer.VD[Idx];
2431 const FieldDecl *FD = VarFieldMap.lookup(VD);
2432 llvm::Value *BufferPtr =
2433 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2434 LValue GlobLVal = CGF.EmitLValueForField(
2435 CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
2436 Address GlobAddr = GlobLVal.getAddress(CGF);
2437 GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
2438 CGF.ConvertTypeForMem(Private->getType()),
2439 GlobAddr.getAlignment()));
2440 switch (CGF.getEvaluationKind(Private->getType())) {
2441 case TEK_Scalar: {
2442 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2443 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2445 TBAAAccessInfo());
2446 break;
2447 }
2448 case TEK_Complex: {
2450 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2451 /*isInit=*/false);
2452 break;
2453 }
2454 case TEK_Aggregate:
2455 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2456 GlobLVal, Private->getType(),
2458 break;
2459 }
2460 ++Idx;
2461 }
2462
2463 CGF.FinishFunction();
2464 return Fn;
2465}
2466
2467/// This function emits a helper that reduces all the reduction variables from
2468/// the team into the provided global buffer for the reduction variables.
2469///
2470/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2471/// void *GlobPtrs[];
2472/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2473/// ...
2474/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2475/// reduce_function(reduce_data, GlobPtrs);
2478 QualType ReductionArrayTy, SourceLocation Loc,
2479 const RecordDecl *TeamReductionRec,
2480 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2481 &VarFieldMap,
2482 llvm::Function *ReduceFn) {
2483 ASTContext &C = CGM.getContext();
2484
2485 // Buffer: global reduction buffer.
2486 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2487 C.VoidPtrTy, ImplicitParamKind::Other);
2488 // Idx: index of the buffer.
2489 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2491 // ReduceList: thread local Reduce list.
2492 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2493 C.VoidPtrTy, ImplicitParamKind::Other);
2494 FunctionArgList Args;
2495 Args.push_back(&BufferArg);
2496 Args.push_back(&IdxArg);
2497 Args.push_back(&ReduceListArg);
2498
2499 const CGFunctionInfo &CGFI =
2500 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2501 auto *Fn = llvm::Function::Create(
2502 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2503 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2505 Fn->setDoesNotRecurse();
2506 CodeGenFunction CGF(CGM);
2507 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2508
2509 CGBuilderTy &Bld = CGF.Builder;
2510
2511 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2512 QualType StaticTy = C.getRecordType(TeamReductionRec);
2513 llvm::Type *LLVMReductionsBufferTy =
2514 CGM.getTypes().ConvertTypeForMem(StaticTy);
2515 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2516 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2517 LLVMReductionsBufferTy->getPointerTo());
2518
2519 // 1. Build a list of reduction variables.
2520 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2521 Address ReductionList =
2522 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2523 auto IPriv = Privates.begin();
2524 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2525 /*Volatile=*/false, C.IntTy,
2526 Loc)};
2527 unsigned Idx = 0;
2528 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2529 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2530 // Global = Buffer.VD[Idx];
2531 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2532 const FieldDecl *FD = VarFieldMap.lookup(VD);
2533 llvm::Value *BufferPtr =
2534 Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2535 LValue GlobLVal = CGF.EmitLValueForField(
2536 CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
2537 Address GlobAddr = GlobLVal.getAddress(CGF);
2538 CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
2539 /*Volatile=*/false, C.VoidPtrTy);
2540 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2541 // Store array size.
2542 ++Idx;
2543 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2544 llvm::Value *Size = CGF.Builder.CreateIntCast(
2545 CGF.getVLASize(
2546 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2547 .NumElts,
2548 CGF.SizeTy, /*isSigned=*/false);
2549 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2550 Elem);
2551 }
2552 }
2553
2554 // Call reduce_function(ReduceList, GlobalReduceList)
2555 llvm::Value *GlobalReduceList = ReductionList.emitRawPointer(CGF);
2556 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2557 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2558 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2560 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2561 CGF.FinishFunction();
2562 return Fn;
2563}
2564
2565///
2566/// Design of OpenMP reductions on the GPU
2567///
2568/// Consider a typical OpenMP program with one or more reduction
2569/// clauses:
2570///
2571/// float foo;
2572/// double bar;
2573/// #pragma omp target teams distribute parallel for \
2574/// reduction(+:foo) reduction(*:bar)
2575/// for (int i = 0; i < N; i++) {
2576/// foo += A[i]; bar *= B[i];
2577/// }
2578///
2579/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2580/// all teams. In our OpenMP implementation on the NVPTX device an
2581/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2582/// within a team are mapped to CUDA threads within a threadblock.
2583/// Our goal is to efficiently aggregate values across all OpenMP
2584/// threads such that:
2585///
2586/// - the compiler and runtime are logically concise, and
2587/// - the reduction is performed efficiently in a hierarchical
2588/// manner as follows: within OpenMP threads in the same warp,
2589/// across warps in a threadblock, and finally across teams on
2590/// the NVPTX device.
2591///
2592/// Introduction to Decoupling
2593///
2594/// We would like to decouple the compiler and the runtime so that the
2595/// latter is ignorant of the reduction variables (number, data types)
2596/// and the reduction operators. This allows a simpler interface
2597/// and implementation while still attaining good performance.
2598///
2599/// Pseudocode for the aforementioned OpenMP program generated by the
2600/// compiler is as follows:
2601///
2602/// 1. Create private copies of reduction variables on each OpenMP
2603/// thread: 'foo_private', 'bar_private'
2604/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2605/// to it and writes the result in 'foo_private' and 'bar_private'
2606/// respectively.
2607/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2608/// and store the result on the team master:
2609///
2610/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2611/// reduceData, shuffleReduceFn, interWarpCpyFn)
2612///
2613/// where:
2614/// struct ReduceData {
2615/// double *foo;
2616/// double *bar;
2617/// } reduceData
2618/// reduceData.foo = &foo_private
2619/// reduceData.bar = &bar_private
2620///
2621/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2622/// auxiliary functions generated by the compiler that operate on
2623/// variables of type 'ReduceData'. They aid the runtime perform
2624/// algorithmic steps in a data agnostic manner.
2625///
2626/// 'shuffleReduceFn' is a pointer to a function that reduces data
2627/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2628/// same warp. It takes the following arguments as input:
2629///
2630/// a. variable of type 'ReduceData' on the calling lane,
2631/// b. its lane_id,
2632/// c. an offset relative to the current lane_id to generate a
2633/// remote_lane_id. The remote lane contains the second
2634/// variable of type 'ReduceData' that is to be reduced.
2635/// d. an algorithm version parameter determining which reduction
2636/// algorithm to use.
2637///
2638/// 'shuffleReduceFn' retrieves data from the remote lane using
2639/// efficient GPU shuffle intrinsics and reduces, using the
2640/// algorithm specified by the 4th parameter, the two operands
2641/// element-wise. The result is written to the first operand.
2642///
2643/// Different reduction algorithms are implemented in different
2644/// runtime functions, all calling 'shuffleReduceFn' to perform
2645/// the essential reduction step. Therefore, based on the 4th
2646/// parameter, this function behaves slightly differently to
2647/// cooperate with the runtime to ensure correctness under
2648/// different circumstances.
2649///
2650/// 'InterWarpCpyFn' is a pointer to a function that transfers
2651/// reduced variables across warps. It tunnels, through CUDA
2652/// shared memory, the thread-private data of type 'ReduceData'
2653/// from lane 0 of each warp to a lane in the first warp.
2654/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2655/// The last team writes the global reduced value to memory.
2656///
2657/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2658/// reduceData, shuffleReduceFn, interWarpCpyFn,
2659/// scratchpadCopyFn, loadAndReduceFn)
2660///
2661/// 'scratchpadCopyFn' is a helper that stores reduced
2662/// data from the team master to a scratchpad array in
2663/// global memory.
2664///
2665/// 'loadAndReduceFn' is a helper that loads data from
2666/// the scratchpad array and reduces it with the input
2667/// operand.
2668///
2669/// These compiler generated functions hide address
2670/// calculation and alignment information from the runtime.
2671/// 5. if ret == 1:
2672/// The team master of the last team stores the reduced
2673/// result to the globals in memory.
2674/// foo += reduceData.foo; bar *= reduceData.bar
2675///
2676///
2677/// Warp Reduction Algorithms
2678///
2679/// On the warp level, we have three algorithms implemented in the
2680/// OpenMP runtime depending on the number of active lanes:
2681///
2682/// Full Warp Reduction
2683///
2684/// The reduce algorithm within a warp where all lanes are active
2685/// is implemented in the runtime as follows:
2686///
2687/// full_warp_reduce(void *reduce_data,
2688/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2689/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2690/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2691/// }
2692///
2693/// The algorithm completes in log(2, WARPSIZE) steps.
2694///
2695/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2696/// not used therefore we save instructions by not retrieving lane_id
2697/// from the corresponding special registers. The 4th parameter, which
2698/// represents the version of the algorithm being used, is set to 0 to
2699/// signify full warp reduction.
2700///
2701/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2702///
2703/// #reduce_elem refers to an element in the local lane's data structure
2704/// #remote_elem is retrieved from a remote lane
2705/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2706/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2707///
2708/// Contiguous Partial Warp Reduction
2709///
2710/// This reduce algorithm is used within a warp where only the first
2711/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2712/// number of OpenMP threads in a parallel region is not a multiple of
2713/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2714///
2715/// void
2716/// contiguous_partial_reduce(void *reduce_data,
2717/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2718/// int size, int lane_id) {
2719/// int curr_size;
2720/// int offset;
2721/// curr_size = size;
2722/// mask = curr_size/2;
2723/// while (offset>0) {
2724/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2725/// curr_size = (curr_size+1)/2;
2726/// offset = curr_size/2;
2727/// }
2728/// }
2729///
2730/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2731///
2732/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2733/// if (lane_id < offset)
2734/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2735/// else
2736/// reduce_elem = remote_elem
2737///
2738/// This algorithm assumes that the data to be reduced are located in a
2739/// contiguous subset of lanes starting from the first. When there is
2740/// an odd number of active lanes, the data in the last lane is not
2741/// aggregated with any other lane's dat but is instead copied over.
2742///
2743/// Dispersed Partial Warp Reduction
2744///
2745/// This algorithm is used within a warp when any discontiguous subset of
2746/// lanes are active. It is used to implement the reduction operation
2747/// across lanes in an OpenMP simd region or in a nested parallel region.
2748///
2749/// void
2750/// dispersed_partial_reduce(void *reduce_data,
2751/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2752/// int size, remote_id;
2753/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2754/// do {
2755/// remote_id = next_active_lane_id_right_after_me();
2756/// # the above function returns 0 of no active lane
2757/// # is present right after the current lane.
2758/// size = number_of_active_lanes_in_this_warp();
2759/// logical_lane_id /= 2;
2760/// ShuffleReduceFn(reduce_data, logical_lane_id,
2761/// remote_id-1-threadIdx.x, 2);
2762/// } while (logical_lane_id % 2 == 0 && size > 1);
2763/// }
2764///
2765/// There is no assumption made about the initial state of the reduction.
2766/// Any number of lanes (>=1) could be active at any position. The reduction
2767/// result is returned in the first active lane.
2768///
2769/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2770///
2771/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2772/// if (lane_id % 2 == 0 && offset > 0)
2773/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2774/// else
2775/// reduce_elem = remote_elem
2776///
2777///
2778/// Intra-Team Reduction
2779///
2780/// This function, as implemented in the runtime call
2781/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2782/// threads in a team. It first reduces within a warp using the
2783/// aforementioned algorithms. We then proceed to gather all such
2784/// reduced values at the first warp.
2785///
2786/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2787/// data from each of the "warp master" (zeroth lane of each warp, where
2788/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2789/// a mathematical sense) the problem of reduction across warp masters in
2790/// a block to the problem of warp reduction.
2791///
2792///
2793/// Inter-Team Reduction
2794///
2795/// Once a team has reduced its data to a single value, it is stored in
2796/// a global scratchpad array. Since each team has a distinct slot, this
2797/// can be done without locking.
2798///
2799/// The last team to write to the scratchpad array proceeds to reduce the
2800/// scratchpad array. One or more workers in the last team use the helper
2801/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2802/// the k'th worker reduces every k'th element.
2803///
2804/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2805/// reduce across workers and compute a globally reduced value.
2806///
2810 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2811 if (!CGF.HaveInsertPoint())
2812 return;
2813
2814 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2815#ifndef NDEBUG
2816 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2817#endif
2818
2819 if (Options.SimpleReduction) {
2820 assert(!TeamsReduction && !ParallelReduction &&
2821 "Invalid reduction selection in emitReduction.");
2822 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2823 ReductionOps, Options);
2824 return;
2825 }
2826
2827 assert((TeamsReduction || ParallelReduction) &&
2828 "Invalid reduction selection in emitReduction.");
2829
2830 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2831 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2832 int Cnt = 0;
2833 for (const Expr *DRE : Privates) {
2834 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2835 ++Cnt;
2836 }
2837
2839 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
2840 CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
2841
2842 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2843 // RedList, shuffle_reduce_func, interwarp_copy_func);
2844 // or
2845 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2846 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2847
2848 llvm::Value *Res;
2849 // 1. Build a list of reduction variables.
2850 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2851 auto Size = RHSExprs.size();
2852 for (const Expr *E : Privates) {
2853 if (E->getType()->isVariablyModifiedType())
2854 // Reserve place for array size.
2855 ++Size;
2856 }
2857 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2858 QualType ReductionArrayTy = C.getConstantArrayType(
2859 C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
2860 /*IndexTypeQuals=*/0);
2861 Address ReductionList =
2862 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2863 auto IPriv = Privates.begin();
2864 unsigned Idx = 0;
2865 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2866 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2867 CGF.Builder.CreateStore(
2869 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
2870 Elem);
2871 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2872 // Store array size.
2873 ++Idx;
2874 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2875 llvm::Value *Size = CGF.Builder.CreateIntCast(
2876 CGF.getVLASize(
2877 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2878 .NumElts,
2879 CGF.SizeTy, /*isSigned=*/false);
2880 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2881 Elem);
2882 }
2883 }
2884
2885 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2886 ReductionList.emitRawPointer(CGF), CGF.VoidPtrTy);
2887 llvm::Function *ReductionFn = emitReductionFunction(
2888 CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
2889 Privates, LHSExprs, RHSExprs, ReductionOps);
2890 llvm::Value *ReductionDataSize =
2891 CGF.getTypeSize(C.getRecordType(ReductionRec));
2892 ReductionDataSize =
2893 CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
2894 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2895 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2896 llvm::Value *InterWarpCopyFn =
2897 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2898
2899 if (ParallelReduction) {
2900 llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
2901 InterWarpCopyFn};
2902
2903 Res = CGF.EmitRuntimeCall(
2904 OMPBuilder.getOrCreateRuntimeFunction(
2905 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2906 Args);
2907 } else {
2908 assert(TeamsReduction && "expected teams reduction.");
2909 TeamsReductions.push_back(ReductionRec);
2910 auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
2911 OMPBuilder.getOrCreateRuntimeFunction(
2912 CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
2913 {}, "_openmp_teams_reductions_buffer_$_$ptr");
2914 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2915 CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
2916 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2917 CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
2918 ReductionFn);
2919 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2920 CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
2921 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2922 CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
2923 ReductionFn);
2924
2925 llvm::Value *Args[] = {
2926 RTLoc,
2927 KernelTeamsReductionPtr,
2928 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2929 ReductionDataSize,
2930 RL,
2931 ShuffleAndReduceFn,
2932 InterWarpCopyFn,
2933 GlobalToBufferCpyFn,
2934 GlobalToBufferRedFn,
2935 BufferToGlobalCpyFn,
2936 BufferToGlobalRedFn};
2937
2938 Res = CGF.EmitRuntimeCall(
2939 OMPBuilder.getOrCreateRuntimeFunction(
2940 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2941 Args);
2942 }
2943
2944 // 5. Build if (res == 1)
2945 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
2946 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
2947 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
2948 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
2949 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
2950
2951 // 6. Build then branch: where we have reduced values in the master
2952 // thread in each team.
2953 // __kmpc_end_reduce{_nowait}(<gtid>);
2954 // break;
2955 CGF.EmitBlock(ThenBB);
2956
2957 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2958 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
2959 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2960 auto IPriv = Privates.begin();
2961 auto ILHS = LHSExprs.begin();
2962 auto IRHS = RHSExprs.begin();
2963 for (const Expr *E : ReductionOps) {
2964 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2965 cast<DeclRefExpr>(*IRHS));
2966 ++IPriv;
2967 ++ILHS;
2968 ++IRHS;
2969 }
2970 };
2971 RegionCodeGenTy RCG(CodeGen);
2972 RCG(CGF);
2973 // There is no need to emit line number for unconditional branch.
2975 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2976}
2977
2978const VarDecl *
2980 const VarDecl *NativeParam) const {
2981 if (!NativeParam->getType()->isReferenceType())
2982 return NativeParam;
2983 QualType ArgType = NativeParam->getType();
2985 const Type *NonQualTy = QC.strip(ArgType);
2986 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2987 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2988 if (Attr->getCaptureKind() == OMPC_map) {
2989 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2991 }
2992 }
2993 ArgType = CGM.getContext().getPointerType(PointeeTy);
2994 QC.addRestrict();
2995 enum { NVPTX_local_addr = 5 };
2996 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
2997 ArgType = QC.apply(CGM.getContext(), ArgType);
2998 if (isa<ImplicitParamDecl>(NativeParam))
3000 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3001 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
3002 return ParmVarDecl::Create(
3003 CGM.getContext(),
3004 const_cast<DeclContext *>(NativeParam->getDeclContext()),
3005 NativeParam->getBeginLoc(), NativeParam->getLocation(),
3006 NativeParam->getIdentifier(), ArgType,
3007 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3008}
3009
3010Address
3012 const VarDecl *NativeParam,
3013 const VarDecl *TargetParam) const {
3014 assert(NativeParam != TargetParam &&
3015 NativeParam->getType()->isReferenceType() &&
3016 "Native arg must not be the same as target arg.");
3017 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3018 QualType NativeParamType = NativeParam->getType();
3020 const Type *NonQualTy = QC.strip(NativeParamType);
3021 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3022 unsigned NativePointeeAddrSpace =
3023 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
3024 QualType TargetTy = TargetParam->getType();
3025 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,
3026 TargetTy, SourceLocation());
3027 // Cast to native address space.
3029 TargetAddr,
3030 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));
3031 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3032 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3033 NativeParamType);
3034 return NativeParamAddr;
3035}
3036
3038 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3039 ArrayRef<llvm::Value *> Args) const {
3041 TargetArgs.reserve(Args.size());
3042 auto *FnType = OutlinedFn.getFunctionType();
3043 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3044 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3045 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3046 break;
3047 }
3048 llvm::Type *TargetType = FnType->getParamType(I);
3049 llvm::Value *NativeArg = Args[I];
3050 if (!TargetType->isPointerTy()) {
3051 TargetArgs.emplace_back(NativeArg);
3052 continue;
3053 }
3054 TargetArgs.emplace_back(
3055 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));
3056 }
3057 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3058}
3059
3060/// Emit function which wraps the outline parallel region
3061/// and controls the arguments which are passed to this function.
3062/// The wrapper ensures that the outlined function is called
3063/// with the correct arguments when data is shared.
3064llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3065 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3066 ASTContext &Ctx = CGM.getContext();
3067 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3068
3069 // Create a function that takes as argument the source thread.
3070 FunctionArgList WrapperArgs;
3071 QualType Int16QTy =
3072 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3073 QualType Int32QTy =
3074 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3075 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3076 /*Id=*/nullptr, Int16QTy,
3078 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3079 /*Id=*/nullptr, Int32QTy,
3081 WrapperArgs.emplace_back(&ParallelLevelArg);
3082 WrapperArgs.emplace_back(&WrapperArg);
3083
3084 const CGFunctionInfo &CGFI =
3086
3087 auto *Fn = llvm::Function::Create(
3088 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3089 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3090
3091 // Ensure we do not inline the function. This is trivially true for the ones
3092 // passed to __kmpc_fork_call but the ones calles in serialized regions
3093 // could be inlined. This is not a perfect but it is closer to the invariant
3094 // we want, namely, every data environment starts with a new function.
3095 // TODO: We should pass the if condition to the runtime function and do the
3096 // handling there. Much cleaner code.
3097 Fn->addFnAttr(llvm::Attribute::NoInline);
3098
3100 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3101 Fn->setDoesNotRecurse();
3102
3103 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3104 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3105 D.getBeginLoc(), D.getBeginLoc());
3106
3107 const auto *RD = CS.getCapturedRecordDecl();
3108 auto CurField = RD->field_begin();
3109
3110 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3111 /*Name=*/".zero.addr");
3112 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3113 // Get the array of arguments.
3115
3116 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
3117 Args.emplace_back(ZeroAddr.emitRawPointer(CGF));
3118
3119 CGBuilderTy &Bld = CGF.Builder;
3120 auto CI = CS.capture_begin();
3121
3122 // Use global memory for data sharing.
3123 // Handle passing of global args to workers.
3124 RawAddress GlobalArgs =
3125 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3126 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3127 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3128 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3129 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3130 DataSharingArgs);
3131
3132 // Retrieve the shared variables from the list of references returned
3133 // by the runtime. Pass the variables to the outlined function.
3134 Address SharedArgListAddress = Address::invalid();
3135 if (CS.capture_size() > 0 ||
3137 SharedArgListAddress = CGF.EmitLoadOfPointer(
3138 GlobalArgs, CGF.getContext()
3140 .castAs<PointerType>());
3141 }
3142 unsigned Idx = 0;
3144 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3146 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3147 llvm::Value *LB = CGF.EmitLoadOfScalar(
3148 TypedAddress,
3149 /*Volatile=*/false,
3151 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3152 Args.emplace_back(LB);
3153 ++Idx;
3154 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3155 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3156 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3157 llvm::Value *UB = CGF.EmitLoadOfScalar(
3158 TypedAddress,
3159 /*Volatile=*/false,
3161 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3162 Args.emplace_back(UB);
3163 ++Idx;
3164 }
3165 if (CS.capture_size() > 0) {
3166 ASTContext &CGFContext = CGF.getContext();
3167 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3168 QualType ElemTy = CurField->getType();
3169 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3171 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3172 CGF.ConvertTypeForMem(ElemTy));
3173 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3174 /*Volatile=*/false,
3175 CGFContext.getPointerType(ElemTy),
3176 CI->getLocation());
3177 if (CI->capturesVariableByCopy() &&
3178 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3179 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3180 CI->getLocation());
3181 }
3182 Args.emplace_back(Arg);
3183 }
3184 }
3185
3186 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3187 CGF.FinishFunction();
3188 return Fn;
3189}
3190
3192 const Decl *D) {
3193 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3194 return;
3195
3196 assert(D && "Expected function or captured|block decl.");
3197 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3198 "Function is registered already.");
3199 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3200 "Team is set but not processed.");
3201 const Stmt *Body = nullptr;
3202 bool NeedToDelayGlobalization = false;
3203 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3204 Body = FD->getBody();
3205 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3206 Body = BD->getBody();
3207 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3208 Body = CD->getBody();
3209 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3210 if (NeedToDelayGlobalization &&
3211 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3212 return;
3213 }
3214 if (!Body)
3215 return;
3216 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3217 VarChecker.Visit(Body);
3218 const RecordDecl *GlobalizedVarsRecord =
3219 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3220 TeamAndReductions.first = nullptr;
3221 TeamAndReductions.second.clear();
3222 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3223 VarChecker.getEscapedVariableLengthDecls();
3224 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
3225 VarChecker.getDelayedVariableLengthDecls();
3226 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
3227 DelayedVariableLengthDecls.empty())
3228 return;
3229 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3230 I->getSecond().MappedParams =
3231 std::make_unique<CodeGenFunction::OMPMapVars>();
3232 I->getSecond().EscapedParameters.insert(
3233 VarChecker.getEscapedParameters().begin(),
3234 VarChecker.getEscapedParameters().end());
3235 I->getSecond().EscapedVariableLengthDecls.append(
3236 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3237 I->getSecond().DelayedVariableLengthDecls.append(
3238 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
3239 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3240 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3241 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3242 Data.insert(std::make_pair(VD, MappedVarData()));
3243 }
3244 if (!NeedToDelayGlobalization) {
3245 emitGenericVarsProlog(CGF, D->getBeginLoc());
3246 struct GlobalizationScope final : EHScopeStack::Cleanup {
3247 GlobalizationScope() = default;
3248
3249 void Emit(CodeGenFunction &CGF, Flags flags) override {
3250 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3251 .emitGenericVarsEpilog(CGF);
3252 }
3253 };
3254 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3255 }
3256}
3257
3259 const VarDecl *VD) {
3260 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3261 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3262 auto AS = LangAS::Default;
3263 switch (A->getAllocatorType()) {
3264 // Use the default allocator here as by default local vars are
3265 // threadlocal.
3266 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3267 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3268 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3269 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3270 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3271 // Follow the user decision - use default allocation.
3272 return Address::invalid();
3273 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3274 // TODO: implement aupport for user-defined allocators.
3275 return Address::invalid();
3276 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3278 break;
3279 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3281 break;
3282 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3283 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3284 break;
3285 }
3286 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3287 auto *GV = new llvm::GlobalVariable(
3288 CGM.getModule(), VarTy, /*isConstant=*/false,
3289 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
3290 VD->getName(),
3291 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3293 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3294 GV->setAlignment(Align.getAsAlign());
3295 return Address(
3297 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3298 VD->getType().getAddressSpace()))),
3299 VarTy, Align);
3300 }
3301
3302 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3303 return Address::invalid();
3304
3305 VD = VD->getCanonicalDecl();
3306 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3307 if (I == FunctionGlobalizedDecls.end())
3308 return Address::invalid();
3309 auto VDI = I->getSecond().LocalVarData.find(VD);
3310 if (VDI != I->getSecond().LocalVarData.end())
3311 return VDI->second.PrivateAddr;
3312 if (VD->hasAttrs()) {
3314 E(VD->attr_end());
3315 IT != E; ++IT) {
3316 auto VDI = I->getSecond().LocalVarData.find(
3317 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3318 ->getCanonicalDecl());
3319 if (VDI != I->getSecond().LocalVarData.end())
3320 return VDI->second.PrivateAddr;
3321 }
3322 }
3323
3324 return Address::invalid();
3325}
3326
3328 FunctionGlobalizedDecls.erase(CGF.CurFn);
3330}
3331
3333 CodeGenFunction &CGF, const OMPLoopDirective &S,
3334 OpenMPDistScheduleClauseKind &ScheduleKind,
3335 llvm::Value *&Chunk) const {
3336 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3337 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3338 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3339 Chunk = CGF.EmitScalarConversion(
3340 RT.getGPUNumThreads(CGF),
3341 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3342 S.getIterationVariable()->getType(), S.getBeginLoc());
3343 return;
3344 }
3346 CGF, S, ScheduleKind, Chunk);
3347}
3348
3350 CodeGenFunction &CGF, const OMPLoopDirective &S,
3351 OpenMPScheduleClauseKind &ScheduleKind,
3352 const Expr *&ChunkExpr) const {
3353 ScheduleKind = OMPC_SCHEDULE_static;
3354 // Chunk size is 1 in this case.
3355 llvm::APInt ChunkSize(32, 1);
3356 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3357 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3358 SourceLocation());
3359}
3360
3362 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3364 " Expected target-based directive.");
3365 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3366 for (const CapturedStmt::Capture &C : CS->captures()) {
3367 // Capture variables captured by reference in lambdas for target-based
3368 // directives.
3369 if (!C.capturesVariable())
3370 continue;
3371 const VarDecl *VD = C.getCapturedVar();
3372 const auto *RD = VD->getType()
3376 if (!RD || !RD->isLambda())
3377 continue;
3378 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3379 LValue VDLVal;
3381 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3382 else
3383 VDLVal = CGF.MakeAddrLValue(
3384 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3385 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3386 FieldDecl *ThisCapture = nullptr;
3387 RD->getCaptureFields(Captures, ThisCapture);
3388 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3389 LValue ThisLVal =
3390 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3391 llvm::Value *CXXThis = CGF.LoadCXXThis();
3392 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3393 }
3394 for (const LambdaCapture &LC : RD->captures()) {
3395 if (LC.getCaptureKind() != LCK_ByRef)
3396 continue;
3397 const ValueDecl *VD = LC.getCapturedVar();
3398 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3399 // capturing structured bindings in lambdas yet.
3400 if (!CS->capturesVariable(cast<VarDecl>(VD)))
3401 continue;
3402 auto It = Captures.find(VD);
3403 assert(It != Captures.end() && "Found lambda capture without field.");
3404 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3405 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3407 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3408 VD->getType().getCanonicalType())
3409 .getAddress(CGF);
3410 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
3411 }
3412 }
3413}
3414
3416 LangAS &AS) {
3417 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3418 return false;
3419 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3420 switch(A->getAllocatorType()) {
3421 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3422 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3423 // Not supported, fallback to the default mem space.
3424 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3425 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3426 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3427 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3428 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3429 AS = LangAS::Default;
3430 return true;
3431 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3433 return true;
3434 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3436 return true;
3437 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3438 llvm_unreachable("Expected predefined allocator for the variables with the "
3439 "static storage.");
3440 }
3441 return false;
3442}
3443
3444// Get current CudaArch and ignore any unknown values
3446 if (!CGM.getTarget().hasFeature("ptx"))
3447 return CudaArch::UNKNOWN;
3448 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3449 if (Feature.getValue()) {
3450 CudaArch Arch = StringToCudaArch(Feature.getKey());
3451 if (Arch != CudaArch::UNKNOWN)
3452 return Arch;
3453 }
3454 }
3455 return CudaArch::UNKNOWN;
3456}
3457
3458/// Check to see if target architecture supports unified addressing which is
3459/// a restriction for OpenMP requires clause "unified_shared_memory".
3461 const OMPRequiresDecl *D) {
3462 for (const OMPClause *Clause : D->clauselists()) {
3463 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3464 CudaArch Arch = getCudaArch(CGM);
3465 switch (Arch) {
3466 case CudaArch::SM_20:
3467 case CudaArch::SM_21:
3468 case CudaArch::SM_30:
3469 case CudaArch::SM_32_:
3470 case CudaArch::SM_35:
3471 case CudaArch::SM_37:
3472 case CudaArch::SM_50:
3473 case CudaArch::SM_52:
3474 case CudaArch::SM_53: {
3475 SmallString<256> Buffer;
3476 llvm::raw_svector_ostream Out(Buffer);
3477 Out << "Target architecture " << CudaArchToString(Arch)
3478 << " does not support unified addressing";
3479 CGM.Error(Clause->getBeginLoc(), Out.str());
3480 return;
3481 }
3482 case CudaArch::SM_60:
3483 case CudaArch::SM_61:
3484 case CudaArch::SM_62:
3485 case CudaArch::SM_70:
3486 case CudaArch::SM_72:
3487 case CudaArch::SM_75:
3488 case CudaArch::SM_80:
3489 case CudaArch::SM_86:
3490 case CudaArch::SM_87:
3491 case CudaArch::SM_89:
3492 case CudaArch::SM_90:
3493 case CudaArch::SM_90a:
3494 case CudaArch::GFX600:
3495 case CudaArch::GFX601:
3496 case CudaArch::GFX602:
3497 case CudaArch::GFX700:
3498 case CudaArch::GFX701:
3499 case CudaArch::GFX702:
3500 case CudaArch::GFX703:
3501 case CudaArch::GFX704:
3502 case CudaArch::GFX705:
3503 case CudaArch::GFX801:
3504 case CudaArch::GFX802:
3505 case CudaArch::GFX803:
3506 case CudaArch::GFX805:
3507 case CudaArch::GFX810:
3508 case CudaArch::GFX900:
3509 case CudaArch::GFX902:
3510 case CudaArch::GFX904:
3511 case CudaArch::GFX906:
3512 case CudaArch::GFX908:
3513 case CudaArch::GFX909:
3514 case CudaArch::GFX90a:
3515 case CudaArch::GFX90c:
3516 case CudaArch::GFX940:
3517 case CudaArch::GFX941:
3518 case CudaArch::GFX942:
3519 case CudaArch::GFX1010:
3520 case CudaArch::GFX1011:
3521 case CudaArch::GFX1012:
3522 case CudaArch::GFX1013:
3523 case CudaArch::GFX1030:
3524 case CudaArch::GFX1031:
3525 case CudaArch::GFX1032:
3526 case CudaArch::GFX1033:
3527 case CudaArch::GFX1034:
3528 case CudaArch::GFX1035:
3529 case CudaArch::GFX1036:
3530 case CudaArch::GFX1100:
3531 case CudaArch::GFX1101:
3532 case CudaArch::GFX1102:
3533 case CudaArch::GFX1103:
3534 case CudaArch::GFX1150:
3535 case CudaArch::GFX1151:
3536 case CudaArch::GFX1200:
3537 case CudaArch::GFX1201:
3538 case CudaArch::Generic:
3539 case CudaArch::UNUSED:
3540 case CudaArch::UNKNOWN:
3541 break;
3542 case CudaArch::LAST:
3543 llvm_unreachable("Unexpected Cuda arch.");
3544 }
3545 }
3546 }
3548}
3549
3551 CGBuilderTy &Bld = CGF.Builder;
3552 llvm::Module *M = &CGF.CGM.getModule();
3553 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3554 llvm::Function *F = M->getFunction(LocSize);
3555 if (!F) {
3556 F = llvm::Function::Create(
3557 llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
3558 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
3559 }
3560 return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
3561}
3562
3565 return CGF.EmitRuntimeCall(
3566 OMPBuilder.getOrCreateRuntimeFunction(
3567 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3568 Args);
3569}
3570
3573 return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3574 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3575 Args);
3576}
#define V(N, I)
Definition: ASTContext.h:3284
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static CudaArch getCudaArch(CodeGenModule &CGM)
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values,...
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ... or teams {distribute ....
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
This file defines OpenMP nodes for declarative directives.
This file defines OpenMP AST classes for clauses.
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
const char * Data
This file defines OpenMP AST classes for executable directives and clauses.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:182
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
CanQualType VoidPtrTy
Definition: ASTContext.h:1118
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
CanQualType VoidTy
Definition: ASTContext.h:1091
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2770
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:757
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
Definition: Attr.h:42
A class which contains all the information about a particular captured value.
Definition: Decl.h:4501
ArrayRef< Capture > captures() const
Definition: Decl.h:4622
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:6173
const BlockDecl * getBlockDecl() const
Definition: Expr.h:6185
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2820
Expr * getCallee()
Definition: Expr.h:2970
arg_range arguments()
Definition: Expr.h:3059
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3770
This captures a statement into a function.
Definition: Stmt.h:3757
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1407
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3861
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1431
capture_range captures()
Definition: Stmt.h:3895
CastKind getCastKind() const
Definition: Expr.h:3527
Expr * getSubExpr()
Definition: Expr.h:3533
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:122
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:189
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111
static Address invalid()
Definition: Address.h:153
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:220
CharUnits getAlignment() const
Definition: Address.h:166
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:184
Address withElementType(llvm::Type *ElemTy) const
Return address with different element type, but same pointer and alignment.
Definition: Address.h:241
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:176
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:881
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:136
llvm::Value * CreateIsNull(Address Addr, const Twine &Name="")
Definition: CGBuilder.h:355
Address CreateGEP(CodeGenFunction &CGF, Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition: CGBuilder.h:292
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:203
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ... produce name = getelementptr inbounds addr, i64 0, i64 index where i64 is a...
Definition: CGBuilder.h:241
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:278
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:261
Address CreateInBoundsGEP(Address Addr, ArrayRef< llvm::Value * > IdxList, llvm::Type *ElementType, CharUnits Align, const Twine &Name="")
Definition: CGBuilder.h:345
CGFunctionInfo - Class to encapsulate the information about a function definition.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
@ DS_CUDA
CUDA data sharing mode.
@ DS_Generic
Generic data-sharing mode.
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
void getKmpcFreeShared(CodeGenFunction &CGF, const std::pair< llvm::Value *, llvm::Value * > &AddrSizePair) override
Get call to __kmpc_free_shared.
llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
llvm::Value * getGPUWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
bool isDelayedVariableLengthDecl(CodeGenFunction &CGF, const VarDecl *VD) const override
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
ExecutionMode
Defines the execution mode.
@ EM_NonSPMD
Non-SPMD execution mode (1 master thread, others are workers).
@ EM_Unknown
Unknown execution mode (orphaned directive).
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
std::pair< llvm::Value *, llvm::Value * > getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) override
Get call to __kmpc_alloc_shared.
bool isGPU() const override
Returns true if the current target is a GPU.
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
virtual llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D, CodeGenFunction &CGF, int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal, int32_t &MaxTeamsVal)
Helper to determine the min/max number of threads/teams for D.
virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Helper to emit outlined function for 'target' directive.
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
virtual void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false)
Emit an implicit/explicit barrier for OpenMP threads.
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
virtual llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
llvm::OpenMPIRBuilder & getOMPBuilder()
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
llvm::Function * emitReductionFunction(StringRef ReducerName, SourceLocation Loc, llvm::Type *ArgsElemType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
CGCapturedStmtInfo * CapturedStmtInfo
LValue MakeNaturalAlignPointeeRawAddrLValue(llvm::Value *V, QualType T)
Same as MakeNaturalAlignPointeeAddrLValue except that the pointer is known to be unsigned.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
RawAddress CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements,...
LValue EmitLValue(const Expr *E, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitLValue - Emit code to compute a designator that specifies the location of the expression.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::Type * ConvertTypeForMem(QualType T)
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
const TargetInfo & getTarget() const
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
LValue MakeNaturalAlignRawAddrLValue(llvm::Value *V, QualType T)
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
llvm::Type * ConvertType(QualType T)
CodeGenTypes & getTypes() const
llvm::Value * EvaluateExprAsBool(const Expr *E)
EvaluateExprAsBool - Perform the usual unary conversions on the specified expression and compare the ...
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
llvm::LLVMContext & getLLVMContext()
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
This class organizes the cross-function state that is used while generating LLVM code.
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
llvm::Module & getModule() const
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
const LangOptions & getLangOpts() const
const TargetInfo & getTarget() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
ASTContext & getContext() const
llvm::LLVMContext & getLLVMContext()
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1632
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:680
unsigned getTargetAddressSpace(QualType T) const
llvm::Type * ConvertTypeForMem(QualType T, bool ForBitField=false)
ConvertTypeForMem - Convert type T into a llvm::Type.
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:352
LValue - This represents an lvalue references.
Definition: CGValue.h:181
Address getAddress(CodeGenFunction &CGF) const
Definition: CGValue.h:370
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:361
void setAddress(Address address)
Definition: CGValue.h:375
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
An abstract representation of an aligned address.
Definition: Address.h:41
llvm::Value * getPointer() const
Definition: Address.h:65
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
void setAction(PrePostActionTy &Action) const
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:195
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1436
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1698
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1260
ValueDecl * getDecl()
Definition: Expr.h:1328
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1497
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
T * getAttr() const
Definition: DeclBase.h:579
bool hasAttrs() const
Definition: DeclBase.h:524
attr_iterator attr_end() const
Definition: DeclBase.h:548
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:974
attr_iterator attr_begin() const
Definition: DeclBase.h:545
SourceLocation getLocation() const
Definition: DeclBase.h:445
DeclContext * getDeclContext()
Definition: DeclBase.h:454
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:437
AttrVec & getAttrs()
Definition: DeclBase.h:530
bool hasAttr() const
Definition: DeclBase.h:583
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:968
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:822
This represents one expression.
Definition: Expr.h:110
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3059
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3055
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:277
QualType getType() const
Definition: Expr.h:142
Represents a member of a struct/union/class.
Definition: Decl.h:3058
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4547
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3655
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:5381
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:977
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1948
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition: ExprCXX.cpp:1290
capture_range captures() const
Retrieve this lambda's captures.
Definition: ExprCXX.cpp:1303
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:539
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:276
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:556
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:531
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive.
Definition: StmtOpenMP.h:547
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:569
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:534
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:502
ArrayRef< OMPClause * > clauses() const
Definition: StmtOpenMP.h:586
static const SpecificClause * getSingleClause(ArrayRef< OMPClause * > Clauses)
Gets a single clause of the specified kind associated with the current directive iff there is only on...
Definition: StmtOpenMP.h:477
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:459
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1018
This represents clause 'reduction' in the '#pragma omp ...' directives.
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:417
clauselist_range clauselists()
Definition: DeclOpenMP.h:442
This represents 'ompx_bare' clause in the '#pragma omp target teams ...' directive.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2915
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:3135
A (possibly-)qualified type.
Definition: Type.h:940
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:7481
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:7556
QualType getCanonicalType() const
Definition: Type.h:7407
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:7295
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:7302
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:4272
void addAddressSpace(LangAS space)
Definition: Type.h:583
void addRestrict()
Definition: Type.h:466
Represents a struct/union/class.
Definition: Decl.h:4169
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:5083
Encodes a location in the source.
RetTy Visit(PTR(Stmt) S, ParamTys... P)
Definition: StmtVisitor.h:44
Stmt - This represents one statement.
Definition: Stmt.h:84
child_range children()
Definition: Stmt.cpp:287
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top,...
Definition: Stmt.cpp:197
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4739
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:722
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:308
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1618
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1452
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62
The base class of the type hierarchy.
Definition: Type.h:1813
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1870
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:7941
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:8186
bool isReferenceType() const
Definition: Type.h:7620
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:694
bool isLValueReferenceType() const
Definition: Type.h:7624
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2174
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2667
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2183
Expr * getSubExpr() const
Definition: Expr.h:2228
Opcode getOpcode() const
Definition: Expr.h:2223
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:706
QualType getType() const
Definition: Decl.h:717
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:5373
Represents a variable declaration or definition.
Definition: Decl.h:918
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2254
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1558
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:33
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
The JSON file list parser is used to communicate input to InstallAPI.
@ Private
'private' clause, allowed on 'parallel', 'serial', 'loop', 'parallel loop', and 'serial loop' constru...
CudaArch
Definition: Cuda.h:53
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:269
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:168
@ CR_OpenMP
Definition: CapturedStmt.h:19
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
@ SC_None
Definition: Specifiers.h:247
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86
@ CXXThis
Parameter for C++ 'this' argument.
@ Other
Other implicit parameter.
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:150
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30
@ AS_public
Definition: Specifiers.h:121
unsigned long uint64_t
llvm::Value * ScratchpadIndex
llvm::Value * ScratchpadWidth
llvm::Value * RemoteLaneOffset
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::IntegerType * IntTy
int