clang 20.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/Support/MathExtras.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
35 llvm::FunctionCallee ExitCallee = nullptr;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52 ContBlock = CGF.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55 CGF.EmitBlock(ThenBlock);
56 }
57 }
58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(ContBlock);
61 CGF.EmitBlock(ContBlock, true);
62 }
63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65 }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
77
78public:
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86};
87
88static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
89 RefExpr = RefExpr->IgnoreParens();
90 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
91 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
93 Base = TempASE->getBase()->IgnoreParenImpCasts();
94 RefExpr = Base;
95 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
96 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
98 Base = TempOASE->getBase()->IgnoreParenImpCasts();
99 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
100 Base = TempASE->getBase()->IgnoreParenImpCasts();
101 RefExpr = Base;
102 }
103 RefExpr = RefExpr->IgnoreParenImpCasts();
104 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
105 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106 const auto *ME = cast<MemberExpr>(RefExpr);
107 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108}
109
110static RecordDecl *buildRecordForGlobalizedVars(
112 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
114 &MappedDeclsFields,
115 int BufSize) {
116 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
117 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118 return nullptr;
119 SmallVector<VarsDataTy, 4> GlobalizedVars;
120 for (const ValueDecl *D : EscapedDecls)
121 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
122 for (const ValueDecl *D : EscapedDeclsForTeams)
123 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
124
125 // Build struct _globalized_locals_ty {
126 // /* globalized vars */[WarSize] align (decl_align)
127 // /* globalized vars */ for EscapedDeclsForTeams
128 // };
129 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
130 GlobalizedRD->startDefinition();
132 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133 for (const auto &Pair : GlobalizedVars) {
134 const ValueDecl *VD = Pair.second;
135 QualType Type = VD->getType();
137 Type = C.getPointerType(Type.getNonReferenceType());
138 else
139 Type = Type.getNonReferenceType();
142 if (SingleEscaped.count(VD)) {
144 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
145 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
146 /*BW=*/nullptr, /*Mutable=*/false,
147 /*InitStyle=*/ICIS_NoInit);
148 Field->setAccess(AS_public);
149 if (VD->hasAttrs()) {
150 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151 E(VD->getAttrs().end());
152 I != E; ++I)
153 Field->addAttr(*I);
154 }
155 } else {
156 if (BufSize > 1) {
157 llvm::APInt ArraySize(32, BufSize);
158 Type = C.getConstantArrayType(Type, ArraySize, nullptr,
159 ArraySizeModifier::Normal, 0);
160 }
162 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
163 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
164 /*BW=*/nullptr, /*Mutable=*/false,
165 /*InitStyle=*/ICIS_NoInit);
166 Field->setAccess(AS_public);
167 llvm::APInt Align(32, Pair.first.getQuantity());
168 Field->addAttr(AlignedAttr::CreateImplicit(
169 C, /*IsAlignmentExpr=*/true,
171 C.getIntTypeForBitwidth(32, /*Signed=*/0),
173 {}, AlignedAttr::GNU_aligned));
174 }
175 GlobalizedRD->addDecl(Field);
176 MappedDeclsFields.try_emplace(VD, Field);
177 }
178 GlobalizedRD->completeDefinition();
179 return GlobalizedRD;
180}
181
182/// Get the list of variables that can escape their declaration context.
183class CheckVarsEscapingDeclContext final
184 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185 CodeGenFunction &CGF;
186 llvm::SetVector<const ValueDecl *> EscapedDecls;
187 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
190 RecordDecl *GlobalizedRD = nullptr;
191 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
192 bool AllEscaped = false;
193 bool IsForCombinedParallelRegion = false;
194
195 void markAsEscaped(const ValueDecl *VD) {
196 // Do not globalize declare target variables.
197 if (!isa<VarDecl>(VD) ||
198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199 return;
200 VD = cast<ValueDecl>(VD->getCanonicalDecl());
201 // Use user-specified allocation.
202 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203 return;
204 // Variables captured by value must be globalized.
205 bool IsCaptured = false;
206 if (auto *CSI = CGF.CapturedStmtInfo) {
207 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
208 // Check if need to capture the variable that was already captured by
209 // value in the outer region.
210 IsCaptured = true;
211 if (!IsForCombinedParallelRegion) {
212 if (!FD->hasAttrs())
213 return;
214 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215 if (!Attr)
216 return;
217 if (((Attr->getCaptureKind() != OMPC_map) &&
218 !isOpenMPPrivate(Attr->getCaptureKind())) ||
219 ((Attr->getCaptureKind() == OMPC_map) &&
220 !FD->getType()->isAnyPointerType()))
221 return;
222 }
223 if (!FD->getType()->isReferenceType()) {
224 assert(!VD->getType()->isVariablyModifiedType() &&
225 "Parameter captured by value with variably modified type");
226 EscapedParameters.insert(VD);
227 } else if (!IsForCombinedParallelRegion) {
228 return;
229 }
230 }
231 }
232 if ((!CGF.CapturedStmtInfo ||
233 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234 VD->getType()->isReferenceType())
235 // Do not globalize variables with reference type.
236 return;
237 if (VD->getType()->isVariablyModifiedType()) {
238 // If not captured at the target region level then mark the escaped
239 // variable as delayed.
240 if (IsCaptured)
241 EscapedVariableLengthDecls.insert(VD);
242 else
243 DelayedVariableLengthDecls.insert(VD);
244 } else
245 EscapedDecls.insert(VD);
246 }
247
248 void VisitValueDecl(const ValueDecl *VD) {
249 if (VD->getType()->isLValueReferenceType())
250 markAsEscaped(VD);
251 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
252 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
253 const bool SavedAllEscaped = AllEscaped;
254 AllEscaped = VD->getType()->isLValueReferenceType();
255 Visit(VarD->getInit());
256 AllEscaped = SavedAllEscaped;
257 }
258 }
259 }
260 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261 ArrayRef<OMPClause *> Clauses,
262 bool IsCombinedParallelRegion) {
263 if (!S)
264 return;
265 for (const CapturedStmt::Capture &C : S->captures()) {
266 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267 const ValueDecl *VD = C.getCapturedVar();
268 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269 if (IsCombinedParallelRegion) {
270 // Check if the variable is privatized in the combined construct and
271 // those private copies must be shared in the inner parallel
272 // directive.
273 IsForCombinedParallelRegion = false;
274 for (const OMPClause *C : Clauses) {
275 if (!isOpenMPPrivate(C->getClauseKind()) ||
276 C->getClauseKind() == OMPC_reduction ||
277 C->getClauseKind() == OMPC_linear ||
278 C->getClauseKind() == OMPC_private)
279 continue;
281 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
282 Vars = PC->getVarRefs();
283 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
284 Vars = PC->getVarRefs();
285 else
286 llvm_unreachable("Unexpected clause.");
287 for (const auto *E : Vars) {
288 const Decl *D =
289 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
290 if (D == VD->getCanonicalDecl()) {
291 IsForCombinedParallelRegion = true;
292 break;
293 }
294 }
295 if (IsForCombinedParallelRegion)
296 break;
297 }
298 }
299 markAsEscaped(VD);
300 if (isa<OMPCapturedExprDecl>(VD))
301 VisitValueDecl(VD);
302 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303 }
304 }
305 }
306
307 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308 assert(!GlobalizedRD &&
309 "Record for globalized variables is built already.");
310 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312 if (IsInTTDRegion)
313 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314 else
315 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316 GlobalizedRD = ::buildRecordForGlobalizedVars(
317 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
318 MappedDeclsFields, WarpSize);
319 }
320
321public:
322 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323 ArrayRef<const ValueDecl *> TeamsReductions)
324 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325 }
326 virtual ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
332 VisitValueDecl(VD);
333 }
334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, D->clauses(),
351 CaptureRegions.back() == OMPD_parallel &&
352 isOpenMPDistributeDirective(D->getDirectiveKind()));
353 }
354 }
355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
362 if (isa<OMPCapturedExprDecl>(VD))
363 VisitValueDecl(VD);
364 }
365 }
366 }
367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
375 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(Arg);
406 }
407 }
408 Visit(E->getCallee());
409 }
410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
416 if (isa<OMPCapturedExprDecl>(VD))
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(E->getSubExpr());
431 }
432 }
433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(Child);
454 AllEscaped = SavedAllEscaped;
455 }
456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501};
502} // anonymous namespace
503
505CGOpenMPRuntimeGPU::getExecutionMode() const {
506 return CurrentExecutionMode;
507}
508
510CGOpenMPRuntimeGPU::getDataSharingMode() const {
511 return CurrentDataSharingMode;
512}
513
514/// Check for inner (nested) SPMD construct, if any
516 const OMPExecutableDirective &D) {
517 const auto *CS = D.getInnermostCapturedStmt();
518 const auto *Body =
519 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
520 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521
522 if (const auto *NestedDir =
523 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
524 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
525 switch (D.getDirectiveKind()) {
526 case OMPD_target:
527 if (isOpenMPParallelDirective(DKind))
528 return true;
529 if (DKind == OMPD_teams) {
530 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
531 /*IgnoreCaptured=*/true);
532 if (!Body)
533 return false;
534 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
535 if (const auto *NND =
536 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
537 DKind = NND->getDirectiveKind();
538 if (isOpenMPParallelDirective(DKind))
539 return true;
540 }
541 }
542 return false;
543 case OMPD_target_teams:
544 return isOpenMPParallelDirective(DKind);
545 case OMPD_target_simd:
546 case OMPD_target_parallel:
547 case OMPD_target_parallel_for:
548 case OMPD_target_parallel_for_simd:
549 case OMPD_target_teams_distribute:
550 case OMPD_target_teams_distribute_simd:
551 case OMPD_target_teams_distribute_parallel_for:
552 case OMPD_target_teams_distribute_parallel_for_simd:
553 case OMPD_parallel:
554 case OMPD_for:
555 case OMPD_parallel_for:
556 case OMPD_parallel_master:
557 case OMPD_parallel_sections:
558 case OMPD_for_simd:
559 case OMPD_parallel_for_simd:
560 case OMPD_cancel:
561 case OMPD_cancellation_point:
562 case OMPD_ordered:
563 case OMPD_threadprivate:
564 case OMPD_allocate:
565 case OMPD_task:
566 case OMPD_simd:
567 case OMPD_sections:
568 case OMPD_section:
569 case OMPD_single:
570 case OMPD_master:
571 case OMPD_critical:
572 case OMPD_taskyield:
573 case OMPD_barrier:
574 case OMPD_taskwait:
575 case OMPD_taskgroup:
576 case OMPD_atomic:
577 case OMPD_flush:
578 case OMPD_depobj:
579 case OMPD_scan:
580 case OMPD_teams:
581 case OMPD_target_data:
582 case OMPD_target_exit_data:
583 case OMPD_target_enter_data:
584 case OMPD_distribute:
585 case OMPD_distribute_simd:
586 case OMPD_distribute_parallel_for:
587 case OMPD_distribute_parallel_for_simd:
588 case OMPD_teams_distribute:
589 case OMPD_teams_distribute_simd:
590 case OMPD_teams_distribute_parallel_for:
591 case OMPD_teams_distribute_parallel_for_simd:
592 case OMPD_target_update:
593 case OMPD_declare_simd:
594 case OMPD_declare_variant:
595 case OMPD_begin_declare_variant:
596 case OMPD_end_declare_variant:
597 case OMPD_declare_target:
598 case OMPD_end_declare_target:
599 case OMPD_declare_reduction:
600 case OMPD_declare_mapper:
601 case OMPD_taskloop:
602 case OMPD_taskloop_simd:
603 case OMPD_master_taskloop:
604 case OMPD_master_taskloop_simd:
605 case OMPD_parallel_master_taskloop:
606 case OMPD_parallel_master_taskloop_simd:
607 case OMPD_requires:
608 case OMPD_unknown:
609 default:
610 llvm_unreachable("Unexpected directive.");
611 }
612 }
613
614 return false;
615}
616
618 const OMPExecutableDirective &D) {
619 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
620 switch (DirectiveKind) {
621 case OMPD_target:
622 case OMPD_target_teams:
623 return hasNestedSPMDDirective(Ctx, D);
624 case OMPD_target_parallel_loop:
625 case OMPD_target_parallel:
626 case OMPD_target_parallel_for:
627 case OMPD_target_parallel_for_simd:
628 case OMPD_target_teams_distribute_parallel_for:
629 case OMPD_target_teams_distribute_parallel_for_simd:
630 case OMPD_target_simd:
631 case OMPD_target_teams_distribute_simd:
632 return true;
633 case OMPD_target_teams_distribute:
634 return false;
635 case OMPD_target_teams_loop:
636 // Whether this is true or not depends on how the directive will
637 // eventually be emitted.
638 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
639 return TTLD->canBeParallelFor();
640 return false;
641 case OMPD_parallel:
642 case OMPD_for:
643 case OMPD_parallel_for:
644 case OMPD_parallel_master:
645 case OMPD_parallel_sections:
646 case OMPD_for_simd:
647 case OMPD_parallel_for_simd:
648 case OMPD_cancel:
649 case OMPD_cancellation_point:
650 case OMPD_ordered:
651 case OMPD_threadprivate:
652 case OMPD_allocate:
653 case OMPD_task:
654 case OMPD_simd:
655 case OMPD_sections:
656 case OMPD_section:
657 case OMPD_single:
658 case OMPD_master:
659 case OMPD_critical:
660 case OMPD_taskyield:
661 case OMPD_barrier:
662 case OMPD_taskwait:
663 case OMPD_taskgroup:
664 case OMPD_atomic:
665 case OMPD_flush:
666 case OMPD_depobj:
667 case OMPD_scan:
668 case OMPD_teams:
669 case OMPD_target_data:
670 case OMPD_target_exit_data:
671 case OMPD_target_enter_data:
672 case OMPD_distribute:
673 case OMPD_distribute_simd:
674 case OMPD_distribute_parallel_for:
675 case OMPD_distribute_parallel_for_simd:
676 case OMPD_teams_distribute:
677 case OMPD_teams_distribute_simd:
678 case OMPD_teams_distribute_parallel_for:
679 case OMPD_teams_distribute_parallel_for_simd:
680 case OMPD_target_update:
681 case OMPD_declare_simd:
682 case OMPD_declare_variant:
683 case OMPD_begin_declare_variant:
684 case OMPD_end_declare_variant:
685 case OMPD_declare_target:
686 case OMPD_end_declare_target:
687 case OMPD_declare_reduction:
688 case OMPD_declare_mapper:
689 case OMPD_taskloop:
690 case OMPD_taskloop_simd:
691 case OMPD_master_taskloop:
692 case OMPD_master_taskloop_simd:
693 case OMPD_parallel_master_taskloop:
694 case OMPD_parallel_master_taskloop_simd:
695 case OMPD_requires:
696 case OMPD_unknown:
697 default:
698 break;
699 }
700 llvm_unreachable(
701 "Unknown programming model for OpenMP directive on NVPTX target.");
702}
703
704void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
705 StringRef ParentName,
706 llvm::Function *&OutlinedFn,
707 llvm::Constant *&OutlinedFnID,
708 bool IsOffloadEntry,
709 const RegionCodeGenTy &CodeGen) {
710 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
711 EntryFunctionState EST;
712 WrapperFunctionsMap.clear();
713
714 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
715 assert(!IsBareKernel && "bare kernel should not be at generic mode");
716
717 // Emit target region as a standalone region.
718 class NVPTXPrePostActionTy : public PrePostActionTy {
719 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
721
722 public:
723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
725 : EST(EST), D(D) {}
726 void Enter(CodeGenFunction &CGF) override {
727 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
728 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
729 // Skip target region initialization.
730 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
731 }
732 void Exit(CodeGenFunction &CGF) override {
733 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
735 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
736 }
737 } Action(EST, D);
738 CodeGen.setAction(Action);
739 IsInTTDRegion = true;
740 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
741 IsOffloadEntry, CodeGen);
742 IsInTTDRegion = false;
743}
744
745void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
746 CodeGenFunction &CGF,
747 EntryFunctionState &EST, bool IsSPMD) {
748 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
749 MaxTeamsVal = -1;
750 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
751 MinTeamsVal, MaxTeamsVal);
752
753 CGBuilderTy &Bld = CGF.Builder;
754 Bld.restoreIP(OMPBuilder.createTargetInit(
755 Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
756 if (!IsSPMD)
757 emitGenericVarsProlog(CGF, EST.Loc);
758}
759
760void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
761 EntryFunctionState &EST,
762 bool IsSPMD) {
763 if (!IsSPMD)
764 emitGenericVarsEpilog(CGF);
765
766 // This is temporary until we remove the fixed sized buffer.
768 RecordDecl *StaticRD = C.buildImplicitRecord(
769 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
770 StaticRD->startDefinition();
771 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
772 QualType RecTy = C.getRecordType(TeamReductionRec);
773 auto *Field = FieldDecl::Create(
774 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
775 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
776 /*BW=*/nullptr, /*Mutable=*/false,
777 /*InitStyle=*/ICIS_NoInit);
778 Field->setAccess(AS_public);
779 StaticRD->addDecl(Field);
780 }
781 StaticRD->completeDefinition();
782 QualType StaticTy = C.getRecordType(StaticRD);
783 llvm::Type *LLVMReductionsBufferTy =
784 CGM.getTypes().ConvertTypeForMem(StaticTy);
785 const auto &DL = CGM.getModule().getDataLayout();
786 uint64_t ReductionDataSize =
787 TeamsReductions.empty()
788 ? 0
789 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
790 CGBuilderTy &Bld = CGF.Builder;
791 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
792 C.getLangOpts().OpenMPCUDAReductionBufNum);
793 TeamsReductions.clear();
794}
795
796void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
797 StringRef ParentName,
798 llvm::Function *&OutlinedFn,
799 llvm::Constant *&OutlinedFnID,
800 bool IsOffloadEntry,
801 const RegionCodeGenTy &CodeGen) {
802 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
803 EntryFunctionState EST;
804
805 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
806
807 // Emit target region as a standalone region.
808 class NVPTXPrePostActionTy : public PrePostActionTy {
810 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
811 bool IsBareKernel;
812 DataSharingMode Mode;
814
815 public:
816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
817 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
818 bool IsBareKernel, const OMPExecutableDirective &D)
819 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
820 Mode(RT.CurrentDataSharingMode), D(D) {}
821 void Enter(CodeGenFunction &CGF) override {
822 if (IsBareKernel) {
823 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
824 return;
825 }
826 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
827 // Skip target region initialization.
828 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
829 }
830 void Exit(CodeGenFunction &CGF) override {
831 if (IsBareKernel) {
832 RT.CurrentDataSharingMode = Mode;
833 return;
834 }
835 RT.clearLocThreadIdInsertPt(CGF);
836 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
837 }
838 } Action(*this, EST, IsBareKernel, D);
839 CodeGen.setAction(Action);
840 IsInTTDRegion = true;
841 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
842 IsOffloadEntry, CodeGen);
843 IsInTTDRegion = false;
844}
845
846void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847 const OMPExecutableDirective &D, StringRef ParentName,
848 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
849 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
850 if (!IsOffloadEntry) // Nothing to do.
851 return;
852
853 assert(!ParentName.empty() && "Invalid target region parent name!");
854
856 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
857 if (Mode || IsBareKernel)
858 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
859 CodeGen);
860 else
861 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862 CodeGen);
863}
864
866 : CGOpenMPRuntime(CGM) {
867 llvm::OpenMPIRBuilderConfig Config(
868 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
869 CGM.getLangOpts().OpenMPOffloadMandatory,
870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
872 OMPBuilder.setConfig(Config);
873
874 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
875 llvm_unreachable("OpenMP can only handle device code.");
876
877 if (CGM.getLangOpts().OpenMPCUDAMode)
878 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
879
880 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
881 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
882 return;
883
884 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
885 "__omp_rtl_debug_kind");
886 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
887 "__omp_rtl_assume_teams_oversubscription");
888 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
889 "__omp_rtl_assume_threads_oversubscription");
890 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
891 "__omp_rtl_assume_no_thread_state");
892 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
893 "__omp_rtl_assume_no_nested_parallelism");
894}
895
897 ProcBindKind ProcBind,
899 // Nothing to do.
900}
901
903 llvm::Value *NumThreads,
905 // Nothing to do.
906}
907
909 const Expr *NumTeams,
910 const Expr *ThreadLimit,
912
915 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
916 const RegionCodeGenTy &CodeGen) {
917 // Emit target region as a standalone region.
918 bool PrevIsInTTDRegion = IsInTTDRegion;
919 IsInTTDRegion = false;
920 auto *OutlinedFun =
922 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
923 IsInTTDRegion = PrevIsInTTDRegion;
924 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
925 llvm::Function *WrapperFun =
926 createParallelDataSharingWrapper(OutlinedFun, D);
927 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
928 }
929
930 return OutlinedFun;
931}
932
933/// Get list of lastprivate variables from the teams distribute ... or
934/// teams {distribute ...} directives.
935static void
938 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
939 "expected teams directive.");
940 const OMPExecutableDirective *Dir = &D;
941 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
943 Ctx,
944 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
945 /*IgnoreCaptured=*/true))) {
946 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
948 Dir = nullptr;
949 }
950 }
951 if (!Dir)
952 return;
953 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
954 for (const Expr *E : C->getVarRefs())
955 Vars.push_back(getPrivateItem(E));
956 }
957}
958
959/// Get list of reduction variables from the teams ... directives.
960static void
963 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
964 "expected teams directive.");
965 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
966 for (const Expr *E : C->privates())
967 Vars.push_back(getPrivateItem(E));
968 }
969}
970
973 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
974 const RegionCodeGenTy &CodeGen) {
976
977 const RecordDecl *GlobalizedRD = nullptr;
978 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
979 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
980 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
981 // Globalize team reductions variable unconditionally in all modes.
982 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
983 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
984 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
985 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
986 if (!LastPrivatesReductions.empty()) {
987 GlobalizedRD = ::buildRecordForGlobalizedVars(
988 CGM.getContext(), std::nullopt, LastPrivatesReductions,
989 MappedDeclsFields, WarpSize);
990 }
991 } else if (!LastPrivatesReductions.empty()) {
992 assert(!TeamAndReductions.first &&
993 "Previous team declaration is not expected.");
994 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
995 std::swap(TeamAndReductions.second, LastPrivatesReductions);
996 }
997
998 // Emit target region as a standalone region.
999 class NVPTXPrePostActionTy : public PrePostActionTy {
1001 const RecordDecl *GlobalizedRD;
1002 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1003 &MappedDeclsFields;
1004
1005 public:
1006 NVPTXPrePostActionTy(
1007 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1008 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1009 &MappedDeclsFields)
1010 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1011 MappedDeclsFields(MappedDeclsFields) {}
1012 void Enter(CodeGenFunction &CGF) override {
1013 auto &Rt =
1014 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1015 if (GlobalizedRD) {
1016 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1017 I->getSecond().MappedParams =
1018 std::make_unique<CodeGenFunction::OMPMapVars>();
1019 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1020 for (const auto &Pair : MappedDeclsFields) {
1021 assert(Pair.getFirst()->isCanonicalDecl() &&
1022 "Expected canonical declaration");
1023 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1024 }
1025 }
1026 Rt.emitGenericVarsProlog(CGF, Loc);
1027 }
1028 void Exit(CodeGenFunction &CGF) override {
1029 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1030 .emitGenericVarsEpilog(CGF);
1031 }
1032 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1033 CodeGen.setAction(Action);
1034 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1035 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1036
1037 return OutlinedFun;
1038}
1039
1040void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1042 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1043 return;
1044
1045 CGBuilderTy &Bld = CGF.Builder;
1046
1047 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1048 if (I == FunctionGlobalizedDecls.end())
1049 return;
1050
1051 for (auto &Rec : I->getSecond().LocalVarData) {
1052 const auto *VD = cast<VarDecl>(Rec.first);
1053 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1054 QualType VarTy = VD->getType();
1055
1056 // Get the local allocation of a firstprivate variable before sharing
1057 llvm::Value *ParValue;
1058 if (EscapedParam) {
1059 LValue ParLVal =
1060 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1061 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1062 }
1063
1064 // Allocate space for the variable to be globalized
1065 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1066 llvm::CallBase *VoidPtr =
1067 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1068 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1069 AllocArgs, VD->getName());
1070 // FIXME: We should use the variables actual alignment as an argument.
1071 VoidPtr->addRetAttr(llvm::Attribute::get(
1072 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1074
1075 // Cast the void pointer and get the address of the globalized variable.
1076 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1077 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1078 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1079 LValue VarAddr =
1080 CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
1081 Rec.second.PrivateAddr = VarAddr.getAddress();
1082 Rec.second.GlobalizedVal = VoidPtr;
1083
1084 // Assign the local allocation to the newly globalized location.
1085 if (EscapedParam) {
1086 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1087 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1088 }
1089 if (auto *DI = CGF.getDebugInfo())
1090 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1091 }
1092
1093 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1094 const auto *VD = cast<VarDecl>(ValueD);
1095 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1096 getKmpcAllocShared(CGF, VD);
1097 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
1098 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1101 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress());
1102 }
1103 I->getSecond().MappedParams->apply(CGF);
1104}
1105
1107 const VarDecl *VD) const {
1108 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1109 if (I == FunctionGlobalizedDecls.end())
1110 return false;
1111
1112 // Check variable declaration is delayed:
1113 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
1114}
1115
1116std::pair<llvm::Value *, llvm::Value *>
1118 const VarDecl *VD) {
1119 CGBuilderTy &Bld = CGF.Builder;
1120
1121 // Compute size and alignment.
1122 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1123 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1124 Size = Bld.CreateNUWAdd(
1125 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1126 llvm::Value *AlignVal =
1127 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1128 Size = Bld.CreateUDiv(Size, AlignVal);
1129 Size = Bld.CreateNUWMul(Size, AlignVal);
1130
1131 // Allocate space for this VLA object to be globalized.
1132 llvm::Value *AllocArgs[] = {Size};
1133 llvm::CallBase *VoidPtr =
1134 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1135 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1136 AllocArgs, VD->getName());
1137 VoidPtr->addRetAttr(llvm::Attribute::get(
1138 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1139
1140 return std::make_pair(VoidPtr, Size);
1141}
1142
1144 CodeGenFunction &CGF,
1145 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1146 // Deallocate the memory for each globalized VLA object
1147 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1148 CGM.getModule(), OMPRTL___kmpc_free_shared),
1149 {AddrSizePair.first, AddrSizePair.second});
1150}
1151
1152void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1153 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1154 return;
1155
1156 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1157 if (I != FunctionGlobalizedDecls.end()) {
1158 // Deallocate the memory for each globalized VLA object that was
1159 // globalized in the prolog (i.e. emitGenericVarsProlog).
1160 for (const auto &AddrSizePair :
1161 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1162 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1163 CGM.getModule(), OMPRTL___kmpc_free_shared),
1164 {AddrSizePair.first, AddrSizePair.second});
1165 }
1166 // Deallocate the memory for each globalized value
1167 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1168 const auto *VD = cast<VarDecl>(Rec.first);
1169 I->getSecond().MappedParams->restore(CGF);
1170
1171 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1172 CGF.getTypeSize(VD->getType())};
1173 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1174 CGM.getModule(), OMPRTL___kmpc_free_shared),
1175 FreeArgs);
1176 }
1177 }
1178}
1179
1183 llvm::Function *OutlinedFn,
1184 ArrayRef<llvm::Value *> CapturedVars) {
1185 if (!CGF.HaveInsertPoint())
1186 return;
1187
1188 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1189
1191 /*Name=*/".zero.addr");
1192 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1194 // We don't emit any thread id function call in bare kernel, but because the
1195 // outlined function has a pointer argument, we emit a nullptr here.
1196 if (IsBareKernel)
1197 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
1198 else
1199 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1200 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1201 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1202 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1203}
1204
1207 llvm::Function *OutlinedFn,
1208 ArrayRef<llvm::Value *> CapturedVars,
1209 const Expr *IfCond,
1210 llvm::Value *NumThreads) {
1211 if (!CGF.HaveInsertPoint())
1212 return;
1213
1214 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1215 NumThreads](CodeGenFunction &CGF,
1216 PrePostActionTy &Action) {
1217 CGBuilderTy &Bld = CGF.Builder;
1218 llvm::Value *NumThreadsVal = NumThreads;
1219 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1220 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1221 if (WFn)
1222 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1223 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1224
1225 // Create a private scope that will globalize the arguments
1226 // passed from the outside of the target region.
1227 // TODO: Is that needed?
1228 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1229
1230 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1231 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1232 "captured_vars_addrs");
1233 // There's something to share.
1234 if (!CapturedVars.empty()) {
1235 // Prepare for parallel region. Indicate the outlined function.
1236 ASTContext &Ctx = CGF.getContext();
1237 unsigned Idx = 0;
1238 for (llvm::Value *V : CapturedVars) {
1239 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1240 llvm::Value *PtrV;
1241 if (V->getType()->isIntegerTy())
1242 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1243 else
1245 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1246 Ctx.getPointerType(Ctx.VoidPtrTy));
1247 ++Idx;
1248 }
1249 }
1250
1251 llvm::Value *IfCondVal = nullptr;
1252 if (IfCond)
1253 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1254 /* isSigned */ false);
1255 else
1256 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1257
1258 if (!NumThreadsVal)
1259 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1260 else
1261 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1262
1263 assert(IfCondVal && "Expected a value");
1264 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1265 llvm::Value *Args[] = {
1266 RTLoc,
1267 getThreadID(CGF, Loc),
1268 IfCondVal,
1269 NumThreadsVal,
1270 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1271 FnPtr,
1272 ID,
1273 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
1274 CGF.VoidPtrPtrTy),
1275 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1276 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1277 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1278 Args);
1279 };
1280
1281 RegionCodeGenTy RCG(ParallelGen);
1282 RCG(CGF);
1283}
1284
1285void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1286 // Always emit simple barriers!
1287 if (!CGF.HaveInsertPoint())
1288 return;
1289 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1290 // This function does not use parameters, so we can emit just default values.
1291 llvm::Value *Args[] = {
1292 llvm::ConstantPointerNull::get(
1293 cast<llvm::PointerType>(getIdentTyPointerTy())),
1294 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1295 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1296 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1297 Args);
1298}
1299
1302 OpenMPDirectiveKind Kind, bool,
1303 bool) {
1304 // Always emit simple barriers!
1305 if (!CGF.HaveInsertPoint())
1306 return;
1307 // Build call __kmpc_cancel_barrier(loc, thread_id);
1308 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1309 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1310 getThreadID(CGF, Loc)};
1311
1312 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1313 CGM.getModule(), OMPRTL___kmpc_barrier),
1314 Args);
1315}
1316
1318 CodeGenFunction &CGF, StringRef CriticalName,
1319 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1320 const Expr *Hint) {
1321 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1322 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1323 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1324 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1325 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1326
1327 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1328
1329 // Get the mask of active threads in the warp.
1330 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1331 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1332 // Fetch team-local id of the thread.
1333 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1334
1335 // Get the width of the team.
1336 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1337
1338 // Initialize the counter variable for the loop.
1339 QualType Int32Ty =
1340 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1341 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1342 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1343 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1344 /*isInit=*/true);
1345
1346 // Block checks if loop counter exceeds upper bound.
1347 CGF.EmitBlock(LoopBB);
1348 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1349 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1350 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1351
1352 // Block tests which single thread should execute region, and which threads
1353 // should go straight to synchronisation point.
1354 CGF.EmitBlock(TestBB);
1355 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1356 llvm::Value *CmpThreadToCounter =
1357 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1358 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1359
1360 // Block emits the body of the critical region.
1361 CGF.EmitBlock(BodyBB);
1362
1363 // Output the critical statement.
1364 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1365 Hint);
1366
1367 // After the body surrounded by the critical region, the single executing
1368 // thread will jump to the synchronisation point.
1369 // Block waits for all threads in current team to finish then increments the
1370 // counter variable and returns to the loop.
1371 CGF.EmitBlock(SyncBB);
1372 // Reconverge active threads in the warp.
1373 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1374 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1375 Mask);
1376
1377 llvm::Value *IncCounterVal =
1378 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1379 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1380 CGF.EmitBranch(LoopBB);
1381
1382 // Block that is reached when all threads in the team complete the region.
1383 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1384}
1385
1386/// Cast value to the specified type.
1387static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1388 QualType ValTy, QualType CastTy,
1390 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1391 "Cast type must sized.");
1392 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1393 "Val type must sized.");
1394 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1395 if (ValTy == CastTy)
1396 return Val;
1397 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1398 CGF.getContext().getTypeSizeInChars(CastTy))
1399 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1400 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1401 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1403 Address CastItem = CGF.CreateMemTemp(CastTy);
1404 Address ValCastItem = CastItem.withElementType(Val->getType());
1405 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1407 TBAAAccessInfo());
1408 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1410 TBAAAccessInfo());
1411}
1412
1413///
1414/// Design of OpenMP reductions on the GPU
1415///
1416/// Consider a typical OpenMP program with one or more reduction
1417/// clauses:
1418///
1419/// float foo;
1420/// double bar;
1421/// #pragma omp target teams distribute parallel for \
1422/// reduction(+:foo) reduction(*:bar)
1423/// for (int i = 0; i < N; i++) {
1424/// foo += A[i]; bar *= B[i];
1425/// }
1426///
1427/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1428/// all teams. In our OpenMP implementation on the NVPTX device an
1429/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1430/// within a team are mapped to CUDA threads within a threadblock.
1431/// Our goal is to efficiently aggregate values across all OpenMP
1432/// threads such that:
1433///
1434/// - the compiler and runtime are logically concise, and
1435/// - the reduction is performed efficiently in a hierarchical
1436/// manner as follows: within OpenMP threads in the same warp,
1437/// across warps in a threadblock, and finally across teams on
1438/// the NVPTX device.
1439///
1440/// Introduction to Decoupling
1441///
1442/// We would like to decouple the compiler and the runtime so that the
1443/// latter is ignorant of the reduction variables (number, data types)
1444/// and the reduction operators. This allows a simpler interface
1445/// and implementation while still attaining good performance.
1446///
1447/// Pseudocode for the aforementioned OpenMP program generated by the
1448/// compiler is as follows:
1449///
1450/// 1. Create private copies of reduction variables on each OpenMP
1451/// thread: 'foo_private', 'bar_private'
1452/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1453/// to it and writes the result in 'foo_private' and 'bar_private'
1454/// respectively.
1455/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1456/// and store the result on the team master:
1457///
1458/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1459/// reduceData, shuffleReduceFn, interWarpCpyFn)
1460///
1461/// where:
1462/// struct ReduceData {
1463/// double *foo;
1464/// double *bar;
1465/// } reduceData
1466/// reduceData.foo = &foo_private
1467/// reduceData.bar = &bar_private
1468///
1469/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1470/// auxiliary functions generated by the compiler that operate on
1471/// variables of type 'ReduceData'. They aid the runtime perform
1472/// algorithmic steps in a data agnostic manner.
1473///
1474/// 'shuffleReduceFn' is a pointer to a function that reduces data
1475/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1476/// same warp. It takes the following arguments as input:
1477///
1478/// a. variable of type 'ReduceData' on the calling lane,
1479/// b. its lane_id,
1480/// c. an offset relative to the current lane_id to generate a
1481/// remote_lane_id. The remote lane contains the second
1482/// variable of type 'ReduceData' that is to be reduced.
1483/// d. an algorithm version parameter determining which reduction
1484/// algorithm to use.
1485///
1486/// 'shuffleReduceFn' retrieves data from the remote lane using
1487/// efficient GPU shuffle intrinsics and reduces, using the
1488/// algorithm specified by the 4th parameter, the two operands
1489/// element-wise. The result is written to the first operand.
1490///
1491/// Different reduction algorithms are implemented in different
1492/// runtime functions, all calling 'shuffleReduceFn' to perform
1493/// the essential reduction step. Therefore, based on the 4th
1494/// parameter, this function behaves slightly differently to
1495/// cooperate with the runtime to ensure correctness under
1496/// different circumstances.
1497///
1498/// 'InterWarpCpyFn' is a pointer to a function that transfers
1499/// reduced variables across warps. It tunnels, through CUDA
1500/// shared memory, the thread-private data of type 'ReduceData'
1501/// from lane 0 of each warp to a lane in the first warp.
1502/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1503/// The last team writes the global reduced value to memory.
1504///
1505/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1506/// reduceData, shuffleReduceFn, interWarpCpyFn,
1507/// scratchpadCopyFn, loadAndReduceFn)
1508///
1509/// 'scratchpadCopyFn' is a helper that stores reduced
1510/// data from the team master to a scratchpad array in
1511/// global memory.
1512///
1513/// 'loadAndReduceFn' is a helper that loads data from
1514/// the scratchpad array and reduces it with the input
1515/// operand.
1516///
1517/// These compiler generated functions hide address
1518/// calculation and alignment information from the runtime.
1519/// 5. if ret == 1:
1520/// The team master of the last team stores the reduced
1521/// result to the globals in memory.
1522/// foo += reduceData.foo; bar *= reduceData.bar
1523///
1524///
1525/// Warp Reduction Algorithms
1526///
1527/// On the warp level, we have three algorithms implemented in the
1528/// OpenMP runtime depending on the number of active lanes:
1529///
1530/// Full Warp Reduction
1531///
1532/// The reduce algorithm within a warp where all lanes are active
1533/// is implemented in the runtime as follows:
1534///
1535/// full_warp_reduce(void *reduce_data,
1536/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1537/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1538/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1539/// }
1540///
1541/// The algorithm completes in log(2, WARPSIZE) steps.
1542///
1543/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1544/// not used therefore we save instructions by not retrieving lane_id
1545/// from the corresponding special registers. The 4th parameter, which
1546/// represents the version of the algorithm being used, is set to 0 to
1547/// signify full warp reduction.
1548///
1549/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1550///
1551/// #reduce_elem refers to an element in the local lane's data structure
1552/// #remote_elem is retrieved from a remote lane
1553/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1554/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1555///
1556/// Contiguous Partial Warp Reduction
1557///
1558/// This reduce algorithm is used within a warp where only the first
1559/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1560/// number of OpenMP threads in a parallel region is not a multiple of
1561/// WARPSIZE. The algorithm is implemented in the runtime as follows:
1562///
1563/// void
1564/// contiguous_partial_reduce(void *reduce_data,
1565/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1566/// int size, int lane_id) {
1567/// int curr_size;
1568/// int offset;
1569/// curr_size = size;
1570/// mask = curr_size/2;
1571/// while (offset>0) {
1572/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1573/// curr_size = (curr_size+1)/2;
1574/// offset = curr_size/2;
1575/// }
1576/// }
1577///
1578/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1579///
1580/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1581/// if (lane_id < offset)
1582/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1583/// else
1584/// reduce_elem = remote_elem
1585///
1586/// This algorithm assumes that the data to be reduced are located in a
1587/// contiguous subset of lanes starting from the first. When there is
1588/// an odd number of active lanes, the data in the last lane is not
1589/// aggregated with any other lane's dat but is instead copied over.
1590///
1591/// Dispersed Partial Warp Reduction
1592///
1593/// This algorithm is used within a warp when any discontiguous subset of
1594/// lanes are active. It is used to implement the reduction operation
1595/// across lanes in an OpenMP simd region or in a nested parallel region.
1596///
1597/// void
1598/// dispersed_partial_reduce(void *reduce_data,
1599/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1600/// int size, remote_id;
1601/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1602/// do {
1603/// remote_id = next_active_lane_id_right_after_me();
1604/// # the above function returns 0 of no active lane
1605/// # is present right after the current lane.
1606/// size = number_of_active_lanes_in_this_warp();
1607/// logical_lane_id /= 2;
1608/// ShuffleReduceFn(reduce_data, logical_lane_id,
1609/// remote_id-1-threadIdx.x, 2);
1610/// } while (logical_lane_id % 2 == 0 && size > 1);
1611/// }
1612///
1613/// There is no assumption made about the initial state of the reduction.
1614/// Any number of lanes (>=1) could be active at any position. The reduction
1615/// result is returned in the first active lane.
1616///
1617/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1618///
1619/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1620/// if (lane_id % 2 == 0 && offset > 0)
1621/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1622/// else
1623/// reduce_elem = remote_elem
1624///
1625///
1626/// Intra-Team Reduction
1627///
1628/// This function, as implemented in the runtime call
1629/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1630/// threads in a team. It first reduces within a warp using the
1631/// aforementioned algorithms. We then proceed to gather all such
1632/// reduced values at the first warp.
1633///
1634/// The runtime makes use of the function 'InterWarpCpyFn', which copies
1635/// data from each of the "warp master" (zeroth lane of each warp, where
1636/// warp-reduced data is held) to the zeroth warp. This step reduces (in
1637/// a mathematical sense) the problem of reduction across warp masters in
1638/// a block to the problem of warp reduction.
1639///
1640///
1641/// Inter-Team Reduction
1642///
1643/// Once a team has reduced its data to a single value, it is stored in
1644/// a global scratchpad array. Since each team has a distinct slot, this
1645/// can be done without locking.
1646///
1647/// The last team to write to the scratchpad array proceeds to reduce the
1648/// scratchpad array. One or more workers in the last team use the helper
1649/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1650/// the k'th worker reduces every k'th element.
1651///
1652/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1653/// reduce across workers and compute a globally reduced value.
1654///
1658 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1659 if (!CGF.HaveInsertPoint())
1660 return;
1661
1662 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
1663 bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
1664 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
1665
1667
1668 if (Options.SimpleReduction) {
1669 assert(!TeamsReduction && !ParallelReduction &&
1670 "Invalid reduction selection in emitReduction.");
1671 (void)ParallelReduction;
1672 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1673 ReductionOps, Options);
1674 return;
1675 }
1676
1677 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
1678 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
1679 int Cnt = 0;
1680 for (const Expr *DRE : Privates) {
1681 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
1682 ++Cnt;
1683 }
1684 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1685 CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
1686
1687 if (TeamsReduction)
1688 TeamsReductions.push_back(ReductionRec);
1689
1690 // Source location for the ident struct
1691 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1692
1693 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1694 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
1695 CGF.AllocaInsertPt->getIterator());
1696 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1697 CGF.Builder.GetInsertPoint());
1698 llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1699 CodeGenIP, CGF.SourceLocToDebugLoc(Loc));
1701
1703 unsigned Idx = 0;
1704 for (const Expr *Private : Privates) {
1705 llvm::Type *ElementType;
1706 llvm::Value *Variable;
1707 llvm::Value *PrivateVariable;
1708 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1709 ElementType = CGF.ConvertTypeForMem(Private->getType());
1710 const auto *RHSVar =
1711 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
1712 PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF);
1713 const auto *LHSVar =
1714 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
1715 Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF);
1716 llvm::OpenMPIRBuilder::EvalKind EvalKind;
1717 switch (CGF.getEvaluationKind(Private->getType())) {
1718 case TEK_Scalar:
1719 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1720 break;
1721 case TEK_Complex:
1722 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1723 break;
1724 case TEK_Aggregate:
1725 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1726 break;
1727 }
1728 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1729 llvm::Value **LHSPtr, llvm::Value **RHSPtr,
1730 llvm::Function *NewFunc) {
1731 CGF.Builder.restoreIP(CodeGenIP);
1732 auto *CurFn = CGF.CurFn;
1733 CGF.CurFn = NewFunc;
1734
1735 *LHSPtr = CGF.GetAddrOfLocalVar(
1736 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
1737 .emitRawPointer(CGF);
1738 *RHSPtr = CGF.GetAddrOfLocalVar(
1739 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
1740 .emitRawPointer(CGF);
1741
1742 emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
1743 cast<DeclRefExpr>(LHSExprs[I]),
1744 cast<DeclRefExpr>(RHSExprs[I]));
1745
1746 CGF.CurFn = CurFn;
1747
1748 return InsertPointTy(CGF.Builder.GetInsertBlock(),
1749 CGF.Builder.GetInsertPoint());
1750 };
1751 ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
1752 ElementType, Variable, PrivateVariable, EvalKind,
1753 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
1754 Idx++;
1755 }
1756
1757 CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
1758 OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
1759 DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1760 CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
1761 RTLoc));
1762 return;
1763}
1764
1765const VarDecl *
1767 const VarDecl *NativeParam) const {
1768 if (!NativeParam->getType()->isReferenceType())
1769 return NativeParam;
1770 QualType ArgType = NativeParam->getType();
1772 const Type *NonQualTy = QC.strip(ArgType);
1773 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1774 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1775 if (Attr->getCaptureKind() == OMPC_map) {
1776 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
1778 }
1779 }
1780 ArgType = CGM.getContext().getPointerType(PointeeTy);
1781 QC.addRestrict();
1782 enum { NVPTX_local_addr = 5 };
1783 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
1784 ArgType = QC.apply(CGM.getContext(), ArgType);
1785 if (isa<ImplicitParamDecl>(NativeParam))
1787 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
1788 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
1789 return ParmVarDecl::Create(
1790 CGM.getContext(),
1791 const_cast<DeclContext *>(NativeParam->getDeclContext()),
1792 NativeParam->getBeginLoc(), NativeParam->getLocation(),
1793 NativeParam->getIdentifier(), ArgType,
1794 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
1795}
1796
1797Address
1799 const VarDecl *NativeParam,
1800 const VarDecl *TargetParam) const {
1801 assert(NativeParam != TargetParam &&
1802 NativeParam->getType()->isReferenceType() &&
1803 "Native arg must not be the same as target arg.");
1804 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
1805 QualType NativeParamType = NativeParam->getType();
1807 const Type *NonQualTy = QC.strip(NativeParamType);
1808 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1809 unsigned NativePointeeAddrSpace =
1810 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
1811 QualType TargetTy = TargetParam->getType();
1812 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,
1813 TargetTy, SourceLocation());
1814 // Cast to native address space.
1816 TargetAddr,
1817 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));
1818 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
1819 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
1820 NativeParamType);
1821 return NativeParamAddr;
1822}
1823
1825 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1826 ArrayRef<llvm::Value *> Args) const {
1828 TargetArgs.reserve(Args.size());
1829 auto *FnType = OutlinedFn.getFunctionType();
1830 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
1831 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1832 TargetArgs.append(std::next(Args.begin(), I), Args.end());
1833 break;
1834 }
1835 llvm::Type *TargetType = FnType->getParamType(I);
1836 llvm::Value *NativeArg = Args[I];
1837 if (!TargetType->isPointerTy()) {
1838 TargetArgs.emplace_back(NativeArg);
1839 continue;
1840 }
1841 TargetArgs.emplace_back(
1842 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));
1843 }
1844 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
1845}
1846
1847/// Emit function which wraps the outline parallel region
1848/// and controls the arguments which are passed to this function.
1849/// The wrapper ensures that the outlined function is called
1850/// with the correct arguments when data is shared.
1851llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1852 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
1853 ASTContext &Ctx = CGM.getContext();
1854 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
1855
1856 // Create a function that takes as argument the source thread.
1857 FunctionArgList WrapperArgs;
1858 QualType Int16QTy =
1859 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1860 QualType Int32QTy =
1861 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1862 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1863 /*Id=*/nullptr, Int16QTy,
1865 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1866 /*Id=*/nullptr, Int32QTy,
1868 WrapperArgs.emplace_back(&ParallelLevelArg);
1869 WrapperArgs.emplace_back(&WrapperArg);
1870
1871 const CGFunctionInfo &CGFI =
1873
1874 auto *Fn = llvm::Function::Create(
1875 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1876 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
1877
1878 // Ensure we do not inline the function. This is trivially true for the ones
1879 // passed to __kmpc_fork_call but the ones calles in serialized regions
1880 // could be inlined. This is not a perfect but it is closer to the invariant
1881 // we want, namely, every data environment starts with a new function.
1882 // TODO: We should pass the if condition to the runtime function and do the
1883 // handling there. Much cleaner code.
1884 Fn->addFnAttr(llvm::Attribute::NoInline);
1885
1887 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1888 Fn->setDoesNotRecurse();
1889
1890 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1891 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
1892 D.getBeginLoc(), D.getBeginLoc());
1893
1894 const auto *RD = CS.getCapturedRecordDecl();
1895 auto CurField = RD->field_begin();
1896
1897 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1898 /*Name=*/".zero.addr");
1899 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1900 // Get the array of arguments.
1902
1903 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
1904 Args.emplace_back(ZeroAddr.emitRawPointer(CGF));
1905
1906 CGBuilderTy &Bld = CGF.Builder;
1907 auto CI = CS.capture_begin();
1908
1909 // Use global memory for data sharing.
1910 // Handle passing of global args to workers.
1911 RawAddress GlobalArgs =
1912 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
1913 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1914 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1915 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1916 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
1917 DataSharingArgs);
1918
1919 // Retrieve the shared variables from the list of references returned
1920 // by the runtime. Pass the variables to the outlined function.
1921 Address SharedArgListAddress = Address::invalid();
1922 if (CS.capture_size() > 0 ||
1923 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1924 SharedArgListAddress = CGF.EmitLoadOfPointer(
1925 GlobalArgs, CGF.getContext()
1927 .castAs<PointerType>());
1928 }
1929 unsigned Idx = 0;
1930 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1931 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1933 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
1934 llvm::Value *LB = CGF.EmitLoadOfScalar(
1935 TypedAddress,
1936 /*Volatile=*/false,
1938 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
1939 Args.emplace_back(LB);
1940 ++Idx;
1941 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1942 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1943 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
1944 llvm::Value *UB = CGF.EmitLoadOfScalar(
1945 TypedAddress,
1946 /*Volatile=*/false,
1948 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
1949 Args.emplace_back(UB);
1950 ++Idx;
1951 }
1952 if (CS.capture_size() > 0) {
1953 ASTContext &CGFContext = CGF.getContext();
1954 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1955 QualType ElemTy = CurField->getType();
1956 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
1958 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
1959 CGF.ConvertTypeForMem(ElemTy));
1960 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
1961 /*Volatile=*/false,
1962 CGFContext.getPointerType(ElemTy),
1963 CI->getLocation());
1964 if (CI->capturesVariableByCopy() &&
1965 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
1966 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
1967 CI->getLocation());
1968 }
1969 Args.emplace_back(Arg);
1970 }
1971 }
1972
1973 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
1974 CGF.FinishFunction();
1975 return Fn;
1976}
1977
1979 const Decl *D) {
1980 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1981 return;
1982
1983 assert(D && "Expected function or captured|block decl.");
1984 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
1985 "Function is registered already.");
1986 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
1987 "Team is set but not processed.");
1988 const Stmt *Body = nullptr;
1989 bool NeedToDelayGlobalization = false;
1990 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
1991 Body = FD->getBody();
1992 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
1993 Body = BD->getBody();
1994 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
1995 Body = CD->getBody();
1996 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
1997 if (NeedToDelayGlobalization &&
1998 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1999 return;
2000 }
2001 if (!Body)
2002 return;
2003 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2004 VarChecker.Visit(Body);
2005 const RecordDecl *GlobalizedVarsRecord =
2006 VarChecker.getGlobalizedRecord(IsInTTDRegion);
2007 TeamAndReductions.first = nullptr;
2008 TeamAndReductions.second.clear();
2009 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2010 VarChecker.getEscapedVariableLengthDecls();
2011 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2012 VarChecker.getDelayedVariableLengthDecls();
2013 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2014 DelayedVariableLengthDecls.empty())
2015 return;
2016 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2017 I->getSecond().MappedParams =
2018 std::make_unique<CodeGenFunction::OMPMapVars>();
2019 I->getSecond().EscapedParameters.insert(
2020 VarChecker.getEscapedParameters().begin(),
2021 VarChecker.getEscapedParameters().end());
2022 I->getSecond().EscapedVariableLengthDecls.append(
2023 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
2024 I->getSecond().DelayedVariableLengthDecls.append(
2025 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
2026 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2027 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2028 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2029 Data.insert(std::make_pair(VD, MappedVarData()));
2030 }
2031 if (!NeedToDelayGlobalization) {
2032 emitGenericVarsProlog(CGF, D->getBeginLoc());
2033 struct GlobalizationScope final : EHScopeStack::Cleanup {
2034 GlobalizationScope() = default;
2035
2036 void Emit(CodeGenFunction &CGF, Flags flags) override {
2037 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2038 .emitGenericVarsEpilog(CGF);
2039 }
2040 };
2041 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
2042 }
2043}
2044
2046 const VarDecl *VD) {
2047 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2048 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2049 auto AS = LangAS::Default;
2050 switch (A->getAllocatorType()) {
2051 // Use the default allocator here as by default local vars are
2052 // threadlocal.
2053 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2054 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2055 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2056 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2057 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2058 // Follow the user decision - use default allocation.
2059 return Address::invalid();
2060 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2061 // TODO: implement aupport for user-defined allocators.
2062 return Address::invalid();
2063 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2065 break;
2066 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2068 break;
2069 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2070 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2071 break;
2072 }
2073 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
2074 auto *GV = new llvm::GlobalVariable(
2075 CGM.getModule(), VarTy, /*isConstant=*/false,
2076 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
2077 VD->getName(),
2078 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
2080 CharUnits Align = CGM.getContext().getDeclAlign(VD);
2081 GV->setAlignment(Align.getAsAlign());
2082 return Address(
2084 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
2085 VD->getType().getAddressSpace()))),
2086 VarTy, Align);
2087 }
2088
2089 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2090 return Address::invalid();
2091
2092 VD = VD->getCanonicalDecl();
2093 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2094 if (I == FunctionGlobalizedDecls.end())
2095 return Address::invalid();
2096 auto VDI = I->getSecond().LocalVarData.find(VD);
2097 if (VDI != I->getSecond().LocalVarData.end())
2098 return VDI->second.PrivateAddr;
2099 if (VD->hasAttrs()) {
2101 E(VD->attr_end());
2102 IT != E; ++IT) {
2103 auto VDI = I->getSecond().LocalVarData.find(
2104 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
2105 ->getCanonicalDecl());
2106 if (VDI != I->getSecond().LocalVarData.end())
2107 return VDI->second.PrivateAddr;
2108 }
2109 }
2110
2111 return Address::invalid();
2112}
2113
2115 FunctionGlobalizedDecls.erase(CGF.CurFn);
2117}
2118
2120 CodeGenFunction &CGF, const OMPLoopDirective &S,
2121 OpenMPDistScheduleClauseKind &ScheduleKind,
2122 llvm::Value *&Chunk) const {
2123 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2124 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2125 ScheduleKind = OMPC_DIST_SCHEDULE_static;
2126 Chunk = CGF.EmitScalarConversion(
2127 RT.getGPUNumThreads(CGF),
2128 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2129 S.getIterationVariable()->getType(), S.getBeginLoc());
2130 return;
2131 }
2133 CGF, S, ScheduleKind, Chunk);
2134}
2135
2137 CodeGenFunction &CGF, const OMPLoopDirective &S,
2138 OpenMPScheduleClauseKind &ScheduleKind,
2139 const Expr *&ChunkExpr) const {
2140 ScheduleKind = OMPC_SCHEDULE_static;
2141 // Chunk size is 1 in this case.
2142 llvm::APInt ChunkSize(32, 1);
2143 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
2144 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2145 SourceLocation());
2146}
2147
2149 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2150 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2151 " Expected target-based directive.");
2152 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
2153 for (const CapturedStmt::Capture &C : CS->captures()) {
2154 // Capture variables captured by reference in lambdas for target-based
2155 // directives.
2156 if (!C.capturesVariable())
2157 continue;
2158 const VarDecl *VD = C.getCapturedVar();
2159 const auto *RD = VD->getType()
2163 if (!RD || !RD->isLambda())
2164 continue;
2165 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2166 LValue VDLVal;
2168 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
2169 else
2170 VDLVal = CGF.MakeAddrLValue(
2171 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
2172 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
2173 FieldDecl *ThisCapture = nullptr;
2174 RD->getCaptureFields(Captures, ThisCapture);
2175 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2176 LValue ThisLVal =
2177 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
2178 llvm::Value *CXXThis = CGF.LoadCXXThis();
2179 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
2180 }
2181 for (const LambdaCapture &LC : RD->captures()) {
2182 if (LC.getCaptureKind() != LCK_ByRef)
2183 continue;
2184 const ValueDecl *VD = LC.getCapturedVar();
2185 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2186 // capturing structured bindings in lambdas yet.
2187 if (!CS->capturesVariable(cast<VarDecl>(VD)))
2188 continue;
2189 auto It = Captures.find(VD);
2190 assert(It != Captures.end() && "Found lambda capture without field.");
2191 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
2192 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
2194 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
2195 VD->getType().getCanonicalType())
2196 .getAddress();
2197 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
2198 }
2199 }
2200}
2201
2203 LangAS &AS) {
2204 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
2205 return false;
2206 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2207 switch(A->getAllocatorType()) {
2208 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2209 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2210 // Not supported, fallback to the default mem space.
2211 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2212 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2213 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2214 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2215 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2216 AS = LangAS::Default;
2217 return true;
2218 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2220 return true;
2221 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2223 return true;
2224 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2225 llvm_unreachable("Expected predefined allocator for the variables with the "
2226 "static storage.");
2227 }
2228 return false;
2229}
2230
2231// Get current OffloadArch and ignore any unknown values
2233 if (!CGM.getTarget().hasFeature("ptx"))
2234 return OffloadArch::UNKNOWN;
2235 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2236 if (Feature.getValue()) {
2237 OffloadArch Arch = StringToOffloadArch(Feature.getKey());
2238 if (Arch != OffloadArch::UNKNOWN)
2239 return Arch;
2240 }
2241 }
2242 return OffloadArch::UNKNOWN;
2243}
2244
2245/// Check to see if target architecture supports unified addressing which is
2246/// a restriction for OpenMP requires clause "unified_shared_memory".
2248 for (const OMPClause *Clause : D->clauselists()) {
2249 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2251 switch (Arch) {
2252 case OffloadArch::SM_20:
2253 case OffloadArch::SM_21:
2254 case OffloadArch::SM_30:
2256 case OffloadArch::SM_35:
2257 case OffloadArch::SM_37:
2258 case OffloadArch::SM_50:
2259 case OffloadArch::SM_52:
2260 case OffloadArch::SM_53: {
2261 SmallString<256> Buffer;
2262 llvm::raw_svector_ostream Out(Buffer);
2263 Out << "Target architecture " << OffloadArchToString(Arch)
2264 << " does not support unified addressing";
2265 CGM.Error(Clause->getBeginLoc(), Out.str());
2266 return;
2267 }
2268 case OffloadArch::SM_60:
2269 case OffloadArch::SM_61:
2270 case OffloadArch::SM_62:
2271 case OffloadArch::SM_70:
2272 case OffloadArch::SM_72:
2273 case OffloadArch::SM_75:
2274 case OffloadArch::SM_80:
2275 case OffloadArch::SM_86:
2276 case OffloadArch::SM_87:
2277 case OffloadArch::SM_89:
2278 case OffloadArch::SM_90:
2334 break;
2335 case OffloadArch::LAST:
2336 llvm_unreachable("Unexpected GPU arch.");
2337 }
2338 }
2339 }
2341}
2342
2344 CGBuilderTy &Bld = CGF.Builder;
2345 llvm::Module *M = &CGF.CGM.getModule();
2346 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2347 llvm::Function *F = M->getFunction(LocSize);
2348 if (!F) {
2349 F = llvm::Function::Create(
2350 llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
2351 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
2352 }
2353 return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
2354}
2355
2358 return CGF.EmitRuntimeCall(
2359 OMPBuilder.getOrCreateRuntimeFunction(
2360 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
2361 Args);
2362}
#define V(N, I)
Definition: ASTContext.h:3338
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ... or teams {distribute ....
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
static OffloadArch getOffloadArch(CodeGenModule &CGM)
const Decl * D
Expr * E
This file defines OpenMP nodes for declarative directives.
This file defines OpenMP AST classes for clauses.
VarDecl * Variable
Definition: SemaObjC.cpp:756
SourceLocation Loc
Definition: SemaObjC.cpp:758
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
const char * Data
This file defines OpenMP AST classes for executable directives and clauses.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:186
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
CanQualType VoidPtrTy
Definition: ASTContext.h:1145
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
CanQualType VoidTy
Definition: ASTContext.h:1118
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:778
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
Definition: Attr.h:42
A class which contains all the information about a particular captured value.
Definition: Decl.h:4473
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:6355
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2830
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3775
This captures a statement into a function.
Definition: Stmt.h:3762
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1431
capture_range captures()
Definition: Stmt.h:3900
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:122
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:189
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:128
static Address invalid()
Definition: Address.h:176
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:251
Address withElementType(llvm::Type *ElemTy) const
Return address with different element type, but same pointer and alignment.
Definition: Address.h:274
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:135
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:202
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ... produce name = getelementptr inbounds addr, i64 0, i64 index where i64 is a...
Definition: CGBuilder.h:240
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:260
CGFunctionInfo - Class to encapsulate the information about a function definition.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
@ DS_CUDA
CUDA data sharing mode.
@ DS_Generic
Generic data-sharing mode.
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
void getKmpcFreeShared(CodeGenFunction &CGF, const std::pair< llvm::Value *, llvm::Value * > &AddrSizePair) override
Get call to __kmpc_free_shared.
llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
bool isDelayedVariableLengthDecl(CodeGenFunction &CGF, const VarDecl *VD) const override
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
ExecutionMode
Defines the execution mode.
@ EM_NonSPMD
Non-SPMD execution mode (1 master thread, others are workers).
@ EM_Unknown
Unknown execution mode (orphaned directive).
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
std::pair< llvm::Value *, llvm::Value * > getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) override
Get call to __kmpc_alloc_shared.
bool isGPU() const override
Returns true if the current target is a GPU.
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
virtual llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D, CodeGenFunction &CGF, int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal, int32_t &MaxTeamsVal)
Helper to determine the min/max number of threads/teams for D.
virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Helper to emit outlined function for 'target' directive.
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
virtual llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
llvm::OpenMPIRBuilder & getOMPBuilder()
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
CGCapturedStmtInfo * CapturedStmtInfo
LValue MakeNaturalAlignPointeeRawAddrLValue(llvm::Value *V, QualType T)
Same as MakeNaturalAlignPointeeAddrLValue except that the pointer is known to be unsigned.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
RawAddress CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::Type * ConvertTypeForMem(QualType T)
llvm::AssertingVH< llvm::Instruction > AllocaInsertPt
AllocaInsertPoint - This is an instruction in the entry block before which we prefer to insert alloca...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
const TargetInfo & getTarget() const
llvm::DebugLoc SourceLocToDebugLoc(SourceLocation Location)
Converts Location to a DebugLoc, if debug information is enabled.
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
CodeGenTypes & getTypes() const
llvm::Value * EvaluateExprAsBool(const Expr *E)
EvaluateExprAsBool - Perform the usual unary conversions on the specified expression and compare the ...
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
llvm::LLVMContext & getLLVMContext()
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
This class organizes the cross-function state that is used while generating LLVM code.
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
llvm::Module & getModule() const
const LangOptions & getLangOpts() const
const TargetInfo & getTarget() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
ASTContext & getContext() const
llvm::LLVMContext & getLLVMContext()
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1632
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:680
unsigned getTargetAddressSpace(QualType T) const
llvm::Type * ConvertTypeForMem(QualType T)
ConvertTypeForMem - Convert type T into a llvm::Type.
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:368
LValue - This represents an lvalue references.
Definition: CGValue.h:182
Address getAddress() const
Definition: CGValue.h:361
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
An abstract representation of an aligned address.
Definition: Address.h:42
llvm::Value * getPointer() const
Definition: Address.h:66
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
void setAction(PrePostActionTy &Action) const
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:195
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1425
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1742
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1265
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1497
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
T * getAttr() const
Definition: DeclBase.h:579
bool hasAttrs() const
Definition: DeclBase.h:524
attr_iterator attr_end() const
Definition: DeclBase.h:548
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:963
attr_iterator attr_begin() const
Definition: DeclBase.h:545
SourceLocation getLocation() const
Definition: DeclBase.h:445
DeclContext * getDeclContext()
Definition: DeclBase.h:454
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:437
AttrVec & getAttrs()
Definition: DeclBase.h:530
bool hasAttr() const
Definition: DeclBase.h:583
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:957
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:783
This represents one expression.
Definition: Expr.h:110
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3070
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3066
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:277
Represents a member of a struct/union/class.
Definition: Decl.h:3030
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4533
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3675
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:5367
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:977
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1954
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:539
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:276
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:556
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:446
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1004
This represents clause 'reduction' in the '#pragma omp ...' directives.
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:417
This represents 'ompx_bare' clause in the '#pragma omp target teams ...' directive.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2903
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:3161
A (possibly-)qualified type.
Definition: Type.h:941
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:7869
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:7944
QualType getCanonicalType() const
Definition: Type.h:7795
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:7683
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:7690
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:4349
void addAddressSpace(LangAS space)
Definition: Type.h:584
void addRestrict()
Definition: Type.h:467
Represents a struct/union/class.
Definition: Decl.h:4141
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:5069
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:41
Encodes a location in the source.
RetTy Visit(PTR(Stmt) S, ParamTys... P)
Definition: StmtVisitor.h:44
Stmt - This represents one statement.
Definition: Stmt.h:84
child_range children()
Definition: Stmt.cpp:287
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4725
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:742
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:312
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1653
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1487
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62
The base class of the type hierarchy.
Definition: Type.h:1829
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1882
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:8335
bool isReferenceType() const
Definition: Type.h:8010
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:705
bool isLValueReferenceType() const
Definition: Type.h:8014
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2186
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2690
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2188
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:667
QualType getType() const
Definition: Decl.h:678
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:5359
Represents a variable declaration or definition.
Definition: Decl.h:879
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2239
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1519
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:33
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
The JSON file list parser is used to communicate input to InstallAPI.
@ Private
'private' clause, allowed on 'parallel', 'serial', 'loop', 'parallel loop', and 'serial loop' constru...
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:269
OffloadArch
Definition: Cuda.h:55
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37
@ CR_OpenMP
Definition: CapturedStmt.h:19
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
@ SC_None
Definition: Specifiers.h:247
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
OffloadArch StringToOffloadArch(llvm::StringRef S)
Definition: Cuda.cpp:175
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
const char * OffloadArchToString(OffloadArch A)
Definition: Cuda.cpp:157
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86
@ CXXThis
Parameter for C++ 'this' argument.
@ Other
Other implicit parameter.
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30
@ AS_public
Definition: Specifiers.h:121
unsigned long uint64_t