clang 23.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX, AMDGCN and SPIR-V.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CGDebugInfo.h"
16#include "CodeGenFunction.h"
17#include "clang/AST/Attr.h"
22#include "clang/Basic/Cuda.h"
23#include "llvm/ADT/SmallPtrSet.h"
24#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
25#include "llvm/Frontend/OpenMP/OMPGridValues.h"
26
27using namespace clang;
28using namespace CodeGen;
29using namespace llvm::omp;
30
31namespace {
32/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
33class NVPTXActionTy final : public PrePostActionTy {
34 llvm::FunctionCallee EnterCallee = nullptr;
35 ArrayRef<llvm::Value *> EnterArgs;
36 llvm::FunctionCallee ExitCallee = nullptr;
37 ArrayRef<llvm::Value *> ExitArgs;
38 bool Conditional = false;
39 llvm::BasicBlock *ContBlock = nullptr;
40
41public:
42 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43 ArrayRef<llvm::Value *> EnterArgs,
44 llvm::FunctionCallee ExitCallee,
45 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
46 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
47 ExitArgs(ExitArgs), Conditional(Conditional) {}
48 void Enter(CodeGenFunction &CGF) override {
49 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
50 if (Conditional) {
51 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
52 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
53 ContBlock = CGF.createBasicBlock("omp_if.end");
54 // Generate the branch (If-stmt)
55 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
56 CGF.EmitBlock(ThenBlock);
57 }
58 }
59 void Done(CodeGenFunction &CGF) {
60 // Emit the rest of blocks/branches
61 CGF.EmitBranch(ContBlock);
62 CGF.EmitBlock(ContBlock, true);
63 }
64 void Exit(CodeGenFunction &CGF) override {
65 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
66 }
67};
68
69/// A class to track the execution mode when codegening directives within
70/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
71/// to the target region and used by containing directives such as 'parallel'
72/// to emit optimized code.
73class ExecutionRuntimeModesRAII {
74private:
78
79public:
80 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
82 : ExecMode(ExecMode) {
83 SavedExecMode = ExecMode;
84 ExecMode = EntryMode;
85 }
86 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
87};
88
89static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
90 RefExpr = RefExpr->IgnoreParens();
91 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
92 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
93 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
94 Base = TempASE->getBase()->IgnoreParenImpCasts();
95 RefExpr = Base;
96 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
97 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
98 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
99 Base = TempOASE->getBase()->IgnoreParenImpCasts();
100 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
101 Base = TempASE->getBase()->IgnoreParenImpCasts();
102 RefExpr = Base;
103 }
104 RefExpr = RefExpr->IgnoreParenImpCasts();
105 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
106 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
107 const auto *ME = cast<MemberExpr>(RefExpr);
108 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
109}
110
111static RecordDecl *buildRecordForGlobalizedVars(
113 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
114 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
115 &MappedDeclsFields,
116 int BufSize) {
117 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
118 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
119 return nullptr;
120 SmallVector<VarsDataTy, 4> GlobalizedVars;
121 for (const ValueDecl *D : EscapedDecls)
122 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
123 for (const ValueDecl *D : EscapedDeclsForTeams)
124 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
125
126 // Build struct _globalized_locals_ty {
127 // /* globalized vars */[WarSize] align (decl_align)
128 // /* globalized vars */ for EscapedDeclsForTeams
129 // };
130 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
131 GlobalizedRD->startDefinition();
132 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(llvm::from_range,
133 EscapedDeclsForTeams);
134 for (const auto &Pair : GlobalizedVars) {
135 const ValueDecl *VD = Pair.second;
136 QualType Type = VD->getType();
138 Type = C.getPointerType(Type.getNonReferenceType());
139 else
140 Type = Type.getNonReferenceType();
141 SourceLocation Loc = VD->getLocation();
142 FieldDecl *Field;
143 if (SingleEscaped.count(VD)) {
144 Field = FieldDecl::Create(
145 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
146 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
147 /*BW=*/nullptr, /*Mutable=*/false,
148 /*InitStyle=*/ICIS_NoInit);
149 Field->setAccess(AS_public);
150 if (VD->hasAttrs()) {
151 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
152 E(VD->getAttrs().end());
153 I != E; ++I)
154 Field->addAttr(*I);
155 }
156 } else {
157 if (BufSize > 1) {
158 llvm::APInt ArraySize(32, BufSize);
159 Type = C.getConstantArrayType(Type, ArraySize, nullptr,
161 }
162 Field = FieldDecl::Create(
163 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
164 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
165 /*BW=*/nullptr, /*Mutable=*/false,
166 /*InitStyle=*/ICIS_NoInit);
167 Field->setAccess(AS_public);
168 llvm::APInt Align(32, Pair.first.getQuantity());
169 Field->addAttr(AlignedAttr::CreateImplicit(
170 C, /*IsAlignmentExpr=*/true,
172 C.getIntTypeForBitwidth(32, /*Signed=*/0),
174 {}, AlignedAttr::GNU_aligned));
175 }
176 GlobalizedRD->addDecl(Field);
177 MappedDeclsFields.try_emplace(VD, Field);
178 }
179 GlobalizedRD->completeDefinition();
180 return GlobalizedRD;
181}
182
183/// Get the list of variables that can escape their declaration context.
184class CheckVarsEscapingDeclContext final
185 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
186 CodeGenFunction &CGF;
187 llvm::SetVector<const ValueDecl *> EscapedDecls;
188 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
189 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
190 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
191 RecordDecl *GlobalizedRD = nullptr;
192 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
193 bool AllEscaped = false;
194 bool IsForCombinedParallelRegion = false;
195
196 void markAsEscaped(const ValueDecl *VD) {
197 // Do not globalize declare target variables.
198 if (!isa<VarDecl>(VD) ||
199 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
200 return;
202 // Use user-specified allocation.
203 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
204 return;
205 // Variables captured by value must be globalized.
206 bool IsCaptured = false;
207 if (auto *CSI = CGF.CapturedStmtInfo) {
208 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
209 // Check if need to capture the variable that was already captured by
210 // value in the outer region.
211 IsCaptured = true;
212 if (!IsForCombinedParallelRegion) {
213 if (!FD->hasAttrs())
214 return;
215 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
216 if (!Attr)
217 return;
218 if (((Attr->getCaptureKind() != OMPC_map) &&
219 !isOpenMPPrivate(Attr->getCaptureKind())) ||
220 ((Attr->getCaptureKind() == OMPC_map) &&
221 !FD->getType()->isAnyPointerType()))
222 return;
223 }
224 if (!FD->getType()->isReferenceType()) {
225 assert(!VD->getType()->isVariablyModifiedType() &&
226 "Parameter captured by value with variably modified type");
227 EscapedParameters.insert(VD);
228 } else if (!IsForCombinedParallelRegion) {
229 return;
230 }
231 }
232 }
233 if ((!CGF.CapturedStmtInfo ||
234 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
235 VD->getType()->isReferenceType())
236 // Do not globalize variables with reference type.
237 return;
238 if (VD->getType()->isVariablyModifiedType()) {
239 // If not captured at the target region level then mark the escaped
240 // variable as delayed.
241 if (IsCaptured)
242 EscapedVariableLengthDecls.insert(VD);
243 else
244 DelayedVariableLengthDecls.insert(VD);
245 } else
246 EscapedDecls.insert(VD);
247 }
248
249 void VisitValueDecl(const ValueDecl *VD) {
250 if (VD->getType()->isLValueReferenceType())
251 markAsEscaped(VD);
252 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
253 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
254 const bool SavedAllEscaped = AllEscaped;
255 AllEscaped = VD->getType()->isLValueReferenceType();
256 Visit(VarD->getInit());
257 AllEscaped = SavedAllEscaped;
258 }
259 }
260 }
261 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
262 ArrayRef<OMPClause *> Clauses,
263 bool IsCombinedParallelRegion) {
264 if (!S)
265 return;
266 for (const CapturedStmt::Capture &C : S->captures()) {
267 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
268 const ValueDecl *VD = C.getCapturedVar();
269 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
270 if (IsCombinedParallelRegion) {
271 // Check if the variable is privatized in the combined construct and
272 // those private copies must be shared in the inner parallel
273 // directive.
274 IsForCombinedParallelRegion = false;
275 for (const OMPClause *C : Clauses) {
276 if (!isOpenMPPrivate(C->getClauseKind()) ||
277 C->getClauseKind() == OMPC_reduction ||
278 C->getClauseKind() == OMPC_linear ||
279 C->getClauseKind() == OMPC_private)
280 continue;
281 ArrayRef<const Expr *> Vars;
282 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
283 Vars = PC->getVarRefs();
284 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
285 Vars = PC->getVarRefs();
286 else
287 llvm_unreachable("Unexpected clause.");
288 for (const auto *E : Vars) {
289 const Decl *D =
290 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
291 if (D == VD->getCanonicalDecl()) {
292 IsForCombinedParallelRegion = true;
293 break;
294 }
295 }
296 if (IsForCombinedParallelRegion)
297 break;
298 }
299 }
300 markAsEscaped(VD);
302 VisitValueDecl(VD);
303 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
304 }
305 }
306 }
307
308 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
309 assert(!GlobalizedRD &&
310 "Record for globalized variables is built already.");
311 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
312 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
313 if (IsInTTDRegion)
314 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
315 else
316 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
317 GlobalizedRD = ::buildRecordForGlobalizedVars(
318 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
319 MappedDeclsFields, WarpSize);
320 }
321
322public:
323 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
324 ArrayRef<const ValueDecl *> TeamsReductions)
325 : CGF(CGF), EscapedDecls(llvm::from_range, TeamsReductions) {}
326 ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
332 VisitValueDecl(VD);
333 }
334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, D->clauses(),
351 CaptureRegions.back() == OMPD_parallel &&
352 isOpenMPDistributeDirective(D->getDirectiveKind()));
353 }
354 }
355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
363 VisitValueDecl(VD);
364 }
365 }
366 }
367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(Arg);
406 }
407 }
408 Visit(E->getCallee());
409 }
410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(E->getSubExpr());
431 }
432 }
433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(Child);
454 AllEscaped = SavedAllEscaped;
455 }
456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501};
502} // anonymous namespace
503
505CGOpenMPRuntimeGPU::getExecutionMode() const {
506 return CurrentExecutionMode;
507}
508
510CGOpenMPRuntimeGPU::getDataSharingMode() const {
511 return CurrentDataSharingMode;
512}
513
514/// Check for inner (nested) SPMD construct, if any
516 const OMPExecutableDirective &D) {
517 const auto *CS = D.getInnermostCapturedStmt();
518 const auto *Body =
519 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
520 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521
522 if (const auto *NestedDir =
523 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
524 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
525 switch (D.getDirectiveKind()) {
526 case OMPD_target:
527 if (isOpenMPParallelDirective(DKind))
528 return true;
529 if (DKind == OMPD_teams) {
530 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
531 /*IgnoreCaptured=*/true);
532 if (!Body)
533 return false;
534 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
535 if (const auto *NND =
536 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
537 DKind = NND->getDirectiveKind();
538 if (isOpenMPParallelDirective(DKind))
539 return true;
540 }
541 }
542 return false;
543 case OMPD_target_teams:
544 return isOpenMPParallelDirective(DKind);
545 case OMPD_target_simd:
546 case OMPD_target_parallel:
547 case OMPD_target_parallel_for:
548 case OMPD_target_parallel_for_simd:
549 case OMPD_target_teams_distribute:
550 case OMPD_target_teams_distribute_simd:
551 case OMPD_target_teams_distribute_parallel_for:
552 case OMPD_target_teams_distribute_parallel_for_simd:
553 case OMPD_parallel:
554 case OMPD_for:
555 case OMPD_parallel_for:
556 case OMPD_parallel_master:
557 case OMPD_parallel_sections:
558 case OMPD_for_simd:
559 case OMPD_parallel_for_simd:
560 case OMPD_cancel:
561 case OMPD_cancellation_point:
562 case OMPD_ordered:
563 case OMPD_threadprivate:
564 case OMPD_allocate:
565 case OMPD_task:
566 case OMPD_simd:
567 case OMPD_sections:
568 case OMPD_section:
569 case OMPD_single:
570 case OMPD_master:
571 case OMPD_critical:
572 case OMPD_taskyield:
573 case OMPD_barrier:
574 case OMPD_taskwait:
575 case OMPD_taskgroup:
576 case OMPD_atomic:
577 case OMPD_flush:
578 case OMPD_depobj:
579 case OMPD_scan:
580 case OMPD_teams:
581 case OMPD_target_data:
582 case OMPD_target_exit_data:
583 case OMPD_target_enter_data:
584 case OMPD_distribute:
585 case OMPD_distribute_simd:
586 case OMPD_distribute_parallel_for:
587 case OMPD_distribute_parallel_for_simd:
588 case OMPD_teams_distribute:
589 case OMPD_teams_distribute_simd:
590 case OMPD_teams_distribute_parallel_for:
591 case OMPD_teams_distribute_parallel_for_simd:
592 case OMPD_target_update:
593 case OMPD_declare_simd:
594 case OMPD_declare_variant:
595 case OMPD_begin_declare_variant:
596 case OMPD_end_declare_variant:
597 case OMPD_declare_target:
598 case OMPD_end_declare_target:
599 case OMPD_declare_reduction:
600 case OMPD_declare_mapper:
601 case OMPD_taskloop:
602 case OMPD_taskloop_simd:
603 case OMPD_master_taskloop:
604 case OMPD_master_taskloop_simd:
605 case OMPD_parallel_master_taskloop:
606 case OMPD_parallel_master_taskloop_simd:
607 case OMPD_requires:
608 case OMPD_unknown:
609 default:
610 llvm_unreachable("Unexpected directive.");
611 }
612 }
613
614 return false;
615}
616
618 const OMPExecutableDirective &D) {
619 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
620 switch (DirectiveKind) {
621 case OMPD_target:
622 case OMPD_target_teams:
623 return hasNestedSPMDDirective(Ctx, D);
624 case OMPD_target_parallel_loop:
625 case OMPD_target_parallel:
626 case OMPD_target_parallel_for:
627 case OMPD_target_parallel_for_simd:
628 case OMPD_target_teams_distribute_parallel_for:
629 case OMPD_target_teams_distribute_parallel_for_simd:
630 case OMPD_target_simd:
631 case OMPD_target_teams_distribute_simd:
632 return true;
633 case OMPD_target_teams_distribute:
634 return false;
635 case OMPD_target_teams_loop:
636 // Whether this is true or not depends on how the directive will
637 // eventually be emitted.
638 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
639 return TTLD->canBeParallelFor();
640 return false;
641 case OMPD_parallel:
642 case OMPD_for:
643 case OMPD_parallel_for:
644 case OMPD_parallel_master:
645 case OMPD_parallel_sections:
646 case OMPD_for_simd:
647 case OMPD_parallel_for_simd:
648 case OMPD_cancel:
649 case OMPD_cancellation_point:
650 case OMPD_ordered:
651 case OMPD_threadprivate:
652 case OMPD_allocate:
653 case OMPD_task:
654 case OMPD_simd:
655 case OMPD_sections:
656 case OMPD_section:
657 case OMPD_single:
658 case OMPD_master:
659 case OMPD_critical:
660 case OMPD_taskyield:
661 case OMPD_barrier:
662 case OMPD_taskwait:
663 case OMPD_taskgroup:
664 case OMPD_atomic:
665 case OMPD_flush:
666 case OMPD_depobj:
667 case OMPD_scan:
668 case OMPD_teams:
669 case OMPD_target_data:
670 case OMPD_target_exit_data:
671 case OMPD_target_enter_data:
672 case OMPD_distribute:
673 case OMPD_distribute_simd:
674 case OMPD_distribute_parallel_for:
675 case OMPD_distribute_parallel_for_simd:
676 case OMPD_teams_distribute:
677 case OMPD_teams_distribute_simd:
678 case OMPD_teams_distribute_parallel_for:
679 case OMPD_teams_distribute_parallel_for_simd:
680 case OMPD_target_update:
681 case OMPD_declare_simd:
682 case OMPD_declare_variant:
683 case OMPD_begin_declare_variant:
684 case OMPD_end_declare_variant:
685 case OMPD_declare_target:
686 case OMPD_end_declare_target:
687 case OMPD_declare_reduction:
688 case OMPD_declare_mapper:
689 case OMPD_taskloop:
690 case OMPD_taskloop_simd:
691 case OMPD_master_taskloop:
692 case OMPD_master_taskloop_simd:
693 case OMPD_parallel_master_taskloop:
694 case OMPD_parallel_master_taskloop_simd:
695 case OMPD_requires:
696 case OMPD_unknown:
697 default:
698 break;
699 }
700 llvm_unreachable(
701 "Unknown programming model for OpenMP directive on NVPTX target.");
702}
703
704void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
705 StringRef ParentName,
706 llvm::Function *&OutlinedFn,
707 llvm::Constant *&OutlinedFnID,
708 bool IsOffloadEntry,
709 const RegionCodeGenTy &CodeGen) {
710 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
711 EntryFunctionState EST;
712 WrapperFunctionsMap.clear();
713
714 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
715 assert(!IsBareKernel && "bare kernel should not be at generic mode");
716
717 // Emit target region as a standalone region.
718 class NVPTXPrePostActionTy : public PrePostActionTy {
719 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
720 const OMPExecutableDirective &D;
721
722 public:
723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
724 const OMPExecutableDirective &D)
725 : EST(EST), D(D) {}
726 void Enter(CodeGenFunction &CGF) override {
727 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
728 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
729 // Skip target region initialization.
730 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
731 }
732 void Exit(CodeGenFunction &CGF) override {
733 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
734 RT.clearLocThreadIdInsertPt(CGF);
735 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
736 }
737 } Action(EST, D);
738 CodeGen.setAction(Action);
739 IsInTTDRegion = true;
740 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
741 IsOffloadEntry, CodeGen);
742 IsInTTDRegion = false;
743}
744
745void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
746 CodeGenFunction &CGF,
747 EntryFunctionState &EST, bool IsSPMD) {
748 llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs;
749 Attrs.ExecFlags =
750 IsSPMD ? llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD
751 : llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
752 computeMinAndMaxThreadsAndTeams(D, CGF, Attrs);
753
754 CGBuilderTy &Bld = CGF.Builder;
755 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, Attrs));
756 if (!IsSPMD)
757 emitGenericVarsProlog(CGF, EST.Loc);
758}
759
760void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
761 EntryFunctionState &EST,
762 bool IsSPMD) {
763 if (!IsSPMD)
764 emitGenericVarsEpilog(CGF);
765
766 // This is temporary until we remove the fixed sized buffer.
767 ASTContext &C = CGM.getContext();
768 RecordDecl *StaticRD = C.buildImplicitRecord(
769 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
770 StaticRD->startDefinition();
771 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
772 CanQualType RecTy = C.getCanonicalTagType(TeamReductionRec);
773 auto *Field = FieldDecl::Create(
774 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
775 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
776 /*BW=*/nullptr, /*Mutable=*/false,
777 /*InitStyle=*/ICIS_NoInit);
778 Field->setAccess(AS_public);
779 StaticRD->addDecl(Field);
780 }
781 StaticRD->completeDefinition();
782 CanQualType StaticTy = C.getCanonicalTagType(StaticRD);
783 llvm::Type *LLVMReductionsBufferTy =
784 CGM.getTypes().ConvertTypeForMem(StaticTy);
785 const auto &DL = CGM.getModule().getDataLayout();
786 uint64_t ReductionDataSize =
787 TeamsReductions.empty()
788 ? 0
789 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
790 CGBuilderTy &Bld = CGF.Builder;
791 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
792 C.getLangOpts().OpenMPCUDAReductionBufNum);
793 TeamsReductions.clear();
794}
795
796void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
797 StringRef ParentName,
798 llvm::Function *&OutlinedFn,
799 llvm::Constant *&OutlinedFnID,
800 bool IsOffloadEntry,
801 const RegionCodeGenTy &CodeGen) {
802 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
803 EntryFunctionState EST;
804
805 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
806
807 // Emit target region as a standalone region.
808 class NVPTXPrePostActionTy : public PrePostActionTy {
809 CGOpenMPRuntimeGPU &RT;
810 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
811 bool IsBareKernel;
812 DataSharingMode Mode;
813 const OMPExecutableDirective &D;
814
815 public:
816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
817 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
818 bool IsBareKernel, const OMPExecutableDirective &D)
819 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
820 Mode(RT.CurrentDataSharingMode), D(D) {}
821 void Enter(CodeGenFunction &CGF) override {
822 if (IsBareKernel) {
823 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
824 return;
825 }
826 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
827 // Skip target region initialization.
828 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
829 }
830 void Exit(CodeGenFunction &CGF) override {
831 if (IsBareKernel) {
832 RT.CurrentDataSharingMode = Mode;
833 return;
834 }
835 RT.clearLocThreadIdInsertPt(CGF);
836 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
837 }
838 } Action(*this, EST, IsBareKernel, D);
839 CodeGen.setAction(Action);
840 IsInTTDRegion = true;
841 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
842 IsOffloadEntry, CodeGen);
843 IsInTTDRegion = false;
844}
845
846void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847 const OMPExecutableDirective &D, StringRef ParentName,
848 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
849 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
850 if (!IsOffloadEntry) // Nothing to do.
851 return;
852
853 assert(!ParentName.empty() && "Invalid target region parent name!");
854
855 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
856 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
857 if (Mode || IsBareKernel)
858 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
859 CodeGen);
860 else
861 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862 CodeGen);
863}
864
867 llvm::OpenMPIRBuilderConfig Config(
868 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
869 CGM.getLangOpts().OpenMPOffloadMandatory,
870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
872 Config.setDefaultTargetAS(
873 CGM.getContext().getTargetInfo().getTargetAddressSpace(LangAS::Default));
874 Config.setRuntimeCC(CGM.getRuntimeCC());
875
876 OMPBuilder.setConfig(Config);
877
878 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
879 llvm_unreachable("OpenMP can only handle device code.");
880
881 if (CGM.getLangOpts().OpenMPCUDAMode)
882 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
883
884 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
885 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
886 return;
887
888 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
889 "__omp_rtl_debug_kind");
890 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
891 "__omp_rtl_assume_teams_oversubscription");
892 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
893 "__omp_rtl_assume_threads_oversubscription");
894 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
895 "__omp_rtl_assume_no_thread_state");
896 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
897 "__omp_rtl_assume_no_nested_parallelism");
898}
899
901 ProcBindKind ProcBind,
902 SourceLocation Loc) {
903 // Nothing to do.
904}
905
907 const Expr *Message,
908 SourceLocation Loc) {
909 CGM.getDiags().Report(Loc, diag::warn_omp_gpu_unsupported_clause)
910 << getOpenMPClauseName(OMPC_message);
911 return nullptr;
912}
913
914llvm::Value *
916 SourceLocation Loc) {
917 CGM.getDiags().Report(Loc, diag::warn_omp_gpu_unsupported_clause)
918 << getOpenMPClauseName(OMPC_severity);
919 return nullptr;
920}
921
923 CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc,
925 SourceLocation SeverityLoc, const Expr *Message,
926 SourceLocation MessageLoc) {
927 if (Modifier == OMPC_NUMTHREADS_strict) {
928 CGM.getDiags().Report(Loc,
929 diag::warn_omp_gpu_unsupported_modifier_for_clause)
930 << "strict" << getOpenMPClauseName(OMPC_num_threads);
931 return;
932 }
933
934 // Nothing to do.
935}
936
938 const Expr *NumTeams,
939 const Expr *ThreadLimit,
940 SourceLocation Loc) {}
941
944 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
945 const RegionCodeGenTy &CodeGen) {
946 // Emit target region as a standalone region.
947 bool PrevIsInTTDRegion = IsInTTDRegion;
948 IsInTTDRegion = false;
949 auto *OutlinedFun =
951 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
952 IsInTTDRegion = PrevIsInTTDRegion;
953 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
954 llvm::Function *WrapperFun =
955 createParallelDataSharingWrapper(OutlinedFun, D);
956 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
957 }
958
959 return OutlinedFun;
960}
961
962/// Get list of lastprivate variables from the teams distribute ... or
963/// teams {distribute ...} directives.
964static void
967 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
968 "expected teams directive.");
969 const OMPExecutableDirective *Dir = &D;
970 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
972 Ctx,
973 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
974 /*IgnoreCaptured=*/true))) {
975 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
976 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
977 Dir = nullptr;
978 }
979 }
980 if (!Dir)
981 return;
982 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
983 for (const Expr *E : C->getVarRefs())
984 Vars.push_back(getPrivateItem(E));
985 }
986}
987
988/// Get list of reduction variables from the teams ... directives.
989static void
992 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
993 "expected teams directive.");
994 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
995 for (const Expr *E : C->privates())
996 Vars.push_back(getPrivateItem(E));
997 }
998}
999
1002 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
1003 const RegionCodeGenTy &CodeGen) {
1004 SourceLocation Loc = D.getBeginLoc();
1005
1006 const RecordDecl *GlobalizedRD = nullptr;
1007 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1008 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1009 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1010 // Globalize team reductions variable unconditionally in all modes.
1011 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1012 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1013 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1014 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1015 if (!LastPrivatesReductions.empty()) {
1016 GlobalizedRD = ::buildRecordForGlobalizedVars(
1017 CGM.getContext(), {}, LastPrivatesReductions, MappedDeclsFields,
1018 WarpSize);
1019 }
1020 } else if (!LastPrivatesReductions.empty()) {
1021 assert(!TeamAndReductions.first &&
1022 "Previous team declaration is not expected.");
1023 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1024 std::swap(TeamAndReductions.second, LastPrivatesReductions);
1025 }
1026
1027 // Emit target region as a standalone region.
1028 class NVPTXPrePostActionTy : public PrePostActionTy {
1029 SourceLocation &Loc;
1030 const RecordDecl *GlobalizedRD;
1031 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1032 &MappedDeclsFields;
1033
1034 public:
1035 NVPTXPrePostActionTy(
1036 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1037 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1038 &MappedDeclsFields)
1039 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1040 MappedDeclsFields(MappedDeclsFields) {}
1041 void Enter(CodeGenFunction &CGF) override {
1042 auto &Rt =
1043 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1044 if (GlobalizedRD) {
1045 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1046 I->getSecond().MappedParams =
1047 std::make_unique<CodeGenFunction::OMPMapVars>();
1048 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1049 for (const auto &Pair : MappedDeclsFields) {
1050 assert(Pair.getFirst()->isCanonicalDecl() &&
1051 "Expected canonical declaration");
1052 Data.try_emplace(Pair.getFirst());
1053 }
1054 }
1055 Rt.emitGenericVarsProlog(CGF, Loc);
1056 }
1057 void Exit(CodeGenFunction &CGF) override {
1058 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1059 .emitGenericVarsEpilog(CGF);
1060 }
1061 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1062 CodeGen.setAction(Action);
1063 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1064 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1065
1066 return OutlinedFun;
1067}
1068
1069void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1070 SourceLocation Loc) {
1071 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1072 return;
1073
1074 CGBuilderTy &Bld = CGF.Builder;
1075
1076 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1077 if (I == FunctionGlobalizedDecls.end())
1078 return;
1079
1080 for (auto &Rec : I->getSecond().LocalVarData) {
1081 const auto *VD = cast<VarDecl>(Rec.first);
1082 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1083 QualType VarTy = VD->getType();
1084
1085 // Get the local allocation of a firstprivate variable before sharing
1086 llvm::Value *ParValue;
1087 if (EscapedParam) {
1088 LValue ParLVal =
1089 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1090 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1091 }
1092
1093 // Allocate space for the variable to be globalized
1094 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1095 llvm::CallBase *VoidPtr =
1096 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1097 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1098 AllocArgs, VD->getName());
1099 // FIXME: We should use the variables actual alignment as an argument.
1100 VoidPtr->addRetAttr(llvm::Attribute::get(
1101 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1103
1104 // Cast the void pointer and get the address of the globalized variable.
1105 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1106 VoidPtr, Bld.getPtrTy(0), VD->getName() + "_on_stack");
1107 LValue VarAddr =
1108 CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
1109 Rec.second.PrivateAddr = VarAddr.getAddress();
1110 Rec.second.GlobalizedVal = VoidPtr;
1111
1112 // Assign the local allocation to the newly globalized location.
1113 if (EscapedParam) {
1114 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1115 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1116 }
1117 if (auto *DI = CGF.getDebugInfo())
1118 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1119 }
1120
1121 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1122 const auto *VD = cast<VarDecl>(ValueD);
1123 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1124 getKmpcAllocShared(CGF, VD);
1125 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
1126 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1127 CGM.getContext().getDeclAlign(VD),
1129 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress());
1130 }
1131 I->getSecond().MappedParams->apply(CGF);
1132}
1133
1135 const VarDecl *VD) const {
1136 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1137 if (I == FunctionGlobalizedDecls.end())
1138 return false;
1139
1140 // Check variable declaration is delayed:
1141 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
1142}
1143
1144std::pair<llvm::Value *, llvm::Value *>
1146 const VarDecl *VD) {
1147 CGBuilderTy &Bld = CGF.Builder;
1148
1149 // Compute size and alignment.
1150 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1151 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1152 Size = Bld.CreateNUWAdd(
1153 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1154 llvm::Value *AlignVal =
1155 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1156 Size = Bld.CreateUDiv(Size, AlignVal);
1157 Size = Bld.CreateNUWMul(Size, AlignVal);
1158
1159 // Allocate space for this VLA object to be globalized.
1160 llvm::Value *AllocArgs[] = {Size};
1161 llvm::CallBase *VoidPtr =
1162 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1163 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1164 AllocArgs, VD->getName());
1165 VoidPtr->addRetAttr(llvm::Attribute::get(
1166 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1167
1168 return std::make_pair(VoidPtr, Size);
1169}
1170
1172 CodeGenFunction &CGF,
1173 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1174 // Deallocate the memory for each globalized VLA object
1175 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1176 CGM.getModule(), OMPRTL___kmpc_free_shared),
1177 {AddrSizePair.first, AddrSizePair.second});
1178}
1179
1180void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1181 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1182 return;
1183
1184 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1185 if (I != FunctionGlobalizedDecls.end()) {
1186 // Deallocate the memory for each globalized VLA object that was
1187 // globalized in the prolog (i.e. emitGenericVarsProlog).
1188 for (const auto &AddrSizePair :
1189 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1190 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1191 CGM.getModule(), OMPRTL___kmpc_free_shared),
1192 {AddrSizePair.first, AddrSizePair.second});
1193 }
1194 // Deallocate the memory for each globalized value
1195 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1196 const auto *VD = cast<VarDecl>(Rec.first);
1197 I->getSecond().MappedParams->restore(CGF);
1198
1199 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1200 CGF.getTypeSize(VD->getType())};
1201 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1202 CGM.getModule(), OMPRTL___kmpc_free_shared),
1203 FreeArgs);
1204 }
1205 }
1206}
1207
1209 const OMPExecutableDirective &D,
1210 SourceLocation Loc,
1211 llvm::Function *OutlinedFn,
1212 ArrayRef<llvm::Value *> CapturedVars) {
1213 if (!CGF.HaveInsertPoint())
1214 return;
1215
1216 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1217
1219 /*Name=*/".zero.addr");
1220 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1222 // We don't emit any thread id function call in bare kernel, but because the
1223 // outlined function has a pointer argument, we emit a nullptr here.
1224 if (IsBareKernel)
1225 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
1226 else
1227 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1228 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1229 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1230 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1231}
1232
1234 CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
1235 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond,
1236 llvm::Value *NumThreads, OpenMPNumThreadsClauseModifier NumThreadsModifier,
1237 OpenMPSeverityClauseKind Severity, const Expr *Message) {
1238 if (!CGF.HaveInsertPoint())
1239 return;
1240
1241 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1242 NumThreads](CodeGenFunction &CGF,
1243 PrePostActionTy &Action) {
1244 CGBuilderTy &Bld = CGF.Builder;
1245 llvm::Value *NumThreadsVal = NumThreads;
1246 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1247 llvm::PointerType *FnPtrTy = llvm::PointerType::get(
1248 CGF.getLLVMContext(), CGM.getDataLayout().getProgramAddressSpace());
1249
1250 llvm::Value *ID = llvm::ConstantPointerNull::get(FnPtrTy);
1251 if (WFn)
1252 ID = Bld.CreateBitOrPointerCast(WFn, FnPtrTy);
1253
1254 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, FnPtrTy);
1255
1256 // Create a private scope that will globalize the arguments
1257 // passed from the outside of the target region.
1258 // TODO: Is that needed?
1259 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1260
1261 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1262 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1263 "captured_vars_addrs");
1264 // There's something to share.
1265 if (!CapturedVars.empty()) {
1266 // Prepare for parallel region. Indicate the outlined function.
1267 ASTContext &Ctx = CGF.getContext();
1268 unsigned Idx = 0;
1269 for (llvm::Value *V : CapturedVars) {
1270 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1271 llvm::Value *PtrV;
1272 if (V->getType()->isIntegerTy())
1273 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1274 else
1276 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1277 Ctx.getPointerType(Ctx.VoidPtrTy));
1278 ++Idx;
1279 }
1280 }
1281
1282 llvm::Value *IfCondVal = nullptr;
1283 if (IfCond)
1284 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1285 /* isSigned */ false);
1286 else
1287 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1288
1289 if (!NumThreadsVal)
1290 NumThreadsVal = llvm::ConstantInt::getAllOnesValue(CGF.Int32Ty);
1291 else
1292 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty);
1293
1294 // No strict prescriptiveness for the number of threads.
1295 llvm::Value *StrictNumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, 0);
1296
1297 assert(IfCondVal && "Expected a value");
1298 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1299 llvm::Value *Args[] = {
1300 RTLoc,
1301 getThreadID(CGF, Loc),
1302 IfCondVal,
1303 NumThreadsVal,
1304 llvm::ConstantInt::getAllOnesValue(CGF.Int32Ty),
1305 FnPtr,
1306 ID,
1307 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
1308 CGF.VoidPtrPtrTy),
1309 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size()),
1310 StrictNumThreadsVal};
1311
1312 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1313 CGM.getModule(), OMPRTL___kmpc_parallel_60),
1314 Args);
1315 };
1316
1317 RegionCodeGenTy RCG(ParallelGen);
1318 RCG(CGF);
1319}
1320
1321void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1322 // Always emit simple barriers!
1323 if (!CGF.HaveInsertPoint())
1324 return;
1325 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1326 // This function does not use parameters, so we can emit just default values.
1327 llvm::Value *Args[] = {
1328 llvm::ConstantPointerNull::get(
1330 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1331 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1332 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1333 Args);
1334}
1335
1337 SourceLocation Loc,
1338 OpenMPDirectiveKind Kind, bool,
1339 bool) {
1340 // Always emit simple barriers!
1341 if (!CGF.HaveInsertPoint())
1342 return;
1343 // Build call __kmpc_cancel_barrier(loc, thread_id);
1344 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1345 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1346 getThreadID(CGF, Loc)};
1347
1348 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1349 CGM.getModule(), OMPRTL___kmpc_barrier),
1350 Args);
1351}
1352
1354 CodeGenFunction &CGF, StringRef CriticalName,
1355 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1356 const Expr *Hint) {
1357 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1358 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1359 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1360 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1361 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1362
1363 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1364
1365 // Get the mask of active threads in the warp.
1366 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1367 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1368 // Fetch team-local id of the thread.
1369 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1370
1371 // Get the width of the team.
1372 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1373
1374 // Initialize the counter variable for the loop.
1375 QualType Int32Ty =
1376 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1377 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1378 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1379 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1380 /*isInit=*/true);
1381
1382 // Block checks if loop counter exceeds upper bound.
1383 CGF.EmitBlock(LoopBB);
1384 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1385 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1386 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1387
1388 // Block tests which single thread should execute region, and which threads
1389 // should go straight to synchronisation point.
1390 CGF.EmitBlock(TestBB);
1391 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1392 llvm::Value *CmpThreadToCounter =
1393 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1394 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1395
1396 // Block emits the body of the critical region.
1397 CGF.EmitBlock(BodyBB);
1398
1399 // Output the critical statement.
1400 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1401 Hint);
1402
1403 // After the body surrounded by the critical region, the single executing
1404 // thread will jump to the synchronisation point.
1405 // Block waits for all threads in current team to finish then increments the
1406 // counter variable and returns to the loop.
1407 CGF.EmitBlock(SyncBB);
1408 // Reconverge active threads in the warp.
1409 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1410 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1411 Mask);
1412
1413 llvm::Value *IncCounterVal =
1414 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1415 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1416 CGF.EmitBranch(LoopBB);
1417
1418 // Block that is reached when all threads in the team complete the region.
1419 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1420}
1421
1422/// Cast value to the specified type.
1423static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1424 QualType ValTy, QualType CastTy,
1425 SourceLocation Loc) {
1426 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1427 "Cast type must sized.");
1428 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1429 "Val type must sized.");
1430 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1431 if (ValTy == CastTy)
1432 return Val;
1433 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1434 CGF.getContext().getTypeSizeInChars(CastTy))
1435 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1436 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1437 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1439 Address CastItem = CGF.CreateMemTemp(CastTy);
1440 Address ValCastItem = CastItem.withElementType(Val->getType());
1441 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1443 TBAAAccessInfo());
1444 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1446 TBAAAccessInfo());
1447}
1448
1449///
1450/// Design of OpenMP reductions on the GPU
1451///
1452/// Consider a typical OpenMP program with one or more reduction
1453/// clauses:
1454///
1455/// float foo;
1456/// double bar;
1457/// #pragma omp target teams distribute parallel for \
1458/// reduction(+:foo) reduction(*:bar)
1459/// for (int i = 0; i < N; i++) {
1460/// foo += A[i]; bar *= B[i];
1461/// }
1462///
1463/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1464/// all teams. In our OpenMP implementation on the NVPTX device an
1465/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1466/// within a team are mapped to CUDA threads within a threadblock.
1467/// Our goal is to efficiently aggregate values across all OpenMP
1468/// threads such that:
1469///
1470/// - the compiler and runtime are logically concise, and
1471/// - the reduction is performed efficiently in a hierarchical
1472/// manner as follows: within OpenMP threads in the same warp,
1473/// across warps in a threadblock, and finally across teams on
1474/// the NVPTX device.
1475///
1476/// Introduction to Decoupling
1477///
1478/// We would like to decouple the compiler and the runtime so that the
1479/// latter is ignorant of the reduction variables (number, data types)
1480/// and the reduction operators. This allows a simpler interface
1481/// and implementation while still attaining good performance.
1482///
1483/// Pseudocode for the aforementioned OpenMP program generated by the
1484/// compiler is as follows:
1485///
1486/// 1. Create private copies of reduction variables on each OpenMP
1487/// thread: 'foo_private', 'bar_private'
1488/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1489/// to it and writes the result in 'foo_private' and 'bar_private'
1490/// respectively.
1491/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1492/// and store the result on the team master:
1493///
1494/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1495/// reduceData, shuffleReduceFn, interWarpCpyFn)
1496///
1497/// where:
1498/// struct ReduceData {
1499/// double *foo;
1500/// double *bar;
1501/// } reduceData
1502/// reduceData.foo = &foo_private
1503/// reduceData.bar = &bar_private
1504///
1505/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1506/// auxiliary functions generated by the compiler that operate on
1507/// variables of type 'ReduceData'. They aid the runtime perform
1508/// algorithmic steps in a data agnostic manner.
1509///
1510/// 'shuffleReduceFn' is a pointer to a function that reduces data
1511/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1512/// same warp. It takes the following arguments as input:
1513///
1514/// a. variable of type 'ReduceData' on the calling lane,
1515/// b. its lane_id,
1516/// c. an offset relative to the current lane_id to generate a
1517/// remote_lane_id. The remote lane contains the second
1518/// variable of type 'ReduceData' that is to be reduced.
1519/// d. an algorithm version parameter determining which reduction
1520/// algorithm to use.
1521///
1522/// 'shuffleReduceFn' retrieves data from the remote lane using
1523/// efficient GPU shuffle intrinsics and reduces, using the
1524/// algorithm specified by the 4th parameter, the two operands
1525/// element-wise. The result is written to the first operand.
1526///
1527/// Different reduction algorithms are implemented in different
1528/// runtime functions, all calling 'shuffleReduceFn' to perform
1529/// the essential reduction step. Therefore, based on the 4th
1530/// parameter, this function behaves slightly differently to
1531/// cooperate with the runtime to ensure correctness under
1532/// different circumstances.
1533///
1534/// 'InterWarpCpyFn' is a pointer to a function that transfers
1535/// reduced variables across warps. It tunnels, through CUDA
1536/// shared memory, the thread-private data of type 'ReduceData'
1537/// from lane 0 of each warp to a lane in the first warp.
1538/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1539/// The last team writes the global reduced value to memory.
1540///
1541/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1542/// reduceData, shuffleReduceFn, interWarpCpyFn,
1543/// scratchpadCopyFn, loadAndReduceFn)
1544///
1545/// 'scratchpadCopyFn' is a helper that stores reduced
1546/// data from the team master to a scratchpad array in
1547/// global memory.
1548///
1549/// 'loadAndReduceFn' is a helper that loads data from
1550/// the scratchpad array and reduces it with the input
1551/// operand.
1552///
1553/// These compiler generated functions hide address
1554/// calculation and alignment information from the runtime.
1555/// 5. if ret == 1:
1556/// The team master of the last team stores the reduced
1557/// result to the globals in memory.
1558/// foo += reduceData.foo; bar *= reduceData.bar
1559///
1560///
1561/// Warp Reduction Algorithms
1562///
1563/// On the warp level, we have three algorithms implemented in the
1564/// OpenMP runtime depending on the number of active lanes:
1565///
1566/// Full Warp Reduction
1567///
1568/// The reduce algorithm within a warp where all lanes are active
1569/// is implemented in the runtime as follows:
1570///
1571/// full_warp_reduce(void *reduce_data,
1572/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1573/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1574/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1575/// }
1576///
1577/// The algorithm completes in log(2, WARPSIZE) steps.
1578///
1579/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1580/// not used therefore we save instructions by not retrieving lane_id
1581/// from the corresponding special registers. The 4th parameter, which
1582/// represents the version of the algorithm being used, is set to 0 to
1583/// signify full warp reduction.
1584///
1585/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1586///
1587/// #reduce_elem refers to an element in the local lane's data structure
1588/// #remote_elem is retrieved from a remote lane
1589/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1590/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1591///
1592/// Contiguous Partial Warp Reduction
1593///
1594/// This reduce algorithm is used within a warp where only the first
1595/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1596/// number of OpenMP threads in a parallel region is not a multiple of
1597/// WARPSIZE. The algorithm is implemented in the runtime as follows:
1598///
1599/// void
1600/// contiguous_partial_reduce(void *reduce_data,
1601/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1602/// int size, int lane_id) {
1603/// int curr_size;
1604/// int offset;
1605/// curr_size = size;
1606/// mask = curr_size/2;
1607/// while (offset>0) {
1608/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1609/// curr_size = (curr_size+1)/2;
1610/// offset = curr_size/2;
1611/// }
1612/// }
1613///
1614/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1615///
1616/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1617/// if (lane_id < offset)
1618/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1619/// else
1620/// reduce_elem = remote_elem
1621///
1622/// This algorithm assumes that the data to be reduced are located in a
1623/// contiguous subset of lanes starting from the first. When there is
1624/// an odd number of active lanes, the data in the last lane is not
1625/// aggregated with any other lane's dat but is instead copied over.
1626///
1627/// Dispersed Partial Warp Reduction
1628///
1629/// This algorithm is used within a warp when any discontiguous subset of
1630/// lanes are active. It is used to implement the reduction operation
1631/// across lanes in an OpenMP simd region or in a nested parallel region.
1632///
1633/// void
1634/// dispersed_partial_reduce(void *reduce_data,
1635/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1636/// int size, remote_id;
1637/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1638/// do {
1639/// remote_id = next_active_lane_id_right_after_me();
1640/// # the above function returns 0 of no active lane
1641/// # is present right after the current lane.
1642/// size = number_of_active_lanes_in_this_warp();
1643/// logical_lane_id /= 2;
1644/// ShuffleReduceFn(reduce_data, logical_lane_id,
1645/// remote_id-1-threadIdx.x, 2);
1646/// } while (logical_lane_id % 2 == 0 && size > 1);
1647/// }
1648///
1649/// There is no assumption made about the initial state of the reduction.
1650/// Any number of lanes (>=1) could be active at any position. The reduction
1651/// result is returned in the first active lane.
1652///
1653/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1654///
1655/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1656/// if (lane_id % 2 == 0 && offset > 0)
1657/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1658/// else
1659/// reduce_elem = remote_elem
1660///
1661///
1662/// Intra-Team Reduction
1663///
1664/// This function, as implemented in the runtime call
1665/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1666/// threads in a team. It first reduces within a warp using the
1667/// aforementioned algorithms. We then proceed to gather all such
1668/// reduced values at the first warp.
1669///
1670/// The runtime makes use of the function 'InterWarpCpyFn', which copies
1671/// data from each of the "warp master" (zeroth lane of each warp, where
1672/// warp-reduced data is held) to the zeroth warp. This step reduces (in
1673/// a mathematical sense) the problem of reduction across warp masters in
1674/// a block to the problem of warp reduction.
1675///
1676///
1677/// Inter-Team Reduction
1678///
1679/// Once a team has reduced its data to a single value, it is stored in
1680/// a global scratchpad array. Since each team has a distinct slot, this
1681/// can be done without locking.
1682///
1683/// The last team to write to the scratchpad array proceeds to reduce the
1684/// scratchpad array. One or more workers in the last team use the helper
1685/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1686/// the k'th worker reduces every k'th element.
1687///
1688/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1689/// reduce across workers and compute a globally reduced value.
1690///
1694 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1695 if (!CGF.HaveInsertPoint())
1696 return;
1697
1698 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
1699 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
1700
1701 ASTContext &C = CGM.getContext();
1702
1703 if (Options.SimpleReduction) {
1704 assert(!TeamsReduction && !ParallelReduction &&
1705 "Invalid reduction selection in emitReduction.");
1706 (void)ParallelReduction;
1707 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1708 ReductionOps, Options);
1709 return;
1710 }
1711
1712 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
1713 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
1714 int Cnt = 0;
1715 for (const Expr *DRE : Privates) {
1716 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
1717 ++Cnt;
1718 }
1719 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1720 CGM.getContext(), PrivatesReductions, {}, VarFieldMap, 1);
1721
1722 if (TeamsReduction)
1723 TeamsReductions.push_back(ReductionRec);
1724
1725 // Source location for the ident struct
1726 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1727
1728 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1729 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
1730 CGF.AllocaInsertPt->getIterator());
1731 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1732 CGF.Builder.GetInsertPoint());
1733 llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1734 CodeGenIP, CGF.SourceLocToDebugLoc(Loc));
1736
1738 unsigned Idx = 0;
1739 for (const Expr *Private : Privates) {
1740 llvm::Type *ElementType;
1741 llvm::Value *Variable;
1742 llvm::Value *PrivateVariable;
1743 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1744 ElementType = CGF.ConvertTypeForMem(Private->getType());
1745 const auto *RHSVar =
1746 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
1747 PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF);
1748 const auto *LHSVar =
1749 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
1750 Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF);
1751 llvm::OpenMPIRBuilder::EvalKind EvalKind;
1752 switch (CGF.getEvaluationKind(Private->getType())) {
1753 case TEK_Scalar:
1754 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1755 break;
1756 case TEK_Complex:
1757 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1758 break;
1759 case TEK_Aggregate:
1760 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1761 break;
1762 }
1763 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1764 llvm::Value **LHSPtr, llvm::Value **RHSPtr,
1765 llvm::Function *NewFunc) {
1766 CGF.Builder.restoreIP(CodeGenIP);
1767 auto *CurFn = CGF.CurFn;
1768 CGF.CurFn = NewFunc;
1769
1770 *LHSPtr = CGF.GetAddrOfLocalVar(
1771 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
1772 .emitRawPointer(CGF);
1773 *RHSPtr = CGF.GetAddrOfLocalVar(
1774 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
1775 .emitRawPointer(CGF);
1776
1777 emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
1778 cast<DeclRefExpr>(LHSExprs[I]),
1779 cast<DeclRefExpr>(RHSExprs[I]));
1780
1781 CGF.CurFn = CurFn;
1782
1783 return InsertPointTy(CGF.Builder.GetInsertBlock(),
1784 CGF.Builder.GetInsertPoint());
1785 };
1786 ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
1787 ElementType, Variable, PrivateVariable, EvalKind,
1788 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen,
1789 /*DataPtrPtrGen=*/nullptr));
1790 Idx++;
1791 }
1792
1793 llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
1794 cantFail(OMPBuilder.createReductionsGPU(
1795 OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, /*IsByRef=*/{}, false,
1796 TeamsReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1797 CGF.getTarget().getGridValue(),
1798 C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc));
1799 CGF.Builder.restoreIP(AfterIP);
1800}
1801
1802const VarDecl *
1804 const VarDecl *NativeParam) const {
1805 if (!NativeParam->getType()->isReferenceType())
1806 return NativeParam;
1807 QualType ArgType = NativeParam->getType();
1809 const Type *NonQualTy = QC.strip(ArgType);
1810 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1811 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1812 if (Attr->getCaptureKind() == OMPC_map) {
1813 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
1815 }
1816 }
1817 ArgType = CGM.getContext().getPointerType(PointeeTy);
1818 QC.addRestrict();
1819 ArgType = QC.apply(CGM.getContext(), ArgType);
1820 if (isa<ImplicitParamDecl>(NativeParam))
1822 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
1824 return ParmVarDecl::Create(
1825 CGM.getContext(),
1826 const_cast<DeclContext *>(NativeParam->getDeclContext()),
1827 NativeParam->getBeginLoc(), NativeParam->getLocation(),
1828 NativeParam->getIdentifier(), ArgType,
1829 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
1830}
1831
1832Address
1834 const VarDecl *NativeParam,
1835 const VarDecl *TargetParam) const {
1836 assert(NativeParam != TargetParam &&
1837 NativeParam->getType()->isReferenceType() &&
1838 "Native arg must not be the same as target arg.");
1839 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
1840 QualType NativeParamType = NativeParam->getType();
1842 const Type *NonQualTy = QC.strip(NativeParamType);
1843 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1844 unsigned NativePointeeAddrSpace =
1845 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
1846 QualType TargetTy = TargetParam->getType();
1847 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,
1848 TargetTy, SourceLocation());
1849 // Cast to native address space.
1851 TargetAddr,
1852 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));
1853 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
1854 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
1855 NativeParamType);
1856 return NativeParamAddr;
1857}
1858
1860 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1861 ArrayRef<llvm::Value *> Args) const {
1863 TargetArgs.reserve(Args.size());
1864 auto *FnType = OutlinedFn.getFunctionType();
1865 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
1866 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1867 TargetArgs.append(std::next(Args.begin(), I), Args.end());
1868 break;
1869 }
1870 llvm::Type *TargetType = FnType->getParamType(I);
1871 llvm::Value *NativeArg = Args[I];
1872 if (!TargetType->isPointerTy()) {
1873 TargetArgs.emplace_back(NativeArg);
1874 continue;
1875 }
1876 TargetArgs.emplace_back(
1877 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));
1878 }
1879 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
1880}
1881
1882/// Emit function which wraps the outline parallel region
1883/// and controls the arguments which are passed to this function.
1884/// The wrapper ensures that the outlined function is called
1885/// with the correct arguments when data is shared.
1886llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1887 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
1888 ASTContext &Ctx = CGM.getContext();
1889 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
1890
1891 // Create a function that takes as argument the source thread.
1892 FunctionArgList WrapperArgs;
1893 QualType Int16QTy =
1894 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1895 QualType Int32QTy =
1896 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1897 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1898 /*Id=*/nullptr, Int16QTy,
1900 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1901 /*Id=*/nullptr, Int32QTy,
1903 WrapperArgs.emplace_back(&ParallelLevelArg);
1904 WrapperArgs.emplace_back(&WrapperArg);
1905
1906 const CGFunctionInfo &CGFI =
1908
1909 auto *Fn = llvm::Function::Create(
1910 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1911 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
1912
1913 // Ensure we do not inline the function. This is trivially true for the ones
1914 // passed to __kmpc_fork_call but the ones calles in serialized regions
1915 // could be inlined. This is not a perfect but it is closer to the invariant
1916 // we want, namely, every data environment starts with a new function.
1917 // TODO: We should pass the if condition to the runtime function and do the
1918 // handling there. Much cleaner code.
1919 Fn->addFnAttr(llvm::Attribute::NoInline);
1920
1922 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1923 Fn->setDoesNotRecurse();
1924
1925 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1926 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
1927 D.getBeginLoc(), D.getBeginLoc());
1928
1929 const auto *RD = CS.getCapturedRecordDecl();
1930 auto CurField = RD->field_begin();
1931
1932 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1933 /*Name=*/".zero.addr");
1934 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1935 // Get the array of arguments.
1937
1938 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
1939 Args.emplace_back(ZeroAddr.emitRawPointer(CGF));
1940
1941 CGBuilderTy &Bld = CGF.Builder;
1942 auto CI = CS.capture_begin();
1943
1944 // Use global memory for data sharing.
1945 // Handle passing of global args to workers.
1946 RawAddress GlobalArgs =
1947 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
1948 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1949 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1950 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1951 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
1952 DataSharingArgs);
1953
1954 // Retrieve the shared variables from the list of references returned
1955 // by the runtime. Pass the variables to the outlined function.
1956 Address SharedArgListAddress = Address::invalid();
1957 if (CS.capture_size() > 0 ||
1958 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1959 SharedArgListAddress = CGF.EmitLoadOfPointer(
1960 GlobalArgs, CGF.getContext()
1962 .castAs<PointerType>());
1963 }
1964 unsigned Idx = 0;
1965 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1966 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1968 Src, Bld.getPtrTy(0), CGF.SizeTy);
1969 llvm::Value *LB = CGF.EmitLoadOfScalar(
1970 TypedAddress,
1971 /*Volatile=*/false,
1973 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
1974 Args.emplace_back(LB);
1975 ++Idx;
1976 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1977 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Src, Bld.getPtrTy(0),
1978 CGF.SizeTy);
1979 llvm::Value *UB = CGF.EmitLoadOfScalar(
1980 TypedAddress,
1981 /*Volatile=*/false,
1983 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
1984 Args.emplace_back(UB);
1985 ++Idx;
1986 }
1987 if (CS.capture_size() > 0) {
1988 ASTContext &CGFContext = CGF.getContext();
1989 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1990 QualType ElemTy = CurField->getType();
1991 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
1992 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1993 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
1994 CGF.ConvertTypeForMem(ElemTy));
1995 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
1996 /*Volatile=*/false,
1997 CGFContext.getPointerType(ElemTy),
1998 CI->getLocation());
1999 if (CI->capturesVariableByCopy() &&
2000 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
2001 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
2002 CI->getLocation());
2003 }
2004 Args.emplace_back(Arg);
2005 }
2006 }
2007
2008 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
2009 CGF.FinishFunction();
2010 return Fn;
2011}
2012
2014 const Decl *D) {
2015 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2016 return;
2017
2018 assert(D && "Expected function or captured|block decl.");
2019 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
2020 "Function is registered already.");
2021 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
2022 "Team is set but not processed.");
2023 const Stmt *Body = nullptr;
2024 bool NeedToDelayGlobalization = false;
2025 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
2026 Body = FD->getBody();
2027 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
2028 Body = BD->getBody();
2029 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
2030 Body = CD->getBody();
2031 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
2032 if (NeedToDelayGlobalization &&
2033 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
2034 return;
2035 }
2036 if (!Body)
2037 return;
2038 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2039 VarChecker.Visit(Body);
2040 const RecordDecl *GlobalizedVarsRecord =
2041 VarChecker.getGlobalizedRecord(IsInTTDRegion);
2042 TeamAndReductions.first = nullptr;
2043 TeamAndReductions.second.clear();
2044 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2045 VarChecker.getEscapedVariableLengthDecls();
2046 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2047 VarChecker.getDelayedVariableLengthDecls();
2048 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2049 DelayedVariableLengthDecls.empty())
2050 return;
2051 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2052 I->getSecond().MappedParams =
2053 std::make_unique<CodeGenFunction::OMPMapVars>();
2054 I->getSecond().EscapedParameters.insert(
2055 VarChecker.getEscapedParameters().begin(),
2056 VarChecker.getEscapedParameters().end());
2057 I->getSecond().EscapedVariableLengthDecls.append(
2058 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
2059 I->getSecond().DelayedVariableLengthDecls.append(
2060 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
2061 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2062 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2063 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2064 Data.try_emplace(VD);
2065 }
2066 if (!NeedToDelayGlobalization) {
2067 emitGenericVarsProlog(CGF, D->getBeginLoc());
2068 struct GlobalizationScope final : EHScopeStack::Cleanup {
2069 GlobalizationScope() = default;
2070
2071 void Emit(CodeGenFunction &CGF, Flags flags) override {
2072 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2073 .emitGenericVarsEpilog(CGF);
2074 }
2075 };
2076 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
2077 }
2078}
2079
2081 const VarDecl *VD) {
2082 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2083 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2084 auto AS = LangAS::Default;
2085 switch (A->getAllocatorType()) {
2086 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2087 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2088 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2089 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2090 break;
2091 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2092 return Address::invalid();
2093 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2094 // TODO: implement aupport for user-defined allocators.
2095 return Address::invalid();
2096 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2098 break;
2099 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2101 break;
2102 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2103 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2104 break;
2105 }
2106 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
2107 auto *GV = new llvm::GlobalVariable(
2108 CGM.getModule(), VarTy, /*isConstant=*/false,
2109 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
2110 VD->getName(),
2111 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
2112 CGM.getContext().getTargetAddressSpace(AS));
2113 CharUnits Align = CGM.getContext().getDeclAlign(VD);
2114 GV->setAlignment(Align.getAsAlign());
2115 return Address(
2117 GV, CGF.Builder.getPtrTy(CGM.getContext().getTargetAddressSpace(
2118 VD->getType().getAddressSpace()))),
2119 VarTy, Align);
2120 }
2121
2122 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2123 return Address::invalid();
2124
2125 VD = VD->getCanonicalDecl();
2126 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2127 if (I == FunctionGlobalizedDecls.end())
2128 return Address::invalid();
2129 auto VDI = I->getSecond().LocalVarData.find(VD);
2130 if (VDI != I->getSecond().LocalVarData.end())
2131 return VDI->second.PrivateAddr;
2132 if (VD->hasAttrs()) {
2134 E(VD->attr_end());
2135 IT != E; ++IT) {
2136 auto VDI = I->getSecond().LocalVarData.find(
2137 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
2138 ->getCanonicalDecl());
2139 if (VDI != I->getSecond().LocalVarData.end())
2140 return VDI->second.PrivateAddr;
2141 }
2142 }
2143
2144 return Address::invalid();
2145}
2146
2148 FunctionGlobalizedDecls.erase(CGF.CurFn);
2150}
2151
2153 CodeGenFunction &CGF, const OMPLoopDirective &S,
2154 OpenMPDistScheduleClauseKind &ScheduleKind,
2155 llvm::Value *&Chunk) const {
2156 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2157 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2158 ScheduleKind = OMPC_DIST_SCHEDULE_static;
2159 Chunk = CGF.EmitScalarConversion(
2160 RT.getGPUNumThreads(CGF),
2161 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2162 S.getIterationVariable()->getType(), S.getBeginLoc());
2163 return;
2164 }
2166 CGF, S, ScheduleKind, Chunk);
2167}
2168
2170 CodeGenFunction &CGF, const OMPLoopDirective &S,
2171 OpenMPScheduleClauseKind &ScheduleKind,
2172 const Expr *&ChunkExpr) const {
2173 ScheduleKind = OMPC_SCHEDULE_static;
2174 // Chunk size is 1 in this case.
2175 llvm::APInt ChunkSize(32, 1);
2176 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
2177 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2178 SourceLocation());
2179}
2180
2182 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2183 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2184 " Expected target-based directive.");
2185 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
2186 for (const CapturedStmt::Capture &C : CS->captures()) {
2187 // Capture variables captured by reference in lambdas for target-based
2188 // directives.
2189 if (!C.capturesVariable())
2190 continue;
2191 const VarDecl *VD = C.getCapturedVar();
2192 const auto *RD = VD->getType()
2196 if (!RD || !RD->isLambda())
2197 continue;
2198 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2199 LValue VDLVal;
2201 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
2202 else
2203 VDLVal = CGF.MakeAddrLValue(
2204 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
2205 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
2206 FieldDecl *ThisCapture = nullptr;
2207 RD->getCaptureFields(Captures, ThisCapture);
2208 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2209 LValue ThisLVal =
2210 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
2211 llvm::Value *CXXThis = CGF.LoadCXXThis();
2212 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
2213 }
2214 for (const LambdaCapture &LC : RD->captures()) {
2215 if (LC.getCaptureKind() != LCK_ByRef)
2216 continue;
2217 const ValueDecl *VD = LC.getCapturedVar();
2218 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2219 // capturing structured bindings in lambdas yet.
2220 if (!CS->capturesVariable(cast<VarDecl>(VD)))
2221 continue;
2222 auto It = Captures.find(VD);
2223 assert(It != Captures.end() && "Found lambda capture without field.");
2224 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
2225 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
2227 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
2228 VD->getType().getCanonicalType())
2229 .getAddress();
2230 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
2231 }
2232 }
2233}
2234
2236 LangAS &AS) {
2237 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
2238 return false;
2239 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2240 switch(A->getAllocatorType()) {
2241 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2242 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2243 // Not supported, fallback to the default mem space.
2244 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2245 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2246 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2247 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2248 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2249 AS = LangAS::Default;
2250 return true;
2251 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2253 return true;
2254 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2256 return true;
2257 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2258 llvm_unreachable("Expected predefined allocator for the variables with the "
2259 "static storage.");
2260 }
2261 return false;
2262}
2263
2264// Get current OffloadArch and ignore any unknown values
2266 if (!CGM.getTarget().hasFeature("ptx"))
2267 return OffloadArch::UNKNOWN;
2268 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2269 if (Feature.getValue()) {
2272 return Arch;
2273 }
2274 }
2275 return OffloadArch::UNKNOWN;
2276}
2277
2278/// Check to see if target architecture supports unified addressing which is
2279/// a restriction for OpenMP requires clause "unified_shared_memory".
2281 for (const OMPClause *Clause : D->clauselists()) {
2282 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2284 switch (Arch) {
2285 case OffloadArch::SM_20:
2286 case OffloadArch::SM_21:
2287 case OffloadArch::SM_30:
2289 case OffloadArch::SM_35:
2290 case OffloadArch::SM_37:
2291 case OffloadArch::SM_50:
2292 case OffloadArch::SM_52:
2293 case OffloadArch::SM_53: {
2294 SmallString<256> Buffer;
2295 llvm::raw_svector_ostream Out(Buffer);
2296 Out << "Target architecture " << OffloadArchToString(Arch)
2297 << " does not support unified addressing";
2298 CGM.Error(Clause->getBeginLoc(), Out.str());
2299 return;
2300 }
2301 case OffloadArch::SM_60:
2302 case OffloadArch::SM_61:
2303 case OffloadArch::SM_62:
2304 case OffloadArch::SM_70:
2305 case OffloadArch::SM_72:
2306 case OffloadArch::SM_75:
2307 case OffloadArch::SM_80:
2308 case OffloadArch::SM_86:
2309 case OffloadArch::SM_87:
2310 case OffloadArch::SM_88:
2311 case OffloadArch::SM_89:
2312 case OffloadArch::SM_90:
2388 break;
2389 case OffloadArch::LAST:
2390 llvm_unreachable("Unexpected GPU arch.");
2391 }
2392 }
2393 }
2395}
2396
2398 CGBuilderTy &Bld = CGF.Builder;
2399 llvm::Module *M = &CGF.CGM.getModule();
2400 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2401 llvm::Function *F = M->getFunction(LocSize);
2402 if (!F) {
2403 F = llvm::Function::Create(llvm::FunctionType::get(CGF.Int32Ty, {}, false),
2404 llvm::GlobalVariable::ExternalLinkage, LocSize,
2405 &CGF.CGM.getModule());
2406 }
2407 return Bld.CreateCall(F, {}, "nvptx_num_threads");
2408}
2409
2412 return CGF.EmitRuntimeCall(
2413 OMPBuilder.getOrCreateRuntimeFunction(
2414 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
2415 Args);
2416}
#define V(N, I)
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ... or teams {distribute ....
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
static OffloadArch getOffloadArch(CodeGenModule &CGM)
This file defines OpenMP nodes for declarative directives.
This file defines OpenMP AST classes for clauses.
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, bool AllowAssumedSizeArray=false, StringRef DiagType="")
This file defines OpenMP AST classes for executable directives and clauses.
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition Decl.cpp:4701
This represents 'ompx_bare' clause in the 'pragma omp target teams ...' directive.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition ASTContext.h:226
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
CanQualType VoidPtrTy
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
CanQualType VoidTy
QualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
const TargetInfo & getTargetInfo() const
Definition ASTContext.h:916
Attr - This represents one attribute.
Definition Attr.h:46
ArrayRef< Capture > captures() const
Definition Decl.h:4801
const BlockDecl * getBlockDecl() const
Definition Expr.h:6683
Expr * getCallee()
Definition Expr.h:3093
arg_range arguments()
Definition Expr.h:3198
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition Stmt.h:3942
This captures a statement into a function.
Definition Stmt.h:3929
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition Stmt.cpp:1517
capture_range captures()
Definition Stmt.h:4067
CastKind getCastKind() const
Definition Expr.h:3723
Expr * getSubExpr()
Definition Expr.h:3729
CharUnits - This is an opaque type for sizes expressed in character units.
Definition CharUnits.h:38
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition CharUnits.h:122
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition CharUnits.h:189
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition CharUnits.h:185
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition Address.h:128
static Address invalid()
Definition Address.h:176
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition Address.h:253
Address withElementType(llvm::Type *ElemTy) const
Return address with different element type, but same pointer and alignment.
Definition Address.h:276
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition CGBuilder.h:146
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition CGBuilder.h:213
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ... produce name = getelementptr inbounds addr, i64 0, i64 index where i64 is a...
Definition CGBuilder.h:251
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition CGBuilder.h:271
CGFunctionInfo - Class to encapsulate the information about a function definition.
llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32global_tid, int proc_bind) to generate...
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
@ DS_Generic
Generic data-sharing mode.
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
void getKmpcFreeShared(CodeGenFunction &CGF, const std::pair< llvm::Value *, llvm::Value * > &AddrSizePair) override
Get call to __kmpc_free_shared.
llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
bool isDelayedVariableLengthDecl(CodeGenFunction &CGF, const VarDecl *VD) const override
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args={}) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
ExecutionMode
Defines the execution mode.
@ EM_NonSPMD
Non-SPMD execution mode (1 master thread, others are workers).
@ EM_Unknown
Unknown execution mode (orphaned directive).
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
std::pair< llvm::Value *, llvm::Value * > getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) override
Get call to __kmpc_alloc_shared.
bool isGPU() const override
Returns true if the current target is a GPU.
llvm::Value * emitSeverityClause(OpenMPSeverityClauseKind Severity, SourceLocation Loc) override
llvm::Value * emitMessageClause(CodeGenFunction &CGF, const Expr *Message, SourceLocation Loc) override
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads, OpenMPNumThreadsClauseModifier NumThreadsModifier=OMPC_NUMTHREADS_unknown, OpenMPSeverityClauseKind Severity=OMPC_SEVERITY_fatal, const Expr *Message=nullptr) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc, OpenMPNumThreadsClauseModifier Modifier=OMPC_NUMTHREADS_unknown, OpenMPSeverityClauseKind Severity=OMPC_SEVERITY_fatal, SourceLocation SeverityLoc=SourceLocation(), const Expr *Message=nullptr, SourceLocation MessageLoc=SourceLocation()) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32global_tid, kmp_int32 num_threads) ...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
virtual llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Helper to emit outlined function for 'target' directive.
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D, CodeGenFunction &CGF, llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
Helper to determine the min/max number of threads/teams for D.
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
virtual llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
llvm::OpenMPIRBuilder & getOMPBuilder()
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args={}) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Definition CGExpr.cpp:3355
CGCapturedStmtInfo * CapturedStmtInfo
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
Definition CGExpr.cpp:3364
LValue MakeNaturalAlignPointeeRawAddrLValue(llvm::Value *V, QualType T)
Same as MakeNaturalAlignPointeeAddrLValue except that the pointer is known to be unsigned.
llvm::AssertingVH< llvm::Instruction > AllocaInsertPt
AllocaInsertPoint - This is an instruction in the entry block before which we prefer to insert alloca...
llvm::DebugLoc SourceLocToDebugLoc(SourceLocation Location)
Converts Location to a DebugLoc, if debug information is enabled.
RawAddress CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition CGExpr.cpp:176
const TargetInfo & getTarget() const
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
llvm::Value * EvaluateExprAsBool(const Expr *E)
EvaluateExprAsBool - Perform the usual unary conversions on the specified expression and compare the ...
Definition CGExpr.cpp:232
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
Definition CGExpr.cpp:5858
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
llvm::Type * ConvertTypeForMem(QualType T)
CodeGenTypes & getTypes() const
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Definition CGStmt.cpp:660
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition CGExpr.cpp:189
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
llvm::LLVMContext & getLLVMContext()
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition CGStmt.cpp:640
This class organizes the cross-function state that is used while generating LLVM code.
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
llvm::Module & getModule() const
const TargetInfo & getTarget() const
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
ASTContext & getContext() const
llvm::LLVMContext & getLLVMContext()
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition CGCall.cpp:1801
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition CGCall.cpp:742
unsigned getTargetAddressSpace(QualType T) const
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition CGCall.h:375
LValue - This represents an lvalue references.
Definition CGValue.h:183
Address getAddress() const
Definition CGValue.h:373
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
An abstract representation of an aligned address.
Definition Address.h:42
llvm::Value * getPointer() const
Definition Address.h:66
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
void setAction(PrePostActionTy &Action) const
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition DeclBase.h:1449
void addDecl(Decl *D)
Add the declaration D into this context.
ValueDecl * getDecl()
Definition Expr.h:1341
decl_range decls()
Definition Stmt.h:1671
T * getAttr() const
Definition DeclBase.h:573
bool hasAttrs() const
Definition DeclBase.h:518
attr_iterator attr_end() const
Definition DeclBase.h:542
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition DeclBase.h:984
attr_iterator attr_begin() const
Definition DeclBase.h:539
SourceLocation getLocation() const
Definition DeclBase.h:439
DeclContext * getDeclContext()
Definition DeclBase.h:448
SourceLocation getBeginLoc() const LLVM_READONLY
Definition DeclBase.h:431
AttrVec & getAttrs()
Definition DeclBase.h:524
bool hasAttr() const
Definition DeclBase.h:577
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition DeclBase.h:978
SourceLocation getBeginLoc() const LLVM_READONLY
Definition Decl.h:831
This represents one expression.
Definition Expr.h:112
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition Expr.cpp:3090
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition Expr.cpp:3086
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition Expr.h:284
Represents a member of a struct/union/class.
Definition Decl.h:3160
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition Decl.cpp:4701
GlobalDecl - represents a global declaration.
Definition GlobalDecl.h:57
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition Decl.cpp:5603
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition Expr.cpp:975
Describes the capture of a variable or of this, or of a C++1y init-capture.
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition ExprCXX.cpp:1358
capture_range captures() const
Retrieve this lambda's captures.
Definition ExprCXX.cpp:1371
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition Decl.h:295
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition Decl.h:301
This is a basic class for representing single OpenMP clause.
This represents 'pragma omp requires...' directive.
Definition DeclOpenMP.h:479
clauselist_range clauselists()
Definition DeclOpenMP.h:504
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition Decl.cpp:2958
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition TypeBase.h:3336
A (possibly-)qualified type.
Definition TypeBase.h:937
LangAS getAddressSpace() const
Return the address space of this type.
Definition TypeBase.h:8514
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition TypeBase.h:8573
QualType getCanonicalType() const
Definition TypeBase.h:8440
A qualifier set is used to build a set of qualifiers.
Definition TypeBase.h:8328
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition TypeBase.h:8335
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition Type.cpp:4734
Represents a struct/union/class.
Definition Decl.h:4327
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition Decl.cpp:5292
Scope - A scope is a transient data structure that is used while parsing the program.
Definition Scope.h:41
Encodes a location in the source.
Stmt - This represents one statement.
Definition Stmt.h:86
child_range children()
Definition Stmt.cpp:304
void startDefinition()
Starts the definition of this tag declaration.
Definition Decl.cpp:4907
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with 'operator new(size_t)' is gua...
Definition TargetInfo.h:766
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition TargetInfo.h:326
virtual const llvm::omp::GV & getGridValue() const
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
The base class of the type hierarchy.
Definition TypeBase.h:1839
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition Type.h:26
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition TypeBase.h:9035
bool isReferenceType() const
Definition TypeBase.h:8649
bool isLValueReferenceType() const
Definition TypeBase.h:8653
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition Type.cpp:2274
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition TypeBase.h:2808
Expr * getSubExpr() const
Definition Expr.h:2288
Opcode getOpcode() const
Definition Expr.h:2283
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition Decl.h:712
QualType getType() const
Definition Decl.h:723
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition Decl.cpp:5588
Represents a variable declaration or definition.
Definition Decl.h:926
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition Decl.cpp:2269
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition Decl.h:1578
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
Definition CGValue.h:155
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
Definition CGValue.h:146
The JSON file list parser is used to communicate input to InstallAPI.
CanQual< Type > CanQualType
Represents a canonical, potentially-qualified type.
Privates[]
This class represents the 'transparent' clause in the 'pragma omp task' directive.
bool isa(CodeGen::Address addr)
Definition Address.h:330
@ ICIS_NoInit
No in-class initializer.
Definition Specifiers.h:272
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
@ LCK_ByRef
Capturing by reference.
Definition Lambda.h:37
@ Private
'private' clause, allowed on 'parallel', 'serial', 'loop', 'parallel loop', and 'serial loop' constru...
@ AS_public
Definition Specifiers.h:124
@ CR_OpenMP
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
@ SC_None
Definition Specifiers.h:250
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
OffloadArch StringToOffloadArch(llvm::StringRef S)
OpenMPSeverityClauseKind
OpenMP attributes for 'severity' clause.
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
LangAS
Defines the address space values used by the address space qualifier of QualType.
const char * OffloadArchToString(OffloadArch A)
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition OpenMPKinds.h:25
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
OpenMPNumThreadsClauseModifier
U cast(CodeGen::Address addr)
Definition Address.h:327
@ CXXThis
Parameter for C++ 'this' argument.
Definition Decl.h:1734
@ Other
Other implicit parameter.
Definition Decl.h:1746
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition OpenMPKinds.h:31
unsigned long uint64_t