22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 #include "llvm/Support/MathExtras.h"
26 using namespace clang;
27 using namespace CodeGen;
28 using namespace llvm::omp;
33 llvm::FunctionCallee EnterCallee =
nullptr;
35 llvm::FunctionCallee ExitCallee =
nullptr;
38 llvm::BasicBlock *ContBlock =
nullptr;
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43 llvm::FunctionCallee ExitCallee,
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
50 llvm::Value *CallBool = CGF.
Builder.CreateIsNotNull(EnterRes);
54 CGF.
Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
72 class ExecutionRuntimeModesRAII {
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
92 enum MachineConfiguration :
unsigned {
97 GlobalMemoryAlignment = 128,
102 if (
const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
103 const Expr *
Base = ASE->getBase()->IgnoreParenImpCasts();
104 while (
const auto *TempASE = dyn_cast<ArraySubscriptExpr>(
Base))
105 Base = TempASE->getBase()->IgnoreParenImpCasts();
107 }
else if (
auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
108 const Expr *
Base = OASE->getBase()->IgnoreParenImpCasts();
109 while (
const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(
Base))
110 Base = TempOASE->getBase()->IgnoreParenImpCasts();
111 while (
const auto *TempASE = dyn_cast<ArraySubscriptExpr>(
Base))
112 Base = TempASE->getBase()->IgnoreParenImpCasts();
116 if (
const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
117 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
118 const auto *ME = cast<MemberExpr>(RefExpr);
119 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
123 static RecordDecl *buildRecordForGlobalizedVars(
126 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
127 &MappedDeclsFields,
int BufSize) {
129 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
133 GlobalizedVars.emplace_back(
135 C.getDeclAlign(D).getQuantity(),
138 for (
const ValueDecl *D : EscapedDeclsForTeams)
139 GlobalizedVars.emplace_back(
C.getDeclAlign(D), D);
140 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
141 return L.first > R.first;
149 RecordDecl *GlobalizedRD =
C.buildImplicitRecord(
"_globalized_locals_ty");
150 GlobalizedRD->startDefinition();
152 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
153 for (
const auto &Pair : GlobalizedVars) {
157 Type =
C.getPointerType(
Type.getNonReferenceType());
162 if (SingleEscaped.count(VD)) {
187 GlobalMemoryAlignment)));
188 Field->addAttr(AlignedAttr::CreateImplicit(
191 C.getIntTypeForBitwidth(32, 0),
195 GlobalizedRD->addDecl(Field);
196 MappedDeclsFields.try_emplace(VD, Field);
198 GlobalizedRD->completeDefinition();
203 class CheckVarsEscapingDeclContext final
206 llvm::SetVector<const ValueDecl *> EscapedDecls;
207 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
210 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
211 bool AllEscaped =
false;
212 bool IsForCombinedParallelRegion =
false;
214 void markAsEscaped(
const ValueDecl *VD) {
216 if (!isa<VarDecl>(VD) ||
217 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
224 if (
auto *CSI = CGF.CapturedStmtInfo) {
225 if (
const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
228 if (!IsForCombinedParallelRegion) {
231 const auto *
Attr = FD->getAttr<OMPCaptureKindAttr>();
234 if (((
Attr->getCaptureKind() != OMPC_map) &&
236 ((
Attr->getCaptureKind() == OMPC_map) &&
237 !FD->getType()->isAnyPointerType()))
240 if (!FD->getType()->isReferenceType()) {
242 "Parameter captured by value with variably modified type");
243 EscapedParameters.insert(VD);
244 }
else if (!IsForCombinedParallelRegion) {
249 if ((!CGF.CapturedStmtInfo ||
250 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
255 EscapedVariableLengthDecls.insert(VD);
257 EscapedDecls.insert(VD);
260 void VisitValueDecl(
const ValueDecl *VD) {
263 if (
const auto *VarD = dyn_cast<VarDecl>(VD)) {
264 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
265 const bool SavedAllEscaped = AllEscaped;
267 Visit(VarD->getInit());
268 AllEscaped = SavedAllEscaped;
274 bool IsCombinedParallelRegion) {
278 if (
C.capturesVariable() && !
C.capturesVariableByCopy()) {
280 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
281 if (IsCombinedParallelRegion) {
285 IsForCombinedParallelRegion =
false;
288 C->getClauseKind() == OMPC_reduction ||
289 C->getClauseKind() == OMPC_linear ||
290 C->getClauseKind() == OMPC_private)
293 if (
const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
294 Vars = PC->getVarRefs();
295 else if (
const auto *PC = dyn_cast<OMPLastprivateClause>(C))
296 Vars = PC->getVarRefs();
298 llvm_unreachable(
"Unexpected clause.");
299 for (
const auto *E : Vars) {
303 IsForCombinedParallelRegion =
true;
307 if (IsForCombinedParallelRegion)
312 if (isa<OMPCapturedExprDecl>(VD))
314 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
319 void buildRecordForGlobalizedVars(
bool IsInTTDRegion) {
320 assert(!GlobalizedRD &&
321 "Record for globalized variables is built already.");
323 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
325 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
327 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
328 GlobalizedRD = ::buildRecordForGlobalizedVars(
329 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
330 MappedDeclsFields, WarpSize);
336 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
338 virtual ~CheckVarsEscapingDeclContext() =
default;
339 void VisitDeclStmt(
const DeclStmt *S) {
342 for (
const Decl *D : S->decls())
343 if (
const auto *VD = dyn_cast_or_null<ValueDecl>(D))
357 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
358 VisitStmt(S->getCapturedStmt());
361 VisitOpenMPCapturedStmt(
363 CaptureRegions.back() == OMPD_parallel &&
371 if (
C.capturesVariable() && !
C.capturesVariableByCopy()) {
374 if (isa<OMPCapturedExprDecl>(VD))
383 if (
C.capturesVariable()) {
393 void VisitBlockExpr(
const BlockExpr *E) {
398 const VarDecl *VD =
C.getVariable();
405 void VisitCallExpr(
const CallExpr *E) {
411 if (Arg->isLValue()) {
412 const bool SavedAllEscaped = AllEscaped;
415 AllEscaped = SavedAllEscaped;
428 if (isa<OMPCapturedExprDecl>(VD))
437 const bool SavedAllEscaped = AllEscaped;
440 AllEscaped = SavedAllEscaped;
449 const bool SavedAllEscaped = AllEscaped;
452 AllEscaped = SavedAllEscaped;
457 void VisitExpr(
const Expr *E) {
460 bool SavedAllEscaped = AllEscaped;
466 AllEscaped = SavedAllEscaped;
468 void VisitStmt(
const Stmt *S) {
471 for (
const Stmt *Child : S->children())
478 const RecordDecl *getGlobalizedRecord(
bool IsInTTDRegion) {
480 buildRecordForGlobalizedVars(IsInTTDRegion);
486 assert(GlobalizedRD &&
487 "Record for globalized variables must be generated already.");
488 auto I = MappedDeclsFields.find(VD);
489 if (I == MappedDeclsFields.end())
491 return I->getSecond();
496 return EscapedDecls.getArrayRef();
501 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters()
const {
502 return EscapedParameters;
508 return EscapedVariableLengthDecls.getArrayRef();
518 unsigned LaneIDBits =
521 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits,
"nvptx_warp_id");
529 unsigned LaneIDBits =
531 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
533 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
538 CGOpenMPRuntimeGPU::getExecutionMode()
const {
539 return CurrentExecutionMode;
556 if (
const auto *NestedDir =
557 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
563 if (DKind == OMPD_teams) {
564 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
569 if (
const auto *NND =
570 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
571 DKind = NND->getDirectiveKind();
577 case OMPD_target_teams:
579 case OMPD_target_simd:
580 case OMPD_target_parallel:
581 case OMPD_target_parallel_for:
582 case OMPD_target_parallel_for_simd:
583 case OMPD_target_teams_distribute:
584 case OMPD_target_teams_distribute_simd:
585 case OMPD_target_teams_distribute_parallel_for:
586 case OMPD_target_teams_distribute_parallel_for_simd:
589 case OMPD_parallel_for:
590 case OMPD_parallel_master:
591 case OMPD_parallel_sections:
593 case OMPD_parallel_for_simd:
595 case OMPD_cancellation_point:
597 case OMPD_threadprivate:
615 case OMPD_target_data:
616 case OMPD_target_exit_data:
617 case OMPD_target_enter_data:
618 case OMPD_distribute:
619 case OMPD_distribute_simd:
620 case OMPD_distribute_parallel_for:
621 case OMPD_distribute_parallel_for_simd:
622 case OMPD_teams_distribute:
623 case OMPD_teams_distribute_simd:
624 case OMPD_teams_distribute_parallel_for:
625 case OMPD_teams_distribute_parallel_for_simd:
626 case OMPD_target_update:
627 case OMPD_declare_simd:
628 case OMPD_declare_variant:
629 case OMPD_begin_declare_variant:
630 case OMPD_end_declare_variant:
631 case OMPD_declare_target:
632 case OMPD_end_declare_target:
633 case OMPD_declare_reduction:
634 case OMPD_declare_mapper:
636 case OMPD_taskloop_simd:
637 case OMPD_master_taskloop:
638 case OMPD_master_taskloop_simd:
639 case OMPD_parallel_master_taskloop:
640 case OMPD_parallel_master_taskloop_simd:
644 llvm_unreachable(
"Unexpected directive.");
656 case OMPD_target_teams:
658 case OMPD_target_parallel:
659 case OMPD_target_parallel_for:
660 case OMPD_target_parallel_for_simd:
661 case OMPD_target_teams_distribute_parallel_for:
662 case OMPD_target_teams_distribute_parallel_for_simd:
663 case OMPD_target_simd:
664 case OMPD_target_teams_distribute_simd:
666 case OMPD_target_teams_distribute:
670 case OMPD_parallel_for:
671 case OMPD_parallel_master:
672 case OMPD_parallel_sections:
674 case OMPD_parallel_for_simd:
676 case OMPD_cancellation_point:
678 case OMPD_threadprivate:
696 case OMPD_target_data:
697 case OMPD_target_exit_data:
698 case OMPD_target_enter_data:
699 case OMPD_distribute:
700 case OMPD_distribute_simd:
701 case OMPD_distribute_parallel_for:
702 case OMPD_distribute_parallel_for_simd:
703 case OMPD_teams_distribute:
704 case OMPD_teams_distribute_simd:
705 case OMPD_teams_distribute_parallel_for:
706 case OMPD_teams_distribute_parallel_for_simd:
707 case OMPD_target_update:
708 case OMPD_declare_simd:
709 case OMPD_declare_variant:
710 case OMPD_begin_declare_variant:
711 case OMPD_end_declare_variant:
712 case OMPD_declare_target:
713 case OMPD_end_declare_target:
714 case OMPD_declare_reduction:
715 case OMPD_declare_mapper:
717 case OMPD_taskloop_simd:
718 case OMPD_master_taskloop:
719 case OMPD_master_taskloop_simd:
720 case OMPD_parallel_master_taskloop:
721 case OMPD_parallel_master_taskloop_simd:
728 "Unknown programming model for OpenMP directive on NVPTX target.");
732 StringRef ParentName,
733 llvm::Function *&OutlinedFn,
734 llvm::Constant *&OutlinedFnID,
737 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
738 EntryFunctionState EST;
739 WrapperFunctionsMap.clear();
743 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
746 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
751 RT.emitKernelInit(CGF, EST,
false);
753 RT.setLocThreadIdInsertPt(CGF,
true);
759 RT.emitKernelDeinit(CGF, EST,
false);
763 IsInTTDRegion =
true;
764 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
765 IsOffloadEntry, CodeGen);
766 IsInTTDRegion =
false;
770 EntryFunctionState &EST,
bool IsSPMD) {
772 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
774 emitGenericVarsProlog(CGF, EST.Loc);
778 EntryFunctionState &EST,
781 emitGenericVarsEpilog(CGF);
784 OMPBuilder.createTargetDeinit(Bld, IsSPMD);
788 StringRef ParentName,
789 llvm::Function *&OutlinedFn,
790 llvm::Constant *&OutlinedFnID,
793 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
794 EntryFunctionState EST;
799 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
803 CGOpenMPRuntimeGPU::EntryFunctionState &EST)
804 : RT(RT), EST(EST) {}
806 RT.emitKernelInit(CGF, EST,
true);
812 RT.emitKernelDeinit(CGF, EST,
true);
814 } Action(*
this, EST);
816 IsInTTDRegion =
true;
817 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
818 IsOffloadEntry, CodeGen);
819 IsInTTDRegion =
false;
830 auto *GVMode =
new llvm::GlobalVariable(
832 llvm::GlobalValue::WeakAnyLinkage,
833 llvm::ConstantInt::get(CGM.
Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
834 : OMP_TGT_EXEC_MODE_GENERIC),
835 Twine(Name,
"_exec_mode"));
840 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
842 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
847 assert(!ParentName.empty() &&
"Invalid target region parent name!");
851 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
854 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862 llvm::OpenMPIRBuilderConfig Config(
CGM.
getLangOpts().OpenMPIsDevice,
true,
869 llvm_unreachable(
"OpenMP can only handle device code.");
876 "__omp_rtl_debug_kind");
878 "__omp_rtl_assume_teams_oversubscription");
880 "__omp_rtl_assume_threads_oversubscription");
882 "__omp_rtl_assume_no_thread_state");
884 "__omp_rtl_assume_no_nested_parallelism");
888 ProcBindKind ProcBind,
898 llvm::Value *NumThreads,
904 const Expr *NumTeams,
905 const Expr *ThreadLimit,
912 bool PrevIsInTTDRegion = IsInTTDRegion;
913 IsInTTDRegion =
false;
916 D, ThreadIDVar, InnermostKind, CodeGen));
917 IsInTTDRegion = PrevIsInTTDRegion;
919 llvm::Function *WrapperFun =
920 createParallelDataSharingWrapper(OutlinedFun, D);
921 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
933 "expected teams directive.");
940 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
948 for (
const Expr *E : C->getVarRefs())
958 "expected teams directive.");
960 for (
const Expr *E : C->privates())
972 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
979 if (!LastPrivatesReductions.empty()) {
980 GlobalizedRD = ::buildRecordForGlobalizedVars(
982 MappedDeclsFields, WarpSize);
984 }
else if (!LastPrivatesReductions.empty()) {
985 assert(!TeamAndReductions.first &&
986 "Previous team declaration is not expected.");
988 std::swap(TeamAndReductions.second, LastPrivatesReductions);
995 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
999 NVPTXPrePostActionTy(
1001 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1003 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1004 MappedDeclsFields(MappedDeclsFields) {}
1009 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.
CurFn).first;
1010 I->getSecond().MappedParams =
1011 std::make_unique<CodeGenFunction::OMPMapVars>();
1012 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1013 for (
const auto &Pair : MappedDeclsFields) {
1014 assert(Pair.getFirst()->isCanonicalDecl() &&
1015 "Expected canonical declaration");
1016 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1019 Rt.emitGenericVarsProlog(CGF, Loc);
1023 .emitGenericVarsEpilog(CGF);
1025 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1028 D, ThreadIDVar, InnermostKind, CodeGen);
1035 bool WithSPMDCheck) {
1042 const auto I = FunctionGlobalizedDecls.find(CGF.
CurFn);
1043 if (I == FunctionGlobalizedDecls.end())
1046 for (
auto &Rec : I->getSecond().LocalVarData) {
1047 const auto *VD = cast<VarDecl>(Rec.first);
1048 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1052 llvm::Value *ParValue;
1061 llvm::CallBase *VoidPtr =
1066 VoidPtr->addRetAttr(llvm::Attribute::get(
1073 VoidPtr, VarPtrTy, VD->
getName() +
"_on_stack");
1075 Rec.second.PrivateAddr = VarAddr.
getAddress(CGF);
1076 Rec.second.GlobalizedVal = VoidPtr;
1081 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.
getAddress(CGF));
1084 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->
getLocation()));
1086 for (
const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1091 Size = Bld.CreateNUWAdd(
1093 llvm::Value *AlignVal =
1096 Size = Bld.CreateUDiv(Size, AlignVal);
1097 Size = Bld.CreateNUWMul(Size, AlignVal);
1101 llvm::CallBase *VoidPtr =
1105 VoidPtr->addRetAttr(
1109 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1110 std::pair<llvm::Value *, llvm::Value *>(
1115 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1116 Base.getAddress(CGF));
1118 I->getSecond().MappedParams->apply(CGF);
1122 bool WithSPMDCheck) {
1127 const auto I = FunctionGlobalizedDecls.find(CGF.
CurFn);
1128 if (I != FunctionGlobalizedDecls.end()) {
1130 for (
auto AddrSizePair :
1131 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1134 {AddrSizePair.first, AddrSizePair.second});
1137 for (
auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1138 const auto *VD = cast<VarDecl>(Rec.first);
1139 I->getSecond().MappedParams->restore(CGF);
1141 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1153 llvm::Function *OutlinedFn,
1163 OutlinedFnArgs.push_back(ZeroAddr.
getPointer());
1164 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1170 llvm::Function *OutlinedFn,
1173 llvm::Value *NumThreads) {
1177 auto &&ParallelGen = [
this, Loc, OutlinedFn, CapturedVars, IfCond,
1181 llvm::Value *NumThreadsVal = NumThreads;
1182 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1186 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn,
CGM.
Int8PtrTy);
1193 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1194 llvm::ArrayType::get(
CGM.
VoidPtrTy, CapturedVars.size()),
1195 "captured_vars_addrs");
1197 if (!CapturedVars.empty()) {
1201 for (llvm::Value *
V : CapturedVars) {
1204 if (
V->getType()->isIntegerTy())
1205 PtrV = Bld.CreateIntToPtr(
V, CGF.VoidPtrTy);
1208 CGF.EmitStoreOfScalar(PtrV, Dst,
false,
1214 llvm::Value *IfCondVal =
nullptr;
1216 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1219 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1222 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1224 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1226 assert(IfCondVal &&
"Expected a value");
1228 llvm::Value *Args[] = {
1233 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1236 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.
getPointer(),
1238 llvm::ConstantInt::get(
CGM.
SizeTy, CapturedVars.size())};
1239 CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
1254 llvm::Value *Args[] = {
1255 llvm::ConstantPointerNull::get(
1257 llvm::ConstantInt::get(CGF.
Int32Ty, 0,
true)};
1294 CGM.
getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1312 llvm::Value *CmpLoopBound = CGF.
Builder.CreateICmpSLT(CounterVal, TeamWidth);
1313 CGF.
Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1319 llvm::Value *CmpThreadToCounter =
1320 CGF.
Builder.CreateICmpEQ(ThreadID, CounterVal);
1321 CGF.
Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1340 llvm::Value *IncCounterVal =
1354 "Cast type must sized.");
1356 "Val type must sized.");
1358 if (ValTy == CastTy)
1362 return CGF.
Builder.CreateBitCast(Val, LLVMCastTy);
1364 return CGF.
Builder.CreateIntCast(Val, LLVMCastTy,
1392 assert(Size.getQuantity() <= 8 &&
1393 "Unsupported bitwidth in shuffle instruction.");
1395 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1396 ? OMPRTL___kmpc_shuffle_int32
1397 : OMPRTL___kmpc_shuffle_int64;
1401 Size.getQuantity() <= 4 ? 32 : 64, 1);
1402 llvm::Value *ElemCast =
castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1403 llvm::Value *WarpSize =
1407 OMPBuilder.getOrCreateRuntimeFunction(CGM.
getModule(), ShuffleFn),
1408 {ElemCast, Offset, WarpSize});
1433 for (
int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1443 ElemPtr, IntTy->getPointerTo(), IntTy);
1444 if (Size.getQuantity() / IntSize > 1) {
1448 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1450 llvm::PHINode *PhiSrc =
1451 Bld.CreatePHI(Ptr.
getType(), 2);
1452 PhiSrc->addIncoming(Ptr.
getPointer(), CurrentBB);
1453 llvm::PHINode *PhiDest =
1454 Bld.CreatePHI(ElemPtr.
getType(), 2);
1455 PhiDest->addIncoming(ElemPtr.
getPointer(), CurrentBB);
1459 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1463 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1477 PhiSrc->addIncoming(LocalPtr.
getPointer(), ThenBB);
1478 PhiDest->addIncoming(LocalElemPtr.
getPointer(), ThenBB);
1494 Size = Size % IntSize;
1499 enum CopyAction :
unsigned {
1530 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1531 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1532 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1537 unsigned Size = Privates.size();
1538 for (
const Expr *Private : Privates) {
1543 bool ShuffleInElement =
false;
1546 bool UpdateDestListPtr =
false;
1549 bool IncrScratchpadSrc =
false;
1550 bool IncrScratchpadDest =
false;
1551 QualType PrivatePtrType =
C.getPointerType(Private->getType());
1552 llvm::Type *PrivateLlvmPtrType = CGF.
ConvertType(PrivatePtrType);
1555 case RemoteLaneToThread: {
1560 SrcElementPtrAddr, PrivateLlvmPtrType),
1567 CGF.
CreateMemTemp(Private->getType(),
".omp.reduction.element");
1568 ShuffleInElement =
true;
1569 UpdateDestListPtr =
true;
1577 SrcElementPtrAddr, PrivateLlvmPtrType),
1585 DestElementPtrAddr, PrivateLlvmPtrType),
1589 case ThreadToScratchpad: {
1594 SrcElementPtrAddr, PrivateLlvmPtrType),
1599 llvm::Value *ElementSizeInChars = CGF.
getTypeSize(Private->getType());
1600 llvm::Value *CurrentOffset =
1601 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1602 llvm::Value *ScratchPadElemAbsolutePtrVal =
1603 Bld.CreateNUWAdd(DestBase.
getPointer(), CurrentOffset);
1604 ScratchPadElemAbsolutePtrVal =
1605 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.
VoidPtrTy);
1606 DestElementAddr =
Address(ScratchPadElemAbsolutePtrVal, CGF.
Int8Ty,
1607 C.getTypeAlignInChars(Private->getType()));
1608 IncrScratchpadDest =
true;
1611 case ScratchpadToThread: {
1614 llvm::Value *ElementSizeInChars = CGF.
getTypeSize(Private->getType());
1615 llvm::Value *CurrentOffset =
1616 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1617 llvm::Value *ScratchPadElemAbsolutePtrVal =
1618 Bld.CreateNUWAdd(SrcBase.
getPointer(), CurrentOffset);
1619 ScratchPadElemAbsolutePtrVal =
1620 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.
VoidPtrTy);
1621 SrcElementAddr =
Address(ScratchPadElemAbsolutePtrVal, CGF.
Int8Ty,
1622 C.getTypeAlignInChars(Private->getType()));
1623 IncrScratchpadSrc =
true;
1629 CGF.
CreateMemTemp(Private->getType(),
".omp.reduction.element");
1630 UpdateDestListPtr =
true;
1644 if (ShuffleInElement) {
1645 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1646 RemoteLaneOffset, Private->getExprLoc());
1651 SrcElementAddr,
false, Private->getType(),
1656 Elem, DestElementAddr,
false, Private->getType(),
1663 Private->getExprLoc());
1683 if (UpdateDestListPtr) {
1686 DestElementPtrAddr,
false,
1693 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1696 llvm::Value *ScratchpadBasePtr =
1698 llvm::Value *ElementSizeInChars = CGF.
getTypeSize(Private->getType());
1699 ScratchpadBasePtr = Bld.CreateNUWAdd(
1701 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
1704 ScratchpadBasePtr = Bld.CreateNUWSub(
1705 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.
SizeTy, 1));
1706 ScratchpadBasePtr = Bld.CreateUDiv(
1708 llvm::ConstantInt::get(CGM.
SizeTy, GlobalMemoryAlignment));
1709 ScratchpadBasePtr = Bld.CreateNUWAdd(
1710 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.
SizeTy, 1));
1711 ScratchpadBasePtr = Bld.CreateNUWMul(
1713 llvm::ConstantInt::get(CGM.
SizeTy, GlobalMemoryAlignment));
1715 if (IncrScratchpadDest)
1754 C.getIntTypeForBitwidth(32,
true),
1757 Args.push_back(&ReduceListArg);
1758 Args.push_back(&NumWarpsArg);
1764 "_omp_reduction_inter_warp_copy_func", &M);
1766 Fn->setDoesNotRecurse();
1779 StringRef TransferMediumName =
1780 "__openmp_nvptx_data_transfer_temporary_storage";
1781 llvm::GlobalVariable *TransferMedium =
1782 M.getGlobalVariable(TransferMediumName);
1784 if (!TransferMedium) {
1785 auto *Ty = llvm::ArrayType::get(CGM.
Int32Ty, WarpSize);
1787 TransferMedium =
new llvm::GlobalVariable(
1788 M, Ty,
false, llvm::GlobalVariable::WeakAnyLinkage,
1789 llvm::UndefValue::get(Ty), TransferMediumName,
1790 nullptr, llvm::GlobalVariable::NotThreadLocal,
1791 SharedAddressSpace);
1808 AddrReduceListArg,
false, C.VoidPtrTy, Loc,
1810 ElemTy->getPointerTo()),
1814 for (
const Expr *Private : Privates) {
1819 unsigned RealTySize =
1820 C.getTypeSizeInChars(Private->getType())
1821 .alignTo(C.getTypeAlignInChars(Private->getType()))
1823 for (
unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1824 unsigned NumIters = RealTySize / TySize;
1827 QualType CType = C.getIntTypeForBitwidth(
1831 llvm::Value *Cnt =
nullptr;
1833 llvm::BasicBlock *PrecondBB =
nullptr;
1834 llvm::BasicBlock *ExitBB =
nullptr;
1847 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.
IntTy, NumIters));
1848 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1860 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID,
"warp_master");
1861 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1876 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1877 TransferMedium->getValueType(), TransferMedium,
1878 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1884 CopyType->getPointerTo(
1885 MediumPtrVal->getType()->getPointerAddressSpace())),
1891 ElemPtr,
false, CType, Loc,
1898 Bld.CreateBr(MergeBB);
1901 Bld.CreateBr(MergeBB);
1919 AddrNumWarpsArg,
false, C.IntTy, Loc);
1922 llvm::Value *IsActiveThread =
1923 Bld.CreateICmpULT(ThreadID, NumWarpsVal,
"is_active_thread");
1924 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1929 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1930 TransferMedium->getValueType(), TransferMedium,
1931 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1936 CopyType->getPointerTo(
1937 SrcMediumPtrVal->getType()->getPointerAddressSpace())),
1943 TargetElemPtrPtr,
false, C.VoidPtrTy, Loc);
1947 TargetElemPtr = Bld.
CreateGEP(TargetElemPtr, Cnt);
1950 llvm::Value *SrcMediumValue =
1954 Bld.CreateBr(W0MergeBB);
1957 Bld.CreateBr(W0MergeBB);
1962 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.
IntTy, 1));
1968 RealTySize %= TySize;
2061 Args.push_back(&ReduceListArg);
2062 Args.push_back(&LaneIDArg);
2063 Args.push_back(&RemoteLaneOffsetArg);
2064 Args.push_back(&AlgoVerArg);
2068 auto *Fn = llvm::Function::Create(
2070 "_omp_reduction_shuffle_and_reduce_func", &CGM.
getModule());
2072 Fn->setDoesNotRecurse();
2085 ElemTy->getPointerTo()),
2103 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.remote_reduce_list");
2109 LocalReduceList, RemoteReduceList,
2110 {RemoteLaneOffsetArgVal,
2135 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2137 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2138 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2139 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2141 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2142 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2143 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2144 CondAlgo2 = Bld.CreateAnd(
2145 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2147 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2148 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2153 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2162 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2163 Bld.CreateBr(MergeBB);
2166 Bld.CreateBr(MergeBB);
2172 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2173 llvm::Value *CondCopy = Bld.CreateAnd(
2174 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2179 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2183 RemoteReduceList, LocalReduceList);
2184 Bld.CreateBr(CpyMergeBB);
2187 Bld.CreateBr(CpyMergeBB);
2205 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2219 Args.push_back(&BufferArg);
2220 Args.push_back(&IdxArg);
2221 Args.push_back(&ReduceListArg);
2225 auto *Fn = llvm::Function::Create(
2227 "_omp_reduction_list_to_global_copy_func", &CGM.
getModule());
2229 Fn->setDoesNotRecurse();
2242 ElemTy->getPointerTo()),
2244 QualType StaticTy = C.getRecordType(TeamReductionRec);
2245 llvm::Type *LLVMReductionsBufferTy =
2249 LLVMReductionsBufferTy->getPointerTo());
2250 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.
Int32Ty),
2255 for (
const Expr *Private : Privates) {
2263 ElemPtrPtr, ElemTy->getPointerTo());
2265 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2266 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2268 const FieldDecl *FD = VarFieldMap.lookup(VD);
2272 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.
getElementType(),
2280 ElemPtr,
false, Private->
getType(), Loc,
2317 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2319 llvm::Function *ReduceFn) {
2332 Args.push_back(&BufferArg);
2333 Args.push_back(&IdxArg);
2334 Args.push_back(&ReduceListArg);
2338 auto *Fn = llvm::Function::Create(
2340 "_omp_reduction_list_to_global_reduce_func", &CGM.
getModule());
2342 Fn->setDoesNotRecurse();
2349 QualType StaticTy = C.getRecordType(TeamReductionRec);
2350 llvm::Type *LLVMReductionsBufferTy =
2354 LLVMReductionsBufferTy->getPointerTo());
2359 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.red_list");
2360 auto IPriv = Privates.begin();
2361 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.
Int32Ty),
2366 for (
unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2369 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2370 const FieldDecl *FD = VarFieldMap.lookup(VD);
2374 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2378 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2382 llvm::Value *Size = CGF.
Builder.CreateIntCast(
2393 llvm::Value *GlobalReduceList =
2397 AddrReduceListArg,
false, C.VoidPtrTy, Loc);
2399 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2414 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2428 Args.push_back(&BufferArg);
2429 Args.push_back(&IdxArg);
2430 Args.push_back(&ReduceListArg);
2434 auto *Fn = llvm::Function::Create(
2436 "_omp_reduction_global_to_list_copy_func", &CGM.
getModule());
2438 Fn->setDoesNotRecurse();
2451 ElemTy->getPointerTo()),
2453 QualType StaticTy = C.getRecordType(TeamReductionRec);
2454 llvm::Type *LLVMReductionsBufferTy =
2458 LLVMReductionsBufferTy->getPointerTo());
2460 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.
Int32Ty),
2465 for (
const Expr *Private : Privates) {
2473 ElemPtrPtr, ElemTy->getPointerTo());
2475 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2476 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2478 const FieldDecl *FD = VarFieldMap.lookup(VD);
2482 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.
getElementType(),
2527 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2529 llvm::Function *ReduceFn) {
2542 Args.push_back(&BufferArg);
2543 Args.push_back(&IdxArg);
2544 Args.push_back(&ReduceListArg);
2548 auto *Fn = llvm::Function::Create(
2550 "_omp_reduction_global_to_list_reduce_func", &CGM.
getModule());
2552 Fn->setDoesNotRecurse();
2559 QualType StaticTy = C.getRecordType(TeamReductionRec);
2560 llvm::Type *LLVMReductionsBufferTy =
2564 LLVMReductionsBufferTy->getPointerTo());
2569 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.red_list");
2570 auto IPriv = Privates.begin();
2571 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.
Int32Ty),
2576 for (
unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2579 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2580 const FieldDecl *FD = VarFieldMap.lookup(VD);
2584 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2588 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2592 llvm::Value *Size = CGF.
Builder.CreateIntCast(
2603 llvm::Value *GlobalReduceList =
2607 AddrReduceListArg,
false, C.VoidPtrTy, Loc);
2609 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2868 if (Options.SimpleReduction) {
2869 assert(!TeamsReduction && !ParallelReduction &&
2870 "Invalid reduction selection in emitReduction.");
2872 ReductionOps, Options);
2876 assert((TeamsReduction || ParallelReduction) &&
2877 "Invalid reduction selection in emitReduction.");
2890 auto Size = RHSExprs.size();
2891 for (
const Expr *E : Privates) {
2901 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.red_list");
2902 auto IPriv = Privates.begin();
2904 for (
unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2910 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2914 llvm::Value *Size = CGF.
Builder.CreateIntCast(
2926 llvm::Function *ReductionFn =
2928 Privates, LHSExprs, RHSExprs, ReductionOps);
2929 llvm::Value *ReductionArrayTySize = CGF.
getTypeSize(ReductionArrayTy);
2931 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2932 llvm::Value *InterWarpCopyFn =
2935 if (ParallelReduction) {
2936 llvm::Value *Args[] = {RTLoc,
2938 CGF.
Builder.getInt32(RHSExprs.size()),
2939 ReductionArrayTySize,
2946 CGM.
getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2949 assert(TeamsReduction &&
"expected teams reduction.");
2950 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2953 for (
const Expr *DRE : Privates) {
2954 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2957 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
2958 CGM.
getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
2959 C.getLangOpts().OpenMPCUDAReductionBufNum);
2960 TeamsReductions.push_back(TeamReductionRec);
2961 if (!KernelTeamsReductionPtr) {
2962 KernelTeamsReductionPtr =
new llvm::GlobalVariable(
2965 "_openmp_teams_reductions_buffer_$_$ptr");
2969 false, C.getPointerType(C.VoidPtrTy), Loc);
2971 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2973 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2976 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2978 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2981 llvm::Value *Args[] = {
2985 CGF.
Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2989 GlobalToBufferCpyFn,
2990 GlobalToBufferRedFn,
2991 BufferToGlobalCpyFn,
2992 BufferToGlobalRedFn};
2996 CGM.
getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3003 llvm::Value *Cond = CGF.
Builder.CreateICmpEQ(
3004 Res, llvm::ConstantInt::get(
CGM.
Int32Ty, 1));
3005 CGF.
Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3014 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3016 auto IPriv = Privates.begin();
3017 auto ILHS = LHSExprs.begin();
3018 auto IRHS = RHSExprs.begin();
3019 for (
const Expr *E : ReductionOps) {
3021 cast<DeclRefExpr>(*IRHS));
3027 llvm::Value *EndArgs[] = {ThreadId};
3029 NVPTXActionTy Action(
3030 nullptr, std::nullopt,
3032 CGM.
getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3038 CGF.EmitBlock(ExitBB,
true);
3043 const VarDecl *NativeParam)
const {
3048 const Type *NonQualTy = QC.
strip(ArgType);
3050 if (
const auto *
Attr = FD->
getAttr<OMPCaptureKindAttr>()) {
3051 if (
Attr->getCaptureKind() == OMPC_map) {
3058 enum { NVPTX_local_addr = 5 };
3061 if (isa<ImplicitParamDecl>(NativeParam))
3076 const VarDecl *TargetParam)
const {
3077 assert(NativeParam != TargetParam &&
3079 "Native arg must not be the same as target arg.");
3083 const Type *NonQualTy = QC.
strip(NativeParamType);
3085 unsigned NativePointeeAddrSpace =
3092 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3093 cast<llvm::PointerType>(TargetAddr->getType()), 0));
3096 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3097 cast<llvm::PointerType>(TargetAddr->getType()),
3098 NativePointeeAddrSpace));
3102 return NativeParamAddr;
3109 TargetArgs.reserve(Args.size());
3110 auto *FnType = OutlinedFn.getFunctionType();
3111 for (
unsigned I = 0, E = Args.size(); I < E; ++I) {
3112 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3113 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3116 llvm::Type *TargetType = FnType->getParamType(I);
3117 llvm::Value *NativeArg = Args[I];
3118 if (!TargetType->isPointerTy()) {
3119 TargetArgs.emplace_back(NativeArg);
3123 NativeArg, llvm::PointerType::getWithSamePointeeType(
3124 cast<llvm::PointerType>(NativeArg->getType()), 0));
3125 TargetArgs.emplace_back(
3135 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3152 WrapperArgs.emplace_back(&ParallelLevelArg);
3153 WrapperArgs.emplace_back(&WrapperArg);
3158 auto *Fn = llvm::Function::Create(
3160 Twine(OutlinedParallelFn->getName(),
"_wrapper"), &
CGM.
getModule());
3168 Fn->addFnAttr(llvm::Attribute::NoInline);
3172 Fn->setDoesNotRecurse();
3178 const auto *RD = CS.getCapturedRecordDecl();
3179 auto CurField = RD->field_begin();
3191 auto CI = CS.capture_begin();
3197 llvm::Value *GlobalArgsPtr = GlobalArgs.
getPointer();
3198 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3206 if (CS.capture_size() > 0 ||
3222 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3223 Args.emplace_back(LB);
3232 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3233 Args.emplace_back(UB);
3236 if (CS.capture_size() > 0) {
3238 for (
unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3239 QualType ElemTy = CurField->getType();
3248 if (CI->capturesVariableByCopy() &&
3249 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3253 Args.emplace_back(Arg);
3267 assert(D &&
"Expected function or captured|block decl.");
3268 assert(FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 &&
3269 "Function is registered already.");
3270 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3271 "Team is set but not processed.");
3272 const Stmt *Body =
nullptr;
3273 bool NeedToDelayGlobalization =
false;
3274 if (
const auto *FD = dyn_cast<FunctionDecl>(D)) {
3275 Body = FD->getBody();
3276 }
else if (
const auto *BD = dyn_cast<BlockDecl>(D)) {
3277 Body = BD->getBody();
3278 }
else if (
const auto *CD = dyn_cast<CapturedDecl>(D)) {
3279 Body = CD->getBody();
3281 if (NeedToDelayGlobalization &&
3287 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3288 VarChecker.Visit(Body);
3290 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3291 TeamAndReductions.first =
nullptr;
3292 TeamAndReductions.second.clear();
3294 VarChecker.getEscapedVariableLengthDecls();
3295 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3297 auto I = FunctionGlobalizedDecls.try_emplace(CGF.
CurFn).first;
3298 I->getSecond().MappedParams =
3299 std::make_unique<CodeGenFunction::OMPMapVars>();
3300 I->getSecond().EscapedParameters.insert(
3301 VarChecker.getEscapedParameters().begin(),
3302 VarChecker.getEscapedParameters().end());
3303 I->getSecond().EscapedVariableLengthDecls.append(
3304 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3305 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3306 for (
const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3308 Data.insert(std::make_pair(VD, MappedVarData()));
3310 if (!NeedToDelayGlobalization) {
3311 emitGenericVarsProlog(CGF, D->
getBeginLoc(),
true);
3313 GlobalizationScope() =
default;
3317 .emitGenericVarsEpilog(CGF,
true);
3326 if (VD && VD->
hasAttr<OMPAllocateDeclAttr>()) {
3327 const auto *A = VD->
getAttr<OMPAllocateDeclAttr>();
3329 switch (A->getAllocatorType()) {
3332 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3333 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3334 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3335 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3336 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3339 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3342 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3345 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3348 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3349 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3353 auto *GV =
new llvm::GlobalVariable(
3357 nullptr, llvm::GlobalValue::NotThreadLocal,
3372 auto I = FunctionGlobalizedDecls.find(CGF.
CurFn);
3373 if (I == FunctionGlobalizedDecls.end())
3375 auto VDI = I->getSecond().LocalVarData.find(VD);
3376 if (VDI != I->getSecond().LocalVarData.end())
3377 return VDI->second.PrivateAddr;
3382 auto VDI = I->getSecond().LocalVarData.find(
3383 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3384 ->getCanonicalDecl());
3385 if (VDI != I->getSecond().LocalVarData.end())
3386 return VDI->second.PrivateAddr;
3394 FunctionGlobalizedDecls.erase(CGF.
CurFn);
3401 llvm::Value *&Chunk)
const {
3404 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3408 S.getIterationVariable()->getType(), S.getBeginLoc());
3412 CGF, S, ScheduleKind, Chunk);
3418 const Expr *&ChunkExpr)
const {
3419 ScheduleKind = OMPC_SCHEDULE_static;
3430 " Expected target-based directive.");
3435 if (!C.capturesVariable())
3437 const VarDecl *VD = C.getCapturedVar();
3438 const auto *RD = VD->
getType()
3442 if (!RD || !RD->isLambda())
3451 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3453 RD->getCaptureFields(Captures, ThisCapture);
3463 const ValueDecl *VD = LC.getCapturedVar();
3468 auto It = Captures.find(VD);
3469 assert(It != Captures.end() &&
"Found lambda capture without field.");
3483 if (!VD || !VD->
hasAttr<OMPAllocateDeclAttr>())
3485 const auto *A = VD->
getAttr<OMPAllocateDeclAttr>();
3486 switch(A->getAllocatorType()) {
3487 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3488 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3490 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3491 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3492 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3493 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3494 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3497 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3500 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3503 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3504 llvm_unreachable(
"Expected predefined allocator for the variables with the "
3515 if (Feature.getValue()) {
3529 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3542 llvm::raw_svector_ostream Out(Buffer);
3544 <<
" does not support unified addressing";
3545 CGM.
Error(Clause->getBeginLoc(), Out.str());
3602 llvm_unreachable(
"Unexpected Cuda arch.");
3611 if (!TeamsReductions.empty()) {
3613 RecordDecl *StaticRD = C.buildImplicitRecord(
3616 for (
const RecordDecl *TeamReductionRec : TeamsReductions) {
3617 QualType RecTy = C.getRecordType(TeamReductionRec);
3627 QualType StaticTy = C.getRecordType(StaticRD);
3628 llvm::Type *LLVMReductionsBufferTy =
3633 auto *GV =
new llvm::GlobalVariable(
3636 llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3637 "_openmp_teams_reductions_buffer_$_");
3638 KernelTeamsReductionPtr->setInitializer(
3639 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3648 const char *LocSize =
"__kmpc_get_hardware_num_threads_in_block";
3649 llvm::Function *F = M->getFunction(LocSize);
3651 F = llvm::Function::Create(
3652 llvm::FunctionType::get(CGF.
Int32Ty, std::nullopt,
false),
3655 return Bld.CreateCall(F, std::nullopt,
"nvptx_num_threads");
3662 CGM.
getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),