21 #include "llvm/ADT/SmallPtrSet.h"
22 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
23 #include "llvm/Support/MathExtras.h"
25 using namespace clang;
26 using namespace CodeGen;
27 using namespace llvm::omp;
32 llvm::FunctionCallee EnterCallee =
nullptr;
34 llvm::FunctionCallee ExitCallee =
nullptr;
37 llvm::BasicBlock *ContBlock =
nullptr;
40 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42 llvm::FunctionCallee ExitCallee,
44 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
49 llvm::Value *CallBool = CGF.
Builder.CreateIsNotNull(EnterRes);
53 CGF.
Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
71 class ExecutionRuntimeModesRAII {
76 bool SavedRuntimeMode =
false;
77 bool *RuntimeMode =
nullptr;
82 : ExecMode(ExecMode) {
83 SavedExecMode = ExecMode;
88 bool &RuntimeMode,
bool FullRuntimeMode)
89 : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
90 SavedExecMode = ExecMode;
91 SavedRuntimeMode = RuntimeMode;
93 RuntimeMode = FullRuntimeMode;
95 ~ExecutionRuntimeModesRAII() {
96 ExecMode = SavedExecMode;
98 *RuntimeMode = SavedRuntimeMode;
106 enum MachineConfiguration :
unsigned {
111 GlobalMemoryAlignment = 128,
114 SharedMemorySize = 128,
119 if (
const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
120 const Expr *
Base = ASE->getBase()->IgnoreParenImpCasts();
121 while (
const auto *TempASE = dyn_cast<ArraySubscriptExpr>(
Base))
122 Base = TempASE->getBase()->IgnoreParenImpCasts();
124 }
else if (
auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
125 const Expr *
Base = OASE->getBase()->IgnoreParenImpCasts();
126 while (
const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(
Base))
127 Base = TempOASE->getBase()->IgnoreParenImpCasts();
128 while (
const auto *TempASE = dyn_cast<ArraySubscriptExpr>(
Base))
129 Base = TempASE->getBase()->IgnoreParenImpCasts();
133 if (
const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
134 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
135 const auto *ME = cast<MemberExpr>(RefExpr);
136 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
140 static RecordDecl *buildRecordForGlobalizedVars(
143 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
144 &MappedDeclsFields,
int BufSize) {
146 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
150 GlobalizedVars.emplace_back(
152 C.getDeclAlign(D).getQuantity(),
155 for (
const ValueDecl *D : EscapedDeclsForTeams)
156 GlobalizedVars.emplace_back(
C.getDeclAlign(D), D);
157 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
158 return L.first > R.first;
166 RecordDecl *GlobalizedRD =
C.buildImplicitRecord(
"_globalized_locals_ty");
167 GlobalizedRD->startDefinition();
169 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
170 for (
const auto &Pair : GlobalizedVars) {
174 Type =
C.getPointerType(
Type.getNonReferenceType());
179 if (SingleEscaped.count(VD)) {
204 GlobalMemoryAlignment)));
205 Field->addAttr(AlignedAttr::CreateImplicit(
208 C.getIntTypeForBitwidth(32, 0),
212 GlobalizedRD->addDecl(Field);
213 MappedDeclsFields.try_emplace(VD, Field);
215 GlobalizedRD->completeDefinition();
220 class CheckVarsEscapingDeclContext final
223 llvm::SetVector<const ValueDecl *> EscapedDecls;
224 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
227 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
228 bool AllEscaped =
false;
229 bool IsForCombinedParallelRegion =
false;
231 void markAsEscaped(
const ValueDecl *VD) {
233 if (!isa<VarDecl>(VD) ||
234 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
241 if (
auto *CSI = CGF.CapturedStmtInfo) {
242 if (
const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
245 if (!IsForCombinedParallelRegion) {
248 const auto *
Attr = FD->getAttr<OMPCaptureKindAttr>();
251 if (((
Attr->getCaptureKind() != OMPC_map) &&
253 ((
Attr->getCaptureKind() == OMPC_map) &&
254 !FD->getType()->isAnyPointerType()))
257 if (!FD->getType()->isReferenceType()) {
259 "Parameter captured by value with variably modified type");
260 EscapedParameters.insert(VD);
261 }
else if (!IsForCombinedParallelRegion) {
266 if ((!CGF.CapturedStmtInfo ||
267 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
272 EscapedVariableLengthDecls.insert(VD);
274 EscapedDecls.insert(VD);
277 void VisitValueDecl(
const ValueDecl *VD) {
280 if (
const auto *VarD = dyn_cast<VarDecl>(VD)) {
281 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
282 const bool SavedAllEscaped = AllEscaped;
284 Visit(VarD->getInit());
285 AllEscaped = SavedAllEscaped;
291 bool IsCombinedParallelRegion) {
295 if (
C.capturesVariable() && !
C.capturesVariableByCopy()) {
297 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
298 if (IsCombinedParallelRegion) {
302 IsForCombinedParallelRegion =
false;
305 C->getClauseKind() == OMPC_reduction ||
306 C->getClauseKind() == OMPC_linear ||
307 C->getClauseKind() == OMPC_private)
310 if (
const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
311 Vars = PC->getVarRefs();
312 else if (
const auto *PC = dyn_cast<OMPLastprivateClause>(C))
313 Vars = PC->getVarRefs();
315 llvm_unreachable(
"Unexpected clause.");
316 for (
const auto *E : Vars) {
320 IsForCombinedParallelRegion =
true;
324 if (IsForCombinedParallelRegion)
329 if (isa<OMPCapturedExprDecl>(VD))
331 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
336 void buildRecordForGlobalizedVars(
bool IsInTTDRegion) {
337 assert(!GlobalizedRD &&
338 "Record for globalized variables is built already.");
340 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
342 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
344 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
345 GlobalizedRD = ::buildRecordForGlobalizedVars(
346 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
347 MappedDeclsFields, WarpSize);
353 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
355 virtual ~CheckVarsEscapingDeclContext() =
default;
356 void VisitDeclStmt(
const DeclStmt *S) {
359 for (
const Decl *D : S->decls())
360 if (
const auto *VD = dyn_cast_or_null<ValueDecl>(D))
374 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
375 VisitStmt(S->getCapturedStmt());
378 VisitOpenMPCapturedStmt(
380 CaptureRegions.back() == OMPD_parallel &&
388 if (
C.capturesVariable() && !
C.capturesVariableByCopy()) {
391 if (isa<OMPCapturedExprDecl>(VD))
400 if (
C.capturesVariable()) {
410 void VisitBlockExpr(
const BlockExpr *E) {
415 const VarDecl *VD =
C.getVariable();
422 void VisitCallExpr(
const CallExpr *E) {
428 if (Arg->isLValue()) {
429 const bool SavedAllEscaped = AllEscaped;
432 AllEscaped = SavedAllEscaped;
445 if (isa<OMPCapturedExprDecl>(VD))
447 else if (
const auto *VarD = dyn_cast<VarDecl>(VD))
448 if (VarD->isInitCapture())
455 const bool SavedAllEscaped = AllEscaped;
458 AllEscaped = SavedAllEscaped;
467 const bool SavedAllEscaped = AllEscaped;
470 AllEscaped = SavedAllEscaped;
475 void VisitExpr(
const Expr *E) {
478 bool SavedAllEscaped = AllEscaped;
484 AllEscaped = SavedAllEscaped;
486 void VisitStmt(
const Stmt *S) {
489 for (
const Stmt *Child : S->children())
496 const RecordDecl *getGlobalizedRecord(
bool IsInTTDRegion) {
498 buildRecordForGlobalizedVars(IsInTTDRegion);
504 assert(GlobalizedRD &&
505 "Record for globalized variables must be generated already.");
506 auto I = MappedDeclsFields.find(VD);
507 if (I == MappedDeclsFields.end())
509 return I->getSecond();
514 return EscapedDecls.getArrayRef();
519 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters()
const {
520 return EscapedParameters;
526 return EscapedVariableLengthDecls.getArrayRef();
536 unsigned LaneIDBits =
539 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits,
"nvptx_warp_id");
547 unsigned LaneIDBits =
549 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
551 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
556 CGOpenMPRuntimeGPU::getExecutionMode()
const {
557 return CurrentExecutionMode;
574 if (
const auto *NestedDir =
575 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
581 if (DKind == OMPD_teams) {
582 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
587 if (
const auto *NND =
588 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
589 DKind = NND->getDirectiveKind();
595 case OMPD_target_teams:
597 case OMPD_target_simd:
598 case OMPD_target_parallel:
599 case OMPD_target_parallel_for:
600 case OMPD_target_parallel_for_simd:
601 case OMPD_target_teams_distribute:
602 case OMPD_target_teams_distribute_simd:
603 case OMPD_target_teams_distribute_parallel_for:
604 case OMPD_target_teams_distribute_parallel_for_simd:
607 case OMPD_parallel_for:
608 case OMPD_parallel_master:
609 case OMPD_parallel_sections:
611 case OMPD_parallel_for_simd:
613 case OMPD_cancellation_point:
615 case OMPD_threadprivate:
633 case OMPD_target_data:
634 case OMPD_target_exit_data:
635 case OMPD_target_enter_data:
636 case OMPD_distribute:
637 case OMPD_distribute_simd:
638 case OMPD_distribute_parallel_for:
639 case OMPD_distribute_parallel_for_simd:
640 case OMPD_teams_distribute:
641 case OMPD_teams_distribute_simd:
642 case OMPD_teams_distribute_parallel_for:
643 case OMPD_teams_distribute_parallel_for_simd:
644 case OMPD_target_update:
645 case OMPD_declare_simd:
646 case OMPD_declare_variant:
647 case OMPD_begin_declare_variant:
648 case OMPD_end_declare_variant:
649 case OMPD_declare_target:
650 case OMPD_end_declare_target:
651 case OMPD_declare_reduction:
652 case OMPD_declare_mapper:
654 case OMPD_taskloop_simd:
655 case OMPD_master_taskloop:
656 case OMPD_master_taskloop_simd:
657 case OMPD_parallel_master_taskloop:
658 case OMPD_parallel_master_taskloop_simd:
662 llvm_unreachable(
"Unexpected directive.");
672 switch (DirectiveKind) {
674 case OMPD_target_teams:
676 case OMPD_target_parallel:
677 case OMPD_target_parallel_for:
678 case OMPD_target_parallel_for_simd:
679 case OMPD_target_teams_distribute_parallel_for:
680 case OMPD_target_teams_distribute_parallel_for_simd:
681 case OMPD_target_simd:
682 case OMPD_target_teams_distribute_simd:
684 case OMPD_target_teams_distribute:
688 case OMPD_parallel_for:
689 case OMPD_parallel_master:
690 case OMPD_parallel_sections:
692 case OMPD_parallel_for_simd:
694 case OMPD_cancellation_point:
696 case OMPD_threadprivate:
714 case OMPD_target_data:
715 case OMPD_target_exit_data:
716 case OMPD_target_enter_data:
717 case OMPD_distribute:
718 case OMPD_distribute_simd:
719 case OMPD_distribute_parallel_for:
720 case OMPD_distribute_parallel_for_simd:
721 case OMPD_teams_distribute:
722 case OMPD_teams_distribute_simd:
723 case OMPD_teams_distribute_parallel_for:
724 case OMPD_teams_distribute_parallel_for_simd:
725 case OMPD_target_update:
726 case OMPD_declare_simd:
727 case OMPD_declare_variant:
728 case OMPD_begin_declare_variant:
729 case OMPD_end_declare_variant:
730 case OMPD_declare_target:
731 case OMPD_end_declare_target:
732 case OMPD_declare_reduction:
733 case OMPD_declare_mapper:
735 case OMPD_taskloop_simd:
736 case OMPD_master_taskloop:
737 case OMPD_master_taskloop_simd:
738 case OMPD_parallel_master_taskloop:
739 case OMPD_parallel_master_taskloop_simd:
746 "Unknown programming model for OpenMP directive on NVPTX target.");
754 "Expected loop-based directive.");
759 return C->getScheduleKind() == OMPC_SCHEDULE_static;
772 if (
const auto *NestedDir =
773 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
781 if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
783 if (DKind == OMPD_parallel) {
784 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
789 if (
const auto *NND =
790 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
791 DKind = NND->getDirectiveKind();
796 }
else if (DKind == OMPD_teams) {
797 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
802 if (
const auto *NND =
803 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
804 DKind = NND->getDirectiveKind();
809 if (DKind == OMPD_parallel) {
810 Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
815 if (
const auto *NND =
816 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
817 DKind = NND->getDirectiveKind();
826 case OMPD_target_teams:
831 if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
833 if (DKind == OMPD_parallel) {
834 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
839 if (
const auto *NND =
840 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
841 DKind = NND->getDirectiveKind();
848 case OMPD_target_parallel:
849 if (DKind == OMPD_simd)
853 case OMPD_target_teams_distribute:
854 case OMPD_target_simd:
855 case OMPD_target_parallel_for:
856 case OMPD_target_parallel_for_simd:
857 case OMPD_target_teams_distribute_simd:
858 case OMPD_target_teams_distribute_parallel_for:
859 case OMPD_target_teams_distribute_parallel_for_simd:
862 case OMPD_parallel_for:
863 case OMPD_parallel_master:
864 case OMPD_parallel_sections:
866 case OMPD_parallel_for_simd:
868 case OMPD_cancellation_point:
870 case OMPD_threadprivate:
888 case OMPD_target_data:
889 case OMPD_target_exit_data:
890 case OMPD_target_enter_data:
891 case OMPD_distribute:
892 case OMPD_distribute_simd:
893 case OMPD_distribute_parallel_for:
894 case OMPD_distribute_parallel_for_simd:
895 case OMPD_teams_distribute:
896 case OMPD_teams_distribute_simd:
897 case OMPD_teams_distribute_parallel_for:
898 case OMPD_teams_distribute_parallel_for_simd:
899 case OMPD_target_update:
900 case OMPD_declare_simd:
901 case OMPD_declare_variant:
902 case OMPD_begin_declare_variant:
903 case OMPD_end_declare_variant:
904 case OMPD_declare_target:
905 case OMPD_end_declare_target:
906 case OMPD_declare_reduction:
907 case OMPD_declare_mapper:
909 case OMPD_taskloop_simd:
910 case OMPD_master_taskloop:
911 case OMPD_master_taskloop_simd:
912 case OMPD_parallel_master_taskloop:
913 case OMPD_parallel_master_taskloop_simd:
917 llvm_unreachable(
"Unexpected directive.");
931 switch (DirectiveKind) {
933 case OMPD_target_teams:
934 case OMPD_target_parallel:
936 case OMPD_target_parallel_for:
937 case OMPD_target_parallel_for_simd:
938 case OMPD_target_teams_distribute_parallel_for:
939 case OMPD_target_teams_distribute_parallel_for_simd:
942 case OMPD_target_simd:
943 case OMPD_target_teams_distribute_simd:
945 case OMPD_target_teams_distribute:
949 case OMPD_parallel_for:
950 case OMPD_parallel_master:
951 case OMPD_parallel_sections:
953 case OMPD_parallel_for_simd:
955 case OMPD_cancellation_point:
957 case OMPD_threadprivate:
975 case OMPD_target_data:
976 case OMPD_target_exit_data:
977 case OMPD_target_enter_data:
978 case OMPD_distribute:
979 case OMPD_distribute_simd:
980 case OMPD_distribute_parallel_for:
981 case OMPD_distribute_parallel_for_simd:
982 case OMPD_teams_distribute:
983 case OMPD_teams_distribute_simd:
984 case OMPD_teams_distribute_parallel_for:
985 case OMPD_teams_distribute_parallel_for_simd:
986 case OMPD_target_update:
987 case OMPD_declare_simd:
988 case OMPD_declare_variant:
989 case OMPD_begin_declare_variant:
990 case OMPD_end_declare_variant:
991 case OMPD_declare_target:
992 case OMPD_end_declare_target:
993 case OMPD_declare_reduction:
994 case OMPD_declare_mapper:
996 case OMPD_taskloop_simd:
997 case OMPD_master_taskloop:
998 case OMPD_master_taskloop_simd:
999 case OMPD_parallel_master_taskloop:
1000 case OMPD_parallel_master_taskloop_simd:
1007 "Unknown programming model for OpenMP directive on NVPTX target.");
1011 StringRef ParentName,
1012 llvm::Function *&OutlinedFn,
1013 llvm::Constant *&OutlinedFnID,
1014 bool IsOffloadEntry,
1016 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1017 EntryFunctionState EST;
1018 WrapperFunctionsMap.clear();
1022 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1025 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1030 RT.emitKernelInit(CGF, EST,
false);
1032 RT.setLocThreadIdInsertPt(CGF,
true);
1038 RT.emitKernelDeinit(CGF, EST,
false);
1042 IsInTTDRegion =
true;
1043 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1044 IsOffloadEntry, CodeGen);
1045 IsInTTDRegion =
false;
1049 EntryFunctionState &EST,
bool IsSPMD) {
1051 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
1052 IsInTargetMasterThreadRegion = IsSPMD;
1054 emitGenericVarsProlog(CGF, EST.Loc);
1058 EntryFunctionState &EST,
1061 emitGenericVarsEpilog(CGF);
1064 OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
1068 StringRef ParentName,
1069 llvm::Function *&OutlinedFn,
1070 llvm::Constant *&OutlinedFnID,
1071 bool IsOffloadEntry,
1073 ExecutionRuntimeModesRAII ModeRAII(
1074 CurrentExecutionMode, RequiresFullRuntime,
1075 CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1077 EntryFunctionState EST;
1082 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1086 CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1087 : RT(RT), EST(EST) {}
1089 RT.emitKernelInit(CGF, EST,
true);
1095 RT.emitKernelDeinit(CGF, EST,
true);
1097 } Action(*
this, EST);
1099 IsInTTDRegion =
true;
1100 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1101 IsOffloadEntry, CodeGen);
1102 IsInTTDRegion =
false;
1113 auto *GVMode =
new llvm::GlobalVariable(
1115 llvm::GlobalValue::WeakAnyLinkage,
1116 llvm::ConstantInt::get(CGM.
Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
1117 : OMP_TGT_EXEC_MODE_GENERIC),
1118 Twine(Name,
"_exec_mode"));
1122 void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *
ID,
1123 llvm::Constant *Addr,
1124 uint64_t Size, int32_t,
1125 llvm::GlobalValue::LinkageTypes) {
1128 llvm::Function *Fn = dyn_cast<llvm::Function>(Addr);
1132 llvm::Module &M = CGM.getModule();
1133 llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1136 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata(
"nvvm.annotations");
1138 llvm::Metadata *MDVals[] = {
1139 llvm::ConstantAsMetadata::get(Fn), llvm::MDString::get(Ctx,
"kernel"),
1140 llvm::ConstantAsMetadata::get(
1141 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1143 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1146 Fn->addFnAttr(llvm::Attribute::get(Ctx,
"kernel"));
1149 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1151 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1153 if (!IsOffloadEntry)
1156 assert(!ParentName.empty() &&
"Invalid target region parent name!");
1160 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1163 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1172 enum ModeFlagsTy :
unsigned {
1174 KMP_IDENT_SPMD_MODE = 0x01,
1176 KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1181 static const ModeFlagsTy UndefinedMode =
1182 (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1186 switch (getExecutionMode()) {
1188 if (requiresFullRuntime())
1189 return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1190 return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1192 assert(requiresFullRuntime() &&
"Expected full runtime.");
1193 return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1195 return UndefinedMode;
1197 llvm_unreachable(
"Unknown flags are requested.");
1203 llvm_unreachable(
"OpenMP can only handle device code.");
1210 "__omp_rtl_debug_kind");
1212 "__omp_rtl_assume_teams_oversubscription");
1214 "__omp_rtl_assume_threads_oversubscription");
1216 "__omp_rtl_assume_no_thread_state");
1220 ProcBindKind ProcBind,
1230 llvm::Value *NumThreads,
1236 const Expr *NumTeams,
1237 const Expr *ThreadLimit,
1245 bool &IsInParallelRegion;
1246 bool PrevIsInParallelRegion;
1249 NVPTXPrePostActionTy(
bool &IsInParallelRegion)
1250 : IsInParallelRegion(IsInParallelRegion) {}
1252 PrevIsInParallelRegion = IsInParallelRegion;
1253 IsInParallelRegion =
true;
1256 IsInParallelRegion = PrevIsInParallelRegion;
1258 } Action(IsInParallelRegion);
1260 bool PrevIsInTTDRegion = IsInTTDRegion;
1261 IsInTTDRegion =
false;
1262 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1263 IsInTargetMasterThreadRegion =
false;
1266 D, ThreadIDVar, InnermostKind, CodeGen));
1267 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1268 IsInTTDRegion = PrevIsInTTDRegion;
1270 !IsInParallelRegion) {
1271 llvm::Function *WrapperFun =
1272 createParallelDataSharingWrapper(OutlinedFun, D);
1273 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1285 "expected teams directive.");
1292 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1300 for (
const Expr *E : C->getVarRefs())
1310 "expected teams directive.");
1312 for (
const Expr *E : C->privates())
1324 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1331 if (!LastPrivatesReductions.empty()) {
1332 GlobalizedRD = ::buildRecordForGlobalizedVars(
1334 MappedDeclsFields, WarpSize);
1336 }
else if (!LastPrivatesReductions.empty()) {
1337 assert(!TeamAndReductions.first &&
1338 "Previous team declaration is not expected.");
1340 std::swap(TeamAndReductions.second, LastPrivatesReductions);
1347 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1351 NVPTXPrePostActionTy(
1353 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1355 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1356 MappedDeclsFields(MappedDeclsFields) {}
1361 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.
CurFn).first;
1362 I->getSecond().MappedParams =
1363 std::make_unique<CodeGenFunction::OMPMapVars>();
1364 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1365 for (
const auto &Pair : MappedDeclsFields) {
1366 assert(Pair.getFirst()->isCanonicalDecl() &&
1367 "Expected canonical declaration");
1368 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1371 Rt.emitGenericVarsProlog(CGF, Loc);
1375 .emitGenericVarsEpilog(CGF);
1377 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1380 D, ThreadIDVar, InnermostKind, CodeGen);
1387 bool WithSPMDCheck) {
1394 const auto I = FunctionGlobalizedDecls.find(CGF.
CurFn);
1395 if (I == FunctionGlobalizedDecls.end())
1398 for (
auto &Rec : I->getSecond().LocalVarData) {
1399 const auto *VD = cast<VarDecl>(Rec.first);
1400 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1404 llvm::Value *ParValue;
1413 llvm::CallBase *VoidPtr =
1418 VoidPtr->addRetAttr(llvm::Attribute::get(
1425 VoidPtr, VarPtrTy, VD->
getName() +
"_on_stack");
1427 Rec.second.PrivateAddr = VarAddr.
getAddress(CGF);
1428 Rec.second.GlobalizedVal = VoidPtr;
1433 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.
getAddress(CGF));
1436 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->
getLocation()));
1438 for (
const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1443 Size = Bld.CreateNUWAdd(
1445 llvm::Value *AlignVal =
1448 Size = Bld.CreateUDiv(Size, AlignVal);
1449 Size = Bld.CreateNUWMul(Size, AlignVal);
1453 llvm::CallBase *VoidPtr =
1457 VoidPtr->addRetAttr(
1461 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1462 std::pair<llvm::Value *, llvm::Value *>(
1467 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1468 Base.getAddress(CGF));
1470 I->getSecond().MappedParams->apply(CGF);
1474 bool WithSPMDCheck) {
1479 const auto I = FunctionGlobalizedDecls.find(CGF.
CurFn);
1480 if (I != FunctionGlobalizedDecls.end()) {
1482 for (
auto AddrSizePair :
1483 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1486 {AddrSizePair.first, AddrSizePair.second});
1489 for (
auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1490 const auto *VD = cast<VarDecl>(Rec.first);
1491 I->getSecond().MappedParams->restore(CGF);
1493 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1505 llvm::Function *OutlinedFn,
1515 OutlinedFnArgs.push_back(ZeroAddr.
getPointer());
1516 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1522 llvm::Function *OutlinedFn,
1525 llvm::Value *NumThreads) {
1529 auto &&ParallelGen = [
this, Loc, OutlinedFn, CapturedVars, IfCond,
1533 llvm::Value *NumThreadsVal = NumThreads;
1534 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1538 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn,
CGM.
Int8PtrTy);
1545 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1546 llvm::ArrayType::get(
CGM.
VoidPtrTy, CapturedVars.size()),
1547 "captured_vars_addrs");
1549 if (!CapturedVars.empty()) {
1553 for (llvm::Value *
V : CapturedVars) {
1556 if (
V->getType()->isIntegerTy())
1557 PtrV = Bld.CreateIntToPtr(
V, CGF.VoidPtrTy);
1560 CGF.EmitStoreOfScalar(PtrV, Dst,
false,
1566 llvm::Value *IfCondVal =
nullptr;
1568 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1571 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1574 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1576 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1578 assert(IfCondVal &&
"Expected a value");
1580 llvm::Value *Args[] = {
1585 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1588 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.
getPointer(),
1590 llvm::ConstantInt::get(
CGM.
SizeTy, CapturedVars.size())};
1591 CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
1606 llvm::Value *Args[] = {
1607 llvm::ConstantPointerNull::get(
1609 llvm::ConstantInt::get(CGF.
Int32Ty, 0,
true)};
1646 CGM.
getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1664 llvm::Value *CmpLoopBound = CGF.
Builder.CreateICmpSLT(CounterVal, TeamWidth);
1665 CGF.
Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1671 llvm::Value *CmpThreadToCounter =
1672 CGF.
Builder.CreateICmpEQ(ThreadID, CounterVal);
1673 CGF.
Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1692 llvm::Value *IncCounterVal =
1706 "Cast type must sized.");
1708 "Val type must sized.");
1710 if (ValTy == CastTy)
1714 return CGF.
Builder.CreateBitCast(Val, LLVMCastTy);
1716 return CGF.
Builder.CreateIntCast(Val, LLVMCastTy,
1744 assert(Size.getQuantity() <= 8 &&
1745 "Unsupported bitwidth in shuffle instruction.");
1747 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1748 ? OMPRTL___kmpc_shuffle_int32
1749 : OMPRTL___kmpc_shuffle_int64;
1753 Size.getQuantity() <= 4 ? 32 : 64, 1);
1754 llvm::Value *ElemCast =
castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1755 llvm::Value *WarpSize =
1759 OMPBuilder.getOrCreateRuntimeFunction(CGM.
getModule(), ShuffleFn),
1760 {ElemCast, Offset, WarpSize});
1785 for (
int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1795 ElemPtr, IntTy->getPointerTo(), IntTy);
1796 if (Size.getQuantity() / IntSize > 1) {
1800 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1802 llvm::PHINode *PhiSrc =
1803 Bld.CreatePHI(Ptr.
getType(), 2);
1804 PhiSrc->addIncoming(Ptr.
getPointer(), CurrentBB);
1805 llvm::PHINode *PhiDest =
1806 Bld.CreatePHI(ElemPtr.
getType(), 2);
1807 PhiDest->addIncoming(ElemPtr.
getPointer(), CurrentBB);
1811 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1815 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1829 PhiSrc->addIncoming(LocalPtr.
getPointer(), ThenBB);
1830 PhiDest->addIncoming(LocalElemPtr.
getPointer(), ThenBB);
1846 Size = Size % IntSize;
1851 enum CopyAction :
unsigned {
1882 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1883 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1884 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1889 unsigned Size = Privates.size();
1890 for (
const Expr *Private : Privates) {
1895 bool ShuffleInElement =
false;
1898 bool UpdateDestListPtr =
false;
1901 bool IncrScratchpadSrc =
false;
1902 bool IncrScratchpadDest =
false;
1903 QualType PrivatePtrType =
C.getPointerType(Private->getType());
1904 llvm::Type *PrivateLlvmPtrType = CGF.
ConvertType(PrivatePtrType);
1907 case RemoteLaneToThread: {
1912 SrcElementPtrAddr, PrivateLlvmPtrType),
1919 CGF.
CreateMemTemp(Private->getType(),
".omp.reduction.element");
1920 ShuffleInElement =
true;
1921 UpdateDestListPtr =
true;
1929 SrcElementPtrAddr, PrivateLlvmPtrType),
1937 DestElementPtrAddr, PrivateLlvmPtrType),
1941 case ThreadToScratchpad: {
1946 SrcElementPtrAddr, PrivateLlvmPtrType),
1951 llvm::Value *ElementSizeInChars = CGF.
getTypeSize(Private->getType());
1952 llvm::Value *CurrentOffset =
1953 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1954 llvm::Value *ScratchPadElemAbsolutePtrVal =
1955 Bld.CreateNUWAdd(DestBase.
getPointer(), CurrentOffset);
1956 ScratchPadElemAbsolutePtrVal =
1957 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.
VoidPtrTy);
1958 DestElementAddr =
Address(ScratchPadElemAbsolutePtrVal, CGF.
Int8Ty,
1959 C.getTypeAlignInChars(Private->getType()));
1960 IncrScratchpadDest =
true;
1963 case ScratchpadToThread: {
1966 llvm::Value *ElementSizeInChars = CGF.
getTypeSize(Private->getType());
1967 llvm::Value *CurrentOffset =
1968 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1969 llvm::Value *ScratchPadElemAbsolutePtrVal =
1970 Bld.CreateNUWAdd(SrcBase.
getPointer(), CurrentOffset);
1971 ScratchPadElemAbsolutePtrVal =
1972 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.
VoidPtrTy);
1973 SrcElementAddr =
Address(ScratchPadElemAbsolutePtrVal, CGF.
Int8Ty,
1974 C.getTypeAlignInChars(Private->getType()));
1975 IncrScratchpadSrc =
true;
1981 CGF.
CreateMemTemp(Private->getType(),
".omp.reduction.element");
1982 UpdateDestListPtr =
true;
1996 if (ShuffleInElement) {
1997 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1998 RemoteLaneOffset, Private->getExprLoc());
2003 SrcElementAddr,
false, Private->getType(),
2008 Elem, DestElementAddr,
false, Private->getType(),
2015 Private->getExprLoc());
2035 if (UpdateDestListPtr) {
2038 DestElementPtrAddr,
false,
2045 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2048 llvm::Value *ScratchpadBasePtr =
2050 llvm::Value *ElementSizeInChars = CGF.
getTypeSize(Private->getType());
2051 ScratchpadBasePtr = Bld.CreateNUWAdd(
2053 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2056 ScratchpadBasePtr = Bld.CreateNUWSub(
2057 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.
SizeTy, 1));
2058 ScratchpadBasePtr = Bld.CreateUDiv(
2060 llvm::ConstantInt::get(CGM.
SizeTy, GlobalMemoryAlignment));
2061 ScratchpadBasePtr = Bld.CreateNUWAdd(
2062 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.
SizeTy, 1));
2063 ScratchpadBasePtr = Bld.CreateNUWMul(
2065 llvm::ConstantInt::get(CGM.
SizeTy, GlobalMemoryAlignment));
2067 if (IncrScratchpadDest)
2106 C.getIntTypeForBitwidth(32,
true),
2109 Args.push_back(&ReduceListArg);
2110 Args.push_back(&NumWarpsArg);
2116 "_omp_reduction_inter_warp_copy_func", &M);
2118 Fn->setDoesNotRecurse();
2131 StringRef TransferMediumName =
2132 "__openmp_nvptx_data_transfer_temporary_storage";
2133 llvm::GlobalVariable *TransferMedium =
2134 M.getGlobalVariable(TransferMediumName);
2136 if (!TransferMedium) {
2137 auto *Ty = llvm::ArrayType::get(CGM.
Int32Ty, WarpSize);
2139 TransferMedium =
new llvm::GlobalVariable(
2140 M, Ty,
false, llvm::GlobalVariable::WeakAnyLinkage,
2141 llvm::UndefValue::get(Ty), TransferMediumName,
2142 nullptr, llvm::GlobalVariable::NotThreadLocal,
2143 SharedAddressSpace);
2160 AddrReduceListArg,
false, C.VoidPtrTy, Loc,
2162 ElemTy->getPointerTo()),
2166 for (
const Expr *Private : Privates) {
2171 unsigned RealTySize =
2172 C.getTypeSizeInChars(Private->getType())
2173 .alignTo(C.getTypeAlignInChars(Private->getType()))
2175 for (
unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2176 unsigned NumIters = RealTySize / TySize;
2179 QualType CType = C.getIntTypeForBitwidth(
2183 llvm::Value *Cnt =
nullptr;
2185 llvm::BasicBlock *PrecondBB =
nullptr;
2186 llvm::BasicBlock *ExitBB =
nullptr;
2199 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.
IntTy, NumIters));
2200 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2212 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID,
"warp_master");
2213 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2228 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2229 TransferMedium->getValueType(), TransferMedium,
2230 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2236 CopyType->getPointerTo(
2237 MediumPtrVal->getType()->getPointerAddressSpace())),
2243 ElemPtr,
false, CType, Loc,
2250 Bld.CreateBr(MergeBB);
2253 Bld.CreateBr(MergeBB);
2271 AddrNumWarpsArg,
false, C.IntTy, Loc);
2274 llvm::Value *IsActiveThread =
2275 Bld.CreateICmpULT(ThreadID, NumWarpsVal,
"is_active_thread");
2276 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2281 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2282 TransferMedium->getValueType(), TransferMedium,
2283 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2288 CopyType->getPointerTo(
2289 SrcMediumPtrVal->getType()->getPointerAddressSpace())),
2295 TargetElemPtrPtr,
false, C.VoidPtrTy, Loc);
2299 TargetElemPtr = Bld.
CreateGEP(TargetElemPtr, Cnt);
2302 llvm::Value *SrcMediumValue =
2306 Bld.CreateBr(W0MergeBB);
2309 Bld.CreateBr(W0MergeBB);
2314 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.
IntTy, 1));
2320 RealTySize %= TySize;
2413 Args.push_back(&ReduceListArg);
2414 Args.push_back(&LaneIDArg);
2415 Args.push_back(&RemoteLaneOffsetArg);
2416 Args.push_back(&AlgoVerArg);
2420 auto *Fn = llvm::Function::Create(
2422 "_omp_reduction_shuffle_and_reduce_func", &CGM.
getModule());
2424 Fn->setDoesNotRecurse();
2437 ElemTy->getPointerTo()),
2455 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.remote_reduce_list");
2461 LocalReduceList, RemoteReduceList,
2462 {RemoteLaneOffsetArgVal,
2487 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2489 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2490 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2491 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2493 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2494 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2495 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2496 CondAlgo2 = Bld.CreateAnd(
2497 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2499 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2500 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2505 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2514 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2515 Bld.CreateBr(MergeBB);
2518 Bld.CreateBr(MergeBB);
2524 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2525 llvm::Value *CondCopy = Bld.CreateAnd(
2526 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2531 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2535 RemoteReduceList, LocalReduceList);
2536 Bld.CreateBr(CpyMergeBB);
2539 Bld.CreateBr(CpyMergeBB);
2557 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2571 Args.push_back(&BufferArg);
2572 Args.push_back(&IdxArg);
2573 Args.push_back(&ReduceListArg);
2577 auto *Fn = llvm::Function::Create(
2579 "_omp_reduction_list_to_global_copy_func", &CGM.
getModule());
2581 Fn->setDoesNotRecurse();
2594 ElemTy->getPointerTo()),
2596 QualType StaticTy = C.getRecordType(TeamReductionRec);
2597 llvm::Type *LLVMReductionsBufferTy =
2601 LLVMReductionsBufferTy->getPointerTo());
2602 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.
Int32Ty),
2607 for (
const Expr *Private : Privates) {
2615 ElemPtrPtr, ElemTy->getPointerTo());
2617 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2618 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2620 const FieldDecl *FD = VarFieldMap.lookup(VD);
2624 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.
getElementType(),
2632 ElemPtr,
false, Private->
getType(), Loc,
2669 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2671 llvm::Function *ReduceFn) {
2684 Args.push_back(&BufferArg);
2685 Args.push_back(&IdxArg);
2686 Args.push_back(&ReduceListArg);
2690 auto *Fn = llvm::Function::Create(
2692 "_omp_reduction_list_to_global_reduce_func", &CGM.
getModule());
2694 Fn->setDoesNotRecurse();
2701 QualType StaticTy = C.getRecordType(TeamReductionRec);
2702 llvm::Type *LLVMReductionsBufferTy =
2706 LLVMReductionsBufferTy->getPointerTo());
2711 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.red_list");
2712 auto IPriv = Privates.begin();
2713 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.
Int32Ty),
2718 for (
unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2721 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2722 const FieldDecl *FD = VarFieldMap.lookup(VD);
2726 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2730 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2734 llvm::Value *Size = CGF.
Builder.CreateIntCast(
2745 llvm::Value *GlobalReduceList =
2749 AddrReduceListArg,
false, C.VoidPtrTy, Loc);
2751 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2766 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2780 Args.push_back(&BufferArg);
2781 Args.push_back(&IdxArg);
2782 Args.push_back(&ReduceListArg);
2786 auto *Fn = llvm::Function::Create(
2788 "_omp_reduction_global_to_list_copy_func", &CGM.
getModule());
2790 Fn->setDoesNotRecurse();
2803 ElemTy->getPointerTo()),
2805 QualType StaticTy = C.getRecordType(TeamReductionRec);
2806 llvm::Type *LLVMReductionsBufferTy =
2810 LLVMReductionsBufferTy->getPointerTo());
2812 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.
Int32Ty),
2817 for (
const Expr *Private : Privates) {
2825 ElemPtrPtr, ElemTy->getPointerTo());
2827 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2828 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2830 const FieldDecl *FD = VarFieldMap.lookup(VD);
2834 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.
getElementType(),
2879 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2881 llvm::Function *ReduceFn) {
2894 Args.push_back(&BufferArg);
2895 Args.push_back(&IdxArg);
2896 Args.push_back(&ReduceListArg);
2900 auto *Fn = llvm::Function::Create(
2902 "_omp_reduction_global_to_list_reduce_func", &CGM.
getModule());
2904 Fn->setDoesNotRecurse();
2911 QualType StaticTy = C.getRecordType(TeamReductionRec);
2912 llvm::Type *LLVMReductionsBufferTy =
2916 LLVMReductionsBufferTy->getPointerTo());
2921 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.red_list");
2922 auto IPriv = Privates.begin();
2923 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.
Int32Ty),
2928 for (
unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2931 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2932 const FieldDecl *FD = VarFieldMap.lookup(VD);
2936 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2940 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2944 llvm::Value *Size = CGF.
Builder.CreateIntCast(
2955 llvm::Value *GlobalReduceList =
2959 AddrReduceListArg,
false, C.VoidPtrTy, Loc);
2961 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
3220 if (Options.SimpleReduction) {
3221 assert(!TeamsReduction && !ParallelReduction &&
3222 "Invalid reduction selection in emitReduction.");
3224 ReductionOps, Options);
3228 assert((TeamsReduction || ParallelReduction) &&
3229 "Invalid reduction selection in emitReduction.");
3242 auto Size = RHSExprs.size();
3243 for (
const Expr *E : Privates) {
3253 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.red_list");
3254 auto IPriv = Privates.begin();
3256 for (
unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3262 if ((*IPriv)->getType()->isVariablyModifiedType()) {
3266 llvm::Value *Size = CGF.
Builder.CreateIntCast(
3278 llvm::Function *ReductionFn =
3280 Privates, LHSExprs, RHSExprs, ReductionOps);
3281 llvm::Value *ReductionArrayTySize = CGF.
getTypeSize(ReductionArrayTy);
3283 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3284 llvm::Value *InterWarpCopyFn =
3287 if (ParallelReduction) {
3288 llvm::Value *Args[] = {RTLoc,
3290 CGF.
Builder.getInt32(RHSExprs.size()),
3291 ReductionArrayTySize,
3298 CGM.
getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3301 assert(TeamsReduction &&
"expected teams reduction.");
3302 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3305 for (
const Expr *DRE : Privates) {
3306 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3309 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3310 CGM.
getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3311 C.getLangOpts().OpenMPCUDAReductionBufNum);
3312 TeamsReductions.push_back(TeamReductionRec);
3313 if (!KernelTeamsReductionPtr) {
3314 KernelTeamsReductionPtr =
new llvm::GlobalVariable(
3317 "_openmp_teams_reductions_buffer_$_$ptr");
3321 false, C.getPointerType(C.VoidPtrTy), Loc);
3323 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3325 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3328 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3330 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3333 llvm::Value *Args[] = {
3337 CGF.
Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3341 GlobalToBufferCpyFn,
3342 GlobalToBufferRedFn,
3343 BufferToGlobalCpyFn,
3344 BufferToGlobalRedFn};
3348 CGM.
getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3355 llvm::Value *Cond = CGF.
Builder.CreateICmpEQ(
3356 Res, llvm::ConstantInt::get(
CGM.
Int32Ty, 1));
3357 CGF.
Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3366 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3368 auto IPriv = Privates.begin();
3369 auto ILHS = LHSExprs.begin();
3370 auto IRHS = RHSExprs.begin();
3371 for (
const Expr *E : ReductionOps) {
3373 cast<DeclRefExpr>(*IRHS));
3379 llvm::Value *EndArgs[] = {ThreadId};
3381 NVPTXActionTy Action(
3382 nullptr, llvm::None,
3384 CGM.
getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3390 CGF.EmitBlock(ExitBB,
true);
3395 const VarDecl *NativeParam)
const {
3400 const Type *NonQualTy = QC.
strip(ArgType);
3402 if (
const auto *
Attr = FD->
getAttr<OMPCaptureKindAttr>()) {
3403 if (
Attr->getCaptureKind() == OMPC_map) {
3410 enum { NVPTX_local_addr = 5 };
3413 if (isa<ImplicitParamDecl>(NativeParam))
3428 const VarDecl *TargetParam)
const {
3429 assert(NativeParam != TargetParam &&
3431 "Native arg must not be the same as target arg.");
3435 const Type *NonQualTy = QC.
strip(NativeParamType);
3437 unsigned NativePointeeAddrSpace =
3444 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3445 cast<llvm::PointerType>(TargetAddr->getType()), 0));
3448 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3449 cast<llvm::PointerType>(TargetAddr->getType()),
3450 NativePointeeAddrSpace));
3454 return NativeParamAddr;
3461 TargetArgs.reserve(Args.size());
3462 auto *FnType = OutlinedFn.getFunctionType();
3463 for (
unsigned I = 0, E = Args.size(); I < E; ++I) {
3464 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3465 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3468 llvm::Type *TargetType = FnType->getParamType(I);
3469 llvm::Value *NativeArg = Args[I];
3470 if (!TargetType->isPointerTy()) {
3471 TargetArgs.emplace_back(NativeArg);
3475 NativeArg, llvm::PointerType::getWithSamePointeeType(
3476 cast<llvm::PointerType>(NativeArg->getType()), 0));
3477 TargetArgs.emplace_back(
3487 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3504 WrapperArgs.emplace_back(&ParallelLevelArg);
3505 WrapperArgs.emplace_back(&WrapperArg);
3510 auto *Fn = llvm::Function::Create(
3512 Twine(OutlinedParallelFn->getName(),
"_wrapper"), &
CGM.
getModule());
3520 Fn->addFnAttr(llvm::Attribute::NoInline);
3524 Fn->setDoesNotRecurse();
3530 const auto *RD = CS.getCapturedRecordDecl();
3531 auto CurField = RD->field_begin();
3543 auto CI = CS.capture_begin();
3549 llvm::Value *GlobalArgsPtr = GlobalArgs.
getPointer();
3550 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3558 if (CS.capture_size() > 0 ||
3574 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3575 Args.emplace_back(LB);
3584 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3585 Args.emplace_back(UB);
3588 if (CS.capture_size() > 0) {
3590 for (
unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3591 QualType ElemTy = CurField->getType();
3600 if (CI->capturesVariableByCopy() &&
3601 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3605 Args.emplace_back(Arg);
3619 assert(D &&
"Expected function or captured|block decl.");
3620 assert(FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 &&
3621 "Function is registered already.");
3622 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3623 "Team is set but not processed.");
3624 const Stmt *Body =
nullptr;
3625 bool NeedToDelayGlobalization =
false;
3626 if (
const auto *FD = dyn_cast<FunctionDecl>(D)) {
3627 Body = FD->getBody();
3628 }
else if (
const auto *BD = dyn_cast<BlockDecl>(D)) {
3629 Body = BD->getBody();
3630 }
else if (
const auto *CD = dyn_cast<CapturedDecl>(D)) {
3631 Body = CD->getBody();
3633 if (NeedToDelayGlobalization &&
3639 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3640 VarChecker.Visit(Body);
3642 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3643 TeamAndReductions.first =
nullptr;
3644 TeamAndReductions.second.clear();
3646 VarChecker.getEscapedVariableLengthDecls();
3647 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3649 auto I = FunctionGlobalizedDecls.try_emplace(CGF.
CurFn).first;
3650 I->getSecond().MappedParams =
3651 std::make_unique<CodeGenFunction::OMPMapVars>();
3652 I->getSecond().EscapedParameters.insert(
3653 VarChecker.getEscapedParameters().begin(),
3654 VarChecker.getEscapedParameters().end());
3655 I->getSecond().EscapedVariableLengthDecls.append(
3656 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3657 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3658 for (
const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3660 Data.insert(std::make_pair(VD, MappedVarData()));
3662 if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
3663 CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
3664 VarChecker.Visit(Body);
3665 I->getSecond().SecondaryLocalVarData.emplace();
3666 DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
3667 for (
const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3669 Data.insert(std::make_pair(VD, MappedVarData()));
3672 if (!NeedToDelayGlobalization) {
3673 emitGenericVarsProlog(CGF, D->
getBeginLoc(),
true);
3675 GlobalizationScope() =
default;
3679 .emitGenericVarsEpilog(CGF,
true);
3688 if (VD && VD->
hasAttr<OMPAllocateDeclAttr>()) {
3689 const auto *A = VD->
getAttr<OMPAllocateDeclAttr>();
3691 switch (A->getAllocatorType()) {
3694 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3695 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3696 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3697 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3698 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3701 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3704 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3707 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3710 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3711 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3715 auto *GV =
new llvm::GlobalVariable(
3719 nullptr, llvm::GlobalValue::NotThreadLocal,
3734 auto I = FunctionGlobalizedDecls.find(CGF.
CurFn);
3735 if (I == FunctionGlobalizedDecls.end())
3737 auto VDI = I->getSecond().LocalVarData.find(VD);
3738 if (VDI != I->getSecond().LocalVarData.end())
3739 return VDI->second.PrivateAddr;
3744 auto VDI = I->getSecond().LocalVarData.find(
3745 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3746 ->getCanonicalDecl());
3747 if (VDI != I->getSecond().LocalVarData.end())
3748 return VDI->second.PrivateAddr;
3756 FunctionGlobalizedDecls.erase(CGF.
CurFn);
3763 llvm::Value *&Chunk)
const {
3766 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3770 S.getIterationVariable()->getType(), S.getBeginLoc());
3774 CGF, S, ScheduleKind, Chunk);
3780 const Expr *&ChunkExpr)
const {
3781 ScheduleKind = OMPC_SCHEDULE_static;
3792 " Expected target-based directive.");
3797 if (!C.capturesVariable())
3799 const VarDecl *VD = C.getCapturedVar();
3800 const auto *RD = VD->
getType()
3804 if (!RD || !RD->isLambda())
3813 llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
3815 RD->getCaptureFields(Captures, ThisCapture);
3825 const VarDecl *VD = LC.getCapturedVar();
3828 auto It = Captures.find(VD);
3829 assert(It != Captures.end() &&
"Found lambda capture without field.");
3843 if (!VD || !VD->
hasAttr<OMPAllocateDeclAttr>())
3845 const auto *A = VD->
getAttr<OMPAllocateDeclAttr>();
3846 switch(A->getAllocatorType()) {
3847 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3848 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3850 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3851 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3852 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3853 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3854 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3857 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3860 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3863 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3864 llvm_unreachable(
"Expected predefined allocator for the variables with the "
3875 if (Feature.getValue()) {
3889 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3902 llvm::raw_svector_ostream Out(Buffer);
3904 <<
" does not support unified addressing";
3905 CGM.
Error(Clause->getBeginLoc(), Out.str());
3959 llvm_unreachable(
"Unexpected Cuda arch.");
3968 if (!TeamsReductions.empty()) {
3970 RecordDecl *StaticRD = C.buildImplicitRecord(
3973 for (
const RecordDecl *TeamReductionRec : TeamsReductions) {
3974 QualType RecTy = C.getRecordType(TeamReductionRec);
3984 QualType StaticTy = C.getRecordType(StaticRD);
3985 llvm::Type *LLVMReductionsBufferTy =
3990 auto *GV =
new llvm::GlobalVariable(
3993 llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3994 "_openmp_teams_reductions_buffer_$_");
3995 KernelTeamsReductionPtr->setInitializer(
3996 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
4005 const char *LocSize =
"__kmpc_get_hardware_num_threads_in_block";
4006 llvm::Function *F = M->getFunction(LocSize);
4008 F = llvm::Function::Create(
4009 llvm::FunctionType::get(CGF.
Int32Ty, llvm::None,
false),
4012 return Bld.CreateCall(F, llvm::None,
"nvptx_num_threads");
4019 CGM.
getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),