clang  10.0.0svn
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for OpenMP runtime code generation specialized to NVPTX
10 // targets.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeNVPTX.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/DeclOpenMP.h"
17 #include "clang/AST/StmtOpenMP.h"
18 #include "clang/AST/StmtVisitor.h"
19 #include "clang/Basic/Cuda.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 
22 using namespace clang;
23 using namespace CodeGen;
24 
25 namespace {
27  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28  /// int16_t RequiresOMPRuntime);
29  OMPRTL_NVPTX__kmpc_kernel_init,
30  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
31  OMPRTL_NVPTX__kmpc_kernel_deinit,
32  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
33  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
34  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35  /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
36  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
37  /// Call to void __kmpc_kernel_prepare_parallel(void
38  /// *outlined_function, int16_t
39  /// IsOMPRuntimeInitialized);
40  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
42  /// int16_t IsOMPRuntimeInitialized);
43  OMPRTL_NVPTX__kmpc_kernel_parallel,
44  /// Call to void __kmpc_kernel_end_parallel();
45  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47  /// global_tid);
48  OMPRTL_NVPTX__kmpc_serialized_parallel,
49  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50  /// global_tid);
51  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
53  /// int16_t lane_offset, int16_t warp_size);
54  OMPRTL_NVPTX__kmpc_shuffle_int32,
55  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
56  /// int16_t lane_offset, int16_t warp_size);
57  OMPRTL_NVPTX__kmpc_shuffle_int64,
58  /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
59  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61  /// lane_offset, int16_t shortCircuit),
62  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2,
64  /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
65  /// global_tid, void *global_buffer, int32_t num_of_records, void*
66  /// reduce_data,
67  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
68  /// lane_offset, int16_t shortCircuit),
69  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
70  /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
71  /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
72  /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
73  /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
74  /// *buffer, int idx, void *reduce_data));
75  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2,
76  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
77  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
78  /// Call to void __kmpc_data_sharing_init_stack();
79  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
80  /// Call to void __kmpc_data_sharing_init_stack_spmd();
81  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
82  /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
83  /// int16_t UseSharedMemory);
84  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
85  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
86  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
87  /// Call to void __kmpc_begin_sharing_variables(void ***args,
88  /// size_t n_args);
89  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
90  /// Call to void __kmpc_end_sharing_variables();
91  OMPRTL_NVPTX__kmpc_end_sharing_variables,
92  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
93  OMPRTL_NVPTX__kmpc_get_shared_variables,
94  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
95  /// global_tid);
96  OMPRTL_NVPTX__kmpc_parallel_level,
97  /// Call to int8_t __kmpc_is_spmd_exec_mode();
98  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
99  /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
100  /// const void *buf, size_t size, int16_t is_shared, const void **res);
101  OMPRTL_NVPTX__kmpc_get_team_static_memory,
102  /// Call to void __kmpc_restore_team_static_memory(int16_t
103  /// isSPMDExecutionMode, int16_t is_shared);
104  OMPRTL_NVPTX__kmpc_restore_team_static_memory,
105  /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
107  /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
108  /// global_tid);
109  OMPRTL__kmpc_barrier_simple_spmd,
110  /// Call to int32_t __kmpc_warp_active_thread_mask(void);
111  OMPRTL_NVPTX__kmpc_warp_active_thread_mask,
112  /// Call to void __kmpc_syncwarp(int32_t Mask);
113  OMPRTL_NVPTX__kmpc_syncwarp,
114 };
115 
116 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
117 class NVPTXActionTy final : public PrePostActionTy {
118  llvm::FunctionCallee EnterCallee = nullptr;
119  ArrayRef<llvm::Value *> EnterArgs;
120  llvm::FunctionCallee ExitCallee = nullptr;
121  ArrayRef<llvm::Value *> ExitArgs;
122  bool Conditional = false;
123  llvm::BasicBlock *ContBlock = nullptr;
124 
125 public:
126  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
127  ArrayRef<llvm::Value *> EnterArgs,
128  llvm::FunctionCallee ExitCallee,
129  ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
130  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
131  ExitArgs(ExitArgs), Conditional(Conditional) {}
132  void Enter(CodeGenFunction &CGF) override {
133  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
134  if (Conditional) {
135  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
136  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
137  ContBlock = CGF.createBasicBlock("omp_if.end");
138  // Generate the branch (If-stmt)
139  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
140  CGF.EmitBlock(ThenBlock);
141  }
142  }
143  void Done(CodeGenFunction &CGF) {
144  // Emit the rest of blocks/branches
145  CGF.EmitBranch(ContBlock);
146  CGF.EmitBlock(ContBlock, true);
147  }
148  void Exit(CodeGenFunction &CGF) override {
149  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
150  }
151 };
152 
153 /// A class to track the execution mode when codegening directives within
154 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
155 /// to the target region and used by containing directives such as 'parallel'
156 /// to emit optimized code.
157 class ExecutionRuntimeModesRAII {
158 private:
162  bool SavedRuntimeMode = false;
163  bool *RuntimeMode = nullptr;
164 
165 public:
166  /// Constructor for Non-SPMD mode.
167  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
168  : ExecMode(ExecMode) {
169  SavedExecMode = ExecMode;
171  }
172  /// Constructor for SPMD mode.
173  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
174  bool &RuntimeMode, bool FullRuntimeMode)
175  : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
176  SavedExecMode = ExecMode;
177  SavedRuntimeMode = RuntimeMode;
179  RuntimeMode = FullRuntimeMode;
180  }
181  ~ExecutionRuntimeModesRAII() {
182  ExecMode = SavedExecMode;
183  if (RuntimeMode)
184  *RuntimeMode = SavedRuntimeMode;
185  }
186 };
187 
188 /// GPU Configuration: This information can be derived from cuda registers,
189 /// however, providing compile time constants helps generate more efficient
190 /// code. For all practical purposes this is fine because the configuration
191 /// is the same for all known NVPTX architectures.
192 enum MachineConfiguration : unsigned {
193  WarpSize = 32,
194  /// Number of bits required to represent a lane identifier, which is
195  /// computed as log_2(WarpSize).
196  LaneIDBits = 5,
197  LaneIDMask = WarpSize - 1,
198 
199  /// Global memory alignment for performance.
200  GlobalMemoryAlignment = 128,
201 
202  /// Maximal size of the shared memory buffer.
203  SharedMemorySize = 128,
204 };
205 
206 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
207  RefExpr = RefExpr->IgnoreParens();
208  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
209  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
210  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
211  Base = TempASE->getBase()->IgnoreParenImpCasts();
212  RefExpr = Base;
213  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
214  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
215  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
216  Base = TempOASE->getBase()->IgnoreParenImpCasts();
217  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
218  Base = TempASE->getBase()->IgnoreParenImpCasts();
219  RefExpr = Base;
220  }
221  RefExpr = RefExpr->IgnoreParenImpCasts();
222  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
223  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
224  const auto *ME = cast<MemberExpr>(RefExpr);
225  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
226 }
227 
228 
229 static RecordDecl *buildRecordForGlobalizedVars(
230  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
231  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
232  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
233  &MappedDeclsFields, int BufSize) {
234  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
235  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
236  return nullptr;
237  SmallVector<VarsDataTy, 4> GlobalizedVars;
238  for (const ValueDecl *D : EscapedDecls)
239  GlobalizedVars.emplace_back(
241  C.getDeclAlign(D).getQuantity(),
242  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
243  D);
244  for (const ValueDecl *D : EscapedDeclsForTeams)
245  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
246  llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
247  return L.first > R.first;
248  });
249 
250  // Build struct _globalized_locals_ty {
251  // /* globalized vars */[WarSize] align (max(decl_align,
252  // GlobalMemoryAlignment))
253  // /* globalized vars */ for EscapedDeclsForTeams
254  // };
255  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
256  GlobalizedRD->startDefinition();
257  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
258  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
259  for (const auto &Pair : GlobalizedVars) {
260  const ValueDecl *VD = Pair.second;
261  QualType Type = VD->getType();
262  if (Type->isLValueReferenceType())
263  Type = C.getPointerType(Type.getNonReferenceType());
264  else
265  Type = Type.getNonReferenceType();
266  SourceLocation Loc = VD->getLocation();
267  FieldDecl *Field;
268  if (SingleEscaped.count(VD)) {
269  Field = FieldDecl::Create(
270  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
272  /*BW=*/nullptr, /*Mutable=*/false,
273  /*InitStyle=*/ICIS_NoInit);
274  Field->setAccess(AS_public);
275  if (VD->hasAttrs()) {
276  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
277  E(VD->getAttrs().end());
278  I != E; ++I)
279  Field->addAttr(*I);
280  }
281  } else {
282  llvm::APInt ArraySize(32, BufSize);
283  Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
284  Field = FieldDecl::Create(
285  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
287  /*BW=*/nullptr, /*Mutable=*/false,
288  /*InitStyle=*/ICIS_NoInit);
289  Field->setAccess(AS_public);
290  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
291  static_cast<CharUnits::QuantityType>(
292  GlobalMemoryAlignment)));
293  Field->addAttr(AlignedAttr::CreateImplicit(
294  C, /*IsAlignmentExpr=*/true,
295  IntegerLiteral::Create(C, Align,
296  C.getIntTypeForBitwidth(32, /*Signed=*/0),
297  SourceLocation()),
298  {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
299  }
300  GlobalizedRD->addDecl(Field);
301  MappedDeclsFields.try_emplace(VD, Field);
302  }
303  GlobalizedRD->completeDefinition();
304  return GlobalizedRD;
305 }
306 
307 /// Get the list of variables that can escape their declaration context.
308 class CheckVarsEscapingDeclContext final
309  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
310  CodeGenFunction &CGF;
311  llvm::SetVector<const ValueDecl *> EscapedDecls;
312  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
313  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
314  RecordDecl *GlobalizedRD = nullptr;
315  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
316  bool AllEscaped = false;
317  bool IsForCombinedParallelRegion = false;
318 
319  void markAsEscaped(const ValueDecl *VD) {
320  // Do not globalize declare target variables.
321  if (!isa<VarDecl>(VD) ||
322  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
323  return;
324  VD = cast<ValueDecl>(VD->getCanonicalDecl());
325  // Use user-specified allocation.
326  if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
327  return;
328  // Variables captured by value must be globalized.
329  if (auto *CSI = CGF.CapturedStmtInfo) {
330  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
331  // Check if need to capture the variable that was already captured by
332  // value in the outer region.
333  if (!IsForCombinedParallelRegion) {
334  if (!FD->hasAttrs())
335  return;
336  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
337  if (!Attr)
338  return;
339  if (((Attr->getCaptureKind() != OMPC_map) &&
341  static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) ||
342  ((Attr->getCaptureKind() == OMPC_map) &&
343  !FD->getType()->isAnyPointerType()))
344  return;
345  }
346  if (!FD->getType()->isReferenceType()) {
347  assert(!VD->getType()->isVariablyModifiedType() &&
348  "Parameter captured by value with variably modified type");
349  EscapedParameters.insert(VD);
350  } else if (!IsForCombinedParallelRegion) {
351  return;
352  }
353  }
354  }
355  if ((!CGF.CapturedStmtInfo ||
356  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
357  VD->getType()->isReferenceType())
358  // Do not globalize variables with reference type.
359  return;
360  if (VD->getType()->isVariablyModifiedType())
361  EscapedVariableLengthDecls.insert(VD);
362  else
363  EscapedDecls.insert(VD);
364  }
365 
366  void VisitValueDecl(const ValueDecl *VD) {
367  if (VD->getType()->isLValueReferenceType())
368  markAsEscaped(VD);
369  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
370  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
371  const bool SavedAllEscaped = AllEscaped;
372  AllEscaped = VD->getType()->isLValueReferenceType();
373  Visit(VarD->getInit());
374  AllEscaped = SavedAllEscaped;
375  }
376  }
377  }
378  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
379  ArrayRef<OMPClause *> Clauses,
380  bool IsCombinedParallelRegion) {
381  if (!S)
382  return;
383  for (const CapturedStmt::Capture &C : S->captures()) {
384  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
385  const ValueDecl *VD = C.getCapturedVar();
386  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
387  if (IsCombinedParallelRegion) {
388  // Check if the variable is privatized in the combined construct and
389  // those private copies must be shared in the inner parallel
390  // directive.
391  IsForCombinedParallelRegion = false;
392  for (const OMPClause *C : Clauses) {
393  if (!isOpenMPPrivate(C->getClauseKind()) ||
394  C->getClauseKind() == OMPC_reduction ||
395  C->getClauseKind() == OMPC_linear ||
396  C->getClauseKind() == OMPC_private)
397  continue;
399  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
400  Vars = PC->getVarRefs();
401  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
402  Vars = PC->getVarRefs();
403  else
404  llvm_unreachable("Unexpected clause.");
405  for (const auto *E : Vars) {
406  const Decl *D =
407  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
408  if (D == VD->getCanonicalDecl()) {
409  IsForCombinedParallelRegion = true;
410  break;
411  }
412  }
413  if (IsForCombinedParallelRegion)
414  break;
415  }
416  }
417  markAsEscaped(VD);
418  if (isa<OMPCapturedExprDecl>(VD))
419  VisitValueDecl(VD);
420  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
421  }
422  }
423  }
424 
425  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
426  assert(!GlobalizedRD &&
427  "Record for globalized variables is built already.");
428  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
429  if (IsInTTDRegion)
430  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
431  else
432  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
433  GlobalizedRD = ::buildRecordForGlobalizedVars(
434  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
435  MappedDeclsFields, WarpSize);
436  }
437 
438 public:
439  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
440  ArrayRef<const ValueDecl *> TeamsReductions)
441  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
442  }
443  virtual ~CheckVarsEscapingDeclContext() = default;
444  void VisitDeclStmt(const DeclStmt *S) {
445  if (!S)
446  return;
447  for (const Decl *D : S->decls())
448  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
449  VisitValueDecl(VD);
450  }
451  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
452  if (!D)
453  return;
454  if (!D->hasAssociatedStmt())
455  return;
456  if (const auto *S =
457  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
458  // Do not analyze directives that do not actually require capturing,
459  // like `omp for` or `omp simd` directives.
461  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
462  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
463  VisitStmt(S->getCapturedStmt());
464  return;
465  }
466  VisitOpenMPCapturedStmt(
467  S, D->clauses(),
468  CaptureRegions.back() == OMPD_parallel &&
470  }
471  }
472  void VisitCapturedStmt(const CapturedStmt *S) {
473  if (!S)
474  return;
475  for (const CapturedStmt::Capture &C : S->captures()) {
476  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
477  const ValueDecl *VD = C.getCapturedVar();
478  markAsEscaped(VD);
479  if (isa<OMPCapturedExprDecl>(VD))
480  VisitValueDecl(VD);
481  }
482  }
483  }
484  void VisitLambdaExpr(const LambdaExpr *E) {
485  if (!E)
486  return;
487  for (const LambdaCapture &C : E->captures()) {
488  if (C.capturesVariable()) {
489  if (C.getCaptureKind() == LCK_ByRef) {
490  const ValueDecl *VD = C.getCapturedVar();
491  markAsEscaped(VD);
492  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
493  VisitValueDecl(VD);
494  }
495  }
496  }
497  }
498  void VisitBlockExpr(const BlockExpr *E) {
499  if (!E)
500  return;
501  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
502  if (C.isByRef()) {
503  const VarDecl *VD = C.getVariable();
504  markAsEscaped(VD);
505  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
506  VisitValueDecl(VD);
507  }
508  }
509  }
510  void VisitCallExpr(const CallExpr *E) {
511  if (!E)
512  return;
513  for (const Expr *Arg : E->arguments()) {
514  if (!Arg)
515  continue;
516  if (Arg->isLValue()) {
517  const bool SavedAllEscaped = AllEscaped;
518  AllEscaped = true;
519  Visit(Arg);
520  AllEscaped = SavedAllEscaped;
521  } else {
522  Visit(Arg);
523  }
524  }
525  Visit(E->getCallee());
526  }
527  void VisitDeclRefExpr(const DeclRefExpr *E) {
528  if (!E)
529  return;
530  const ValueDecl *VD = E->getDecl();
531  if (AllEscaped)
532  markAsEscaped(VD);
533  if (isa<OMPCapturedExprDecl>(VD))
534  VisitValueDecl(VD);
535  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
536  if (VarD->isInitCapture())
537  VisitValueDecl(VD);
538  }
539  void VisitUnaryOperator(const UnaryOperator *E) {
540  if (!E)
541  return;
542  if (E->getOpcode() == UO_AddrOf) {
543  const bool SavedAllEscaped = AllEscaped;
544  AllEscaped = true;
545  Visit(E->getSubExpr());
546  AllEscaped = SavedAllEscaped;
547  } else {
548  Visit(E->getSubExpr());
549  }
550  }
551  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
552  if (!E)
553  return;
554  if (E->getCastKind() == CK_ArrayToPointerDecay) {
555  const bool SavedAllEscaped = AllEscaped;
556  AllEscaped = true;
557  Visit(E->getSubExpr());
558  AllEscaped = SavedAllEscaped;
559  } else {
560  Visit(E->getSubExpr());
561  }
562  }
563  void VisitExpr(const Expr *E) {
564  if (!E)
565  return;
566  bool SavedAllEscaped = AllEscaped;
567  if (!E->isLValue())
568  AllEscaped = false;
569  for (const Stmt *Child : E->children())
570  if (Child)
571  Visit(Child);
572  AllEscaped = SavedAllEscaped;
573  }
574  void VisitStmt(const Stmt *S) {
575  if (!S)
576  return;
577  for (const Stmt *Child : S->children())
578  if (Child)
579  Visit(Child);
580  }
581 
582  /// Returns the record that handles all the escaped local variables and used
583  /// instead of their original storage.
584  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
585  if (!GlobalizedRD)
586  buildRecordForGlobalizedVars(IsInTTDRegion);
587  return GlobalizedRD;
588  }
589 
590  /// Returns the field in the globalized record for the escaped variable.
591  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
592  assert(GlobalizedRD &&
593  "Record for globalized variables must be generated already.");
594  auto I = MappedDeclsFields.find(VD);
595  if (I == MappedDeclsFields.end())
596  return nullptr;
597  return I->getSecond();
598  }
599 
600  /// Returns the list of the escaped local variables/parameters.
601  ArrayRef<const ValueDecl *> getEscapedDecls() const {
602  return EscapedDecls.getArrayRef();
603  }
604 
605  /// Checks if the escaped local variable is actually a parameter passed by
606  /// value.
607  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
608  return EscapedParameters;
609  }
610 
611  /// Returns the list of the escaped variables with the variably modified
612  /// types.
613  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
614  return EscapedVariableLengthDecls.getArrayRef();
615  }
616 };
617 } // anonymous namespace
618 
619 /// Get the GPU warp size.
621  return CGF.EmitRuntimeCall(
622  llvm::Intrinsic::getDeclaration(
623  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
624  "nvptx_warp_size");
625 }
626 
627 /// Get the id of the current thread on the GPU.
629  return CGF.EmitRuntimeCall(
630  llvm::Intrinsic::getDeclaration(
631  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
632  "nvptx_tid");
633 }
634 
635 /// Get the id of the warp in the block.
636 /// We assume that the warp size is 32, which is always the case
637 /// on the NVPTX device, to generate more efficient code.
639  CGBuilderTy &Bld = CGF.Builder;
640  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
641 }
642 
643 /// Get the id of the current lane in the Warp.
644 /// We assume that the warp size is 32, which is always the case
645 /// on the NVPTX device, to generate more efficient code.
647  CGBuilderTy &Bld = CGF.Builder;
648  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
649  "nvptx_lane_id");
650 }
651 
652 /// Get the maximum number of threads in a block of the GPU.
654  return CGF.EmitRuntimeCall(
655  llvm::Intrinsic::getDeclaration(
656  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
657  "nvptx_num_threads");
658 }
659 
660 /// Get the value of the thread_limit clause in the teams directive.
661 /// For the 'generic' execution mode, the runtime encodes thread_limit in
662 /// the launch parameters, always starting thread_limit+warpSize threads per
663 /// CTA. The threads in the last warp are reserved for master execution.
664 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
666  bool IsInSPMDExecutionMode = false) {
667  CGBuilderTy &Bld = CGF.Builder;
668  return IsInSPMDExecutionMode
669  ? getNVPTXNumThreads(CGF)
670  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
671  "thread_limit");
672 }
673 
674 /// Get the thread id of the OMP master thread.
675 /// The master thread id is the first thread (lane) of the last warp in the
676 /// GPU block. Warp size is assumed to be some power of 2.
677 /// Thread id is 0 indexed.
678 /// E.g: If NumThreads is 33, master id is 32.
679 /// If NumThreads is 64, master id is 32.
680 /// If NumThreads is 1024, master id is 992.
682  CGBuilderTy &Bld = CGF.Builder;
683  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
684 
685  // We assume that the warp size is a power of 2.
686  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
687 
688  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
689  Bld.CreateNot(Mask), "master_tid");
690 }
691 
692 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
693  CodeGenModule &CGM, SourceLocation Loc)
694  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
695  Loc(Loc) {
696  createWorkerFunction(CGM);
697 }
698 
699 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
700  CodeGenModule &CGM) {
701  // Create an worker function with no arguments.
702 
703  WorkerFn = llvm::Function::Create(
705  /*placeholder=*/"_worker", &CGM.getModule());
706  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
707  WorkerFn->setDoesNotRecurse();
708 }
709 
711 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
712  return CurrentExecutionMode;
713 }
714 
717  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
719 }
720 
721 /// Check for inner (nested) SPMD construct, if any
723  const OMPExecutableDirective &D) {
724  const auto *CS = D.getInnermostCapturedStmt();
725  const auto *Body =
726  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
727  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
728 
729  if (const auto *NestedDir =
730  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
731  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
732  switch (D.getDirectiveKind()) {
733  case OMPD_target:
734  if (isOpenMPParallelDirective(DKind))
735  return true;
736  if (DKind == OMPD_teams) {
737  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
738  /*IgnoreCaptured=*/true);
739  if (!Body)
740  return false;
741  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
742  if (const auto *NND =
743  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
744  DKind = NND->getDirectiveKind();
745  if (isOpenMPParallelDirective(DKind))
746  return true;
747  }
748  }
749  return false;
750  case OMPD_target_teams:
751  return isOpenMPParallelDirective(DKind);
752  case OMPD_target_simd:
753  case OMPD_target_parallel:
754  case OMPD_target_parallel_for:
755  case OMPD_target_parallel_for_simd:
756  case OMPD_target_teams_distribute:
757  case OMPD_target_teams_distribute_simd:
758  case OMPD_target_teams_distribute_parallel_for:
759  case OMPD_target_teams_distribute_parallel_for_simd:
760  case OMPD_parallel:
761  case OMPD_for:
762  case OMPD_parallel_for:
763  case OMPD_parallel_sections:
764  case OMPD_for_simd:
765  case OMPD_parallel_for_simd:
766  case OMPD_cancel:
767  case OMPD_cancellation_point:
768  case OMPD_ordered:
769  case OMPD_threadprivate:
770  case OMPD_allocate:
771  case OMPD_task:
772  case OMPD_simd:
773  case OMPD_sections:
774  case OMPD_section:
775  case OMPD_single:
776  case OMPD_master:
777  case OMPD_critical:
778  case OMPD_taskyield:
779  case OMPD_barrier:
780  case OMPD_taskwait:
781  case OMPD_taskgroup:
782  case OMPD_atomic:
783  case OMPD_flush:
784  case OMPD_teams:
785  case OMPD_target_data:
786  case OMPD_target_exit_data:
787  case OMPD_target_enter_data:
788  case OMPD_distribute:
789  case OMPD_distribute_simd:
790  case OMPD_distribute_parallel_for:
791  case OMPD_distribute_parallel_for_simd:
792  case OMPD_teams_distribute:
793  case OMPD_teams_distribute_simd:
794  case OMPD_teams_distribute_parallel_for:
795  case OMPD_teams_distribute_parallel_for_simd:
796  case OMPD_target_update:
797  case OMPD_declare_simd:
798  case OMPD_declare_variant:
799  case OMPD_declare_target:
800  case OMPD_end_declare_target:
801  case OMPD_declare_reduction:
802  case OMPD_declare_mapper:
803  case OMPD_taskloop:
804  case OMPD_taskloop_simd:
805  case OMPD_requires:
806  case OMPD_unknown:
807  llvm_unreachable("Unexpected directive.");
808  }
809  }
810 
811  return false;
812 }
813 
815  const OMPExecutableDirective &D) {
816  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
817  switch (DirectiveKind) {
818  case OMPD_target:
819  case OMPD_target_teams:
820  return hasNestedSPMDDirective(Ctx, D);
821  case OMPD_target_parallel:
822  case OMPD_target_parallel_for:
823  case OMPD_target_parallel_for_simd:
824  case OMPD_target_teams_distribute_parallel_for:
825  case OMPD_target_teams_distribute_parallel_for_simd:
826  case OMPD_target_simd:
827  case OMPD_target_teams_distribute_simd:
828  return true;
829  case OMPD_target_teams_distribute:
830  return false;
831  case OMPD_parallel:
832  case OMPD_for:
833  case OMPD_parallel_for:
834  case OMPD_parallel_sections:
835  case OMPD_for_simd:
836  case OMPD_parallel_for_simd:
837  case OMPD_cancel:
838  case OMPD_cancellation_point:
839  case OMPD_ordered:
840  case OMPD_threadprivate:
841  case OMPD_allocate:
842  case OMPD_task:
843  case OMPD_simd:
844  case OMPD_sections:
845  case OMPD_section:
846  case OMPD_single:
847  case OMPD_master:
848  case OMPD_critical:
849  case OMPD_taskyield:
850  case OMPD_barrier:
851  case OMPD_taskwait:
852  case OMPD_taskgroup:
853  case OMPD_atomic:
854  case OMPD_flush:
855  case OMPD_teams:
856  case OMPD_target_data:
857  case OMPD_target_exit_data:
858  case OMPD_target_enter_data:
859  case OMPD_distribute:
860  case OMPD_distribute_simd:
861  case OMPD_distribute_parallel_for:
862  case OMPD_distribute_parallel_for_simd:
863  case OMPD_teams_distribute:
864  case OMPD_teams_distribute_simd:
865  case OMPD_teams_distribute_parallel_for:
866  case OMPD_teams_distribute_parallel_for_simd:
867  case OMPD_target_update:
868  case OMPD_declare_simd:
869  case OMPD_declare_variant:
870  case OMPD_declare_target:
871  case OMPD_end_declare_target:
872  case OMPD_declare_reduction:
873  case OMPD_declare_mapper:
874  case OMPD_taskloop:
875  case OMPD_taskloop_simd:
876  case OMPD_requires:
877  case OMPD_unknown:
878  break;
879  }
880  llvm_unreachable(
881  "Unknown programming model for OpenMP directive on NVPTX target.");
882 }
883 
884 /// Check if the directive is loops based and has schedule clause at all or has
885 /// static scheduling.
889  "Expected loop-based directive.");
890  return !D.hasClausesOfKind<OMPOrderedClause>() &&
892  llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
893  [](const OMPScheduleClause *C) {
894  return C->getScheduleKind() == OMPC_SCHEDULE_static;
895  }));
896 }
897 
898 /// Check for inner (nested) lightweight runtime construct, if any
900  const OMPExecutableDirective &D) {
901  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
902  const auto *CS = D.getInnermostCapturedStmt();
903  const auto *Body =
904  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
905  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
906 
907  if (const auto *NestedDir =
908  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
909  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
910  switch (D.getDirectiveKind()) {
911  case OMPD_target:
912  if (isOpenMPParallelDirective(DKind) &&
914  hasStaticScheduling(*NestedDir))
915  return true;
916  if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
917  return true;
918  if (DKind == OMPD_parallel) {
919  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
920  /*IgnoreCaptured=*/true);
921  if (!Body)
922  return false;
923  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
924  if (const auto *NND =
925  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
926  DKind = NND->getDirectiveKind();
927  if (isOpenMPWorksharingDirective(DKind) &&
929  return true;
930  }
931  } else if (DKind == OMPD_teams) {
932  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
933  /*IgnoreCaptured=*/true);
934  if (!Body)
935  return false;
936  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
937  if (const auto *NND =
938  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
939  DKind = NND->getDirectiveKind();
940  if (isOpenMPParallelDirective(DKind) &&
943  return true;
944  if (DKind == OMPD_parallel) {
945  Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
946  /*IgnoreCaptured=*/true);
947  if (!Body)
948  return false;
949  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
950  if (const auto *NND =
951  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
952  DKind = NND->getDirectiveKind();
953  if (isOpenMPWorksharingDirective(DKind) &&
955  return true;
956  }
957  }
958  }
959  }
960  return false;
961  case OMPD_target_teams:
962  if (isOpenMPParallelDirective(DKind) &&
964  hasStaticScheduling(*NestedDir))
965  return true;
966  if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
967  return true;
968  if (DKind == OMPD_parallel) {
969  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
970  /*IgnoreCaptured=*/true);
971  if (!Body)
972  return false;
973  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
974  if (const auto *NND =
975  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
976  DKind = NND->getDirectiveKind();
977  if (isOpenMPWorksharingDirective(DKind) &&
979  return true;
980  }
981  }
982  return false;
983  case OMPD_target_parallel:
984  if (DKind == OMPD_simd)
985  return true;
986  return isOpenMPWorksharingDirective(DKind) &&
987  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
988  case OMPD_target_teams_distribute:
989  case OMPD_target_simd:
990  case OMPD_target_parallel_for:
991  case OMPD_target_parallel_for_simd:
992  case OMPD_target_teams_distribute_simd:
993  case OMPD_target_teams_distribute_parallel_for:
994  case OMPD_target_teams_distribute_parallel_for_simd:
995  case OMPD_parallel:
996  case OMPD_for:
997  case OMPD_parallel_for:
998  case OMPD_parallel_sections:
999  case OMPD_for_simd:
1000  case OMPD_parallel_for_simd:
1001  case OMPD_cancel:
1002  case OMPD_cancellation_point:
1003  case OMPD_ordered:
1004  case OMPD_threadprivate:
1005  case OMPD_allocate:
1006  case OMPD_task:
1007  case OMPD_simd:
1008  case OMPD_sections:
1009  case OMPD_section:
1010  case OMPD_single:
1011  case OMPD_master:
1012  case OMPD_critical:
1013  case OMPD_taskyield:
1014  case OMPD_barrier:
1015  case OMPD_taskwait:
1016  case OMPD_taskgroup:
1017  case OMPD_atomic:
1018  case OMPD_flush:
1019  case OMPD_teams:
1020  case OMPD_target_data:
1021  case OMPD_target_exit_data:
1022  case OMPD_target_enter_data:
1023  case OMPD_distribute:
1024  case OMPD_distribute_simd:
1025  case OMPD_distribute_parallel_for:
1026  case OMPD_distribute_parallel_for_simd:
1027  case OMPD_teams_distribute:
1028  case OMPD_teams_distribute_simd:
1029  case OMPD_teams_distribute_parallel_for:
1030  case OMPD_teams_distribute_parallel_for_simd:
1031  case OMPD_target_update:
1032  case OMPD_declare_simd:
1033  case OMPD_declare_variant:
1034  case OMPD_declare_target:
1035  case OMPD_end_declare_target:
1036  case OMPD_declare_reduction:
1037  case OMPD_declare_mapper:
1038  case OMPD_taskloop:
1039  case OMPD_taskloop_simd:
1040  case OMPD_requires:
1041  case OMPD_unknown:
1042  llvm_unreachable("Unexpected directive.");
1043  }
1044  }
1045 
1046  return false;
1047 }
1048 
1049 /// Checks if the construct supports lightweight runtime. It must be SPMD
1050 /// construct + inner loop-based construct with static scheduling.
1052  const OMPExecutableDirective &D) {
1053  if (!supportsSPMDExecutionMode(Ctx, D))
1054  return false;
1055  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1056  switch (DirectiveKind) {
1057  case OMPD_target:
1058  case OMPD_target_teams:
1059  case OMPD_target_parallel:
1060  return hasNestedLightweightDirective(Ctx, D);
1061  case OMPD_target_parallel_for:
1062  case OMPD_target_parallel_for_simd:
1063  case OMPD_target_teams_distribute_parallel_for:
1064  case OMPD_target_teams_distribute_parallel_for_simd:
1065  // (Last|First)-privates must be shared in parallel region.
1066  return hasStaticScheduling(D);
1067  case OMPD_target_simd:
1068  case OMPD_target_teams_distribute_simd:
1069  return true;
1070  case OMPD_target_teams_distribute:
1071  return false;
1072  case OMPD_parallel:
1073  case OMPD_for:
1074  case OMPD_parallel_for:
1075  case OMPD_parallel_sections:
1076  case OMPD_for_simd:
1077  case OMPD_parallel_for_simd:
1078  case OMPD_cancel:
1079  case OMPD_cancellation_point:
1080  case OMPD_ordered:
1081  case OMPD_threadprivate:
1082  case OMPD_allocate:
1083  case OMPD_task:
1084  case OMPD_simd:
1085  case OMPD_sections:
1086  case OMPD_section:
1087  case OMPD_single:
1088  case OMPD_master:
1089  case OMPD_critical:
1090  case OMPD_taskyield:
1091  case OMPD_barrier:
1092  case OMPD_taskwait:
1093  case OMPD_taskgroup:
1094  case OMPD_atomic:
1095  case OMPD_flush:
1096  case OMPD_teams:
1097  case OMPD_target_data:
1098  case OMPD_target_exit_data:
1099  case OMPD_target_enter_data:
1100  case OMPD_distribute:
1101  case OMPD_distribute_simd:
1102  case OMPD_distribute_parallel_for:
1103  case OMPD_distribute_parallel_for_simd:
1104  case OMPD_teams_distribute:
1105  case OMPD_teams_distribute_simd:
1106  case OMPD_teams_distribute_parallel_for:
1107  case OMPD_teams_distribute_parallel_for_simd:
1108  case OMPD_target_update:
1109  case OMPD_declare_simd:
1110  case OMPD_declare_variant:
1111  case OMPD_declare_target:
1112  case OMPD_end_declare_target:
1113  case OMPD_declare_reduction:
1114  case OMPD_declare_mapper:
1115  case OMPD_taskloop:
1116  case OMPD_taskloop_simd:
1117  case OMPD_requires:
1118  case OMPD_unknown:
1119  break;
1120  }
1121  llvm_unreachable(
1122  "Unknown programming model for OpenMP directive on NVPTX target.");
1123 }
1124 
1125 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1126  StringRef ParentName,
1127  llvm::Function *&OutlinedFn,
1128  llvm::Constant *&OutlinedFnID,
1129  bool IsOffloadEntry,
1130  const RegionCodeGenTy &CodeGen) {
1131  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1132  EntryFunctionState EST;
1133  WorkerFunctionState WST(CGM, D.getBeginLoc());
1134  Work.clear();
1135  WrapperFunctionsMap.clear();
1136 
1137  // Emit target region as a standalone region.
1138  class NVPTXPrePostActionTy : public PrePostActionTy {
1139  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1140  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1141 
1142  public:
1143  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1144  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1145  : EST(EST), WST(WST) {}
1146  void Enter(CodeGenFunction &CGF) override {
1147  auto &RT =
1148  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1149  RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1150  // Skip target region initialization.
1151  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1152  }
1153  void Exit(CodeGenFunction &CGF) override {
1154  auto &RT =
1155  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1156  RT.clearLocThreadIdInsertPt(CGF);
1157  RT.emitNonSPMDEntryFooter(CGF, EST);
1158  }
1159  } Action(EST, WST);
1160  CodeGen.setAction(Action);
1161  IsInTTDRegion = true;
1162  // Reserve place for the globalized memory.
1163  GlobalizedRecords.emplace_back();
1164  if (!KernelStaticGlobalized) {
1165  KernelStaticGlobalized = new llvm::GlobalVariable(
1166  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1168  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1169  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1170  llvm::GlobalValue::NotThreadLocal,
1172  }
1173  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1174  IsOffloadEntry, CodeGen);
1175  IsInTTDRegion = false;
1176 
1177  // Now change the name of the worker function to correspond to this target
1178  // region's entry function.
1179  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1180 
1181  // Create the worker function
1182  emitWorkerFunction(WST);
1183 }
1184 
1185 // Setup NVPTX threads for master-worker OpenMP scheme.
1186 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1187  EntryFunctionState &EST,
1188  WorkerFunctionState &WST) {
1189  CGBuilderTy &Bld = CGF.Builder;
1190 
1191  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1192  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1193  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1194  EST.ExitBB = CGF.createBasicBlock(".exit");
1195 
1196  llvm::Value *IsWorker =
1197  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1198  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1199 
1200  CGF.EmitBlock(WorkerBB);
1201  emitCall(CGF, WST.Loc, WST.WorkerFn);
1202  CGF.EmitBranch(EST.ExitBB);
1203 
1204  CGF.EmitBlock(MasterCheckBB);
1205  llvm::Value *IsMaster =
1206  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1207  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1208 
1209  CGF.EmitBlock(MasterBB);
1210  IsInTargetMasterThreadRegion = true;
1211  // SEQUENTIAL (MASTER) REGION START
1212  // First action in sequential region:
1213  // Initialize the state of the OpenMP runtime library on the GPU.
1214  // TODO: Optimize runtime initialization and pass in correct value.
1215  llvm::Value *Args[] = {getThreadLimit(CGF),
1216  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1217  CGF.EmitRuntimeCall(
1218  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1219 
1220  // For data sharing, we need to initialize the stack.
1221  CGF.EmitRuntimeCall(
1222  createNVPTXRuntimeFunction(
1223  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1224 
1225  emitGenericVarsProlog(CGF, WST.Loc);
1226 }
1227 
1228 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1229  EntryFunctionState &EST) {
1230  IsInTargetMasterThreadRegion = false;
1231  if (!CGF.HaveInsertPoint())
1232  return;
1233 
1234  emitGenericVarsEpilog(CGF);
1235 
1236  if (!EST.ExitBB)
1237  EST.ExitBB = CGF.createBasicBlock(".exit");
1238 
1239  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1240  CGF.EmitBranch(TerminateBB);
1241 
1242  CGF.EmitBlock(TerminateBB);
1243  // Signal termination condition.
1244  // TODO: Optimize runtime initialization and pass in correct value.
1245  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1246  CGF.EmitRuntimeCall(
1247  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1248  // Barrier to terminate worker threads.
1249  syncCTAThreads(CGF);
1250  // Master thread jumps to exit point.
1251  CGF.EmitBranch(EST.ExitBB);
1252 
1253  CGF.EmitBlock(EST.ExitBB);
1254  EST.ExitBB = nullptr;
1255 }
1256 
1257 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1258  StringRef ParentName,
1259  llvm::Function *&OutlinedFn,
1260  llvm::Constant *&OutlinedFnID,
1261  bool IsOffloadEntry,
1262  const RegionCodeGenTy &CodeGen) {
1263  ExecutionRuntimeModesRAII ModeRAII(
1264  CurrentExecutionMode, RequiresFullRuntime,
1265  CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1267  EntryFunctionState EST;
1268 
1269  // Emit target region as a standalone region.
1270  class NVPTXPrePostActionTy : public PrePostActionTy {
1272  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1273  const OMPExecutableDirective &D;
1274 
1275  public:
1276  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1277  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1278  const OMPExecutableDirective &D)
1279  : RT(RT), EST(EST), D(D) {}
1280  void Enter(CodeGenFunction &CGF) override {
1281  RT.emitSPMDEntryHeader(CGF, EST, D);
1282  // Skip target region initialization.
1283  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1284  }
1285  void Exit(CodeGenFunction &CGF) override {
1286  RT.clearLocThreadIdInsertPt(CGF);
1287  RT.emitSPMDEntryFooter(CGF, EST);
1288  }
1289  } Action(*this, EST, D);
1290  CodeGen.setAction(Action);
1291  IsInTTDRegion = true;
1292  // Reserve place for the globalized memory.
1293  GlobalizedRecords.emplace_back();
1294  if (!KernelStaticGlobalized) {
1295  KernelStaticGlobalized = new llvm::GlobalVariable(
1296  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1298  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1299  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1300  llvm::GlobalValue::NotThreadLocal,
1302  }
1303  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1304  IsOffloadEntry, CodeGen);
1305  IsInTTDRegion = false;
1306 }
1307 
1308 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1309  CodeGenFunction &CGF, EntryFunctionState &EST,
1310  const OMPExecutableDirective &D) {
1311  CGBuilderTy &Bld = CGF.Builder;
1312 
1313  // Setup BBs in entry function.
1314  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1315  EST.ExitBB = CGF.createBasicBlock(".exit");
1316 
1317  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1318  /*RequiresOMPRuntime=*/
1319  Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1320  /*RequiresDataSharing=*/Bld.getInt16(0)};
1321  CGF.EmitRuntimeCall(
1322  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1323 
1324  if (RequiresFullRuntime) {
1325  // For data sharing, we need to initialize the stack.
1326  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1327  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1328  }
1329 
1330  CGF.EmitBranch(ExecuteBB);
1331 
1332  CGF.EmitBlock(ExecuteBB);
1333 
1334  IsInTargetMasterThreadRegion = true;
1335 }
1336 
1337 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1338  EntryFunctionState &EST) {
1339  IsInTargetMasterThreadRegion = false;
1340  if (!CGF.HaveInsertPoint())
1341  return;
1342 
1343  if (!EST.ExitBB)
1344  EST.ExitBB = CGF.createBasicBlock(".exit");
1345 
1346  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1347  CGF.EmitBranch(OMPDeInitBB);
1348 
1349  CGF.EmitBlock(OMPDeInitBB);
1350  // DeInitialize the OMP state in the runtime; called by all active threads.
1351  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1352  CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1353  CGF.EmitRuntimeCall(
1354  createNVPTXRuntimeFunction(
1355  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1356  CGF.EmitBranch(EST.ExitBB);
1357 
1358  CGF.EmitBlock(EST.ExitBB);
1359  EST.ExitBB = nullptr;
1360 }
1361 
1362 // Create a unique global variable to indicate the execution mode of this target
1363 // region. The execution mode is either 'generic', or 'spmd' depending on the
1364 // target directive. This variable is picked up by the offload library to setup
1365 // the device appropriately before kernel launch. If the execution mode is
1366 // 'generic', the runtime reserves one warp for the master, otherwise, all
1367 // warps participate in parallel work.
1368 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1369  bool Mode) {
1370  auto *GVMode =
1371  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1372  llvm::GlobalValue::WeakAnyLinkage,
1373  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1374  Twine(Name, "_exec_mode"));
1375  CGM.addCompilerUsedGlobal(GVMode);
1376 }
1377 
1378 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1379  ASTContext &Ctx = CGM.getContext();
1380 
1381  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1382  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1383  WST.Loc, WST.Loc);
1384  emitWorkerLoop(CGF, WST);
1385  CGF.FinishFunction();
1386 }
1387 
1388 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1389  WorkerFunctionState &WST) {
1390  //
1391  // The workers enter this loop and wait for parallel work from the master.
1392  // When the master encounters a parallel region it sets up the work + variable
1393  // arguments, and wakes up the workers. The workers first check to see if
1394  // they are required for the parallel region, i.e., within the # of requested
1395  // parallel threads. The activated workers load the variable arguments and
1396  // execute the parallel work.
1397  //
1398 
1399  CGBuilderTy &Bld = CGF.Builder;
1400 
1401  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1402  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1403  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1404  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1405  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1406  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1407 
1408  CGF.EmitBranch(AwaitBB);
1409 
1410  // Workers wait for work from master.
1411  CGF.EmitBlock(AwaitBB);
1412  // Wait for parallel work
1413  syncCTAThreads(CGF);
1414 
1415  Address WorkFn =
1416  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1417  Address ExecStatus =
1418  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1419  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1420  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1421 
1422  // TODO: Optimize runtime initialization and pass in correct value.
1423  llvm::Value *Args[] = {WorkFn.getPointer(),
1424  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1426  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1427  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1428 
1429  // On termination condition (workid == 0), exit loop.
1430  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1431  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1432  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1433 
1434  // Activate requested workers.
1435  CGF.EmitBlock(SelectWorkersBB);
1436  llvm::Value *IsActive =
1437  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1438  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1439 
1440  // Signal start of parallel region.
1441  CGF.EmitBlock(ExecuteBB);
1442  // Skip initialization.
1443  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1444 
1445  // Process work items: outlined parallel functions.
1446  for (llvm::Function *W : Work) {
1447  // Try to match this outlined function.
1449 
1450  llvm::Value *WorkFnMatch =
1451  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1452 
1453  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1454  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1455  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1456 
1457  // Execute this outlined function.
1458  CGF.EmitBlock(ExecuteFNBB);
1459 
1460  // Insert call to work function via shared wrapper. The shared
1461  // wrapper takes two arguments:
1462  // - the parallelism level;
1463  // - the thread ID;
1464  emitCall(CGF, WST.Loc, W,
1465  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1466 
1467  // Go to end of parallel region.
1468  CGF.EmitBranch(TerminateBB);
1469 
1470  CGF.EmitBlock(CheckNextBB);
1471  }
1472  // Default case: call to outlined function through pointer if the target
1473  // region makes a declare target call that may contain an orphaned parallel
1474  // directive.
1475  auto *ParallelFnTy =
1476  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1477  /*isVarArg=*/false);
1478  llvm::Value *WorkFnCast =
1479  Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1480  // Insert call to work function via shared wrapper. The shared
1481  // wrapper takes two arguments:
1482  // - the parallelism level;
1483  // - the thread ID;
1484  emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1485  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1486  // Go to end of parallel region.
1487  CGF.EmitBranch(TerminateBB);
1488 
1489  // Signal end of parallel region.
1490  CGF.EmitBlock(TerminateBB);
1491  CGF.EmitRuntimeCall(
1492  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1493  llvm::None);
1494  CGF.EmitBranch(BarrierBB);
1495 
1496  // All active and inactive workers wait at a barrier after parallel region.
1497  CGF.EmitBlock(BarrierBB);
1498  // Barrier after parallel region.
1499  syncCTAThreads(CGF);
1500  CGF.EmitBranch(AwaitBB);
1501 
1502  // Exit target region.
1503  CGF.EmitBlock(ExitBB);
1504  // Skip initialization.
1505  clearLocThreadIdInsertPt(CGF);
1506 }
1507 
1508 /// Returns specified OpenMP runtime function for the current OpenMP
1509 /// implementation. Specialized for the NVPTX device.
1510 /// \param Function OpenMP runtime function.
1511 /// \return Specified function.
1512 llvm::FunctionCallee
1514  llvm::FunctionCallee RTLFn = nullptr;
1515  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1516  case OMPRTL_NVPTX__kmpc_kernel_init: {
1517  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1518  // RequiresOMPRuntime);
1519  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1520  auto *FnTy =
1521  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1522  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1523  break;
1524  }
1525  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1526  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1527  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1528  auto *FnTy =
1529  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1530  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1531  break;
1532  }
1533  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1534  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1535  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1536  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1537  auto *FnTy =
1538  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1539  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1540  break;
1541  }
1542  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1543  // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1544  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1545  auto *FnTy =
1546  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1547  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1548  break;
1549  }
1550  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1551  /// Build void __kmpc_kernel_prepare_parallel(
1552  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1553  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1554  auto *FnTy =
1555  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1556  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1557  break;
1558  }
1559  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1560  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1561  /// int16_t IsOMPRuntimeInitialized);
1562  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1563  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1564  auto *FnTy =
1565  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1566  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1567  break;
1568  }
1569  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1570  /// Build void __kmpc_kernel_end_parallel();
1571  auto *FnTy =
1572  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1573  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1574  break;
1575  }
1576  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1577  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1578  // global_tid);
1579  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1580  auto *FnTy =
1581  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1582  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1583  break;
1584  }
1585  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1586  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1587  // global_tid);
1588  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1589  auto *FnTy =
1590  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1591  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1592  break;
1593  }
1594  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1595  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1596  // int16_t lane_offset, int16_t warp_size);
1597  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1598  auto *FnTy =
1599  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1600  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1601  break;
1602  }
1603  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1604  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1605  // int16_t lane_offset, int16_t warp_size);
1606  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1607  auto *FnTy =
1608  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1609  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1610  break;
1611  }
1612  case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: {
1613  // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
1614  // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
1615  // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t
1616  // lane_id, int16_t lane_offset, int16_t Algorithm Version), void
1617  // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1618  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1619  CGM.Int16Ty, CGM.Int16Ty};
1620  auto *ShuffleReduceFnTy =
1621  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1622  /*isVarArg=*/false);
1623  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1624  auto *InterWarpCopyFnTy =
1625  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1626  /*isVarArg=*/false);
1627  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1628  CGM.Int32Ty,
1629  CGM.Int32Ty,
1630  CGM.SizeTy,
1631  CGM.VoidPtrTy,
1632  ShuffleReduceFnTy->getPointerTo(),
1633  InterWarpCopyFnTy->getPointerTo()};
1634  auto *FnTy =
1635  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1636  RTLFn = CGM.CreateRuntimeFunction(
1637  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2");
1638  break;
1639  }
1640  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1641  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1642  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1643  auto *FnTy =
1644  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1645  RTLFn = CGM.CreateRuntimeFunction(
1646  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1647  break;
1648  }
1649  case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: {
1650  // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
1651  // global_tid, void *global_buffer, int32_t num_of_records, void*
1652  // reduce_data,
1653  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1654  // lane_offset, int16_t shortCircuit),
1655  // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
1656  // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
1657  // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
1658  // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
1659  // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
1660  // *buffer, int idx, void *reduce_data));
1661  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1662  CGM.Int16Ty, CGM.Int16Ty};
1663  auto *ShuffleReduceFnTy =
1664  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1665  /*isVarArg=*/false);
1666  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1667  auto *InterWarpCopyFnTy =
1668  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1669  /*isVarArg=*/false);
1670  llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy,
1671  CGM.VoidPtrTy};
1672  auto *GlobalListFnTy =
1673  llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams,
1674  /*isVarArg=*/false);
1675  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1676  CGM.Int32Ty,
1677  CGM.VoidPtrTy,
1678  CGM.Int32Ty,
1679  CGM.VoidPtrTy,
1680  ShuffleReduceFnTy->getPointerTo(),
1681  InterWarpCopyFnTy->getPointerTo(),
1682  GlobalListFnTy->getPointerTo(),
1683  GlobalListFnTy->getPointerTo(),
1684  GlobalListFnTy->getPointerTo(),
1685  GlobalListFnTy->getPointerTo()};
1686  auto *FnTy =
1687  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1688  RTLFn = CGM.CreateRuntimeFunction(
1689  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2");
1690  break;
1691  }
1692  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1693  /// Build void __kmpc_data_sharing_init_stack();
1694  auto *FnTy =
1695  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1696  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1697  break;
1698  }
1699  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1700  /// Build void __kmpc_data_sharing_init_stack_spmd();
1701  auto *FnTy =
1702  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1703  RTLFn =
1704  CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1705  break;
1706  }
1707  case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1708  // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1709  // int16_t UseSharedMemory);
1710  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1711  auto *FnTy =
1712  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1713  RTLFn = CGM.CreateRuntimeFunction(
1714  FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
1715  break;
1716  }
1717  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1718  // Build void __kmpc_data_sharing_pop_stack(void *a);
1719  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1720  auto *FnTy =
1721  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1722  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1723  /*Name=*/"__kmpc_data_sharing_pop_stack");
1724  break;
1725  }
1726  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1727  /// Build void __kmpc_begin_sharing_variables(void ***args,
1728  /// size_t n_args);
1729  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1730  auto *FnTy =
1731  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1732  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1733  break;
1734  }
1735  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1736  /// Build void __kmpc_end_sharing_variables();
1737  auto *FnTy =
1738  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1739  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1740  break;
1741  }
1742  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1743  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1744  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1745  auto *FnTy =
1746  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1747  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1748  break;
1749  }
1750  case OMPRTL_NVPTX__kmpc_parallel_level: {
1751  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1752  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1753  auto *FnTy =
1754  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1755  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1756  break;
1757  }
1758  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1759  // Build int8_t __kmpc_is_spmd_exec_mode();
1760  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1761  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1762  break;
1763  }
1764  case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1765  // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
1766  // const void *buf, size_t size, int16_t is_shared, const void **res);
1767  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
1768  CGM.Int16Ty, CGM.VoidPtrPtrTy};
1769  auto *FnTy =
1770  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1771  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1772  break;
1773  }
1774  case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1775  // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
1776  // int16_t is_shared);
1777  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
1778  auto *FnTy =
1779  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1780  RTLFn =
1781  CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1782  break;
1783  }
1784  case OMPRTL__kmpc_barrier: {
1785  // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1786  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1787  auto *FnTy =
1788  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1789  RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
1790  cast<llvm::Function>(RTLFn.getCallee())
1791  ->addFnAttr(llvm::Attribute::Convergent);
1792  break;
1793  }
1794  case OMPRTL__kmpc_barrier_simple_spmd: {
1795  // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
1796  // global_tid);
1797  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1798  auto *FnTy =
1799  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1800  RTLFn =
1801  CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
1802  cast<llvm::Function>(RTLFn.getCallee())
1803  ->addFnAttr(llvm::Attribute::Convergent);
1804  break;
1805  }
1806  case OMPRTL_NVPTX__kmpc_warp_active_thread_mask: {
1807  // Build int32_t __kmpc_warp_active_thread_mask(void);
1808  auto *FnTy =
1809  llvm::FunctionType::get(CGM.Int32Ty, llvm::None, /*isVarArg=*/false);
1810  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_warp_active_thread_mask");
1811  break;
1812  }
1813  case OMPRTL_NVPTX__kmpc_syncwarp: {
1814  // Build void __kmpc_syncwarp(kmp_int32 Mask);
1815  auto *FnTy =
1816  llvm::FunctionType::get(CGM.VoidTy, CGM.Int32Ty, /*isVarArg=*/false);
1817  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_syncwarp");
1818  break;
1819  }
1820  }
1821  return RTLFn;
1822 }
1823 
1824 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1825  llvm::Constant *Addr,
1826  uint64_t Size, int32_t,
1827  llvm::GlobalValue::LinkageTypes) {
1828  // TODO: Add support for global variables on the device after declare target
1829  // support.
1830  if (!isa<llvm::Function>(Addr))
1831  return;
1832  llvm::Module &M = CGM.getModule();
1833  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1834 
1835  // Get "nvvm.annotations" metadata node
1836  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1837 
1838  llvm::Metadata *MDVals[] = {
1839  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1840  llvm::ConstantAsMetadata::get(
1841  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1842  // Append metadata to nvvm.annotations
1843  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1844 }
1845 
1846 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1847  const OMPExecutableDirective &D, StringRef ParentName,
1848  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1849  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1850  if (!IsOffloadEntry) // Nothing to do.
1851  return;
1852 
1853  assert(!ParentName.empty() && "Invalid target region parent name!");
1854 
1855  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1856  if (Mode)
1857  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1858  CodeGen);
1859  else
1860  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1861  CodeGen);
1862 
1863  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1864 }
1865 
1866 namespace {
1868 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1869 enum ModeFlagsTy : unsigned {
1870  /// Bit set to 1 when in SPMD mode.
1871  KMP_IDENT_SPMD_MODE = 0x01,
1872  /// Bit set to 1 when a simplified runtime is used.
1873  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1874  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1875 };
1876 
1877 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1878 static const ModeFlagsTy UndefinedMode =
1879  (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1880 } // anonymous namespace
1881 
1883  switch (getExecutionMode()) {
1884  case EM_SPMD:
1885  if (requiresFullRuntime())
1886  return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1887  return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1888  case EM_NonSPMD:
1889  assert(requiresFullRuntime() && "Expected full runtime.");
1890  return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1891  case EM_Unknown:
1892  return UndefinedMode;
1893  }
1894  llvm_unreachable("Unknown flags are requested.");
1895 }
1896 
1898  : CGOpenMPRuntime(CGM, "_", "$") {
1899  if (!CGM.getLangOpts().OpenMPIsDevice)
1900  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1901 }
1902 
1904  OpenMPProcBindClauseKind ProcBind,
1905  SourceLocation Loc) {
1906  // Do nothing in case of SPMD mode and L0 parallel.
1907  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1908  return;
1909 
1910  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1911 }
1912 
1914  llvm::Value *NumThreads,
1915  SourceLocation Loc) {
1916  // Do nothing in case of SPMD mode and L0 parallel.
1917  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1918  return;
1919 
1920  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1921 }
1922 
1924  const Expr *NumTeams,
1925  const Expr *ThreadLimit,
1926  SourceLocation Loc) {}
1927 
1929  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1930  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1931  // Emit target region as a standalone region.
1932  class NVPTXPrePostActionTy : public PrePostActionTy {
1933  bool &IsInParallelRegion;
1934  bool PrevIsInParallelRegion;
1935 
1936  public:
1937  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1938  : IsInParallelRegion(IsInParallelRegion) {}
1939  void Enter(CodeGenFunction &CGF) override {
1940  PrevIsInParallelRegion = IsInParallelRegion;
1941  IsInParallelRegion = true;
1942  }
1943  void Exit(CodeGenFunction &CGF) override {
1944  IsInParallelRegion = PrevIsInParallelRegion;
1945  }
1946  } Action(IsInParallelRegion);
1947  CodeGen.setAction(Action);
1948  bool PrevIsInTTDRegion = IsInTTDRegion;
1949  IsInTTDRegion = false;
1950  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1951  IsInTargetMasterThreadRegion = false;
1952  auto *OutlinedFun =
1953  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1954  D, ThreadIDVar, InnermostKind, CodeGen));
1955  if (CGM.getLangOpts().Optimize) {
1956  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
1957  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
1958  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
1959  }
1960  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1961  IsInTTDRegion = PrevIsInTTDRegion;
1962  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1963  !IsInParallelRegion) {
1964  llvm::Function *WrapperFun =
1965  createParallelDataSharingWrapper(OutlinedFun, D);
1966  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1967  }
1968 
1969  return OutlinedFun;
1970 }
1971 
1972 /// Get list of lastprivate variables from the teams distribute ... or
1973 /// teams {distribute ...} directives.
1974 static void
1978  "expected teams directive.");
1979  const OMPExecutableDirective *Dir = &D;
1982  Ctx,
1984  /*IgnoreCaptured=*/true))) {
1985  Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1986  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1987  Dir = nullptr;
1988  }
1989  }
1990  if (!Dir)
1991  return;
1992  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1993  for (const Expr *E : C->getVarRefs())
1994  Vars.push_back(getPrivateItem(E));
1995  }
1996 }
1997 
1998 /// Get list of reduction variables from the teams ... directives.
1999 static void
2003  "expected teams directive.");
2004  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
2005  for (const Expr *E : C->privates())
2006  Vars.push_back(getPrivateItem(E));
2007  }
2008 }
2009 
2011  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
2012  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
2013  SourceLocation Loc = D.getBeginLoc();
2014 
2015  const RecordDecl *GlobalizedRD = nullptr;
2016  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
2017  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
2018  // Globalize team reductions variable unconditionally in all modes.
2019  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2020  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
2021  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
2022  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2023  if (!LastPrivatesReductions.empty()) {
2024  GlobalizedRD = ::buildRecordForGlobalizedVars(
2025  CGM.getContext(), llvm::None, LastPrivatesReductions,
2026  MappedDeclsFields, WarpSize);
2027  }
2028  } else if (!LastPrivatesReductions.empty()) {
2029  assert(!TeamAndReductions.first &&
2030  "Previous team declaration is not expected.");
2031  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2032  std::swap(TeamAndReductions.second, LastPrivatesReductions);
2033  }
2034 
2035  // Emit target region as a standalone region.
2036  class NVPTXPrePostActionTy : public PrePostActionTy {
2037  SourceLocation &Loc;
2038  const RecordDecl *GlobalizedRD;
2039  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2040  &MappedDeclsFields;
2041 
2042  public:
2043  NVPTXPrePostActionTy(
2044  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2045  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2046  &MappedDeclsFields)
2047  : Loc(Loc), GlobalizedRD(GlobalizedRD),
2048  MappedDeclsFields(MappedDeclsFields) {}
2049  void Enter(CodeGenFunction &CGF) override {
2050  auto &Rt =
2051  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2052  if (GlobalizedRD) {
2053  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2054  I->getSecond().GlobalRecord = GlobalizedRD;
2055  I->getSecond().MappedParams =
2056  std::make_unique<CodeGenFunction::OMPMapVars>();
2057  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2058  for (const auto &Pair : MappedDeclsFields) {
2059  assert(Pair.getFirst()->isCanonicalDecl() &&
2060  "Expected canonical declaration");
2061  Data.insert(std::make_pair(Pair.getFirst(),
2062  MappedVarData(Pair.getSecond(),
2063  /*IsOnePerTeam=*/true)));
2064  }
2065  }
2066  Rt.emitGenericVarsProlog(CGF, Loc);
2067  }
2068  void Exit(CodeGenFunction &CGF) override {
2069  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2070  .emitGenericVarsEpilog(CGF);
2071  }
2072  } Action(Loc, GlobalizedRD, MappedDeclsFields);
2073  CodeGen.setAction(Action);
2074  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2075  D, ThreadIDVar, InnermostKind, CodeGen);
2076  if (CGM.getLangOpts().Optimize) {
2077  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2078  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2079  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2080  }
2081 
2082  return OutlinedFun;
2083 }
2084 
2085 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2086  SourceLocation Loc,
2087  bool WithSPMDCheck) {
2089  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2090  return;
2091 
2092  CGBuilderTy &Bld = CGF.Builder;
2093 
2094  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2095  if (I == FunctionGlobalizedDecls.end())
2096  return;
2097  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2098  QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2099  QualType SecGlobalRecTy;
2100 
2101  // Recover pointer to this function's global record. The runtime will
2102  // handle the specifics of the allocation of the memory.
2103  // Use actual memory size of the record including the padding
2104  // for alignment purposes.
2105  unsigned Alignment =
2106  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2107  unsigned GlobalRecordSize =
2108  CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2109  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2110 
2111  llvm::PointerType *GlobalRecPtrTy =
2112  CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2113  llvm::Value *GlobalRecCastAddr;
2114  llvm::Value *IsTTD = nullptr;
2115  if (!IsInTTDRegion &&
2116  (WithSPMDCheck ||
2117  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2118  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2119  llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2120  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2121  if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2122  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2123  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2124  llvm::Value *PL = CGF.EmitRuntimeCall(
2125  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2126  {RTLoc, ThreadID});
2127  IsTTD = Bld.CreateIsNull(PL);
2128  }
2129  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2130  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2131  Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2132  // There is no need to emit line number for unconditional branch.
2134  CGF.EmitBlock(SPMDBB);
2135  Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2136  CharUnits::fromQuantity(Alignment));
2137  CGF.EmitBranch(ExitBB);
2138  // There is no need to emit line number for unconditional branch.
2140  CGF.EmitBlock(NonSPMDBB);
2141  llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2142  if (const RecordDecl *SecGlobalizedVarsRecord =
2143  I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2144  SecGlobalRecTy =
2145  CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2146 
2147  // Recover pointer to this function's global record. The runtime will
2148  // handle the specifics of the allocation of the memory.
2149  // Use actual memory size of the record including the padding
2150  // for alignment purposes.
2151  unsigned Alignment =
2152  CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2153  unsigned GlobalRecordSize =
2154  CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2155  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2156  Size = Bld.CreateSelect(
2157  IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2158  }
2159  // TODO: allow the usage of shared memory to be controlled by
2160  // the user, for now, default to global.
2161  llvm::Value *GlobalRecordSizeArg[] = {
2162  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2163  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2165  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2166  GlobalRecordSizeArg);
2167  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2168  GlobalRecValue, GlobalRecPtrTy);
2169  CGF.EmitBlock(ExitBB);
2170  auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2171  /*NumReservedValues=*/2, "_select_stack");
2172  Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2173  Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2174  GlobalRecCastAddr = Phi;
2175  I->getSecond().GlobalRecordAddr = Phi;
2176  I->getSecond().IsInSPMDModeFlag = IsSPMD;
2177  } else if (IsInTTDRegion) {
2178  assert(GlobalizedRecords.back().Records.size() < 2 &&
2179  "Expected less than 2 globalized records: one for target and one "
2180  "for teams.");
2181  unsigned Offset = 0;
2182  for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2183  QualType RDTy = CGM.getContext().getRecordType(RD);
2184  unsigned Alignment =
2186  unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2187  Offset =
2188  llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2189  }
2190  unsigned Alignment =
2191  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2192  Offset = llvm::alignTo(Offset, Alignment);
2193  GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2194  ++GlobalizedRecords.back().RegionCounter;
2195  if (GlobalizedRecords.back().Records.size() == 1) {
2196  assert(KernelStaticGlobalized &&
2197  "Kernel static pointer must be initialized already.");
2198  auto *UseSharedMemory = new llvm::GlobalVariable(
2199  CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2201  "_openmp_static_kernel$is_shared");
2202  UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2203  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2204  /*DestWidth=*/16, /*Signed=*/0);
2205  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2206  Address(UseSharedMemory,
2207  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2208  /*Volatile=*/false, Int16Ty, Loc);
2209  auto *StaticGlobalized = new llvm::GlobalVariable(
2210  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2211  llvm::GlobalValue::CommonLinkage, nullptr);
2212  auto *RecSize = new llvm::GlobalVariable(
2213  CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2215  "_openmp_static_kernel$size");
2216  RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2217  llvm::Value *Ld = CGF.EmitLoadOfScalar(
2218  Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
2219  CGM.getContext().getSizeType(), Loc);
2221  KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2222  llvm::Value *GlobalRecordSizeArg[] = {
2223  llvm::ConstantInt::get(
2224  CGM.Int16Ty,
2225  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2226  StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
2228  OMPRTL_NVPTX__kmpc_get_team_static_memory),
2229  GlobalRecordSizeArg);
2230  GlobalizedRecords.back().Buffer = StaticGlobalized;
2231  GlobalizedRecords.back().RecSize = RecSize;
2232  GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2233  GlobalizedRecords.back().Loc = Loc;
2234  }
2235  assert(KernelStaticGlobalized && "Global address must be set already.");
2236  Address FrameAddr = CGF.EmitLoadOfPointer(
2237  Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2238  CGM.getContext()
2240  .castAs<PointerType>());
2241  llvm::Value *GlobalRecValue =
2242  Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
2243  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2244  I->getSecond().IsInSPMDModeFlag = nullptr;
2245  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2246  GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2247  } else {
2248  // TODO: allow the usage of shared memory to be controlled by
2249  // the user, for now, default to global.
2250  llvm::Value *GlobalRecordSizeArg[] = {
2251  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2252  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2253  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2255  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2256  GlobalRecordSizeArg);
2257  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2258  GlobalRecValue, GlobalRecPtrTy);
2259  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2260  I->getSecond().IsInSPMDModeFlag = nullptr;
2261  }
2262  LValue Base =
2263  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
2264 
2265  // Emit the "global alloca" which is a GEP from the global declaration
2266  // record using the pointer returned by the runtime.
2267  LValue SecBase;
2268  decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2269  if (IsTTD) {
2270  SecIt = I->getSecond().SecondaryLocalVarData->begin();
2271  llvm::PointerType *SecGlobalRecPtrTy =
2272  CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2273  SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2275  I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2276  SecGlobalRecTy);
2277  }
2278  for (auto &Rec : I->getSecond().LocalVarData) {
2279  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2280  llvm::Value *ParValue;
2281  if (EscapedParam) {
2282  const auto *VD = cast<VarDecl>(Rec.first);
2283  LValue ParLVal =
2284  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2285  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2286  }
2287  LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2288  // Emit VarAddr basing on lane-id if required.
2289  QualType VarTy;
2290  if (Rec.second.IsOnePerTeam) {
2291  VarTy = Rec.second.FD->getType();
2292  } else {
2293  llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2294  VarAddr.getAddress().getPointer(),
2295  {Bld.getInt32(0), getNVPTXLaneID(CGF)});
2296  VarTy =
2297  Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2298  VarAddr = CGF.MakeAddrLValue(
2299  Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2301  }
2302  Rec.second.PrivateAddr = VarAddr.getAddress();
2303  if (!IsInTTDRegion &&
2304  (WithSPMDCheck ||
2305  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2306  assert(I->getSecond().IsInSPMDModeFlag &&
2307  "Expected unknown execution mode or required SPMD check.");
2308  if (IsTTD) {
2309  assert(SecIt->second.IsOnePerTeam &&
2310  "Secondary glob data must be one per team.");
2311  LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2312  VarAddr.setAddress(
2313  Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2314  VarAddr.getPointer()),
2315  VarAddr.getAlignment()));
2316  Rec.second.PrivateAddr = VarAddr.getAddress();
2317  }
2318  Address GlobalPtr = Rec.second.PrivateAddr;
2319  Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2320  Rec.second.PrivateAddr = Address(
2321  Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2322  LocalAddr.getPointer(), GlobalPtr.getPointer()),
2323  LocalAddr.getAlignment());
2324  }
2325  if (EscapedParam) {
2326  const auto *VD = cast<VarDecl>(Rec.first);
2327  CGF.EmitStoreOfScalar(ParValue, VarAddr);
2328  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2329  }
2330  if (IsTTD)
2331  ++SecIt;
2332  }
2333  }
2334  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2335  // Recover pointer to this function's global record. The runtime will
2336  // handle the specifics of the allocation of the memory.
2337  // Use actual memory size of the record including the padding
2338  // for alignment purposes.
2339  CGBuilderTy &Bld = CGF.Builder;
2340  llvm::Value *Size = CGF.getTypeSize(VD->getType());
2341  CharUnits Align = CGM.getContext().getDeclAlign(VD);
2342  Size = Bld.CreateNUWAdd(
2343  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2344  llvm::Value *AlignVal =
2345  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2346  Size = Bld.CreateUDiv(Size, AlignVal);
2347  Size = Bld.CreateNUWMul(Size, AlignVal);
2348  // TODO: allow the usage of shared memory to be controlled by
2349  // the user, for now, default to global.
2350  llvm::Value *GlobalRecordSizeArg[] = {
2351  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2352  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2354  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2355  GlobalRecordSizeArg);
2356  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2357  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2358  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2359  CGM.getContext().getDeclAlign(VD),
2361  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2362  Base.getAddress());
2363  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2364  }
2365  I->getSecond().MappedParams->apply(CGF);
2366 }
2367 
2368 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2369  bool WithSPMDCheck) {
2371  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2372  return;
2373 
2374  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2375  if (I != FunctionGlobalizedDecls.end()) {
2376  I->getSecond().MappedParams->restore(CGF);
2377  if (!CGF.HaveInsertPoint())
2378  return;
2379  for (llvm::Value *Addr :
2380  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2381  CGF.EmitRuntimeCall(
2382  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2383  Addr);
2384  }
2385  if (I->getSecond().GlobalRecordAddr) {
2386  if (!IsInTTDRegion &&
2387  (WithSPMDCheck ||
2388  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2389  CGBuilderTy &Bld = CGF.Builder;
2390  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2391  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2392  Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2393  // There is no need to emit line number for unconditional branch.
2395  CGF.EmitBlock(NonSPMDBB);
2396  CGF.EmitRuntimeCall(
2398  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2399  CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2400  CGF.EmitBlock(ExitBB);
2401  } else if (IsInTTDRegion) {
2402  assert(GlobalizedRecords.back().RegionCounter > 0 &&
2403  "region counter must be > 0.");
2404  --GlobalizedRecords.back().RegionCounter;
2405  // Emit the restore function only in the target region.
2406  if (GlobalizedRecords.back().RegionCounter == 0) {
2407  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2408  /*DestWidth=*/16, /*Signed=*/0);
2409  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2410  Address(GlobalizedRecords.back().UseSharedMemory,
2411  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2412  /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2413  llvm::Value *Args[] = {
2414  llvm::ConstantInt::get(
2415  CGM.Int16Ty,
2416  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2417  IsInSharedMemory};
2418  CGF.EmitRuntimeCall(
2420  OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2421  Args);
2422  }
2423  } else {
2425  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2426  I->getSecond().GlobalRecordAddr);
2427  }
2428  }
2429  }
2430 }
2431 
2433  const OMPExecutableDirective &D,
2434  SourceLocation Loc,
2435  llvm::Function *OutlinedFn,
2436  ArrayRef<llvm::Value *> CapturedVars) {
2437  if (!CGF.HaveInsertPoint())
2438  return;
2439 
2440  Address ZeroAddr = CGF.CreateMemTemp(
2441  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2442  /*Name*/ ".zero.addr");
2443  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2444  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2445  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2446  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2447  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2448  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2449 }
2450 
2452  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2453  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2454  if (!CGF.HaveInsertPoint())
2455  return;
2456 
2457  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2458  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2459  else
2460  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2461 }
2462 
2463 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2464  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2465  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2466  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2467 
2468  // Force inline this outlined function at its call site.
2469  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2470 
2472  /*DestWidth=*/32, /*Signed=*/1),
2473  ".zero.addr");
2474  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2475  // ThreadId for serialized parallels is 0.
2476  Address ThreadIDAddr = ZeroAddr;
2477  auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
2478  CodeGenFunction &CGF, PrePostActionTy &Action) {
2479  Action.Enter(CGF);
2480 
2481  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2482  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2483  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2484  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2485  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2486  };
2487  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2488  PrePostActionTy &) {
2489 
2490  RegionCodeGenTy RCG(CodeGen);
2491  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2492  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2493  llvm::Value *Args[] = {RTLoc, ThreadID};
2494 
2495  NVPTXActionTy Action(
2496  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2497  Args,
2498  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2499  Args);
2500  RCG.setAction(Action);
2501  RCG(CGF);
2502  };
2503 
2504  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2505  PrePostActionTy &Action) {
2506  CGBuilderTy &Bld = CGF.Builder;
2507  llvm::Function *WFn = WrapperFunctionsMap[Fn];
2508  assert(WFn && "Wrapper function does not exist!");
2509  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2510 
2511  // Prepare for parallel region. Indicate the outlined function.
2512  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
2513  CGF.EmitRuntimeCall(
2514  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2515  Args);
2516 
2517  // Create a private scope that will globalize the arguments
2518  // passed from the outside of the target region.
2519  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2520 
2521  // There's something to share.
2522  if (!CapturedVars.empty()) {
2523  // Prepare for parallel region. Indicate the outlined function.
2524  Address SharedArgs =
2525  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2526  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2527 
2528  llvm::Value *DataSharingArgs[] = {
2529  SharedArgsPtr,
2530  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2531  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2532  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2533  DataSharingArgs);
2534 
2535  // Store variable address in a list of references to pass to workers.
2536  unsigned Idx = 0;
2537  ASTContext &Ctx = CGF.getContext();
2538  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2539  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2540  .castAs<PointerType>());
2541  for (llvm::Value *V : CapturedVars) {
2542  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
2543  llvm::Value *PtrV;
2544  if (V->getType()->isIntegerTy())
2545  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2546  else
2547  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2548  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2549  Ctx.getPointerType(Ctx.VoidPtrTy));
2550  ++Idx;
2551  }
2552  }
2553 
2554  // Activate workers. This barrier is used by the master to signal
2555  // work for the workers.
2556  syncCTAThreads(CGF);
2557 
2558  // OpenMP [2.5, Parallel Construct, p.49]
2559  // There is an implied barrier at the end of a parallel region. After the
2560  // end of a parallel region, only the master thread of the team resumes
2561  // execution of the enclosing task region.
2562  //
2563  // The master waits at this barrier until all workers are done.
2564  syncCTAThreads(CGF);
2565 
2566  if (!CapturedVars.empty())
2567  CGF.EmitRuntimeCall(
2568  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2569 
2570  // Remember for post-processing in worker loop.
2571  Work.emplace_back(WFn);
2572  };
2573 
2574  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2575  CodeGenFunction &CGF, PrePostActionTy &Action) {
2576  if (IsInParallelRegion) {
2577  SeqGen(CGF, Action);
2578  } else if (IsInTargetMasterThreadRegion) {
2579  L0ParallelGen(CGF, Action);
2580  } else {
2581  // Check for master and then parallelism:
2582  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2583  // Serialized execution.
2584  // } else {
2585  // Worker call.
2586  // }
2587  CGBuilderTy &Bld = CGF.Builder;
2588  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2589  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2590  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2591  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2592  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2593  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2594  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2595  // There is no need to emit line number for unconditional branch.
2597  CGF.EmitBlock(ParallelCheckBB);
2598  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2599  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2600  llvm::Value *PL = CGF.EmitRuntimeCall(
2601  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2602  {RTLoc, ThreadID});
2603  llvm::Value *Res = Bld.CreateIsNotNull(PL);
2604  Bld.CreateCondBr(Res, SeqBB, MasterBB);
2605  CGF.EmitBlock(SeqBB);
2606  SeqGen(CGF, Action);
2607  CGF.EmitBranch(ExitBB);
2608  // There is no need to emit line number for unconditional branch.
2610  CGF.EmitBlock(MasterBB);
2611  L0ParallelGen(CGF, Action);
2612  CGF.EmitBranch(ExitBB);
2613  // There is no need to emit line number for unconditional branch.
2615  // Emit the continuation block for code after the if.
2616  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2617  }
2618  };
2619 
2620  if (IfCond) {
2621  emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2622  } else {
2624  RegionCodeGenTy ThenRCG(LNParallelGen);
2625  ThenRCG(CGF);
2626  }
2627 }
2628 
2629 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2630  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2631  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2632  // Just call the outlined function to execute the parallel region.
2633  // OutlinedFn(&GTid, &zero, CapturedStruct);
2634  //
2635  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2636 
2638  /*DestWidth=*/32, /*Signed=*/1),
2639  ".zero.addr");
2640  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2641  // ThreadId for serialized parallels is 0.
2642  Address ThreadIDAddr = ZeroAddr;
2643  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
2644  &ThreadIDAddr](CodeGenFunction &CGF,
2645  PrePostActionTy &Action) {
2646  Action.Enter(CGF);
2647 
2648  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2649  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2650  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2651  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2652  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2653  };
2654  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2655  PrePostActionTy &) {
2656 
2657  RegionCodeGenTy RCG(CodeGen);
2658  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2659  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2660  llvm::Value *Args[] = {RTLoc, ThreadID};
2661 
2662  NVPTXActionTy Action(
2663  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2664  Args,
2665  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2666  Args);
2667  RCG.setAction(Action);
2668  RCG(CGF);
2669  };
2670 
2671  if (IsInTargetMasterThreadRegion) {
2672  // In the worker need to use the real thread id.
2673  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2674  RegionCodeGenTy RCG(CodeGen);
2675  RCG(CGF);
2676  } else {
2677  // If we are not in the target region, it is definitely L2 parallelism or
2678  // more, because for SPMD mode we always has L1 parallel level, sowe don't
2679  // need to check for orphaned directives.
2680  RegionCodeGenTy RCG(SeqGen);
2681  RCG(CGF);
2682  }
2683 }
2684 
2685 void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
2686  // Always emit simple barriers!
2687  if (!CGF.HaveInsertPoint())
2688  return;
2689  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2690  // This function does not use parameters, so we can emit just default values.
2691  llvm::Value *Args[] = {
2692  llvm::ConstantPointerNull::get(
2693  cast<llvm::PointerType>(getIdentTyPointerTy())),
2694  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2695  llvm::CallInst *Call = CGF.EmitRuntimeCall(
2696  createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
2697  Call->setConvergent();
2698 }
2699 
2701  SourceLocation Loc,
2702  OpenMPDirectiveKind Kind, bool,
2703  bool) {
2704  // Always emit simple barriers!
2705  if (!CGF.HaveInsertPoint())
2706  return;
2707  // Build call __kmpc_cancel_barrier(loc, thread_id);
2708  unsigned Flags = getDefaultFlagsForBarriers(Kind);
2709  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2710  getThreadID(CGF, Loc)};
2711  llvm::CallInst *Call = CGF.EmitRuntimeCall(
2713  Call->setConvergent();
2714 }
2715 
2717  CodeGenFunction &CGF, StringRef CriticalName,
2718  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2719  const Expr *Hint) {
2720  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2721  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2722  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2723  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2724  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2725 
2726  // Get the mask of active threads in the warp.
2727  llvm::Value *Mask = CGF.EmitRuntimeCall(
2728  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
2729  // Fetch team-local id of the thread.
2730  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2731 
2732  // Get the width of the team.
2733  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2734 
2735  // Initialize the counter variable for the loop.
2736  QualType Int32Ty =
2737  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2738  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2739  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2740  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2741  /*isInit=*/true);
2742 
2743  // Block checks if loop counter exceeds upper bound.
2744  CGF.EmitBlock(LoopBB);
2745  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2746  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2747  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2748 
2749  // Block tests which single thread should execute region, and which threads
2750  // should go straight to synchronisation point.
2751  CGF.EmitBlock(TestBB);
2752  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2753  llvm::Value *CmpThreadToCounter =
2754  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2755  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2756 
2757  // Block emits the body of the critical region.
2758  CGF.EmitBlock(BodyBB);
2759 
2760  // Output the critical statement.
2761  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2762  Hint);
2763 
2764  // After the body surrounded by the critical region, the single executing
2765  // thread will jump to the synchronisation point.
2766  // Block waits for all threads in current team to finish then increments the
2767  // counter variable and returns to the loop.
2768  CGF.EmitBlock(SyncBB);
2769  // Reconverge active threads in the warp.
2770  (void)CGF.EmitRuntimeCall(
2771  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_syncwarp), Mask);
2772 
2773  llvm::Value *IncCounterVal =
2774  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2775  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2776  CGF.EmitBranch(LoopBB);
2777 
2778  // Block that is reached when all threads in the team complete the region.
2779  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2780 }
2781 
2782 /// Cast value to the specified type.
2784  QualType ValTy, QualType CastTy,
2785  SourceLocation Loc) {
2786  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2787  "Cast type must sized.");
2788  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2789  "Val type must sized.");
2790  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2791  if (ValTy == CastTy)
2792  return Val;
2793  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2794  CGF.getContext().getTypeSizeInChars(CastTy))
2795  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2796  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2797  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2798  CastTy->hasSignedIntegerRepresentation());
2799  Address CastItem = CGF.CreateMemTemp(CastTy);
2801  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2802  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2803  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2804 }
2805 
2806 /// This function creates calls to one of two shuffle functions to copy
2807 /// variables between lanes in a warp.
2809  llvm::Value *Elem,
2810  QualType ElemType,
2812  SourceLocation Loc) {
2813  CodeGenModule &CGM = CGF.CGM;
2814  CGBuilderTy &Bld = CGF.Builder;
2815  CGOpenMPRuntimeNVPTX &RT =
2816  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2817 
2818  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2819  assert(Size.getQuantity() <= 8 &&
2820  "Unsupported bitwidth in shuffle instruction.");
2821 
2822  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2823  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2824  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2825 
2826  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2827  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2828  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2829  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2830  llvm::Value *WarpSize =
2831  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2832 
2833  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2834  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2835 
2836  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2837 }
2838 
2839 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2840  Address DestAddr, QualType ElemType,
2842  CGBuilderTy &Bld = CGF.Builder;
2843 
2844  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2845  // Create the loop over the big sized data.
2846  // ptr = (void*)Elem;
2847  // ptrEnd = (void*) Elem + 1;
2848  // Step = 8;
2849  // while (ptr + Step < ptrEnd)
2850  // shuffle((int64_t)*ptr);
2851  // Step = 4;
2852  // while (ptr + Step < ptrEnd)
2853  // shuffle((int32_t)*ptr);
2854  // ...
2855  Address ElemPtr = DestAddr;
2856  Address Ptr = SrcAddr;
2858  Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2859  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2860  if (Size < CharUnits::fromQuantity(IntSize))
2861  continue;
2862  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2863  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2864  /*Signed=*/1);
2865  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2866  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2867  ElemPtr =
2868  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2869  if (Size.getQuantity() / IntSize > 1) {
2870  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2871  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2872  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2873  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2874  CGF.EmitBlock(PreCondBB);
2875  llvm::PHINode *PhiSrc =
2876  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2877  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2878  llvm::PHINode *PhiDest =
2879  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2880  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2881  Ptr = Address(PhiSrc, Ptr.getAlignment());
2882  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2883  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2885  Ptr.getPointer(), CGF.VoidPtrTy));
2886  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2887  ThenBB, ExitBB);
2888  CGF.EmitBlock(ThenBB);
2890  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2891  IntType, Offset, Loc);
2892  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2893  Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2894  Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2895  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2896  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2897  CGF.EmitBranch(PreCondBB);
2898  CGF.EmitBlock(ExitBB);
2899  } else {
2901  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2902  IntType, Offset, Loc);
2903  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2904  Ptr = Bld.CreateConstGEP(Ptr, 1);
2905  ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2906  }
2907  Size = Size % IntSize;
2908  }
2909 }
2910 
2911 namespace {
2912 enum CopyAction : unsigned {
2913  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2914  // the warp using shuffle instructions.
2915  RemoteLaneToThread,
2916  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2917  ThreadCopy,
2918  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2919  ThreadToScratchpad,
2920  // ScratchpadToThread: Copy from a scratchpad array in global memory
2921  // containing team-reduced data to a thread's stack.
2922  ScratchpadToThread,
2923 };
2924 } // namespace
2925 
2930 };
2931 
2932 /// Emit instructions to copy a Reduce list, which contains partially
2933 /// aggregated values, in the specified direction.
2935  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2936  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2937  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2938 
2939  CodeGenModule &CGM = CGF.CGM;
2940  ASTContext &C = CGM.getContext();
2941  CGBuilderTy &Bld = CGF.Builder;
2942 
2943  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2944  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2945  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2946 
2947  // Iterates, element-by-element, through the source Reduce list and
2948  // make a copy.
2949  unsigned Idx = 0;
2950  unsigned Size = Privates.size();
2951  for (const Expr *Private : Privates) {
2952  Address SrcElementAddr = Address::invalid();
2953  Address DestElementAddr = Address::invalid();
2954  Address DestElementPtrAddr = Address::invalid();
2955  // Should we shuffle in an element from a remote lane?
2956  bool ShuffleInElement = false;
2957  // Set to true to update the pointer in the dest Reduce list to a
2958  // newly created element.
2959  bool UpdateDestListPtr = false;
2960  // Increment the src or dest pointer to the scratchpad, for each
2961  // new element.
2962  bool IncrScratchpadSrc = false;
2963  bool IncrScratchpadDest = false;
2964 
2965  switch (Action) {
2966  case RemoteLaneToThread: {
2967  // Step 1.1: Get the address for the src element in the Reduce list.
2968  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2969  SrcElementAddr = CGF.EmitLoadOfPointer(
2970  SrcElementPtrAddr,
2971  C.getPointerType(Private->getType())->castAs<PointerType>());
2972 
2973  // Step 1.2: Create a temporary to store the element in the destination
2974  // Reduce list.
2975  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2976  DestElementAddr =
2977  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2978  ShuffleInElement = true;
2979  UpdateDestListPtr = true;
2980  break;
2981  }
2982  case ThreadCopy: {
2983  // Step 1.1: Get the address for the src element in the Reduce list.
2984  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2985  SrcElementAddr = CGF.EmitLoadOfPointer(
2986  SrcElementPtrAddr,
2987  C.getPointerType(Private->getType())->castAs<PointerType>());
2988 
2989  // Step 1.2: Get the address for dest element. The destination
2990  // element has already been created on the thread's stack.
2991  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2992  DestElementAddr = CGF.EmitLoadOfPointer(
2993  DestElementPtrAddr,
2994  C.getPointerType(Private->getType())->castAs<PointerType>());
2995  break;
2996  }
2997  case ThreadToScratchpad: {
2998  // Step 1.1: Get the address for the src element in the Reduce list.
2999  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3000  SrcElementAddr = CGF.EmitLoadOfPointer(
3001  SrcElementPtrAddr,
3002  C.getPointerType(Private->getType())->castAs<PointerType>());
3003 
3004  // Step 1.2: Get the address for dest element:
3005  // address = base + index * ElementSizeInChars.
3006  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3007  llvm::Value *CurrentOffset =
3008  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3009  llvm::Value *ScratchPadElemAbsolutePtrVal =
3010  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
3011  ScratchPadElemAbsolutePtrVal =
3012  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3013  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3014  C.getTypeAlignInChars(Private->getType()));
3015  IncrScratchpadDest = true;
3016  break;
3017  }
3018  case ScratchpadToThread: {
3019  // Step 1.1: Get the address for the src element in the scratchpad.
3020  // address = base + index * ElementSizeInChars.
3021  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3022  llvm::Value *CurrentOffset =
3023  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3024  llvm::Value *ScratchPadElemAbsolutePtrVal =
3025  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
3026  ScratchPadElemAbsolutePtrVal =
3027  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3028  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3029  C.getTypeAlignInChars(Private->getType()));
3030  IncrScratchpadSrc = true;
3031 
3032  // Step 1.2: Create a temporary to store the element in the destination
3033  // Reduce list.
3034  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3035  DestElementAddr =
3036  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3037  UpdateDestListPtr = true;
3038  break;
3039  }
3040  }
3041 
3042  // Regardless of src and dest of copy, we emit the load of src
3043  // element as this is required in all directions
3044  SrcElementAddr = Bld.CreateElementBitCast(
3045  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3046  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3047  SrcElementAddr.getElementType());
3048 
3049  // Now that all active lanes have read the element in the
3050  // Reduce list, shuffle over the value from the remote lane.
3051  if (ShuffleInElement) {
3052  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3053  RemoteLaneOffset, Private->getExprLoc());
3054  } else {
3055  switch (CGF.getEvaluationKind(Private->getType())) {
3056  case TEK_Scalar: {
3057  llvm::Value *Elem =
3058  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
3059  Private->getType(), Private->getExprLoc());
3060  // Store the source element value to the dest element address.
3061  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
3062  Private->getType());
3063  break;
3064  }
3065  case TEK_Complex: {
3067  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3068  Private->getExprLoc());
3069  CGF.EmitStoreOfComplex(
3070  Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3071  /*isInit=*/false);
3072  break;
3073  }
3074  case TEK_Aggregate:
3075  CGF.EmitAggregateCopy(
3076  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3077  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3079  break;
3080  }
3081  }
3082 
3083  // Step 3.1: Modify reference in dest Reduce list as needed.
3084  // Modifying the reference in Reduce list to point to the newly
3085  // created element. The element is live in the current function
3086  // scope and that of functions it invokes (i.e., reduce_function).
3087  // RemoteReduceData[i] = (void*)&RemoteElem
3088  if (UpdateDestListPtr) {
3090  DestElementAddr.getPointer(), CGF.VoidPtrTy),
3091  DestElementPtrAddr, /*Volatile=*/false,
3092  C.VoidPtrTy);
3093  }
3094 
3095  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3096  // address of the next element in scratchpad memory, unless we're currently
3097  // processing the last one. Memory alignment is also taken care of here.
3098  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3099  llvm::Value *ScratchpadBasePtr =
3100  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3101  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3102  ScratchpadBasePtr = Bld.CreateNUWAdd(
3103  ScratchpadBasePtr,
3104  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3105 
3106  // Take care of global memory alignment for performance
3107  ScratchpadBasePtr = Bld.CreateNUWSub(
3108  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3109  ScratchpadBasePtr = Bld.CreateUDiv(
3110  ScratchpadBasePtr,
3111  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3112  ScratchpadBasePtr = Bld.CreateNUWAdd(
3113  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3114  ScratchpadBasePtr = Bld.CreateNUWMul(
3115  ScratchpadBasePtr,
3116  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3117 
3118  if (IncrScratchpadDest)
3119  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3120  else /* IncrScratchpadSrc = true */
3121  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3122  }
3123 
3124  ++Idx;
3125  }
3126 }
3127 
3128 /// This function emits a helper that gathers Reduce lists from the first
3129 /// lane of every active warp to lanes in the first warp.
3130 ///
3131 /// void inter_warp_copy_func(void* reduce_data, num_warps)
3132 /// shared smem[warp_size];
3133 /// For all data entries D in reduce_data:
3134 /// sync
3135 /// If (I am the first lane in each warp)
3136 /// Copy my local D to smem[warp_id]
3137 /// sync
3138 /// if (I am the first warp)
3139 /// Copy smem[thread_id] to my local D
3141  ArrayRef<const Expr *> Privates,
3142  QualType ReductionArrayTy,
3143  SourceLocation Loc) {
3144  ASTContext &C = CGM.getContext();
3145  llvm::Module &M = CGM.getModule();
3146 
3147  // ReduceList: thread local Reduce list.
3148  // At the stage of the computation when this function is called, partially
3149  // aggregated values reside in the first lane of every active warp.
3150  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3152  // NumWarps: number of warps active in the parallel region. This could
3153  // be smaller than 32 (max warps in a CTA) for partial block reduction.
3154  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3155  C.getIntTypeForBitwidth(32, /* Signed */ true),
3157  FunctionArgList Args;
3158  Args.push_back(&ReduceListArg);
3159  Args.push_back(&NumWarpsArg);
3160 
3161  const CGFunctionInfo &CGFI =
3163  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
3165  "_omp_reduction_inter_warp_copy_func", &M);
3166  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3167  Fn->setDoesNotRecurse();
3168  CodeGenFunction CGF(CGM);
3169  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3170 
3171  CGBuilderTy &Bld = CGF.Builder;
3172 
3173  // This array is used as a medium to transfer, one reduce element at a time,
3174  // the data from the first lane of every warp to lanes in the first warp
3175  // in order to perform the final step of a reduction in a parallel region
3176  // (reduction across warps). The array is placed in NVPTX __shared__ memory
3177  // for reduced latency, as well as to have a distinct copy for concurrently
3178  // executing target regions. The array is declared with common linkage so
3179  // as to be shared across compilation units.
3180  StringRef TransferMediumName =
3181  "__openmp_nvptx_data_transfer_temporary_storage";
3182  llvm::GlobalVariable *TransferMedium =
3183  M.getGlobalVariable(TransferMediumName);
3184  if (!TransferMedium) {
3185  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3186  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3187  TransferMedium = new llvm::GlobalVariable(
3188  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3189  llvm::Constant::getNullValue(Ty), TransferMediumName,
3190  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3191  SharedAddressSpace);
3192  CGM.addCompilerUsedGlobal(TransferMedium);
3193  }
3194 
3195  // Get the CUDA thread id of the current OpenMP thread on the GPU.
3196  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3197  // nvptx_lane_id = nvptx_id % warpsize
3198  llvm::Value *LaneID = getNVPTXLaneID(CGF);
3199  // nvptx_warp_id = nvptx_id / warpsize
3200  llvm::Value *WarpID = getNVPTXWarpID(CGF);
3201 
3202  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3203  Address LocalReduceList(
3205  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3206  C.VoidPtrTy, Loc),
3207  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3208  CGF.getPointerAlign());
3209 
3210  unsigned Idx = 0;
3211  for (const Expr *Private : Privates) {
3212  //
3213  // Warp master copies reduce element to transfer medium in __shared__
3214  // memory.
3215  //
3216  unsigned RealTySize =
3217  C.getTypeSizeInChars(Private->getType())
3218  .alignTo(C.getTypeAlignInChars(Private->getType()))
3219  .getQuantity();
3220  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3221  unsigned NumIters = RealTySize / TySize;
3222  if (NumIters == 0)
3223  continue;
3224  QualType CType = C.getIntTypeForBitwidth(
3225  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
3226  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3227  CharUnits Align = CharUnits::fromQuantity(TySize);
3228  llvm::Value *Cnt = nullptr;
3229  Address CntAddr = Address::invalid();
3230  llvm::BasicBlock *PrecondBB = nullptr;
3231  llvm::BasicBlock *ExitBB = nullptr;
3232  if (NumIters > 1) {
3233  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3234  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3235  /*Volatile=*/false, C.IntTy);
3236  PrecondBB = CGF.createBasicBlock("precond");
3237  ExitBB = CGF.createBasicBlock("exit");
3238  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3239  // There is no need to emit line number for unconditional branch.
3241  CGF.EmitBlock(PrecondBB);
3242  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
3243  llvm::Value *Cmp =
3244  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3245  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3246  CGF.EmitBlock(BodyBB);
3247  }
3248  // kmpc_barrier.
3249  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3250  /*EmitChecks=*/false,
3251  /*ForceSimpleCall=*/true);
3252  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3253  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3254  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3255 
3256  // if (lane_id == 0)
3257  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3258  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3259  CGF.EmitBlock(ThenBB);
3260 
3261  // Reduce element = LocalReduceList[i]
3262  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3263  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3264  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3265  // elemptr = ((CopyType*)(elemptrptr)) + I
3266  Address ElemPtr = Address(ElemPtrPtr, Align);
3267  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3268  if (NumIters > 1) {
3269  ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3270  ElemPtr.getAlignment());
3271  }
3272 
3273  // Get pointer to location in transfer medium.
3274  // MediumPtr = &medium[warp_id]
3275  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3276  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3277  Address MediumPtr(MediumPtrVal, Align);
3278  // Casting to actual data type.
3279  // MediumPtr = (CopyType*)MediumPtrAddr;
3280  MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3281 
3282  // elem = *elemptr
3283  //*MediumPtr = elem
3284  llvm::Value *Elem =
3285  CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
3286  // Store the source element value to the dest element address.
3287  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
3288 
3289  Bld.CreateBr(MergeBB);
3290 
3291  CGF.EmitBlock(ElseBB);
3292  Bld.CreateBr(MergeBB);
3293 
3294  CGF.EmitBlock(MergeBB);
3295 
3296  // kmpc_barrier.
3297  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3298  /*EmitChecks=*/false,
3299  /*ForceSimpleCall=*/true);
3300 
3301  //
3302  // Warp 0 copies reduce element from transfer medium.
3303  //
3304  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3305  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3306  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3307 
3308  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3309  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3310  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
3311 
3312  // Up to 32 threads in warp 0 are active.
3313  llvm::Value *IsActiveThread =
3314  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3315  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3316 
3317  CGF.EmitBlock(W0ThenBB);
3318 
3319  // SrcMediumPtr = &medium[tid]
3320  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3321  TransferMedium,
3322  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3323  Address SrcMediumPtr(SrcMediumPtrVal, Align);
3324  // SrcMediumVal = *SrcMediumPtr;
3325  SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3326 
3327  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3328  Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3329  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3330  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
3331  Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3332  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3333  if (NumIters > 1) {
3334  TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3335  TargetElemPtr.getAlignment());
3336  }
3337 
3338  // *TargetElemPtr = SrcMediumVal;
3339  llvm::Value *SrcMediumValue =
3340  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
3341  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3342  CType);
3343  Bld.CreateBr(W0MergeBB);
3344 
3345  CGF.EmitBlock(W0ElseBB);
3346  Bld.CreateBr(W0MergeBB);
3347 
3348  CGF.EmitBlock(W0MergeBB);
3349 
3350  if (NumIters > 1) {
3351  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
3352  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
3353  CGF.EmitBranch(PrecondBB);
3355  CGF.EmitBlock(ExitBB);
3356  }
3357  RealTySize %= TySize;
3358  }
3359  ++Idx;
3360  }
3361 
3362  CGF.FinishFunction();
3363  return Fn;
3364 }
3365 
3366 /// Emit a helper that reduces data across two OpenMP threads (lanes)
3367 /// in the same warp. It uses shuffle instructions to copy over data from
3368 /// a remote lane's stack. The reduction algorithm performed is specified
3369 /// by the fourth parameter.
3370 ///
3371 /// Algorithm Versions.
3372 /// Full Warp Reduce (argument value 0):
3373 /// This algorithm assumes that all 32 lanes are active and gathers
3374 /// data from these 32 lanes, producing a single resultant value.
3375 /// Contiguous Partial Warp Reduce (argument value 1):
3376 /// This algorithm assumes that only a *contiguous* subset of lanes
3377 /// are active. This happens for the last warp in a parallel region
3378 /// when the user specified num_threads is not an integer multiple of
3379 /// 32. This contiguous subset always starts with the zeroth lane.
3380 /// Partial Warp Reduce (argument value 2):
3381 /// This algorithm gathers data from any number of lanes at any position.
3382 /// All reduced values are stored in the lowest possible lane. The set
3383 /// of problems every algorithm addresses is a super set of those
3384 /// addressable by algorithms with a lower version number. Overhead
3385 /// increases as algorithm version increases.
3386 ///
3387 /// Terminology
3388 /// Reduce element:
3389 /// Reduce element refers to the individual data field with primitive
3390 /// data types to be combined and reduced across threads.
3391 /// Reduce list:
3392 /// Reduce list refers to a collection of local, thread-private
3393 /// reduce elements.
3394 /// Remote Reduce list:
3395 /// Remote Reduce list refers to a collection of remote (relative to
3396 /// the current thread) reduce elements.
3397 ///
3398 /// We distinguish between three states of threads that are important to
3399 /// the implementation of this function.
3400 /// Alive threads:
3401 /// Threads in a warp executing the SIMT instruction, as distinguished from
3402 /// threads that are inactive due to divergent control flow.
3403 /// Active threads:
3404 /// The minimal set of threads that has to be alive upon entry to this
3405 /// function. The computation is correct iff active threads are alive.
3406 /// Some threads are alive but they are not active because they do not
3407 /// contribute to the computation in any useful manner. Turning them off
3408 /// may introduce control flow overheads without any tangible benefits.
3409 /// Effective threads:
3410 /// In order to comply with the argument requirements of the shuffle
3411 /// function, we must keep all lanes holding data alive. But at most
3412 /// half of them perform value aggregation; we refer to this half of
3413 /// threads as effective. The other half is simply handing off their
3414 /// data.
3415 ///
3416 /// Procedure
3417 /// Value shuffle:
3418 /// In this step active threads transfer data from higher lane positions
3419 /// in the warp to lower lane positions, creating Remote Reduce list.
3420 /// Value aggregation:
3421 /// In this step, effective threads combine their thread local Reduce list
3422 /// with Remote Reduce list and store the result in the thread local
3423 /// Reduce list.
3424 /// Value copy:
3425 /// In this step, we deal with the assumption made by algorithm 2
3426 /// (i.e. contiguity assumption). When we have an odd number of lanes
3427 /// active, say 2k+1, only k threads will be effective and therefore k
3428 /// new values will be produced. However, the Reduce list owned by the
3429 /// (2k+1)th thread is ignored in the value aggregation. Therefore
3430 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3431 /// that the contiguity assumption still holds.
3432 static llvm::Function *emitShuffleAndReduceFunction(
3433  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3434  QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
3435  ASTContext &C = CGM.getContext();
3436 
3437  // Thread local Reduce list used to host the values of data to be reduced.
3438  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3440  // Current lane id; could be logical.
3441  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3443  // Offset of the remote source lane relative to the current lane.
3444  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3446  // Algorithm version. This is expected to be known at compile time.
3447  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3449  FunctionArgList Args;
3450  Args.push_back(&ReduceListArg);
3451  Args.push_back(&LaneIDArg);
3452  Args.push_back(&RemoteLaneOffsetArg);
3453  Args.push_back(&AlgoVerArg);
3454 
3455  const CGFunctionInfo &CGFI =
3457  auto *Fn = llvm::Function::Create(
3459  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3460  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3461  Fn->setDoesNotRecurse();
3462  if (CGM.getLangOpts().Optimize) {
3463  Fn->removeFnAttr(llvm::Attribute::NoInline);
3464  Fn->removeFnAttr(llvm::Attribute::OptimizeNone);
3465  Fn->addFnAttr(llvm::Attribute::AlwaysInline);
3466  }
3467 
3468  CodeGenFunction CGF(CGM);
3469  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3470 
3471  CGBuilderTy &Bld = CGF.Builder;
3472 
3473  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3474  Address LocalReduceList(
3476  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3477  C.VoidPtrTy, SourceLocation()),
3478  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3479  CGF.getPointerAlign());
3480 
3481  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3482  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3483  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3484 
3485  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3486  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3487  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3488 
3489  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3490  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3491  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3492 
3493  // Create a local thread-private variable to host the Reduce list
3494  // from a remote lane.
3495  Address RemoteReduceList =
3496  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3497 
3498  // This loop iterates through the list of reduce elements and copies,
3499  // element by element, from a remote lane in the warp to RemoteReduceList,
3500  // hosted on the thread's stack.
3501  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3502  LocalReduceList, RemoteReduceList,
3503  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3504  /*ScratchpadIndex=*/nullptr,
3505  /*ScratchpadWidth=*/nullptr});
3506 
3507  // The actions to be performed on the Remote Reduce list is dependent
3508  // on the algorithm version.
3509  //
3510  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3511  // LaneId % 2 == 0 && Offset > 0):
3512  // do the reduction value aggregation
3513  //
3514  // The thread local variable Reduce list is mutated in place to host the
3515  // reduced data, which is the aggregated value produced from local and
3516  // remote lanes.
3517  //
3518  // Note that AlgoVer is expected to be a constant integer known at compile
3519  // time.
3520  // When AlgoVer==0, the first conjunction evaluates to true, making
3521  // the entire predicate true during compile time.
3522  // When AlgoVer==1, the second conjunction has only the second part to be
3523  // evaluated during runtime. Other conjunctions evaluates to false
3524  // during compile time.
3525  // When AlgoVer==2, the third conjunction has only the second part to be
3526  // evaluated during runtime. Other conjunctions evaluates to false
3527  // during compile time.
3528  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3529 
3530  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3531  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3532  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3533 
3534  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3535  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3536  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3537  CondAlgo2 = Bld.CreateAnd(
3538  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3539 
3540  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3541  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3542 
3543  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3544  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3545  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3546  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3547 
3548  CGF.EmitBlock(ThenBB);
3549  // reduce_function(LocalReduceList, RemoteReduceList)
3550  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3551  LocalReduceList.getPointer(), CGF.VoidPtrTy);
3552  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3553  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3554  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3555  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3556  Bld.CreateBr(MergeBB);
3557 
3558  CGF.EmitBlock(ElseBB);
3559  Bld.CreateBr(MergeBB);
3560 
3561  CGF.EmitBlock(MergeBB);
3562 
3563  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3564  // Reduce list.
3565  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3566  llvm::Value *CondCopy = Bld.CreateAnd(
3567  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3568 
3569  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3570  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3571  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3572  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3573 
3574  CGF.EmitBlock(CpyThenBB);
3575  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3576  RemoteReduceList, LocalReduceList);
3577  Bld.CreateBr(CpyMergeBB);
3578 
3579  CGF.EmitBlock(CpyElseBB);
3580  Bld.CreateBr(CpyMergeBB);
3581 
3582  CGF.EmitBlock(CpyMergeBB);
3583 
3584  CGF.FinishFunction();
3585  return Fn;
3586 }
3587 
3588 /// This function emits a helper that copies all the reduction variables from
3589 /// the team into the provided global buffer for the reduction variables.
3590 ///
3591 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3592 /// For all data entries D in reduce_data:
3593 /// Copy local D to buffer.D[Idx]
3595  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3596  QualType ReductionArrayTy, SourceLocation Loc,
3597  const RecordDecl *TeamReductionRec,
3598  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3599  &VarFieldMap) {
3600  ASTContext &C = CGM.getContext();
3601 
3602  // Buffer: global reduction buffer.
3603  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3605  // Idx: index of the buffer.
3606  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3608  // ReduceList: thread local Reduce list.
3609  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3611  FunctionArgList Args;
3612  Args.push_back(&BufferArg);
3613  Args.push_back(&IdxArg);
3614  Args.push_back(&ReduceListArg);
3615 
3616  const CGFunctionInfo &CGFI =
3618  auto *Fn = llvm::Function::Create(
3620  "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3621  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3622  Fn->setDoesNotRecurse();
3623  CodeGenFunction CGF(CGM);
3624  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3625 
3626  CGBuilderTy &Bld = CGF.Builder;
3627 
3628  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3629  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3630  Address LocalReduceList(
3632  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3633  C.VoidPtrTy, Loc),
3634  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3635  CGF.getPointerAlign());
3636  QualType StaticTy = C.getRecordType(TeamReductionRec);
3637  llvm::Type *LLVMReductionsBufferTy =
3638  CGM.getTypes().ConvertTypeForMem(StaticTy);
3640  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3641  LLVMReductionsBufferTy->getPointerTo());
3642  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3643  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3644  /*Volatile=*/false, C.IntTy,
3645  Loc)};
3646  unsigned Idx = 0;
3647  for (const Expr *Private : Privates) {
3648  // Reduce element = LocalReduceList[i]
3649  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3650  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3651  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3652  // elemptr = ((CopyType*)(elemptrptr)) + I
3653  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3654  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3655  Address ElemPtr =
3656  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3657  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3658  // Global = Buffer.VD[Idx];
3659  const FieldDecl *FD = VarFieldMap.lookup(VD);
3660  LValue GlobLVal = CGF.EmitLValueForField(
3661  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3662  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3663  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3664  switch (CGF.getEvaluationKind(Private->getType())) {
3665  case TEK_Scalar: {
3666  llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3667  Private->getType(), Loc);
3668  CGF.EmitStoreOfScalar(V, GlobLVal);
3669  break;
3670  }
3671  case TEK_Complex: {
3673  CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3674  CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3675  break;
3676  }
3677  case TEK_Aggregate:
3678  CGF.EmitAggregateCopy(GlobLVal,
3679  CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3681  break;
3682  }
3683  ++Idx;
3684  }
3685 
3686  CGF.FinishFunction();
3687  return Fn;
3688 }
3689 
3690 /// This function emits a helper that reduces all the reduction variables from
3691 /// the team into the provided global buffer for the reduction variables.
3692 ///
3693 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3694 /// void *GlobPtrs[];
3695 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3696 /// ...
3697 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3698 /// reduce_function(GlobPtrs, reduce_data);
3700  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3701  QualType ReductionArrayTy, SourceLocation Loc,
3702  const RecordDecl *TeamReductionRec,
3703  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3704  &VarFieldMap,
3705  llvm::Function *ReduceFn) {
3706  ASTContext &C = CGM.getContext();
3707 
3708  // Buffer: global reduction buffer.
3709  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3711  // Idx: index of the buffer.
3712  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3714  // ReduceList: thread local Reduce list.
3715  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3717  FunctionArgList Args;
3718  Args.push_back(&BufferArg);
3719  Args.push_back(&IdxArg);
3720  Args.push_back(&ReduceListArg);
3721 
3722  const CGFunctionInfo &CGFI =
3724  auto *Fn = llvm::Function::Create(
3726  "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3727  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3728  Fn->setDoesNotRecurse();
3729  CodeGenFunction CGF(CGM);
3730  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3731 
3732  CGBuilderTy &Bld = CGF.Builder;
3733 
3734  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3735  QualType StaticTy = C.getRecordType(TeamReductionRec);
3736  llvm::Type *LLVMReductionsBufferTy =
3737  CGM.getTypes().ConvertTypeForMem(StaticTy);
3739  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3740  LLVMReductionsBufferTy->getPointerTo());
3741 
3742  // 1. Build a list of reduction variables.
3743  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3744  Address ReductionList =
3745  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3746  auto IPriv = Privates.begin();
3747  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3748  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3749  /*Volatile=*/false, C.IntTy,
3750  Loc)};
3751  unsigned Idx = 0;
3752  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3753  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3754  // Global = Buffer.VD[Idx];
3755  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3756  const FieldDecl *FD = VarFieldMap.lookup(VD);
3757  LValue GlobLVal = CGF.EmitLValueForField(
3758  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3759  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3760  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3761  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3762  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3763  // Store array size.
3764  ++Idx;
3765  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3766  llvm::Value *Size = CGF.Builder.CreateIntCast(
3767  CGF.getVLASize(
3768  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3769  .NumElts,
3770  CGF.SizeTy, /*isSigned=*/false);
3771  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3772  Elem);
3773  }
3774  }
3775 
3776  // Call reduce_function(GlobalReduceList, ReduceList)
3777  llvm::Value *GlobalReduceList =
3778  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3779  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3780  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3781  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3782  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3783  CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3784  CGF.FinishFunction();
3785  return Fn;
3786 }
3787 
3788 /// This function emits a helper that copies all the reduction variables from
3789 /// the team into the provided global buffer for the reduction variables.
3790 ///
3791 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3792 /// For all data entries D in reduce_data:
3793 /// Copy buffer.D[Idx] to local D;
3795  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3796  QualType ReductionArrayTy, SourceLocation Loc,
3797  const RecordDecl *TeamReductionRec,
3798  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3799  &VarFieldMap) {
3800  ASTContext &C = CGM.getContext();
3801 
3802  // Buffer: global reduction buffer.
3803  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3805  // Idx: index of the buffer.
3806  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3808  // ReduceList: thread local Reduce list.
3809  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3811  FunctionArgList Args;
3812  Args.push_back(&BufferArg);
3813  Args.push_back(&IdxArg);
3814  Args.push_back(&ReduceListArg);
3815 
3816  const CGFunctionInfo &CGFI =
3818  auto *Fn = llvm::Function::Create(
3820  "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3821  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3822  Fn->setDoesNotRecurse();
3823  CodeGenFunction CGF(CGM);
3824  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3825 
3826  CGBuilderTy &Bld = CGF.Builder;
3827 
3828  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3829  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3830  Address LocalReduceList(
3832  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3833  C.VoidPtrTy, Loc),
3834  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3835  CGF.getPointerAlign());
3836  QualType StaticTy = C.getRecordType(TeamReductionRec);
3837  llvm::Type *LLVMReductionsBufferTy =
3838  CGM.getTypes().ConvertTypeForMem(StaticTy);
3840  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3841  LLVMReductionsBufferTy->getPointerTo());
3842 
3843  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3844  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3845  /*Volatile=*/false, C.IntTy,
3846  Loc)};
3847  unsigned Idx = 0;
3848  for (const Expr *Private : Privates) {
3849  // Reduce element = LocalReduceList[i]
3850  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3851  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3852  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3853  // elemptr = ((CopyType*)(elemptrptr)) + I
3854  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3855  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3856  Address ElemPtr =
3857  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3858  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3859  // Global = Buffer.VD[Idx];
3860  const FieldDecl *FD = VarFieldMap.lookup(VD);
3861  LValue GlobLVal = CGF.EmitLValueForField(
3862  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3863  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3864  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3865  switch (CGF.getEvaluationKind(Private->getType())) {
3866  case TEK_Scalar: {
3867  llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3868  CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
3869  break;
3870  }
3871  case TEK_Complex: {
3872  CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3873  CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3874  /*isInit=*/false);
3875  break;
3876  }
3877  case TEK_Aggregate:
3878  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3879  GlobLVal, Private->getType(),
3881  break;
3882  }
3883  ++Idx;
3884  }
3885 
3886  CGF.FinishFunction();
3887  return Fn;
3888 }
3889 
3890 /// This function emits a helper that reduces all the reduction variables from
3891 /// the team into the provided global buffer for the reduction variables.
3892 ///
3893 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3894 /// void *GlobPtrs[];
3895 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3896 /// ...
3897 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3898 /// reduce_function(reduce_data, GlobPtrs);
3900  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3901  QualType ReductionArrayTy, SourceLocation Loc,
3902  const RecordDecl *TeamReductionRec,
3903  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3904  &VarFieldMap,
3905  llvm::Function *ReduceFn) {
3906  ASTContext &C = CGM.getContext();
3907 
3908  // Buffer: global reduction buffer.
3909  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3911  // Idx: index of the buffer.
3912  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3914  // ReduceList: thread local Reduce list.
3915  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3917  FunctionArgList Args;
3918  Args.push_back(&BufferArg);
3919  Args.push_back(&IdxArg);
3920  Args.push_back(&ReduceListArg);
3921 
3922  const CGFunctionInfo &CGFI =
3924  auto *Fn = llvm::Function::Create(
3926  "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3927  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3928  Fn->setDoesNotRecurse();
3929  CodeGenFunction CGF(CGM);
3930  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3931 
3932  CGBuilderTy &Bld = CGF.Builder;
3933 
3934  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3935  QualType StaticTy = C.getRecordType(TeamReductionRec);
3936  llvm::Type *LLVMReductionsBufferTy =
3937  CGM.getTypes().ConvertTypeForMem(StaticTy);
3939  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3940  LLVMReductionsBufferTy->getPointerTo());
3941 
3942  // 1. Build a list of reduction variables.
3943  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3944  Address ReductionList =
3945  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3946  auto IPriv = Privates.begin();
3947  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3948  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3949  /*Volatile=*/false, C.IntTy,
3950  Loc)};
3951  unsigned Idx = 0;
3952  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3953  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3954  // Global = Buffer.VD[Idx];
3955  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3956  const FieldDecl *FD = VarFieldMap.lookup(VD);
3957  LValue GlobLVal = CGF.EmitLValueForField(
3958  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3959  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3960  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3961  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3962  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3963  // Store array size.
3964  ++Idx;
3965  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3966  llvm::Value *Size = CGF.Builder.CreateIntCast(
3967  CGF.getVLASize(
3968  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3969  .NumElts,
3970  CGF.SizeTy, /*isSigned=*/false);
3971  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3972  Elem);
3973  }
3974  }
3975 
3976  // Call reduce_function(ReduceList, GlobalReduceList)
3977  llvm::Value *GlobalReduceList =
3978  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3979  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3980  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3981  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3982  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3983  CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
3984  CGF.FinishFunction();
3985  return Fn;
3986 }
3987 
3988 ///
3989 /// Design of OpenMP reductions on the GPU
3990 ///
3991 /// Consider a typical OpenMP program with one or more reduction
3992 /// clauses:
3993 ///
3994 /// float foo;
3995 /// double bar;
3996 /// #pragma omp target teams distribute parallel for \
3997 /// reduction(+:foo) reduction(*:bar)
3998 /// for (int i = 0; i < N; i++) {
3999 /// foo += A[i]; bar *= B[i];
4000 /// }
4001 ///
4002 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
4003 /// all teams. In our OpenMP implementation on the NVPTX device an
4004 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
4005 /// within a team are mapped to CUDA threads within a threadblock.
4006 /// Our goal is to efficiently aggregate values across all OpenMP
4007 /// threads such that:
4008 ///
4009 /// - the compiler and runtime are logically concise, and
4010 /// - the reduction is performed efficiently in a hierarchical
4011 /// manner as follows: within OpenMP threads in the same warp,
4012 /// across warps in a threadblock, and finally across teams on
4013 /// the NVPTX device.
4014 ///
4015 /// Introduction to Decoupling
4016 ///
4017 /// We would like to decouple the compiler and the runtime so that the
4018 /// latter is ignorant of the reduction variables (number, data types)
4019 /// and the reduction operators. This allows a simpler interface
4020 /// and implementation while still attaining good performance.
4021 ///
4022 /// Pseudocode for the aforementioned OpenMP program generated by the
4023 /// compiler is as follows:
4024 ///
4025 /// 1. Create private copies of reduction variables on each OpenMP
4026 /// thread: 'foo_private', 'bar_private'
4027 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
4028 /// to it and writes the result in 'foo_private' and 'bar_private'
4029 /// respectively.
4030 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
4031 /// and store the result on the team master:
4032 ///
4033 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
4034 /// reduceData, shuffleReduceFn, interWarpCpyFn)
4035 ///
4036 /// where:
4037 /// struct ReduceData {
4038 /// double *foo;
4039 /// double *bar;
4040 /// } reduceData
4041 /// reduceData.foo = &foo_private
4042 /// reduceData.bar = &bar_private
4043 ///
4044 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
4045 /// auxiliary functions generated by the compiler that operate on
4046 /// variables of type 'ReduceData'. They aid the runtime perform
4047 /// algorithmic steps in a data agnostic manner.
4048 ///
4049 /// 'shuffleReduceFn' is a pointer to a function that reduces data
4050 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
4051 /// same warp. It takes the following arguments as input:
4052 ///
4053 /// a. variable of type 'ReduceData' on the calling lane,
4054 /// b. its lane_id,
4055 /// c. an offset relative to the current lane_id to generate a
4056 /// remote_lane_id. The remote lane contains the second
4057 /// variable of type 'ReduceData' that is to be reduced.
4058 /// d. an algorithm version parameter determining which reduction
4059 /// algorithm to use.
4060 ///
4061 /// 'shuffleReduceFn' retrieves data from the remote lane using
4062 /// efficient GPU shuffle intrinsics and reduces, using the
4063 /// algorithm specified by the 4th parameter, the two operands
4064 /// element-wise. The result is written to the first operand.
4065 ///
4066 /// Different reduction algorithms are implemented in different
4067 /// runtime functions, all calling 'shuffleReduceFn' to perform
4068 /// the essential reduction step. Therefore, based on the 4th
4069 /// parameter, this function behaves slightly differently to
4070 /// cooperate with the runtime to ensure correctness under
4071 /// different circumstances.
4072 ///
4073 /// 'InterWarpCpyFn' is a pointer to a function that transfers
4074 /// reduced variables across warps. It tunnels, through CUDA
4075 /// shared memory, the thread-private data of type 'ReduceData'
4076 /// from lane 0 of each warp to a lane in the first warp.
4077 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
4078 /// The last team writes the global reduced value to memory.
4079 ///
4080 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
4081 /// reduceData, shuffleReduceFn, interWarpCpyFn,
4082 /// scratchpadCopyFn, loadAndReduceFn)
4083 ///
4084 /// 'scratchpadCopyFn' is a helper that stores reduced
4085 /// data from the team master to a scratchpad array in
4086 /// global memory.
4087 ///
4088 /// 'loadAndReduceFn' is a helper that loads data from
4089 /// the scratchpad array and reduces it with the input
4090 /// operand.
4091 ///
4092 /// These compiler generated functions hide address
4093 /// calculation and alignment information from the runtime.
4094 /// 5. if ret == 1:
4095 /// The team master of the last team stores the reduced
4096 /// result to the globals in memory.
4097 /// foo += reduceData.foo; bar *= reduceData.bar
4098 ///
4099 ///
4100 /// Warp Reduction Algorithms
4101 ///
4102 /// On the warp level, we have three algorithms implemented in the
4103 /// OpenMP runtime depending on the number of active lanes:
4104 ///
4105 /// Full Warp Reduction
4106 ///
4107 /// The reduce algorithm within a warp where all lanes are active
4108 /// is implemented in the runtime as follows:
4109 ///
4110 /// full_warp_reduce(void *reduce_data,
4111 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4112 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
4113 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
4114 /// }
4115 ///
4116 /// The algorithm completes in log(2, WARPSIZE) steps.
4117 ///
4118 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
4119 /// not used therefore we save instructions by not retrieving lane_id
4120 /// from the corresponding special registers. The 4th parameter, which
4121 /// represents the version of the algorithm being used, is set to 0 to
4122 /// signify full warp reduction.
4123 ///
4124 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4125 ///
4126 /// #reduce_elem refers to an element in the local lane's data structure
4127 /// #remote_elem is retrieved from a remote lane
4128 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4129 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
4130 ///
4131 /// Contiguous Partial Warp Reduction
4132 ///
4133 /// This reduce algorithm is used within a warp where only the first
4134 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
4135 /// number of OpenMP threads in a parallel region is not a multiple of
4136 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
4137 ///
4138 /// void
4139 /// contiguous_partial_reduce(void *reduce_data,
4140 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
4141 /// int size, int lane_id) {
4142 /// int curr_size;
4143 /// int offset;
4144 /// curr_size = size;
4145 /// mask = curr_size/2;
4146 /// while (offset>0) {
4147 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
4148 /// curr_size = (curr_size+1)/2;
4149 /// offset = curr_size/2;
4150 /// }
4151 /// }
4152 ///
4153 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4154 ///
4155 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4156 /// if (lane_id < offset)
4157 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4158 /// else
4159 /// reduce_elem = remote_elem
4160 ///
4161 /// This algorithm assumes that the data to be reduced are located in a
4162 /// contiguous subset of lanes starting from the first. When there is
4163 /// an odd number of active lanes, the data in the last lane is not
4164 /// aggregated with any other lane's dat but is instead copied over.
4165 ///
4166 /// Dispersed Partial Warp Reduction
4167 ///
4168 /// This algorithm is used within a warp when any discontiguous subset of
4169 /// lanes are active. It is used to implement the reduction operation
4170 /// across lanes in an OpenMP simd region or in a nested parallel region.
4171 ///
4172 /// void
4173 /// dispersed_partial_reduce(void *reduce_data,
4174 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4175 /// int size, remote_id;
4176 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
4177 /// do {
4178 /// remote_id = next_active_lane_id_right_after_me();
4179 /// # the above function returns 0 of no active lane
4180 /// # is present right after the current lane.
4181 /// size = number_of_active_lanes_in_this_warp();
4182 /// logical_lane_id /= 2;
4183 /// ShuffleReduceFn(reduce_data, logical_lane_id,
4184 /// remote_id-1-threadIdx.x, 2);
4185 /// } while (logical_lane_id % 2 == 0 && size > 1);
4186 /// }
4187 ///
4188 /// There is no assumption made about the initial state of the reduction.
4189 /// Any number of lanes (>=1) could be active at any position. The reduction
4190 /// result is returned in the first active lane.
4191 ///
4192 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4193 ///
4194 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4195 /// if (lane_id % 2 == 0 && offset > 0)
4196 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4197 /// else
4198 /// reduce_elem = remote_elem
4199 ///
4200 ///
4201 /// Intra-Team Reduction
4202 ///
4203 /// This function, as implemented in the runtime call
4204 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
4205 /// threads in a team. It first reduces within a warp using the
4206 /// aforementioned algorithms. We then proceed to gather all such
4207 /// reduced values at the first warp.
4208 ///
4209 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
4210 /// data from each of the "warp master" (zeroth lane of each warp, where
4211 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
4212 /// a mathematical sense) the problem of reduction across warp masters in
4213 /// a block to the problem of warp reduction.
4214 ///
4215 ///
4216 /// Inter-Team Reduction
4217 ///
4218 /// Once a team has reduced its data to a single value, it is stored in
4219 /// a global scratchpad array. Since each team has a distinct slot, this
4220 /// can be done without locking.
4221 ///
4222 /// The last team to write to the scratchpad array proceeds to reduce the
4223 /// scratchpad array. One or more workers in the last team use the helper
4224 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
4225 /// the k'th worker reduces every k'th element.
4226 ///
4227 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
4228 /// reduce across workers and compute a globally reduced value.
4229 ///
4233  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
4234  if (!CGF.HaveInsertPoint())
4235  return;
4236 
4237  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
4238 #ifndef NDEBUG
4239  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
4240 #endif
4241 
4242  if (Options.SimpleReduction) {
4243  assert(!TeamsReduction && !ParallelReduction &&
4244  "Invalid reduction selection in emitReduction.");
4245  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
4246  ReductionOps, Options);
4247  return;
4248  }
4249 
4250  assert((TeamsReduction || ParallelReduction) &&
4251  "Invalid reduction selection in emitReduction.");
4252 
4253  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4254  // RedList, shuffle_reduce_func, interwarp_copy_func);
4255  // or
4256  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4257  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
4258  llvm::Value *ThreadId = getThreadID(CGF, Loc);
4259 
4260  llvm::Value *Res;
4261  ASTContext &C = CGM.getContext();
4262  // 1. Build a list of reduction variables.
4263  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4264  auto Size = RHSExprs.size();
4265  for (const Expr *E : Privates) {
4266  if (E->getType()->isVariablyModifiedType())
4267  // Reserve place for array size.
4268  ++Size;
4269  }
4270  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
4271  QualType ReductionArrayTy =
4273  /*IndexTypeQuals=*/0);
4274  Address ReductionList =
4275  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
4276  auto IPriv = Privates.begin();
4277  unsigned Idx = 0;
4278  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
4279  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4280  CGF.Builder.CreateStore(
4282  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
4283  Elem);
4284  if ((*IPriv)->getType()->isVariablyModifiedType()) {
4285  // Store array size.
4286  ++Idx;
4287  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4288  llvm::Value *Size = CGF.Builder.CreateIntCast(
4289  CGF.getVLASize(
4290  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
4291  .NumElts,
4292  CGF.SizeTy, /*isSigned=*/false);
4293  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
4294  Elem);
4295  }
4296  }
4297 
4299  ReductionList.getPointer(), CGF.VoidPtrTy);
4300  llvm::Function *ReductionFn = emitReductionFunction(
4301  Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
4302  LHSExprs, RHSExprs, ReductionOps);
4303  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
4304  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
4305  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
4306  llvm::Value *InterWarpCopyFn =
4307  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
4308 
4309  if (ParallelReduction) {
4310  llvm::Value *Args[] = {RTLoc,
4311  ThreadId,
4312  CGF.Builder.getInt32(RHSExprs.size()),
4313  ReductionArrayTySize,
4314  RL,
4315  ShuffleAndReduceFn,
4316  InterWarpCopyFn};
4317 
4318  Res = CGF.EmitRuntimeCall(
4320  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2),
4321  Args);
4322  } else {
4323  assert(TeamsReduction && "expected teams reduction.");
4324  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
4325  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
4326  int Cnt = 0;
4327  for (const Expr *DRE : Privates) {
4328  PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
4329  ++Cnt;
4330  }
4331  const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
4332  CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
4333  C.getLangOpts().OpenMPCUDAReductionBufNum);
4334  TeamsReductions.push_back(TeamReductionRec);
4335  if (!KernelTeamsReductionPtr) {
4336  KernelTeamsReductionPtr = new llvm::GlobalVariable(
4337  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
4339  "_openmp_teams_reductions_buffer_$_$ptr");
4340  }
4341  llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
4342  Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
4343  /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
4344  llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
4345  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4346  llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
4347  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4348  ReductionFn);
4349  llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
4350  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4351  llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
4352  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4353  ReductionFn);
4354 
4355  llvm::Value *Args[] = {
4356  RTLoc,
4357  ThreadId,
4358  GlobalBufferPtr,
4359  CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
4360  RL,
4361  ShuffleAndReduceFn,
4362  InterWarpCopyFn,
4363  GlobalToBufferCpyFn,
4364  GlobalToBufferRedFn,
4365  BufferToGlobalCpyFn,
4366  BufferToGlobalRedFn};
4367 
4368  Res = CGF.EmitRuntimeCall(
4370  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2),
4371  Args);
4372  }
4373 
4374  // 5. Build if (res == 1)
4375  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
4376  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
4377  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
4378  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
4379  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4380 
4381  // 6. Build then branch: where we have reduced values in the master
4382  // thread in each team.
4383  // __kmpc_end_reduce{_nowait}(<gtid>);
4384  // break;
4385  CGF.EmitBlock(ThenBB);
4386 
4387  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4388  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
4389  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4390  auto IPriv = Privates.begin();
4391  auto ILHS = LHSExprs.begin();
4392  auto IRHS = RHSExprs.begin();
4393  for (const Expr *E : ReductionOps) {
4394  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4395  cast<DeclRefExpr>(*IRHS));
4396  ++IPriv;
4397  ++ILHS;
4398  ++IRHS;
4399  }
4400  };
4401  llvm::Value *EndArgs[] = {ThreadId};
4402  RegionCodeGenTy RCG(CodeGen);
4403  NVPTXActionTy Action(
4404  nullptr, llvm::None,
4405  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4406  EndArgs);
4407  RCG.setAction(Action);
4408  RCG(CGF);
4409  // There is no need to emit line number for unconditional branch.
4411  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
4412 }
4413 
4414 const VarDecl *
4416  const VarDecl *NativeParam) const {
4417  if (!NativeParam->getType()->isReferenceType())
4418  return NativeParam;
4419  QualType ArgType = NativeParam->getType();
4420  QualifierCollector QC;
4421  const Type *NonQualTy = QC.strip(ArgType);
4422  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4423  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4424  if (Attr->getCaptureKind() == OMPC_map) {
4425  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4427  } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
4428  PointeeTy.isConstant(CGM.getContext())) {
4429  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4431  }
4432  }
4433  ArgType = CGM.getContext().getPointerType(PointeeTy);
4434  QC.addRestrict();
4435  enum { NVPTX_local_addr = 5 };
4436  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
4437  ArgType = QC.apply(CGM.getContext(), ArgType);
4438  if (isa<ImplicitParamDecl>(NativeParam))
4440  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
4441  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
4442  return ParmVarDecl::Create(
4443  CGM.getContext(),
4444  const_cast<DeclContext *>(NativeParam->getDeclContext()),
4445  NativeParam->getBeginLoc(), NativeParam->getLocation(),
4446  NativeParam->getIdentifier(), ArgType,
4447  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
4448 }
4449 
4450 Address
4452  const VarDecl *NativeParam,
4453  const VarDecl *TargetParam) const {
4454  assert(NativeParam != TargetParam &&
4455  NativeParam->getType()->isReferenceType() &&
4456  "Native arg must not be the same as target arg.");
4457  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4458  QualType NativeParamType = NativeParam->getType();
4459  QualifierCollector QC;
4460  const Type *NonQualTy = QC.strip(NativeParamType);
4461  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4462  unsigned NativePointeeAddrSpace =
4463  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4464  QualType TargetTy = TargetParam->getType();
4465  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4466  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
4467  // First cast to generic.
4469  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4470  /*AddrSpace=*/0));
4471  // Cast from generic to native address space.
4473  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4474  NativePointeeAddrSpace));
4475  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4476  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
4477  NativeParamType);
4478  return NativeParamAddr;
4479 }
4480 
4482  CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
4483  ArrayRef<llvm::Value *> Args) const {
4484  SmallVector<llvm::Value *, 4> TargetArgs;
4485  TargetArgs.reserve(Args.size());
4486  auto *FnType = OutlinedFn.getFunctionType();
4487  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
4488  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4489  TargetArgs.append(std::next(Args.begin(), I), Args.end());
4490  break;
4491  }
4492  llvm::Type *TargetType = FnType->getParamType(I);
4493  llvm::Value *NativeArg = Args[I];
4494  if (!TargetType->isPointerTy()) {
4495  TargetArgs.emplace_back(NativeArg);
4496  continue;
4497  }
4499  NativeArg,
4500  NativeArg->getType()->getPointerElementType()->getPointerTo());
4501  TargetArgs.emplace_back(
4502  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4503  }
4504  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4505 }
4506 
4507 /// Emit function which wraps the outline parallel region
4508 /// and controls the arguments which are passed to this function.
4509 /// The wrapper ensures that the outlined function is called
4510 /// with the correct arguments when data is shared.
4511 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4512  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4513  ASTContext &Ctx = CGM.getContext();
4514  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4515 
4516  // Create a function that takes as argument the source thread.
4517  FunctionArgList WrapperArgs;
4518  QualType Int16QTy =
4519  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4520  QualType Int32QTy =
4521  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4522  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4523  /*Id=*/nullptr, Int16QTy,
4525  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4526  /*Id=*/nullptr, Int32QTy,
4528  WrapperArgs.emplace_back(&ParallelLevelArg);
4529  WrapperArgs.emplace_back(&WrapperArg);
4530 
4531  const CGFunctionInfo &CGFI =
4532  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4533 
4534  auto *Fn = llvm::Function::Create(
4536  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4537  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4538  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4539  Fn->setDoesNotRecurse();
4540 
4541  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4542  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4543  D.getBeginLoc(), D.getBeginLoc());
4544 
4545  const auto *RD = CS.getCapturedRecordDecl();
4546  auto CurField = RD->field_begin();
4547 
4548  Address ZeroAddr = CGF.CreateMemTemp(
4549  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
4550  /*Name*/ ".zero.addr");
4551  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4552  // Get the array of arguments.
4554 
4555  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4556  Args.emplace_back(ZeroAddr.getPointer());
4557 
4558  CGBuilderTy &Bld = CGF.Builder;
4559  auto CI = CS.capture_begin();
4560 
4561  // Use global memory for data sharing.
4562  // Handle passing of global args to workers.
4563  Address GlobalArgs =
4564  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4565  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4566  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4567  CGF.EmitRuntimeCall(
4568  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4569  DataSharingArgs);
4570 
4571  // Retrieve the shared variables from the list of references returned
4572  // by the runtime. Pass the variables to the outlined function.
4573  Address SharedArgListAddress = Address::invalid();
4574  if (CS.capture_size() > 0 ||
4576  SharedArgListAddress = CGF.EmitLoadOfPointer(
4577  GlobalArgs, CGF.getContext()
4579  CGF.getContext().VoidPtrTy))
4580  .castAs<PointerType>());
4581  }
4582  unsigned Idx = 0;
4584  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4585  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4586  Src, CGF.SizeTy->getPointerTo());
4587  llvm::Value *LB = CGF.EmitLoadOfScalar(
4588  TypedAddress,
4589  /*Volatile=*/false,
4591  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4592  Args.emplace_back(LB);
4593  ++Idx;
4594  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4595  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4596  Src, CGF.SizeTy->getPointerTo());
4597  llvm::Value *UB = CGF.EmitLoadOfScalar(
4598  TypedAddress,
4599  /*Volatile=*/false,
4601  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4602  Args.emplace_back(UB);
4603  ++Idx;
4604  }
4605  if (CS.capture_size() > 0) {
4606  ASTContext &CGFContext = CGF.getContext();
4607  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4608  QualType ElemTy = CurField->getType();
4609  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4610  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4611  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4612  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4613  /*Volatile=*/false,
4614  CGFContext.getPointerType(ElemTy),
4615  CI->getLocation());
4616  if (CI->capturesVariableByCopy() &&
4617  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4618  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4619  CI->getLocation());
4620  }
4621  Args.emplace_back(Arg);
4622  }
4623  }
4624 
4625  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4626  CGF.FinishFunction();
4627  return Fn;
4628 }
4629 
4631  const Decl *D) {
4633  return;
4634 
4635  assert(D && "Expected function or captured|block decl.");
4636  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4637  "Function is registered already.");
4638  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4639  "Team is set but not processed.");
4640  const Stmt *Body = nullptr;
4641  bool NeedToDelayGlobalization = false;
4642  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4643  Body = FD->getBody();
4644  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4645  Body = BD->getBody();
4646  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4647  Body = CD->getBody();
4648  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4649  if (NeedToDelayGlobalization &&
4650  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4651  return;
4652  }
4653  if (!Body)
4654  return;
4655  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4656  VarChecker.Visit(Body);
4657  const RecordDecl *GlobalizedVarsRecord =
4658  VarChecker.getGlobalizedRecord(IsInTTDRegion);
4659  TeamAndReductions.first = nullptr;
4660  TeamAndReductions.second.clear();
4661  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4662  VarChecker.getEscapedVariableLengthDecls();
4663  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4664  return;
4665  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4666  I->getSecond().MappedParams =
4667  std::make_unique<CodeGenFunction::OMPMapVars>();
4668  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4669  I->getSecond().EscapedParameters.insert(
4670  VarChecker.getEscapedParameters().begin(),
4671  VarChecker.getEscapedParameters().end());
4672  I->getSecond().EscapedVariableLengthDecls.append(
4673  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4674  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4675  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4676  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4677  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4678  Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4679  }
4680  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4681  CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4682  VarChecker.Visit(Body);
4683  I->getSecond().SecondaryGlobalRecord =
4684  VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4685  I->getSecond().SecondaryLocalVarData.emplace();
4686  DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4687  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4688  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4689  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4690  Data.insert(
4691  std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4692  }
4693  }
4694  if (!NeedToDelayGlobalization) {
4695  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4696  struct GlobalizationScope final : EHScopeStack::Cleanup {
4697  GlobalizationScope() = default;
4698 
4699  void Emit(CodeGenFunction &CGF, Flags flags) override {
4700  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4701  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4702  }
4703  };
4704  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4705  }
4706 }
4707 
4709  const VarDecl *VD) {
4710  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4711  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4712  switch (A->getAllocatorType()) {
4713  // Use the default allocator here as by default local vars are
4714  // threadlocal.
4715  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4716  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4717  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4718  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4719  // Follow the user decision - use default allocation.
4720  return Address::invalid();
4721  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4722  // TODO: implement aupport for user-defined allocators.
4723  return Address::invalid();
4724  case OMPAllocateDeclAttr::OMPConstMemAlloc: {
4725  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4726  auto *GV = new llvm::GlobalVariable(
4727  CGM.getModule(), VarTy, /*isConstant=*/false,
4729  llvm::Constant::getNullValue(VarTy), VD->getName(),
4730  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4732  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4733  GV->setAlignment(Align.getQuantity());
4734  return Address(GV, Align);
4735  }
4736  case OMPAllocateDeclAttr::OMPPTeamMemAlloc: {
4737  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4738  auto *GV = new llvm::GlobalVariable(
4739  CGM.getModule(), VarTy, /*isConstant=*/false,
4741  llvm::Constant::getNullValue(VarTy), VD->getName(),
4742  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4744  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4745  GV->setAlignment(Align.getQuantity());
4746  return Address(GV, Align);
4747  }
4748  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4749  case OMPAllocateDeclAttr::OMPCGroupMemAlloc: {
4750  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4751  auto *GV = new llvm::GlobalVariable(
4752  CGM.getModule(), VarTy, /*isConstant=*/false,
4754  llvm::Constant::getNullValue(VarTy), VD->getName());
4755  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4756  GV->setAlignment(Align.getQuantity());
4757  return Address(GV, Align);
4758  }
4759  }
4760  }
4761 
4763  return Address::invalid();
4764 
4765  VD = VD->getCanonicalDecl();
4766  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4767  if (I == FunctionGlobalizedDecls.end())
4768  return Address::invalid();
4769  auto VDI = I->getSecond().LocalVarData.find(VD);
4770  if (VDI != I->getSecond().LocalVarData.end())
4771  return VDI->second.PrivateAddr;
4772  if (VD->hasAttrs()) {
4774  E(VD->attr_end());
4775  IT != E; ++IT) {
4776  auto VDI = I->getSecond().LocalVarData.find(
4777  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4778  ->getCanonicalDecl());
4779  if (VDI != I->getSecond().LocalVarData.end())
4780  return VDI->second.PrivateAddr;
4781  }
4782  }
4783 
4784  return Address::invalid();
4785 }
4786 
4788  FunctionGlobalizedDecls.erase(CGF.CurFn);
4790 }
4791 
4793  CodeGenFunction &CGF, const OMPLoopDirective &S,
4794  OpenMPDistScheduleClauseKind &ScheduleKind,
4795  llvm::Value *&Chunk) const {
4796  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4797  ScheduleKind = OMPC_DIST_SCHEDULE_static;
4798  Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4799  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4801  return;
4802  }
4804  CGF, S, ScheduleKind, Chunk);
4805 }
4806 
4808  CodeGenFunction &CGF, const OMPLoopDirective &S,
4809  OpenMPScheduleClauseKind &ScheduleKind,
4810  const Expr *&ChunkExpr) const {
4811  ScheduleKind = OMPC_SCHEDULE_static;
4812  // Chunk size is 1 in this case.
4813  llvm::APInt ChunkSize(32, 1);
4814  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4815  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4816  SourceLocation());
4817 }
4818 
4820  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4822  " Expected target-based directive.");
4823  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4824  for (const CapturedStmt::Capture &C : CS->captures()) {
4825  // Capture variables captured by reference in lambdas for target-based
4826  // directives.
4827  if (!C.capturesVariable())
4828  continue;
4829  const VarDecl *VD = C.getCapturedVar();
4830  const auto *RD = VD->getType()
4831  .getCanonicalType()
4833  ->getAsCXXRecordDecl();
4834  if (!RD || !RD->isLambda())
4835  continue;
4836  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4837  LValue VDLVal;
4838  if (VD->getType().getCanonicalType()->isReferenceType())
4839  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4840  else
4841  VDLVal = CGF.MakeAddrLValue(
4842  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4843  llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4844  FieldDecl *ThisCapture = nullptr;
4845  RD->getCaptureFields(Captures, ThisCapture);
4846  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4847  LValue ThisLVal =
4848  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4849  llvm::Value *CXXThis = CGF.LoadCXXThis();
4850  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4851  }
4852  for (const LambdaCapture &LC : RD->captures()) {
4853  if (LC.getCaptureKind() != LCK_ByRef)
4854  continue;
4855  const VarDecl *VD = LC.getCapturedVar();
4856  if (!CS->capturesVariable(VD))
4857  continue;
4858  auto It = Captures.find(VD);
4859  assert(It != Captures.end() && "Found lambda capture without field.");
4860  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4861  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4862  if (VD->getType().getCanonicalType()->isReferenceType())
4863  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4864  VD->getType().getCanonicalType())
4865  .getAddress();
4866  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4867  }
4868  }
4869 }
4870 
4873 }
4874 
4876  LangAS &AS) {
4877  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4878  return false;
4879  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4880  switch(A->getAllocatorType()) {
4881  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4882  // Not supported, fallback to the default mem space.
4883  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4884  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4885  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4886  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4887  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4888  AS = LangAS::Default;
4889  return true;
4890  case OMPAllocateDeclAttr::OMPConstMemAlloc:
4891  AS = LangAS::cuda_constant;
4892  return true;
4893  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4894  AS = LangAS::cuda_shared;
4895  return true;
4896  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4897  llvm_unreachable("Expected predefined allocator for the variables with the "
4898  "static storage.");
4899  }
4900  return false;
4901 }
4902 
4903 // Get current CudaArch and ignore any unknown values
4905  if (!CGM.getTarget().hasFeature("ptx"))
4906  return CudaArch::UNKNOWN;
4907  llvm::StringMap<bool> Features;
4908  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4909  CGM.getTarget().getTargetOpts().CPU,
4910  CGM.getTarget().getTargetOpts().Features);
4911  for (const auto &Feature : Features) {
4912  if (Feature.getValue()) {
4913  CudaArch Arch = StringToCudaArch(Feature.getKey());
4914  if (Arch != CudaArch::UNKNOWN)
4915  return Arch;
4916  }
4917  }
4918  return CudaArch::UNKNOWN;
4919 }
4920 
4921 /// Check to see if target architecture supports unified addressing which is
4922 /// a restriction for OpenMP requires clause "unified_shared_memory".
4924  const OMPRequiresDecl *D) {
4925  for (const OMPClause *Clause : D->clauselists()) {
4926  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4927  switch (getCudaArch(CGM)) {
4928  case CudaArch::SM_20:
4929  case CudaArch::SM_21:
4930  case CudaArch::SM_30:
4931  case CudaArch::SM_32:
4932  case CudaArch::SM_35:
4933  case CudaArch::SM_37:
4934  case CudaArch::SM_50:
4935  case CudaArch::SM_52:
4936  case CudaArch::SM_53:
4937  case CudaArch::SM_60:
4938  case CudaArch::SM_61:
4939  case CudaArch::SM_62:
4940  CGM.Error(Clause->getBeginLoc(),
4941  "Target architecture does not support unified addressing");
4942  return;
4943  case CudaArch::SM_70:
4944  case CudaArch::SM_72:
4945  case CudaArch::SM_75:
4946  case CudaArch::GFX600:
4947  case CudaArch::GFX601:
4948  case CudaArch::GFX700:
4949  case CudaArch::GFX701:
4950  case CudaArch::GFX702:
4951  case CudaArch::GFX703:
4952  case CudaArch::GFX704:
4953  case CudaArch::GFX801:
4954  case CudaArch::GFX802:
4955  case CudaArch::GFX803:
4956  case CudaArch::GFX810:
4957  case CudaArch::GFX900:
4958  case CudaArch::GFX902:
4959  case CudaArch::GFX904:
4960  case CudaArch::GFX906:
4961  case CudaArch::GFX908:
4962  case CudaArch::GFX909:
4963  case CudaArch::GFX1010:
4964  case CudaArch::GFX1011:
4965  case CudaArch::GFX1012:
4966  case CudaArch::UNKNOWN:
4967  break;
4968  case CudaArch::LAST:
4969  llvm_unreachable("Unexpected Cuda arch.");
4970  }
4971  }
4972  }
4974 }
4975 
4976 /// Get number of SMs and number of blocks per SM.
4977 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4978  std::pair<unsigned, unsigned> Data;
4979  if (CGM.getLangOpts().OpenMPCUDANumSMs)
4980  Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4981  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4982  Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4983  if (Data.first && Data.second)
4984  return Data;
4985  switch (getCudaArch(CGM)) {
4986  case CudaArch::SM_20:
4987  case CudaArch::SM_21:
4988  case CudaArch::SM_30:
4989  case CudaArch::SM_32:
4990  case CudaArch::SM_35:
4991  case CudaArch::SM_37:
4992  case CudaArch::SM_50:
4993  case CudaArch::SM_52:
4994  case CudaArch::SM_53:
4995  return {16, 16};
4996  case CudaArch::SM_60:
4997  case CudaArch::SM_61:
4998  case CudaArch::SM_62:
4999  return {56, 32};
5000  case CudaArch::SM_70:
5001  case CudaArch::SM_72:
5002  case CudaArch::SM_75:
5003  return {84, 32};
5004  case CudaArch::GFX600:
5005  case CudaArch::GFX601:
5006  case CudaArch::GFX700:
5007  case CudaArch::GFX701:
5008  case CudaArch::GFX702:
5009  case CudaArch::GFX703:
5010  case CudaArch::GFX704:
5011  case CudaArch::GFX801:
5012  case CudaArch::GFX802:
5013  case CudaArch::GFX803:
5014  case CudaArch::GFX810:
5015  case CudaArch::GFX900:
5016  case CudaArch::GFX902:
5017  case CudaArch::GFX904:
5018  case CudaArch::GFX906:
5019  case CudaArch::GFX908:
5020  case CudaArch::GFX909:
5021  case CudaArch::GFX1010:
5022  case CudaArch::GFX1011:
5023  case CudaArch::GFX1012:
5024  case CudaArch::UNKNOWN:
5025  break;
5026  case CudaArch::LAST:
5027  llvm_unreachable("Unexpected Cuda arch.");
5028  }
5029  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
5030 }
5031 
5033  if (!GlobalizedRecords.empty()) {
5034  ASTContext &C = CGM.getContext();
5037  RecordDecl *StaticRD = C.buildImplicitRecord(
5038  "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5039  StaticRD->startDefinition();
5040  RecordDecl *SharedStaticRD = C.buildImplicitRecord(
5041  "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5042  SharedStaticRD->startDefinition();
5043  for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
5044  if (Records.Records.empty())
5045  continue;
5046  unsigned Size = 0;
5047  unsigned RecAlignment = 0;
5048  for (const RecordDecl *RD : Records.Records) {
5049  QualType RDTy = C.getRecordType(RD);
5050  unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
5051  RecAlignment = std::max(RecAlignment, Alignment);
5052  unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
5053  Size =
5054  llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
5055  }
5056  Size = llvm::alignTo(Size, RecAlignment);
5057  llvm::APInt ArySize(/*numBits=*/64, Size);
5058  QualType SubTy = C.getConstantArrayType(
5059  C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
5060  const bool UseSharedMemory = Size <= SharedMemorySize;
5061  auto *Field =
5062  FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
5063  SourceLocation(), SourceLocation(), nullptr, SubTy,
5065  /*BW=*/nullptr, /*Mutable=*/false,
5066  /*InitStyle=*/ICIS_NoInit);
5067  Field->setAccess(AS_public);
5068  if (UseSharedMemory) {
5069  SharedStaticRD->addDecl(Field);
5070  SharedRecs.push_back(&Records);
5071  } else {
5072  StaticRD->addDecl(Field);
5073  GlobalRecs.push_back(&Records);
5074  }
5075  Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
5076  Records.UseSharedMemory->setInitializer(
5077  llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
5078  }
5079  // Allocate SharedMemorySize buffer for the shared memory.
5080  // FIXME: nvlink does not handle weak linkage correctly (object with the
5081  // different size are reported as erroneous).
5082  // Restore this code as sson as nvlink is fixed.
5083  if (!SharedStaticRD->field_empty()) {
5084  llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
5085  QualType SubTy = C.getConstantArrayType(
5086  C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
5087  auto *Field = FieldDecl::Create(
5088  C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
5090  /*BW=*/nullptr, /*Mutable=*/false,
5091  /*InitStyle=*/ICIS_NoInit);
5092  Field->setAccess(AS_public);
5093  SharedStaticRD->addDecl(Field);
5094  }
5095  SharedStaticRD->completeDefinition();
5096  if (!SharedStaticRD->field_empty()) {
5097  QualType StaticTy = C.getRecordType(SharedStaticRD);
5098  llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
5099  auto *GV = new llvm::GlobalVariable(
5100  CGM.getModule(), LLVMStaticTy,
5101  /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
5102  llvm::Constant::getNullValue(LLVMStaticTy),
5103  "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
5104  llvm::GlobalValue::NotThreadLocal,
5106  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5107  GV, CGM.VoidPtrTy);
5108  for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
5109  Rec->Buffer->replaceAllUsesWith(Replacement);
5110  Rec->Buffer->eraseFromParent();
5111  }
5112  }
5113  StaticRD->completeDefinition();
5114  if (!StaticRD->field_empty()) {
5115  QualType StaticTy = C.getRecordType(StaticRD);
5116  std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
5117  llvm::APInt Size1(32, SMsBlockPerSM.second);
5118  QualType Arr1Ty =
5119  C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
5120  /*IndexTypeQuals=*/0);
5121  llvm::APInt Size2(32, SMsBlockPerSM.first);
5122  QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
5123  /*IndexTypeQuals=*/0);
5124  llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
5125  // FIXME: nvlink does not handle weak linkage correctly (object with the
5126  // different size are reported as erroneous).
5127  // Restore CommonLinkage as soon as nvlink is fixed.
5128  auto *GV = new llvm::GlobalVariable(
5129  CGM.getModule(), LLVMArr2Ty,
5130  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5131  llvm::Constant::getNullValue(LLVMArr2Ty),
5132  "_openmp_static_glob_rd_$_");
5133  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5134  GV, CGM.VoidPtrTy);
5135  for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
5136  Rec->Buffer->replaceAllUsesWith(Replacement);
5137  Rec->Buffer->eraseFromParent();
5138  }
5139  }
5140  }
5141  if (!TeamsReductions.empty()) {
5142  ASTContext &C = CGM.getContext();
5143  RecordDecl *StaticRD = C.buildImplicitRecord(
5144  "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
5145  StaticRD->startDefinition();
5146  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
5147  QualType RecTy = C.getRecordType(TeamReductionRec);
5148  auto *Field = FieldDecl::Create(
5149  C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
5151  /*BW=*/nullptr, /*Mutable=*/false,
5152  /*InitStyle=*/ICIS_NoInit);
5153  Field->setAccess(AS_public);
5154  StaticRD->addDecl(Field);
5155  }
5156  StaticRD->completeDefinition();
5157  QualType StaticTy = C.getRecordType(StaticRD);
5158  llvm::Type *LLVMReductionsBufferTy =
5159  CGM.getTypes().ConvertTypeForMem(StaticTy);
5160  // FIXME: nvlink does not handle weak linkage correctly (object with the
5161  // different size are reported as erroneous).
5162  // Restore CommonLinkage as soon as nvlink is fixed.
5163  auto *GV = new llvm::GlobalVariable(
5164  CGM.getModule(), LLVMReductionsBufferTy,
5165  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5166  llvm::Constant::getNullValue(LLVMReductionsBufferTy),
5167  "_openmp_teams_reductions_buffer_$_");
5168  KernelTeamsReductionPtr->setInitializer(
5169  llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
5170  CGM.VoidPtrTy));
5171  }
5173 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:652
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:216
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5565
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:218
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl *> &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static const Decl * getCanonicalDecl(const Decl *D)
llvm::IntegerType * IntTy
int
int64_t QuantityType
Definition: CharUnits.h:39
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
Other implicit parameter.
Definition: Decl.h:1532
A class which contains all the information about a particular captured value.
Definition: Decl.h:3929
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2575
CanQualType VoidPtrTy
Definition: ASTContext.h:1040
A (possibly-)qualified type.
Definition: Type.h:643
llvm::Function * emitReductionFunction(SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
CudaArch
Definition: Cuda.h:35
ArrayRef< OMPClause * > clauses()
Definition: StmtOpenMP.h:324
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:139
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:192
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
static std::pair< unsigned, unsigned > getSMsBlocksPerSM(CodeGenModule &CGM)
Get number of SMs and number of blocks per SM.
attr_iterator attr_begin() const
Definition: DeclBase.h:505
Stmt - This represents one statement.
Definition: Stmt.h:66
void checkArchForUnifiedAddressing(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl *> &Vars)
Get list of reduction variables from the teams ... directives.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:88
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:34
SourceLocation getBeginLoc() const
Returns starting location of directive kind.