clang  8.0.0svn
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This provides a class for OpenMP runtime code generation specialized to NVPTX
11 // targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "CodeGenFunction.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/StmtOpenMP.h"
19 #include "clang/AST/StmtVisitor.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 
22 using namespace clang;
23 using namespace CodeGen;
24 
25 namespace {
27  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28  /// int16_t RequiresOMPRuntime);
29  OMPRTL_NVPTX__kmpc_kernel_init,
30  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
31  OMPRTL_NVPTX__kmpc_kernel_deinit,
32  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
33  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
34  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35  /// Call to void __kmpc_spmd_kernel_deinit();
36  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
37  /// Call to void __kmpc_kernel_prepare_parallel(void
38  /// *outlined_function, int16_t
39  /// IsOMPRuntimeInitialized);
40  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
42  /// int16_t IsOMPRuntimeInitialized);
43  OMPRTL_NVPTX__kmpc_kernel_parallel,
44  /// Call to void __kmpc_kernel_end_parallel();
45  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47  /// global_tid);
48  OMPRTL_NVPTX__kmpc_serialized_parallel,
49  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50  /// global_tid);
51  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
53  /// int16_t lane_offset, int16_t warp_size);
54  OMPRTL_NVPTX__kmpc_shuffle_int32,
55  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
56  /// int16_t lane_offset, int16_t warp_size);
57  OMPRTL_NVPTX__kmpc_shuffle_int64,
58  /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
59  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61  /// lane_offset, int16_t shortCircuit),
62  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63  OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
64  /// Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32
65  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
66  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
67  /// lane_offset, int16_t shortCircuit),
68  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
69  OMPRTL_NVPTX__kmpc_simd_reduce_nowait,
70  /// Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
71  /// int32_t num_vars, size_t reduce_size, void *reduce_data,
72  /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
73  /// lane_offset, int16_t shortCircuit),
74  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
75  /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
76  /// int32_t index, int32_t width),
77  /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
78  /// index, int32_t width, int32_t reduce))
79  OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
80  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
81  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
82  /// Call to void __kmpc_data_sharing_init_stack();
83  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
84  /// Call to void __kmpc_data_sharing_init_stack_spmd();
85  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
86  /// Call to void* __kmpc_data_sharing_push_stack(size_t size,
87  /// int16_t UseSharedMemory);
88  OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
89  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
90  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
91  /// Call to void __kmpc_begin_sharing_variables(void ***args,
92  /// size_t n_args);
93  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
94  /// Call to void __kmpc_end_sharing_variables();
95  OMPRTL_NVPTX__kmpc_end_sharing_variables,
96  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
97  OMPRTL_NVPTX__kmpc_get_shared_variables,
98  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
99  /// global_tid);
100  OMPRTL_NVPTX__kmpc_parallel_level,
101  /// Call to int8_t __kmpc_is_spmd_exec_mode();
102  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
103 };
104 
105 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
106 class NVPTXActionTy final : public PrePostActionTy {
107  llvm::Value *EnterCallee = nullptr;
108  ArrayRef<llvm::Value *> EnterArgs;
109  llvm::Value *ExitCallee = nullptr;
110  ArrayRef<llvm::Value *> ExitArgs;
111  bool Conditional = false;
112  llvm::BasicBlock *ContBlock = nullptr;
113 
114 public:
115  NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
116  llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
117  bool Conditional = false)
118  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
119  ExitArgs(ExitArgs), Conditional(Conditional) {}
120  void Enter(CodeGenFunction &CGF) override {
121  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
122  if (Conditional) {
123  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
124  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
125  ContBlock = CGF.createBasicBlock("omp_if.end");
126  // Generate the branch (If-stmt)
127  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
128  CGF.EmitBlock(ThenBlock);
129  }
130  }
131  void Done(CodeGenFunction &CGF) {
132  // Emit the rest of blocks/branches
133  CGF.EmitBranch(ContBlock);
134  CGF.EmitBlock(ContBlock, true);
135  }
136  void Exit(CodeGenFunction &CGF) override {
137  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
138  }
139 };
140 
141 /// A class to track the execution mode when codegening directives within
142 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
143 /// to the target region and used by containing directives such as 'parallel'
144 /// to emit optimized code.
145 class ExecutionModeRAII {
146 private:
149 
150 public:
151  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, bool IsSPMD)
152  : Mode(Mode) {
153  SavedMode = Mode;
154  Mode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD
156  }
157  ~ExecutionModeRAII() { Mode = SavedMode; }
158 };
159 
160 /// GPU Configuration: This information can be derived from cuda registers,
161 /// however, providing compile time constants helps generate more efficient
162 /// code. For all practical purposes this is fine because the configuration
163 /// is the same for all known NVPTX architectures.
164 enum MachineConfiguration : unsigned {
165  WarpSize = 32,
166  /// Number of bits required to represent a lane identifier, which is
167  /// computed as log_2(WarpSize).
168  LaneIDBits = 5,
169  LaneIDMask = WarpSize - 1,
170 
171  /// Global memory alignment for performance.
172  GlobalMemoryAlignment = 256,
173 };
174 
175 enum NamedBarrier : unsigned {
176  /// Synchronize on this barrier #ID using a named barrier primitive.
177  /// Only the subset of active threads in a parallel region arrive at the
178  /// barrier.
179  NB_Parallel = 1,
180 };
181 
182 /// Get the list of variables that can escape their declaration context.
183 class CheckVarsEscapingDeclContext final
184  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185  CodeGenFunction &CGF;
186  llvm::SetVector<const ValueDecl *> EscapedDecls;
187  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
189  RecordDecl *GlobalizedRD = nullptr;
190  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
191  bool AllEscaped = false;
192  bool IsForCombinedParallelRegion = false;
193 
194  void markAsEscaped(const ValueDecl *VD) {
195  // Do not globalize declare target variables.
196  if (!isa<VarDecl>(VD) ||
197  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
198  return;
199  VD = cast<ValueDecl>(VD->getCanonicalDecl());
200  // Variables captured by value must be globalized.
201  if (auto *CSI = CGF.CapturedStmtInfo) {
202  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
203  // Check if need to capture the variable that was already captured by
204  // value in the outer region.
205  if (!IsForCombinedParallelRegion) {
206  if (!FD->hasAttrs())
207  return;
208  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
209  if (!Attr)
210  return;
211  if (!isOpenMPPrivate(
212  static_cast<OpenMPClauseKind>(Attr->getCaptureKind())) ||
213  Attr->getCaptureKind() == OMPC_map)
214  return;
215  }
216  if (!FD->getType()->isReferenceType()) {
217  assert(!VD->getType()->isVariablyModifiedType() &&
218  "Parameter captured by value with variably modified type");
219  EscapedParameters.insert(VD);
220  } else if (!IsForCombinedParallelRegion) {
221  return;
222  }
223  }
224  }
225  if ((!CGF.CapturedStmtInfo ||
226  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
227  VD->getType()->isReferenceType())
228  // Do not globalize variables with reference type.
229  return;
230  if (VD->getType()->isVariablyModifiedType())
231  EscapedVariableLengthDecls.insert(VD);
232  else
233  EscapedDecls.insert(VD);
234  }
235 
236  void VisitValueDecl(const ValueDecl *VD) {
237  if (VD->getType()->isLValueReferenceType())
238  markAsEscaped(VD);
239  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
240  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
241  const bool SavedAllEscaped = AllEscaped;
242  AllEscaped = VD->getType()->isLValueReferenceType();
243  Visit(VarD->getInit());
244  AllEscaped = SavedAllEscaped;
245  }
246  }
247  }
248  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
249  ArrayRef<OMPClause *> Clauses,
250  bool IsCombinedParallelRegion) {
251  if (!S)
252  return;
253  for (const CapturedStmt::Capture &C : S->captures()) {
254  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
255  const ValueDecl *VD = C.getCapturedVar();
256  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
257  if (IsCombinedParallelRegion) {
258  // Check if the variable is privatized in the combined construct and
259  // those private copies must be shared in the inner parallel
260  // directive.
261  IsForCombinedParallelRegion = false;
262  for (const OMPClause *C : Clauses) {
263  if (!isOpenMPPrivate(C->getClauseKind()) ||
264  C->getClauseKind() == OMPC_reduction ||
265  C->getClauseKind() == OMPC_linear ||
266  C->getClauseKind() == OMPC_private)
267  continue;
269  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
270  Vars = PC->getVarRefs();
271  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
272  Vars = PC->getVarRefs();
273  else
274  llvm_unreachable("Unexpected clause.");
275  for (const auto *E : Vars) {
276  const Decl *D =
277  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
278  if (D == VD->getCanonicalDecl()) {
279  IsForCombinedParallelRegion = true;
280  break;
281  }
282  }
283  if (IsForCombinedParallelRegion)
284  break;
285  }
286  }
287  markAsEscaped(VD);
288  if (isa<OMPCapturedExprDecl>(VD))
289  VisitValueDecl(VD);
290  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
291  }
292  }
293  }
294 
295  typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
296  static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
297  return P1.first > P2.first;
298  }
299 
300  void buildRecordForGlobalizedVars() {
301  assert(!GlobalizedRD &&
302  "Record for globalized variables is built already.");
303  if (EscapedDecls.empty())
304  return;
305  ASTContext &C = CGF.getContext();
306  SmallVector<VarsDataTy, 4> GlobalizedVars;
307  for (const ValueDecl *D : EscapedDecls)
308  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
309  std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
311  // Build struct _globalized_locals_ty {
312  // /* globalized vars */
313  // };
314  GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
315  GlobalizedRD->startDefinition();
316  for (const auto &Pair : GlobalizedVars) {
317  const ValueDecl *VD = Pair.second;
318  QualType Type = VD->getType();
319  if (Type->isLValueReferenceType())
320  Type = C.getPointerType(Type.getNonReferenceType());
321  else
322  Type = Type.getNonReferenceType();
323  SourceLocation Loc = VD->getLocation();
324  auto *Field = FieldDecl::Create(
325  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
327  /*BW=*/nullptr, /*Mutable=*/false,
328  /*InitStyle=*/ICIS_NoInit);
329  Field->setAccess(AS_public);
330  GlobalizedRD->addDecl(Field);
331  if (VD->hasAttrs()) {
332  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
333  E(VD->getAttrs().end());
334  I != E; ++I)
335  Field->addAttr(*I);
336  }
337  MappedDeclsFields.try_emplace(VD, Field);
338  }
339  GlobalizedRD->completeDefinition();
340  }
341 
342 public:
343  CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
344  virtual ~CheckVarsEscapingDeclContext() = default;
345  void VisitDeclStmt(const DeclStmt *S) {
346  if (!S)
347  return;
348  for (const Decl *D : S->decls())
349  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
350  VisitValueDecl(VD);
351  }
352  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
353  if (!D)
354  return;
355  if (!D->hasAssociatedStmt())
356  return;
357  if (const auto *S =
358  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
359  // Do not analyze directives that do not actually require capturing,
360  // like `omp for` or `omp simd` directives.
362  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
363  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
364  VisitStmt(S->getCapturedStmt());
365  return;
366  }
367  VisitOpenMPCapturedStmt(
368  S, D->clauses(),
369  CaptureRegions.back() == OMPD_parallel &&
371  }
372  }
373  void VisitCapturedStmt(const CapturedStmt *S) {
374  if (!S)
375  return;
376  for (const CapturedStmt::Capture &C : S->captures()) {
377  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
378  const ValueDecl *VD = C.getCapturedVar();
379  markAsEscaped(VD);
380  if (isa<OMPCapturedExprDecl>(VD))
381  VisitValueDecl(VD);
382  }
383  }
384  }
385  void VisitLambdaExpr(const LambdaExpr *E) {
386  if (!E)
387  return;
388  for (const LambdaCapture &C : E->captures()) {
389  if (C.capturesVariable()) {
390  if (C.getCaptureKind() == LCK_ByRef) {
391  const ValueDecl *VD = C.getCapturedVar();
392  markAsEscaped(VD);
393  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
394  VisitValueDecl(VD);
395  }
396  }
397  }
398  }
399  void VisitBlockExpr(const BlockExpr *E) {
400  if (!E)
401  return;
402  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
403  if (C.isByRef()) {
404  const VarDecl *VD = C.getVariable();
405  markAsEscaped(VD);
406  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
407  VisitValueDecl(VD);
408  }
409  }
410  }
411  void VisitCallExpr(const CallExpr *E) {
412  if (!E)
413  return;
414  for (const Expr *Arg : E->arguments()) {
415  if (!Arg)
416  continue;
417  if (Arg->isLValue()) {
418  const bool SavedAllEscaped = AllEscaped;
419  AllEscaped = true;
420  Visit(Arg);
421  AllEscaped = SavedAllEscaped;
422  } else {
423  Visit(Arg);
424  }
425  }
426  Visit(E->getCallee());
427  }
428  void VisitDeclRefExpr(const DeclRefExpr *E) {
429  if (!E)
430  return;
431  const ValueDecl *VD = E->getDecl();
432  if (AllEscaped)
433  markAsEscaped(VD);
434  if (isa<OMPCapturedExprDecl>(VD))
435  VisitValueDecl(VD);
436  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
437  if (VarD->isInitCapture())
438  VisitValueDecl(VD);
439  }
440  void VisitUnaryOperator(const UnaryOperator *E) {
441  if (!E)
442  return;
443  if (E->getOpcode() == UO_AddrOf) {
444  const bool SavedAllEscaped = AllEscaped;
445  AllEscaped = true;
446  Visit(E->getSubExpr());
447  AllEscaped = SavedAllEscaped;
448  } else {
449  Visit(E->getSubExpr());
450  }
451  }
452  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
453  if (!E)
454  return;
455  if (E->getCastKind() == CK_ArrayToPointerDecay) {
456  const bool SavedAllEscaped = AllEscaped;
457  AllEscaped = true;
458  Visit(E->getSubExpr());
459  AllEscaped = SavedAllEscaped;
460  } else {
461  Visit(E->getSubExpr());
462  }
463  }
464  void VisitExpr(const Expr *E) {
465  if (!E)
466  return;
467  bool SavedAllEscaped = AllEscaped;
468  if (!E->isLValue())
469  AllEscaped = false;
470  for (const Stmt *Child : E->children())
471  if (Child)
472  Visit(Child);
473  AllEscaped = SavedAllEscaped;
474  }
475  void VisitStmt(const Stmt *S) {
476  if (!S)
477  return;
478  for (const Stmt *Child : S->children())
479  if (Child)
480  Visit(Child);
481  }
482 
483  /// Returns the record that handles all the escaped local variables and used
484  /// instead of their original storage.
485  const RecordDecl *getGlobalizedRecord() {
486  if (!GlobalizedRD)
487  buildRecordForGlobalizedVars();
488  return GlobalizedRD;
489  }
490 
491  /// Returns the field in the globalized record for the escaped variable.
492  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
493  assert(GlobalizedRD &&
494  "Record for globalized variables must be generated already.");
495  auto I = MappedDeclsFields.find(VD);
496  if (I == MappedDeclsFields.end())
497  return nullptr;
498  return I->getSecond();
499  }
500 
501  /// Returns the list of the escaped local variables/parameters.
502  ArrayRef<const ValueDecl *> getEscapedDecls() const {
503  return EscapedDecls.getArrayRef();
504  }
505 
506  /// Checks if the escaped local variable is actually a parameter passed by
507  /// value.
508  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
509  return EscapedParameters;
510  }
511 
512  /// Returns the list of the escaped variables with the variably modified
513  /// types.
514  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
515  return EscapedVariableLengthDecls.getArrayRef();
516  }
517 };
518 } // anonymous namespace
519 
520 /// Get the GPU warp size.
522  return CGF.EmitRuntimeCall(
523  llvm::Intrinsic::getDeclaration(
524  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
525  "nvptx_warp_size");
526 }
527 
528 /// Get the id of the current thread on the GPU.
530  return CGF.EmitRuntimeCall(
531  llvm::Intrinsic::getDeclaration(
532  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
533  "nvptx_tid");
534 }
535 
536 /// Get the id of the warp in the block.
537 /// We assume that the warp size is 32, which is always the case
538 /// on the NVPTX device, to generate more efficient code.
540  CGBuilderTy &Bld = CGF.Builder;
541  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542 }
543 
544 /// Get the id of the current lane in the Warp.
545 /// We assume that the warp size is 32, which is always the case
546 /// on the NVPTX device, to generate more efficient code.
548  CGBuilderTy &Bld = CGF.Builder;
549  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
550  "nvptx_lane_id");
551 }
552 
553 /// Get the maximum number of threads in a block of the GPU.
555  return CGF.EmitRuntimeCall(
556  llvm::Intrinsic::getDeclaration(
557  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
558  "nvptx_num_threads");
559 }
560 
561 /// Get barrier to synchronize all threads in a block.
563  CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
564  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
565 }
566 
567 /// Get barrier #ID to synchronize selected (multiple of warp size) threads in
568 /// a CTA.
569 static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
570  llvm::Value *NumThreads) {
571  CGBuilderTy &Bld = CGF.Builder;
572  llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
573  CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
574  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
575  Args);
576 }
577 
578 /// Synchronize all GPU threads in a block.
580 
581 /// Synchronize worker threads in a parallel region.
582 static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
583  return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
584 }
585 
586 /// Get the value of the thread_limit clause in the teams directive.
587 /// For the 'generic' execution mode, the runtime encodes thread_limit in
588 /// the launch parameters, always starting thread_limit+warpSize threads per
589 /// CTA. The threads in the last warp are reserved for master execution.
590 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
592  bool IsInSPMDExecutionMode = false) {
593  CGBuilderTy &Bld = CGF.Builder;
594  return IsInSPMDExecutionMode
595  ? getNVPTXNumThreads(CGF)
596  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
597  "thread_limit");
598 }
599 
600 /// Get the thread id of the OMP master thread.
601 /// The master thread id is the first thread (lane) of the last warp in the
602 /// GPU block. Warp size is assumed to be some power of 2.
603 /// Thread id is 0 indexed.
604 /// E.g: If NumThreads is 33, master id is 32.
605 /// If NumThreads is 64, master id is 32.
606 /// If NumThreads is 1024, master id is 992.
608  CGBuilderTy &Bld = CGF.Builder;
609  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
610 
611  // We assume that the warp size is a power of 2.
612  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
613 
614  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
615  Bld.CreateNot(Mask), "master_tid");
616 }
617 
618 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
619  CodeGenModule &CGM, SourceLocation Loc)
620  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
621  Loc(Loc) {
622  createWorkerFunction(CGM);
623 }
624 
625 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
626  CodeGenModule &CGM) {
627  // Create an worker function with no arguments.
628 
629  WorkerFn = llvm::Function::Create(
631  /*placeholder=*/"_worker", &CGM.getModule());
632  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
633  WorkerFn->setDoesNotRecurse();
634 }
635 
637 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
638  return CurrentExecutionMode;
639 }
640 
643  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
645 }
646 
647 /// Checks if the \p Body is the \a CompoundStmt and returns its child statement
648 /// iff there is only one.
649 static const Stmt *getSingleCompoundChild(const Stmt *Body) {
650  if (const auto *C = dyn_cast<CompoundStmt>(Body))
651  if (C->size() == 1)
652  return C->body_front();
653  return Body;
654 }
655 
656 /// Check if the parallel directive has an 'if' clause with non-constant or
657 /// false condition. Also, check if the number of threads is strictly specified
658 /// and run those directives in non-SPMD mode.
660  const OMPExecutableDirective &D) {
662  return true;
663  for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
664  OpenMPDirectiveKind NameModifier = C->getNameModifier();
665  if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
666  continue;
667  const Expr *Cond = C->getCondition();
668  bool Result;
669  if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
670  return true;
671  }
672  return false;
673 }
674 
675 /// Checks if the directive is the distribute clause with the lastprivate
676 /// clauses. This construct does not support SPMD execution mode.
680 }
681 
682 /// Check for inner (nested) SPMD construct, if any
684  const OMPExecutableDirective &D) {
685  const auto *CS = D.getInnermostCapturedStmt();
686  const auto *Body =
687  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
688  const Stmt *ChildStmt = getSingleCompoundChild(Body);
689 
690  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
691  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
692  switch (D.getDirectiveKind()) {
693  case OMPD_target:
694  if (isOpenMPParallelDirective(DKind) &&
695  !hasParallelIfNumThreadsClause(Ctx, *NestedDir) &&
697  return true;
698  if (DKind == OMPD_teams) {
699  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
700  /*IgnoreCaptured=*/true);
701  if (!Body)
702  return false;
703  ChildStmt = getSingleCompoundChild(Body);
704  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
705  DKind = NND->getDirectiveKind();
706  if (isOpenMPParallelDirective(DKind) &&
707  !hasParallelIfNumThreadsClause(Ctx, *NND) &&
709  return true;
710  }
711  }
712  return false;
713  case OMPD_target_teams:
714  return isOpenMPParallelDirective(DKind) &&
715  !hasParallelIfNumThreadsClause(Ctx, *NestedDir) &&
717  case OMPD_target_simd:
718  case OMPD_target_parallel:
719  case OMPD_target_parallel_for:
720  case OMPD_target_parallel_for_simd:
721  case OMPD_target_teams_distribute:
722  case OMPD_target_teams_distribute_simd:
723  case OMPD_target_teams_distribute_parallel_for:
724  case OMPD_target_teams_distribute_parallel_for_simd:
725  case OMPD_parallel:
726  case OMPD_for:
727  case OMPD_parallel_for:
728  case OMPD_parallel_sections:
729  case OMPD_for_simd:
730  case OMPD_parallel_for_simd:
731  case OMPD_cancel:
732  case OMPD_cancellation_point:
733  case OMPD_ordered:
734  case OMPD_threadprivate:
735  case OMPD_task:
736  case OMPD_simd:
737  case OMPD_sections:
738  case OMPD_section:
739  case OMPD_single:
740  case OMPD_master:
741  case OMPD_critical:
742  case OMPD_taskyield:
743  case OMPD_barrier:
744  case OMPD_taskwait:
745  case OMPD_taskgroup:
746  case OMPD_atomic:
747  case OMPD_flush:
748  case OMPD_teams:
749  case OMPD_target_data:
750  case OMPD_target_exit_data:
751  case OMPD_target_enter_data:
752  case OMPD_distribute:
753  case OMPD_distribute_simd:
754  case OMPD_distribute_parallel_for:
755  case OMPD_distribute_parallel_for_simd:
756  case OMPD_teams_distribute:
757  case OMPD_teams_distribute_simd:
758  case OMPD_teams_distribute_parallel_for:
759  case OMPD_teams_distribute_parallel_for_simd:
760  case OMPD_target_update:
761  case OMPD_declare_simd:
762  case OMPD_declare_target:
763  case OMPD_end_declare_target:
764  case OMPD_declare_reduction:
765  case OMPD_taskloop:
766  case OMPD_taskloop_simd:
767  case OMPD_unknown:
768  llvm_unreachable("Unexpected directive.");
769  }
770  }
771 
772  return false;
773 }
774 
776  const OMPExecutableDirective &D) {
777  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
778  switch (DirectiveKind) {
779  case OMPD_target:
780  case OMPD_target_teams:
781  return hasNestedSPMDDirective(Ctx, D);
782  case OMPD_target_parallel:
783  case OMPD_target_parallel_for:
784  case OMPD_target_parallel_for_simd:
785  return !hasParallelIfNumThreadsClause(Ctx, D);
786  case OMPD_target_teams_distribute_parallel_for:
787  case OMPD_target_teams_distribute_parallel_for_simd:
788  // Distribute with lastprivates requires non-SPMD execution mode.
789  return !hasParallelIfNumThreadsClause(Ctx, D) &&
791  case OMPD_target_simd:
792  case OMPD_target_teams_distribute:
793  case OMPD_target_teams_distribute_simd:
794  return false;
795  case OMPD_parallel:
796  case OMPD_for:
797  case OMPD_parallel_for:
798  case OMPD_parallel_sections:
799  case OMPD_for_simd:
800  case OMPD_parallel_for_simd:
801  case OMPD_cancel:
802  case OMPD_cancellation_point:
803  case OMPD_ordered:
804  case OMPD_threadprivate:
805  case OMPD_task:
806  case OMPD_simd:
807  case OMPD_sections:
808  case OMPD_section:
809  case OMPD_single:
810  case OMPD_master:
811  case OMPD_critical:
812  case OMPD_taskyield:
813  case OMPD_barrier:
814  case OMPD_taskwait:
815  case OMPD_taskgroup:
816  case OMPD_atomic:
817  case OMPD_flush:
818  case OMPD_teams:
819  case OMPD_target_data:
820  case OMPD_target_exit_data:
821  case OMPD_target_enter_data:
822  case OMPD_distribute:
823  case OMPD_distribute_simd:
824  case OMPD_distribute_parallel_for:
825  case OMPD_distribute_parallel_for_simd:
826  case OMPD_teams_distribute:
827  case OMPD_teams_distribute_simd:
828  case OMPD_teams_distribute_parallel_for:
829  case OMPD_teams_distribute_parallel_for_simd:
830  case OMPD_target_update:
831  case OMPD_declare_simd:
832  case OMPD_declare_target:
833  case OMPD_end_declare_target:
834  case OMPD_declare_reduction:
835  case OMPD_taskloop:
836  case OMPD_taskloop_simd:
837  case OMPD_unknown:
838  break;
839  }
840  llvm_unreachable(
841  "Unknown programming model for OpenMP directive on NVPTX target.");
842 }
843 
844 /// Check if the directive is loops based and has schedule clause at all or has
845 /// static scheduling.
849  "Expected loop-based directive.");
850  return !D.hasClausesOfKind<OMPOrderedClause>() &&
852  llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
853  [](const OMPScheduleClause *C) {
854  return C->getScheduleKind() == OMPC_SCHEDULE_static;
855  }));
856 }
857 
858 /// Check for inner (nested) lightweight runtime construct, if any
860  const OMPExecutableDirective &D) {
861  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
862  const auto *CS = D.getInnermostCapturedStmt();
863  const auto *Body =
864  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
865  const Stmt *ChildStmt = getSingleCompoundChild(Body);
866 
867  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
868  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
869  switch (D.getDirectiveKind()) {
870  case OMPD_target:
871  if (isOpenMPParallelDirective(DKind) &&
873  hasStaticScheduling(*NestedDir))
874  return true;
875  if (DKind == OMPD_parallel) {
876  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
877  /*IgnoreCaptured=*/true);
878  if (!Body)
879  return false;
880  ChildStmt = getSingleCompoundChild(Body);
881  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
882  DKind = NND->getDirectiveKind();
883  if (isOpenMPWorksharingDirective(DKind) &&
885  return true;
886  }
887  } else if (DKind == OMPD_teams) {
888  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
889  /*IgnoreCaptured=*/true);
890  if (!Body)
891  return false;
892  ChildStmt = getSingleCompoundChild(Body);
893  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
894  DKind = NND->getDirectiveKind();
895  if (isOpenMPParallelDirective(DKind) &&
898  return true;
899  if (DKind == OMPD_parallel) {
900  Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
901  /*IgnoreCaptured=*/true);
902  if (!Body)
903  return false;
904  ChildStmt = getSingleCompoundChild(Body);
905  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
906  DKind = NND->getDirectiveKind();
907  if (isOpenMPWorksharingDirective(DKind) &&
909  return true;
910  }
911  }
912  }
913  }
914  return false;
915  case OMPD_target_teams:
916  if (isOpenMPParallelDirective(DKind) &&
918  hasStaticScheduling(*NestedDir))
919  return true;
920  if (DKind == OMPD_parallel) {
921  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
922  /*IgnoreCaptured=*/true);
923  if (!Body)
924  return false;
925  ChildStmt = getSingleCompoundChild(Body);
926  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
927  DKind = NND->getDirectiveKind();
928  if (isOpenMPWorksharingDirective(DKind) &&
930  return true;
931  }
932  }
933  return false;
934  case OMPD_target_parallel:
935  return isOpenMPWorksharingDirective(DKind) &&
936  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
937  case OMPD_target_teams_distribute:
938  case OMPD_target_simd:
939  case OMPD_target_parallel_for:
940  case OMPD_target_parallel_for_simd:
941  case OMPD_target_teams_distribute_simd:
942  case OMPD_target_teams_distribute_parallel_for:
943  case OMPD_target_teams_distribute_parallel_for_simd:
944  case OMPD_parallel:
945  case OMPD_for:
946  case OMPD_parallel_for:
947  case OMPD_parallel_sections:
948  case OMPD_for_simd:
949  case OMPD_parallel_for_simd:
950  case OMPD_cancel:
951  case OMPD_cancellation_point:
952  case OMPD_ordered:
953  case OMPD_threadprivate:
954  case OMPD_task:
955  case OMPD_simd:
956  case OMPD_sections:
957  case OMPD_section:
958  case OMPD_single:
959  case OMPD_master:
960  case OMPD_critical:
961  case OMPD_taskyield:
962  case OMPD_barrier:
963  case OMPD_taskwait:
964  case OMPD_taskgroup:
965  case OMPD_atomic:
966  case OMPD_flush:
967  case OMPD_teams:
968  case OMPD_target_data:
969  case OMPD_target_exit_data:
970  case OMPD_target_enter_data:
971  case OMPD_distribute:
972  case OMPD_distribute_simd:
973  case OMPD_distribute_parallel_for:
974  case OMPD_distribute_parallel_for_simd:
975  case OMPD_teams_distribute:
976  case OMPD_teams_distribute_simd:
977  case OMPD_teams_distribute_parallel_for:
978  case OMPD_teams_distribute_parallel_for_simd:
979  case OMPD_target_update:
980  case OMPD_declare_simd:
981  case OMPD_declare_target:
982  case OMPD_end_declare_target:
983  case OMPD_declare_reduction:
984  case OMPD_taskloop:
985  case OMPD_taskloop_simd:
986  case OMPD_unknown:
987  llvm_unreachable("Unexpected directive.");
988  }
989  }
990 
991  return false;
992 }
993 
994 /// Checks if the construct supports lightweight runtime. It must be SPMD
995 /// construct + inner loop-based construct with static scheduling.
997  const OMPExecutableDirective &D) {
998  if (!supportsSPMDExecutionMode(Ctx, D))
999  return false;
1000  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1001  switch (DirectiveKind) {
1002  case OMPD_target:
1003  case OMPD_target_teams:
1004  case OMPD_target_parallel:
1005  return hasNestedLightweightDirective(Ctx, D);
1006  case OMPD_target_parallel_for:
1007  case OMPD_target_parallel_for_simd:
1008  case OMPD_target_teams_distribute_parallel_for:
1009  case OMPD_target_teams_distribute_parallel_for_simd:
1010  // (Last|First)-privates must be shared in parallel region.
1011  return hasStaticScheduling(D);
1012  case OMPD_target_simd:
1013  case OMPD_target_teams_distribute:
1014  case OMPD_target_teams_distribute_simd:
1015  return false;
1016  case OMPD_parallel:
1017  case OMPD_for:
1018  case OMPD_parallel_for:
1019  case OMPD_parallel_sections:
1020  case OMPD_for_simd:
1021  case OMPD_parallel_for_simd:
1022  case OMPD_cancel:
1023  case OMPD_cancellation_point:
1024  case OMPD_ordered:
1025  case OMPD_threadprivate:
1026  case OMPD_task:
1027  case OMPD_simd:
1028  case OMPD_sections:
1029  case OMPD_section:
1030  case OMPD_single:
1031  case OMPD_master:
1032  case OMPD_critical:
1033  case OMPD_taskyield:
1034  case OMPD_barrier:
1035  case OMPD_taskwait:
1036  case OMPD_taskgroup:
1037  case OMPD_atomic:
1038  case OMPD_flush:
1039  case OMPD_teams:
1040  case OMPD_target_data:
1041  case OMPD_target_exit_data:
1042  case OMPD_target_enter_data:
1043  case OMPD_distribute:
1044  case OMPD_distribute_simd:
1045  case OMPD_distribute_parallel_for:
1046  case OMPD_distribute_parallel_for_simd:
1047  case OMPD_teams_distribute:
1048  case OMPD_teams_distribute_simd:
1049  case OMPD_teams_distribute_parallel_for:
1050  case OMPD_teams_distribute_parallel_for_simd:
1051  case OMPD_target_update:
1052  case OMPD_declare_simd:
1053  case OMPD_declare_target:
1054  case OMPD_end_declare_target:
1055  case OMPD_declare_reduction:
1056  case OMPD_taskloop:
1057  case OMPD_taskloop_simd:
1058  case OMPD_unknown:
1059  break;
1060  }
1061  llvm_unreachable(
1062  "Unknown programming model for OpenMP directive on NVPTX target.");
1063 }
1064 
1065 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1066  StringRef ParentName,
1067  llvm::Function *&OutlinedFn,
1068  llvm::Constant *&OutlinedFnID,
1069  bool IsOffloadEntry,
1070  const RegionCodeGenTy &CodeGen) {
1071  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/false);
1072  EntryFunctionState EST;
1073  WorkerFunctionState WST(CGM, D.getBeginLoc());
1074  Work.clear();
1075  WrapperFunctionsMap.clear();
1076 
1077  // Emit target region as a standalone region.
1078  class NVPTXPrePostActionTy : public PrePostActionTy {
1079  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1080  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1081 
1082  public:
1083  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1084  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1085  : EST(EST), WST(WST) {}
1086  void Enter(CodeGenFunction &CGF) override {
1087  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1088  .emitNonSPMDEntryHeader(CGF, EST, WST);
1089  }
1090  void Exit(CodeGenFunction &CGF) override {
1091  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1092  .emitNonSPMDEntryFooter(CGF, EST);
1093  }
1094  } Action(EST, WST);
1095  CodeGen.setAction(Action);
1096  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1097  IsOffloadEntry, CodeGen);
1098 
1099  // Now change the name of the worker function to correspond to this target
1100  // region's entry function.
1101  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1102 
1103  // Create the worker function
1104  emitWorkerFunction(WST);
1105 }
1106 
1107 // Setup NVPTX threads for master-worker OpenMP scheme.
1108 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1109  EntryFunctionState &EST,
1110  WorkerFunctionState &WST) {
1111  CGBuilderTy &Bld = CGF.Builder;
1112 
1113  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1114  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1115  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1116  EST.ExitBB = CGF.createBasicBlock(".exit");
1117 
1118  llvm::Value *IsWorker =
1119  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1120  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1121 
1122  CGF.EmitBlock(WorkerBB);
1123  emitCall(CGF, WST.Loc, WST.WorkerFn);
1124  CGF.EmitBranch(EST.ExitBB);
1125 
1126  CGF.EmitBlock(MasterCheckBB);
1127  llvm::Value *IsMaster =
1128  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1129  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1130 
1131  CGF.EmitBlock(MasterBB);
1132  IsInTargetMasterThreadRegion = true;
1133  // SEQUENTIAL (MASTER) REGION START
1134  // First action in sequential region:
1135  // Initialize the state of the OpenMP runtime library on the GPU.
1136  // TODO: Optimize runtime initialization and pass in correct value.
1137  llvm::Value *Args[] = {getThreadLimit(CGF),
1138  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1139  CGF.EmitRuntimeCall(
1140  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1141 
1142  // For data sharing, we need to initialize the stack.
1143  CGF.EmitRuntimeCall(
1144  createNVPTXRuntimeFunction(
1145  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1146 
1147  emitGenericVarsProlog(CGF, WST.Loc);
1148 }
1149 
1150 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1151  EntryFunctionState &EST) {
1152  IsInTargetMasterThreadRegion = false;
1153  if (!CGF.HaveInsertPoint())
1154  return;
1155 
1156  emitGenericVarsEpilog(CGF);
1157 
1158  if (!EST.ExitBB)
1159  EST.ExitBB = CGF.createBasicBlock(".exit");
1160 
1161  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1162  CGF.EmitBranch(TerminateBB);
1163 
1164  CGF.EmitBlock(TerminateBB);
1165  // Signal termination condition.
1166  // TODO: Optimize runtime initialization and pass in correct value.
1167  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1168  CGF.EmitRuntimeCall(
1169  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1170  // Barrier to terminate worker threads.
1171  syncCTAThreads(CGF);
1172  // Master thread jumps to exit point.
1173  CGF.EmitBranch(EST.ExitBB);
1174 
1175  CGF.EmitBlock(EST.ExitBB);
1176  EST.ExitBB = nullptr;
1177 }
1178 
1179 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1180  StringRef ParentName,
1181  llvm::Function *&OutlinedFn,
1182  llvm::Constant *&OutlinedFnID,
1183  bool IsOffloadEntry,
1184  const RegionCodeGenTy &CodeGen) {
1185  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/true);
1186  EntryFunctionState EST;
1187 
1188  // Emit target region as a standalone region.
1189  class NVPTXPrePostActionTy : public PrePostActionTy {
1191  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1192  const OMPExecutableDirective &D;
1193 
1194  public:
1195  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1196  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1197  const OMPExecutableDirective &D)
1198  : RT(RT), EST(EST), D(D) {}
1199  void Enter(CodeGenFunction &CGF) override {
1200  RT.emitSPMDEntryHeader(CGF, EST, D);
1201  }
1202  void Exit(CodeGenFunction &CGF) override {
1203  RT.emitSPMDEntryFooter(CGF, EST);
1204  }
1205  } Action(*this, EST, D);
1206  CodeGen.setAction(Action);
1207  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1208  IsOffloadEntry, CodeGen);
1209 }
1210 
1211 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1212  CodeGenFunction &CGF, EntryFunctionState &EST,
1213  const OMPExecutableDirective &D) {
1214  CGBuilderTy &Bld = CGF.Builder;
1215 
1216  // Setup BBs in entry function.
1217  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1218  EST.ExitBB = CGF.createBasicBlock(".exit");
1219 
1220  // Initialize the OMP state in the runtime; called by all active threads.
1221  bool RequiresFullRuntime = CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1223  llvm::Value *Args[] = {
1224  getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1225  /*RequiresOMPRuntime=*/
1226  Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1227  /*RequiresDataSharing=*/Bld.getInt16(RequiresFullRuntime ? 1 : 0)};
1228  CGF.EmitRuntimeCall(
1229  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1230 
1231  if (RequiresFullRuntime) {
1232  // For data sharing, we need to initialize the stack.
1233  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1234  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1235  }
1236 
1237  CGF.EmitBranch(ExecuteBB);
1238 
1239  CGF.EmitBlock(ExecuteBB);
1240 
1241  IsInTargetMasterThreadRegion = true;
1242 }
1243 
1244 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1245  EntryFunctionState &EST) {
1246  IsInTargetMasterThreadRegion = false;
1247  if (!CGF.HaveInsertPoint())
1248  return;
1249 
1250  if (!EST.ExitBB)
1251  EST.ExitBB = CGF.createBasicBlock(".exit");
1252 
1253  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1254  CGF.EmitBranch(OMPDeInitBB);
1255 
1256  CGF.EmitBlock(OMPDeInitBB);
1257  // DeInitialize the OMP state in the runtime; called by all active threads.
1258  CGF.EmitRuntimeCall(
1259  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
1260  CGF.EmitBranch(EST.ExitBB);
1261 
1262  CGF.EmitBlock(EST.ExitBB);
1263  EST.ExitBB = nullptr;
1264 }
1265 
1266 // Create a unique global variable to indicate the execution mode of this target
1267 // region. The execution mode is either 'generic', or 'spmd' depending on the
1268 // target directive. This variable is picked up by the offload library to setup
1269 // the device appropriately before kernel launch. If the execution mode is
1270 // 'generic', the runtime reserves one warp for the master, otherwise, all
1271 // warps participate in parallel work.
1272 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1273  bool Mode) {
1274  auto *GVMode =
1275  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1276  llvm::GlobalValue::WeakAnyLinkage,
1277  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1278  Twine(Name, "_exec_mode"));
1279  CGM.addCompilerUsedGlobal(GVMode);
1280 }
1281 
1282 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1283  ASTContext &Ctx = CGM.getContext();
1284 
1285  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1286  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1287  WST.Loc, WST.Loc);
1288  emitWorkerLoop(CGF, WST);
1289  CGF.FinishFunction();
1290 }
1291 
1292 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1293  WorkerFunctionState &WST) {
1294  //
1295  // The workers enter this loop and wait for parallel work from the master.
1296  // When the master encounters a parallel region it sets up the work + variable
1297  // arguments, and wakes up the workers. The workers first check to see if
1298  // they are required for the parallel region, i.e., within the # of requested
1299  // parallel threads. The activated workers load the variable arguments and
1300  // execute the parallel work.
1301  //
1302 
1303  CGBuilderTy &Bld = CGF.Builder;
1304 
1305  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1306  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1307  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1308  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1309  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1310  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1311 
1312  CGF.EmitBranch(AwaitBB);
1313 
1314  // Workers wait for work from master.
1315  CGF.EmitBlock(AwaitBB);
1316  // Wait for parallel work
1317  syncCTAThreads(CGF);
1318 
1319  Address WorkFn =
1320  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1321  Address ExecStatus =
1322  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1323  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1324  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1325 
1326  // TODO: Optimize runtime initialization and pass in correct value.
1327  llvm::Value *Args[] = {WorkFn.getPointer(),
1328  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1329  llvm::Value *Ret = CGF.EmitRuntimeCall(
1330  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1331  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1332 
1333  // On termination condition (workid == 0), exit loop.
1334  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1335  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1336  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1337 
1338  // Activate requested workers.
1339  CGF.EmitBlock(SelectWorkersBB);
1340  llvm::Value *IsActive =
1341  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1342  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1343 
1344  // Signal start of parallel region.
1345  CGF.EmitBlock(ExecuteBB);
1346 
1347  // Process work items: outlined parallel functions.
1348  for (llvm::Function *W : Work) {
1349  // Try to match this outlined function.
1351 
1352  llvm::Value *WorkFnMatch =
1353  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1354 
1355  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1356  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1357  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1358 
1359  // Execute this outlined function.
1360  CGF.EmitBlock(ExecuteFNBB);
1361 
1362  // Insert call to work function via shared wrapper. The shared
1363  // wrapper takes two arguments:
1364  // - the parallelism level;
1365  // - the thread ID;
1366  emitCall(CGF, WST.Loc, W,
1367  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1368 
1369  // Go to end of parallel region.
1370  CGF.EmitBranch(TerminateBB);
1371 
1372  CGF.EmitBlock(CheckNextBB);
1373  }
1374  // Default case: call to outlined function through pointer if the target
1375  // region makes a declare target call that may contain an orphaned parallel
1376  // directive.
1377  auto *ParallelFnTy =
1378  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1379  /*isVarArg=*/false)
1380  ->getPointerTo();
1381  llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
1382  // Insert call to work function via shared wrapper. The shared
1383  // wrapper takes two arguments:
1384  // - the parallelism level;
1385  // - the thread ID;
1386  emitCall(CGF, WST.Loc, WorkFnCast,
1387  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1388  // Go to end of parallel region.
1389  CGF.EmitBranch(TerminateBB);
1390 
1391  // Signal end of parallel region.
1392  CGF.EmitBlock(TerminateBB);
1393  CGF.EmitRuntimeCall(
1394  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1395  llvm::None);
1396  CGF.EmitBranch(BarrierBB);
1397 
1398  // All active and inactive workers wait at a barrier after parallel region.
1399  CGF.EmitBlock(BarrierBB);
1400  // Barrier after parallel region.
1401  syncCTAThreads(CGF);
1402  CGF.EmitBranch(AwaitBB);
1403 
1404  // Exit target region.
1405  CGF.EmitBlock(ExitBB);
1406 }
1407 
1408 /// Returns specified OpenMP runtime function for the current OpenMP
1409 /// implementation. Specialized for the NVPTX device.
1410 /// \param Function OpenMP runtime function.
1411 /// \return Specified function.
1412 llvm::Constant *
1414  llvm::Constant *RTLFn = nullptr;
1415  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1416  case OMPRTL_NVPTX__kmpc_kernel_init: {
1417  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1418  // RequiresOMPRuntime);
1419  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1420  auto *FnTy =
1421  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1422  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1423  break;
1424  }
1425  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1426  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1427  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1428  auto *FnTy =
1429  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1430  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1431  break;
1432  }
1433  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1434  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1435  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1436  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1437  auto *FnTy =
1438  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1439  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1440  break;
1441  }
1442  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
1443  // Build void __kmpc_spmd_kernel_deinit();
1444  auto *FnTy =
1445  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1446  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
1447  break;
1448  }
1449  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1450  /// Build void __kmpc_kernel_prepare_parallel(
1451  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1452  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1453  auto *FnTy =
1454  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1455  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1456  break;
1457  }
1458  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1459  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1460  /// int16_t IsOMPRuntimeInitialized);
1461  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1462  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1463  auto *FnTy =
1464  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1465  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1466  break;
1467  }
1468  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1469  /// Build void __kmpc_kernel_end_parallel();
1470  auto *FnTy =
1471  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1472  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1473  break;
1474  }
1475  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1476  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1477  // global_tid);
1478  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1479  auto *FnTy =
1480  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1481  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1482  break;
1483  }
1484  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1485  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1486  // global_tid);
1487  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1488  auto *FnTy =
1489  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1490  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1491  break;
1492  }
1493  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1494  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1495  // int16_t lane_offset, int16_t warp_size);
1496  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1497  auto *FnTy =
1498  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1499  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1500  break;
1501  }
1502  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1503  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1504  // int16_t lane_offset, int16_t warp_size);
1505  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1506  auto *FnTy =
1507  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1508  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1509  break;
1510  }
1511  case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
1512  // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
1513  // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1514  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1515  // lane_offset, int16_t Algorithm Version),
1516  // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1517  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1518  CGM.Int16Ty, CGM.Int16Ty};
1519  auto *ShuffleReduceFnTy =
1520  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1521  /*isVarArg=*/false);
1522  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1523  auto *InterWarpCopyFnTy =
1524  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1525  /*isVarArg=*/false);
1526  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1527  CGM.Int32Ty,
1528  CGM.SizeTy,
1529  CGM.VoidPtrTy,
1530  ShuffleReduceFnTy->getPointerTo(),
1531  InterWarpCopyFnTy->getPointerTo()};
1532  auto *FnTy =
1533  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1534  RTLFn = CGM.CreateRuntimeFunction(
1535  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1536  break;
1537  }
1538  case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: {
1539  // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid,
1540  // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1541  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1542  // lane_offset, int16_t Algorithm Version),
1543  // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1544  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1545  CGM.Int16Ty, CGM.Int16Ty};
1546  auto *ShuffleReduceFnTy =
1547  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1548  /*isVarArg=*/false);
1549  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1550  auto *InterWarpCopyFnTy =
1551  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1552  /*isVarArg=*/false);
1553  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1554  CGM.Int32Ty,
1555  CGM.SizeTy,
1556  CGM.VoidPtrTy,
1557  ShuffleReduceFnTy->getPointerTo(),
1558  InterWarpCopyFnTy->getPointerTo()};
1559  auto *FnTy =
1560  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1561  RTLFn = CGM.CreateRuntimeFunction(
1562  FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait");
1563  break;
1564  }
1565  case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
1566  // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
1567  // int32_t num_vars, size_t reduce_size, void *reduce_data,
1568  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1569  // lane_offset, int16_t shortCircuit),
1570  // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
1571  // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
1572  // int32_t index, int32_t width),
1573  // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1574  // int32_t index, int32_t width, int32_t reduce))
1575  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1576  CGM.Int16Ty, CGM.Int16Ty};
1577  auto *ShuffleReduceFnTy =
1578  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1579  /*isVarArg=*/false);
1580  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1581  auto *InterWarpCopyFnTy =
1582  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1583  /*isVarArg=*/false);
1584  llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1585  CGM.Int32Ty, CGM.Int32Ty};
1586  auto *CopyToScratchpadFnTy =
1587  llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1588  /*isVarArg=*/false);
1589  llvm::Type *LoadReduceTypeParams[] = {
1590  CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1591  auto *LoadReduceFnTy =
1592  llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1593  /*isVarArg=*/false);
1594  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1595  CGM.Int32Ty,
1596  CGM.SizeTy,
1597  CGM.VoidPtrTy,
1598  ShuffleReduceFnTy->getPointerTo(),
1599  InterWarpCopyFnTy->getPointerTo(),
1600  CopyToScratchpadFnTy->getPointerTo(),
1601  LoadReduceFnTy->getPointerTo()};
1602  auto *FnTy =
1603  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1604  RTLFn = CGM.CreateRuntimeFunction(
1605  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1606  break;
1607  }
1608  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1609  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1610  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1611  auto *FnTy =
1612  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1613  RTLFn = CGM.CreateRuntimeFunction(
1614  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1615  break;
1616  }
1617  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1618  /// Build void __kmpc_data_sharing_init_stack();
1619  auto *FnTy =
1620  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1621  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1622  break;
1623  }
1624  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1625  /// Build void __kmpc_data_sharing_init_stack_spmd();
1626  auto *FnTy =
1627  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1628  RTLFn =
1629  CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1630  break;
1631  }
1632  case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
1633  // Build void *__kmpc_data_sharing_push_stack(size_t size,
1634  // int16_t UseSharedMemory);
1635  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1636  auto *FnTy =
1637  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1638  RTLFn = CGM.CreateRuntimeFunction(
1639  FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
1640  break;
1641  }
1642  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1643  // Build void __kmpc_data_sharing_pop_stack(void *a);
1644  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1645  auto *FnTy =
1646  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1647  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1648  /*Name=*/"__kmpc_data_sharing_pop_stack");
1649  break;
1650  }
1651  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1652  /// Build void __kmpc_begin_sharing_variables(void ***args,
1653  /// size_t n_args);
1654  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1655  auto *FnTy =
1656  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1657  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1658  break;
1659  }
1660  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1661  /// Build void __kmpc_end_sharing_variables();
1662  auto *FnTy =
1663  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1664  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1665  break;
1666  }
1667  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1668  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1669  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1670  auto *FnTy =
1671  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1672  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1673  break;
1674  }
1675  case OMPRTL_NVPTX__kmpc_parallel_level: {
1676  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1677  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1678  auto *FnTy =
1679  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1680  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1681  break;
1682  }
1683  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1684  // Build int8_t __kmpc_is_spmd_exec_mode();
1685  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1686  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1687  break;
1688  }
1689  }
1690  return RTLFn;
1691 }
1692 
1693 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1694  llvm::Constant *Addr,
1695  uint64_t Size, int32_t,
1696  llvm::GlobalValue::LinkageTypes) {
1697  // TODO: Add support for global variables on the device after declare target
1698  // support.
1699  if (!isa<llvm::Function>(Addr))
1700  return;
1701  llvm::Module &M = CGM.getModule();
1702  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1703 
1704  // Get "nvvm.annotations" metadata node
1705  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1706 
1707  llvm::Metadata *MDVals[] = {
1708  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1709  llvm::ConstantAsMetadata::get(
1710  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1711  // Append metadata to nvvm.annotations
1712  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1713 }
1714 
1715 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1716  const OMPExecutableDirective &D, StringRef ParentName,
1717  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1718  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1719  if (!IsOffloadEntry) // Nothing to do.
1720  return;
1721 
1722  assert(!ParentName.empty() && "Invalid target region parent name!");
1723 
1724  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1725  if (Mode)
1726  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1727  CodeGen);
1728  else
1729  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1730  CodeGen);
1731 
1732  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1733 }
1734 
1736  : CGOpenMPRuntime(CGM, "_", "$") {
1737  if (!CGM.getLangOpts().OpenMPIsDevice)
1738  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1739 }
1740 
1742  OpenMPProcBindClauseKind ProcBind,
1743  SourceLocation Loc) {
1744  // Do nothing in case of SPMD mode and L0 parallel.
1745  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1746  return;
1747 
1748  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1749 }
1750 
1752  llvm::Value *NumThreads,
1753  SourceLocation Loc) {
1754  // Do nothing in case of SPMD mode and L0 parallel.
1755  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1756  return;
1757 
1758  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1759 }
1760 
1762  const Expr *NumTeams,
1763  const Expr *ThreadLimit,
1764  SourceLocation Loc) {}
1765 
1767  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1768  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1769  // Emit target region as a standalone region.
1770  class NVPTXPrePostActionTy : public PrePostActionTy {
1771  bool &IsInParallelRegion;
1772  bool PrevIsInParallelRegion;
1773 
1774  public:
1775  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1776  : IsInParallelRegion(IsInParallelRegion) {}
1777  void Enter(CodeGenFunction &CGF) override {
1778  PrevIsInParallelRegion = IsInParallelRegion;
1779  IsInParallelRegion = true;
1780  }
1781  void Exit(CodeGenFunction &CGF) override {
1782  IsInParallelRegion = PrevIsInParallelRegion;
1783  }
1784  } Action(IsInParallelRegion);
1785  CodeGen.setAction(Action);
1786  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1787  IsInTargetMasterThreadRegion = false;
1788  auto *OutlinedFun =
1789  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1790  D, ThreadIDVar, InnermostKind, CodeGen));
1791  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1792  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1793  !IsInParallelRegion) {
1794  llvm::Function *WrapperFun =
1795  createParallelDataSharingWrapper(OutlinedFun, D);
1796  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1797  }
1798 
1799  return OutlinedFun;
1800 }
1801 
1803  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1804  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1805  SourceLocation Loc = D.getBeginLoc();
1806 
1807  // Emit target region as a standalone region.
1808  class NVPTXPrePostActionTy : public PrePostActionTy {
1809  SourceLocation &Loc;
1810 
1811  public:
1812  NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
1813  void Enter(CodeGenFunction &CGF) override {
1814  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1815  .emitGenericVarsProlog(CGF, Loc);
1816  }
1817  void Exit(CodeGenFunction &CGF) override {
1818  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1819  .emitGenericVarsEpilog(CGF);
1820  }
1821  } Action(Loc);
1822  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
1823  CodeGen.setAction(Action);
1825  D, ThreadIDVar, InnermostKind, CodeGen);
1826  llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
1827  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
1828  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
1829  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
1830 
1831  return OutlinedFun;
1832 }
1833 
1834 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
1835  SourceLocation Loc,
1836  bool WithSPMDCheck) {
1838  return;
1839 
1840  CGBuilderTy &Bld = CGF.Builder;
1841 
1842  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1843  if (I == FunctionGlobalizedDecls.end())
1844  return;
1845  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1846  QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1847 
1848  // Recover pointer to this function's global record. The runtime will
1849  // handle the specifics of the allocation of the memory.
1850  // Use actual memory size of the record including the padding
1851  // for alignment purposes.
1852  unsigned Alignment =
1854  unsigned GlobalRecordSize =
1855  CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
1856  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1857 
1858  llvm::Value *GlobalRecCastAddr;
1859  if (WithSPMDCheck ||
1860  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
1861  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1862  llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
1863  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
1864  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
1865  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
1866  Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
1867  // There is no need to emit line number for unconditional branch.
1869  CGF.EmitBlock(SPMDBB);
1870  Address RecPtr = CGF.CreateMemTemp(RecTy, "_local_stack");
1871  CGF.EmitBranch(ExitBB);
1872  // There is no need to emit line number for unconditional branch.
1874  CGF.EmitBlock(NonSPMDBB);
1875  // TODO: allow the usage of shared memory to be controlled by
1876  // the user, for now, default to global.
1877  llvm::Value *GlobalRecordSizeArg[] = {
1878  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1879  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1880  llvm::Value *GlobalRecValue =
1882  OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1883  GlobalRecordSizeArg);
1884  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1885  GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
1886  CGF.EmitBlock(ExitBB);
1887  auto *Phi = Bld.CreatePHI(GlobalRecCastAddr->getType(),
1888  /*NumReservedValues=*/2, "_select_stack");
1889  Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
1890  Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
1891  GlobalRecCastAddr = Phi;
1892  I->getSecond().GlobalRecordAddr = Phi;
1893  I->getSecond().IsInSPMDModeFlag = IsSPMD;
1894  } else {
1895  assert(getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD &&
1896  "Expected Non-SPMD construct.");
1897  // TODO: allow the usage of shared memory to be controlled by
1898  // the user, for now, default to global.
1899  llvm::Value *GlobalRecordSizeArg[] = {
1900  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1901  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1902  llvm::Value *GlobalRecValue =
1904  OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1905  GlobalRecordSizeArg);
1906  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1907  GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
1908  I->getSecond().GlobalRecordAddr = GlobalRecValue;
1909  I->getSecond().IsInSPMDModeFlag = nullptr;
1910  }
1911  LValue Base =
1912  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
1913 
1914  // Emit the "global alloca" which is a GEP from the global declaration
1915  // record using the pointer returned by the runtime.
1916  for (auto &Rec : I->getSecond().LocalVarData) {
1917  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1918  llvm::Value *ParValue;
1919  if (EscapedParam) {
1920  const auto *VD = cast<VarDecl>(Rec.first);
1921  LValue ParLVal =
1922  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1923  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1924  }
1925  const FieldDecl *FD = Rec.second.first;
1926  LValue VarAddr = CGF.EmitLValueForField(Base, FD);
1927  Rec.second.second = VarAddr.getAddress();
1928  if (EscapedParam) {
1929  const auto *VD = cast<VarDecl>(Rec.first);
1930  CGF.EmitStoreOfScalar(ParValue, VarAddr);
1931  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1932  }
1933  }
1934  }
1935  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1936  // Recover pointer to this function's global record. The runtime will
1937  // handle the specifics of the allocation of the memory.
1938  // Use actual memory size of the record including the padding
1939  // for alignment purposes.
1940  CGBuilderTy &Bld = CGF.Builder;
1941  llvm::Value *Size = CGF.getTypeSize(VD->getType());
1942  CharUnits Align = CGM.getContext().getDeclAlign(VD);
1943  Size = Bld.CreateNUWAdd(
1944  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1945  llvm::Value *AlignVal =
1946  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1947  Size = Bld.CreateUDiv(Size, AlignVal);
1948  Size = Bld.CreateNUWMul(Size, AlignVal);
1949  // TODO: allow the usage of shared memory to be controlled by
1950  // the user, for now, default to global.
1951  llvm::Value *GlobalRecordSizeArg[] = {
1952  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1953  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1954  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1955  GlobalRecordSizeArg);
1956  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1957  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1958  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1959  CGM.getContext().getDeclAlign(VD),
1961  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1962  Base.getAddress());
1963  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
1964  }
1965  I->getSecond().MappedParams->apply(CGF);
1966 }
1967 
1968 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
1969  bool WithSPMDCheck) {
1971  return;
1972 
1973  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1974  if (I != FunctionGlobalizedDecls.end()) {
1975  I->getSecond().MappedParams->restore(CGF);
1976  if (!CGF.HaveInsertPoint())
1977  return;
1978  for (llvm::Value *Addr :
1979  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1980  CGF.EmitRuntimeCall(
1981  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1982  Addr);
1983  }
1984  if (I->getSecond().GlobalRecordAddr) {
1985  if (WithSPMDCheck ||
1986  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
1987  CGBuilderTy &Bld = CGF.Builder;
1988  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1989  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
1990  Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
1991  // There is no need to emit line number for unconditional branch.
1993  CGF.EmitBlock(NonSPMDBB);
1994  CGF.EmitRuntimeCall(
1996  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1997  CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
1998  CGF.EmitBlock(ExitBB);
1999  } else {
2000  assert(getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD &&
2001  "Expected Non-SPMD mode.");
2003  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2004  I->getSecond().GlobalRecordAddr);
2005  }
2006  }
2007  }
2008 }
2009 
2011  const OMPExecutableDirective &D,
2012  SourceLocation Loc,
2013  llvm::Value *OutlinedFn,
2014  ArrayRef<llvm::Value *> CapturedVars) {
2015  if (!CGF.HaveInsertPoint())
2016  return;
2017 
2018  Address ZeroAddr = CGF.CreateMemTemp(
2019  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2020  /*Name*/ ".zero.addr");
2021  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2022  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2023  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2024  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2025  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2026  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2027 }
2028 
2030  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2031  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2032  if (!CGF.HaveInsertPoint())
2033  return;
2034 
2035  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2036  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2037  else
2038  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2039 }
2040 
2041 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2042  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2043  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2044  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2045 
2046  // Force inline this outlined function at its call site.
2047  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2048 
2050  /*DestWidth=*/32, /*Signed=*/1),
2051  ".zero.addr");
2052  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2053  // ThreadId for serialized parallels is 0.
2054  Address ThreadIDAddr = ZeroAddr;
2055  auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
2056  CodeGenFunction &CGF, PrePostActionTy &Action) {
2057  Action.Enter(CGF);
2058 
2059  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2060  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2061  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2062  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2063  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2064  };
2065  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2066  PrePostActionTy &) {
2067 
2068  RegionCodeGenTy RCG(CodeGen);
2069  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2070  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2071  llvm::Value *Args[] = {RTLoc, ThreadID};
2072 
2073  NVPTXActionTy Action(
2074  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2075  Args,
2076  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2077  Args);
2078  RCG.setAction(Action);
2079  RCG(CGF);
2080  };
2081 
2082  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2083  PrePostActionTy &Action) {
2084  CGBuilderTy &Bld = CGF.Builder;
2085  llvm::Function *WFn = WrapperFunctionsMap[Fn];
2086  assert(WFn && "Wrapper function does not exist!");
2087  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2088 
2089  // Prepare for parallel region. Indicate the outlined function.
2090  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
2091  CGF.EmitRuntimeCall(
2092  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2093  Args);
2094 
2095  // Create a private scope that will globalize the arguments
2096  // passed from the outside of the target region.
2097  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2098 
2099  // There's somehting to share.
2100  if (!CapturedVars.empty()) {
2101  // Prepare for parallel region. Indicate the outlined function.
2102  Address SharedArgs =
2103  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2104  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2105 
2106  llvm::Value *DataSharingArgs[] = {
2107  SharedArgsPtr,
2108  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2109  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2110  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2111  DataSharingArgs);
2112 
2113  // Store variable address in a list of references to pass to workers.
2114  unsigned Idx = 0;
2115  ASTContext &Ctx = CGF.getContext();
2116  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2117  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2118  .castAs<PointerType>());
2119  for (llvm::Value *V : CapturedVars) {
2120  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2121  CGF.getPointerSize());
2122  llvm::Value *PtrV;
2123  if (V->getType()->isIntegerTy())
2124  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2125  else
2126  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2127  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2128  Ctx.getPointerType(Ctx.VoidPtrTy));
2129  ++Idx;
2130  }
2131  }
2132 
2133  // Activate workers. This barrier is used by the master to signal
2134  // work for the workers.
2135  syncCTAThreads(CGF);
2136 
2137  // OpenMP [2.5, Parallel Construct, p.49]
2138  // There is an implied barrier at the end of a parallel region. After the
2139  // end of a parallel region, only the master thread of the team resumes
2140  // execution of the enclosing task region.
2141  //
2142  // The master waits at this barrier until all workers are done.
2143  syncCTAThreads(CGF);
2144 
2145  if (!CapturedVars.empty())
2146  CGF.EmitRuntimeCall(
2147  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2148 
2149  // Remember for post-processing in worker loop.
2150  Work.emplace_back(WFn);
2151  };
2152 
2153  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen, &CodeGen,
2154  &ThreadIDAddr](CodeGenFunction &CGF,
2155  PrePostActionTy &Action) {
2156  RegionCodeGenTy RCG(CodeGen);
2157  if (IsInParallelRegion) {
2158  SeqGen(CGF, Action);
2159  } else if (IsInTargetMasterThreadRegion) {
2160  L0ParallelGen(CGF, Action);
2161  } else if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD) {
2162  RCG(CGF);
2163  } else {
2164  // Check for master and then parallelism:
2165  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2166  // Serialized execution.
2167  // } else if (master) {
2168  // Worker call.
2169  // } else {
2170  // Outlined function call.
2171  // }
2172  CGBuilderTy &Bld = CGF.Builder;
2173  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2174  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2175  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2176  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
2177  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2178  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2179  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2180  // There is no need to emit line number for unconditional branch.
2182  CGF.EmitBlock(ParallelCheckBB);
2183  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2184  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2185  llvm::Value *PL = CGF.EmitRuntimeCall(
2186  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2187  {RTLoc, ThreadID});
2188  llvm::Value *Res = Bld.CreateIsNotNull(PL);
2189  Bld.CreateCondBr(Res, SeqBB, MasterCheckBB);
2190  CGF.EmitBlock(SeqBB);
2191  SeqGen(CGF, Action);
2192  CGF.EmitBranch(ExitBB);
2193  // There is no need to emit line number for unconditional branch.
2195  CGF.EmitBlock(MasterCheckBB);
2196  llvm::BasicBlock *MasterThenBB = CGF.createBasicBlock("master.then");
2197  llvm::BasicBlock *ElseBlock = CGF.createBasicBlock("omp_if.else");
2198  llvm::Value *IsMaster =
2199  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
2200  Bld.CreateCondBr(IsMaster, MasterThenBB, ElseBlock);
2201  CGF.EmitBlock(MasterThenBB);
2202  L0ParallelGen(CGF, Action);
2203  CGF.EmitBranch(ExitBB);
2204  // There is no need to emit line number for unconditional branch.
2206  CGF.EmitBlock(ElseBlock);
2207  // In the worker need to use the real thread id.
2208  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2209  RCG(CGF);
2210  // There is no need to emit line number for unconditional branch.
2212  // Emit the continuation block for code after the if.
2213  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2214  }
2215  };
2216 
2217  if (IfCond) {
2218  emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2219  } else {
2221  RegionCodeGenTy ThenRCG(LNParallelGen);
2222  ThenRCG(CGF);
2223  }
2224 }
2225 
2226 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2227  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2228  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2229  // Just call the outlined function to execute the parallel region.
2230  // OutlinedFn(&GTid, &zero, CapturedStruct);
2231  //
2232  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2233 
2235  /*DestWidth=*/32, /*Signed=*/1),
2236  ".zero.addr");
2237  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2238  // ThreadId for serialized parallels is 0.
2239  Address ThreadIDAddr = ZeroAddr;
2240  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
2241  &ThreadIDAddr](CodeGenFunction &CGF,
2242  PrePostActionTy &Action) {
2243  Action.Enter(CGF);
2244 
2245  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2246  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2247  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2248  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2249  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2250  };
2251  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2252  PrePostActionTy &) {
2253 
2254  RegionCodeGenTy RCG(CodeGen);
2255  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2256  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2257  llvm::Value *Args[] = {RTLoc, ThreadID};
2258 
2259  NVPTXActionTy Action(
2260  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2261  Args,
2262  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2263  Args);
2264  RCG.setAction(Action);
2265  RCG(CGF);
2266  };
2267 
2268  if (IsInTargetMasterThreadRegion) {
2269  // In the worker need to use the real thread id.
2270  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2271  RegionCodeGenTy RCG(CodeGen);
2272  RCG(CGF);
2273  } else {
2274  // If we are not in the target region, it is definitely L2 parallelism or
2275  // more, because for SPMD mode we always has L1 parallel level, sowe don't
2276  // need to check for orphaned directives.
2277  RegionCodeGenTy RCG(SeqGen);
2278  RCG(CGF);
2279  }
2280 }
2281 
2283  CodeGenFunction &CGF, StringRef CriticalName,
2284  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2285  const Expr *Hint) {
2286  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2287  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2288  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2289  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2290  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2291 
2292  // Fetch team-local id of the thread.
2293  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2294 
2295  // Get the width of the team.
2296  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2297 
2298  // Initialize the counter variable for the loop.
2299  QualType Int32Ty =
2300  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2301  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2302  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2303  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2304  /*isInit=*/true);
2305 
2306  // Block checks if loop counter exceeds upper bound.
2307  CGF.EmitBlock(LoopBB);
2308  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2309  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2310  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2311 
2312  // Block tests which single thread should execute region, and which threads
2313  // should go straight to synchronisation point.
2314  CGF.EmitBlock(TestBB);
2315  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2316  llvm::Value *CmpThreadToCounter =
2317  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2318  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2319 
2320  // Block emits the body of the critical region.
2321  CGF.EmitBlock(BodyBB);
2322 
2323  // Output the critical statement.
2324  CriticalOpGen(CGF);
2325 
2326  // After the body surrounded by the critical region, the single executing
2327  // thread will jump to the synchronisation point.
2328  // Block waits for all threads in current team to finish then increments the
2329  // counter variable and returns to the loop.
2330  CGF.EmitBlock(SyncBB);
2331  getNVPTXCTABarrier(CGF);
2332 
2333  llvm::Value *IncCounterVal =
2334  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2335  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2336  CGF.EmitBranch(LoopBB);
2337 
2338  // Block that is reached when all threads in the team complete the region.
2339  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2340 }
2341 
2342 /// Cast value to the specified type.
2344  QualType ValTy, QualType CastTy,
2345  SourceLocation Loc) {
2346  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2347  "Cast type must sized.");
2348  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2349  "Val type must sized.");
2350  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2351  if (ValTy == CastTy)
2352  return Val;
2353  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2354  CGF.getContext().getTypeSizeInChars(CastTy))
2355  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2356  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2357  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2358  CastTy->hasSignedIntegerRepresentation());
2359  Address CastItem = CGF.CreateMemTemp(CastTy);
2361  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2362  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2363  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2364 }
2365 
2366 /// This function creates calls to one of two shuffle functions to copy
2367 /// variables between lanes in a warp.
2369  llvm::Value *Elem,
2370  QualType ElemType,
2372  SourceLocation Loc) {
2373  CodeGenModule &CGM = CGF.CGM;
2374  CGBuilderTy &Bld = CGF.Builder;
2375  CGOpenMPRuntimeNVPTX &RT =
2376  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2377 
2378  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2379  assert(Size.getQuantity() <= 8 &&
2380  "Unsupported bitwidth in shuffle instruction.");
2381 
2382  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2383  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2384  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2385 
2386  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2387  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2388  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2389  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2390  llvm::Value *WarpSize =
2391  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2392 
2393  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2394  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2395 
2396  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2397 }
2398 
2399 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2400  Address DestAddr, QualType ElemType,
2402  CGBuilderTy &Bld = CGF.Builder;
2403 
2404  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2405  // Create the loop over the big sized data.
2406  // ptr = (void*)Elem;
2407  // ptrEnd = (void*) Elem + 1;
2408  // Step = 8;
2409  // while (ptr + Step < ptrEnd)
2410  // shuffle((int64_t)*ptr);
2411  // Step = 4;
2412  // while (ptr + Step < ptrEnd)
2413  // shuffle((int32_t)*ptr);
2414  // ...
2415  Address ElemPtr = DestAddr;
2416  Address Ptr = SrcAddr;
2418  Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
2419  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2420  if (Size < CharUnits::fromQuantity(IntSize))
2421  continue;
2422  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2423  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2424  /*Signed=*/1);
2425  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2426  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2427  ElemPtr =
2428  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2429  if (Size.getQuantity() / IntSize > 1) {
2430  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2431  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2432  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2433  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2434  CGF.EmitBlock(PreCondBB);
2435  llvm::PHINode *PhiSrc =
2436  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2437  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2438  llvm::PHINode *PhiDest =
2439  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2440  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2441  Ptr = Address(PhiSrc, Ptr.getAlignment());
2442  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2443  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2445  Ptr.getPointer(), CGF.VoidPtrTy));
2446  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2447  ThenBB, ExitBB);
2448  CGF.EmitBlock(ThenBB);
2450  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2451  IntType, Offset, Loc);
2452  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2453  Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2454  ElemPtr =
2455  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2456  PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
2457  PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
2458  CGF.EmitBranch(PreCondBB);
2459  CGF.EmitBlock(ExitBB);
2460  } else {
2462  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2463  IntType, Offset, Loc);
2464  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2465  Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2466  ElemPtr =
2467  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2468  }
2469  Size = Size % IntSize;
2470  }
2471 }
2472 
2473 namespace {
2474 enum CopyAction : unsigned {
2475  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2476  // the warp using shuffle instructions.
2477  RemoteLaneToThread,
2478  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2479  ThreadCopy,
2480  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2481  ThreadToScratchpad,
2482  // ScratchpadToThread: Copy from a scratchpad array in global memory
2483  // containing team-reduced data to a thread's stack.
2484  ScratchpadToThread,
2485 };
2486 } // namespace
2487 
2492 };
2493 
2494 /// Emit instructions to copy a Reduce list, which contains partially
2495 /// aggregated values, in the specified direction.
2497  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2498  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2499  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2500 
2501  CodeGenModule &CGM = CGF.CGM;
2502  ASTContext &C = CGM.getContext();
2503  CGBuilderTy &Bld = CGF.Builder;
2504 
2505  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2506  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2507  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2508 
2509  // Iterates, element-by-element, through the source Reduce list and
2510  // make a copy.
2511  unsigned Idx = 0;
2512  unsigned Size = Privates.size();
2513  for (const Expr *Private : Privates) {
2514  Address SrcElementAddr = Address::invalid();
2515  Address DestElementAddr = Address::invalid();
2516  Address DestElementPtrAddr = Address::invalid();
2517  // Should we shuffle in an element from a remote lane?
2518  bool ShuffleInElement = false;
2519  // Set to true to update the pointer in the dest Reduce list to a
2520  // newly created element.
2521  bool UpdateDestListPtr = false;
2522  // Increment the src or dest pointer to the scratchpad, for each
2523  // new element.
2524  bool IncrScratchpadSrc = false;
2525  bool IncrScratchpadDest = false;
2526 
2527  switch (Action) {
2528  case RemoteLaneToThread: {
2529  // Step 1.1: Get the address for the src element in the Reduce list.
2530  Address SrcElementPtrAddr =
2531  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2532  SrcElementAddr = CGF.EmitLoadOfPointer(
2533  SrcElementPtrAddr,
2534  C.getPointerType(Private->getType())->castAs<PointerType>());
2535 
2536  // Step 1.2: Create a temporary to store the element in the destination
2537  // Reduce list.
2538  DestElementPtrAddr =
2539  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2540  DestElementAddr =
2541  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2542  ShuffleInElement = true;
2543  UpdateDestListPtr = true;
2544  break;
2545  }
2546  case ThreadCopy: {
2547  // Step 1.1: Get the address for the src element in the Reduce list.
2548  Address SrcElementPtrAddr =
2549  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2550  SrcElementAddr = CGF.EmitLoadOfPointer(
2551  SrcElementPtrAddr,
2552  C.getPointerType(Private->getType())->castAs<PointerType>());
2553 
2554  // Step 1.2: Get the address for dest element. The destination
2555  // element has already been created on the thread's stack.
2556  DestElementPtrAddr =
2557  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2558  DestElementAddr = CGF.EmitLoadOfPointer(
2559  DestElementPtrAddr,
2560  C.getPointerType(Private->getType())->castAs<PointerType>());
2561  break;
2562  }
2563  case ThreadToScratchpad: {
2564  // Step 1.1: Get the address for the src element in the Reduce list.
2565  Address SrcElementPtrAddr =
2566  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2567  SrcElementAddr = CGF.EmitLoadOfPointer(
2568  SrcElementPtrAddr,
2569  C.getPointerType(Private->getType())->castAs<PointerType>());
2570 
2571  // Step 1.2: Get the address for dest element:
2572  // address = base + index * ElementSizeInChars.
2573  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2574  llvm::Value *CurrentOffset =
2575  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2576  llvm::Value *ScratchPadElemAbsolutePtrVal =
2577  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2578  ScratchPadElemAbsolutePtrVal =
2579  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2580  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2581  C.getTypeAlignInChars(Private->getType()));
2582  IncrScratchpadDest = true;
2583  break;
2584  }
2585  case ScratchpadToThread: {
2586  // Step 1.1: Get the address for the src element in the scratchpad.
2587  // address = base + index * ElementSizeInChars.
2588  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2589  llvm::Value *CurrentOffset =
2590  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2591  llvm::Value *ScratchPadElemAbsolutePtrVal =
2592  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2593  ScratchPadElemAbsolutePtrVal =
2594  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2595  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2596  C.getTypeAlignInChars(Private->getType()));
2597  IncrScratchpadSrc = true;
2598 
2599  // Step 1.2: Create a temporary to store the element in the destination
2600  // Reduce list.
2601  DestElementPtrAddr =
2602  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2603  DestElementAddr =
2604  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2605  UpdateDestListPtr = true;
2606  break;
2607  }
2608  }
2609 
2610  // Regardless of src and dest of copy, we emit the load of src
2611  // element as this is required in all directions
2612  SrcElementAddr = Bld.CreateElementBitCast(
2613  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
2614  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2615  SrcElementAddr.getElementType());
2616 
2617  // Now that all active lanes have read the element in the
2618  // Reduce list, shuffle over the value from the remote lane.
2619  if (ShuffleInElement) {
2620  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2621  RemoteLaneOffset, Private->getExprLoc());
2622  } else {
2623  if (Private->getType()->isScalarType()) {
2624  llvm::Value *Elem =
2625  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
2626  Private->getType(), Private->getExprLoc());
2627  // Store the source element value to the dest element address.
2628  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
2629  Private->getType());
2630  } else {
2631  CGF.EmitAggregateCopy(
2632  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2633  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2635  }
2636  }
2637 
2638  // Step 3.1: Modify reference in dest Reduce list as needed.
2639  // Modifying the reference in Reduce list to point to the newly
2640  // created element. The element is live in the current function
2641  // scope and that of functions it invokes (i.e., reduce_function).
2642  // RemoteReduceData[i] = (void*)&RemoteElem
2643  if (UpdateDestListPtr) {
2645  DestElementAddr.getPointer(), CGF.VoidPtrTy),
2646  DestElementPtrAddr, /*Volatile=*/false,
2647  C.VoidPtrTy);
2648  }
2649 
2650  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2651  // address of the next element in scratchpad memory, unless we're currently
2652  // processing the last one. Memory alignment is also taken care of here.
2653  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2654  llvm::Value *ScratchpadBasePtr =
2655  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2656  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2657  ScratchpadBasePtr = Bld.CreateNUWAdd(
2658  ScratchpadBasePtr,
2659  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2660 
2661  // Take care of global memory alignment for performance
2662  ScratchpadBasePtr = Bld.CreateNUWSub(
2663  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2664  ScratchpadBasePtr = Bld.CreateUDiv(
2665  ScratchpadBasePtr,
2666  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2667  ScratchpadBasePtr = Bld.CreateNUWAdd(
2668  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2669  ScratchpadBasePtr = Bld.CreateNUWMul(
2670  ScratchpadBasePtr,
2671  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2672 
2673  if (IncrScratchpadDest)
2674  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2675  else /* IncrScratchpadSrc = true */
2676  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2677  }
2678 
2679  ++Idx;
2680  }
2681 }
2682 
2683 /// This function emits a helper that loads data from the scratchpad array
2684 /// and (optionally) reduces it with the input operand.
2685 ///
2686 /// load_and_reduce(local, scratchpad, index, width, should_reduce)
2687 /// reduce_data remote;
2688 /// for elem in remote:
2689 /// remote.elem = Scratchpad[elem_id][index]
2690 /// if (should_reduce)
2691 /// local = local @ remote
2692 /// else
2693 /// local = remote
2695  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2696  QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
2697  ASTContext &C = CGM.getContext();
2698  QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
2699 
2700  // Destination of the copy.
2701  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2703  // Base address of the scratchpad array, with each element storing a
2704  // Reduce list per team.
2705  ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2707  // A source index into the scratchpad array.
2708  ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2710  // Row width of an element in the scratchpad array, typically
2711  // the number of teams.
2712  ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2714  // If should_reduce == 1, then it's load AND reduce,
2715  // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
2716  // The latter case is used for initialization.
2717  ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2718  Int32Ty, ImplicitParamDecl::Other);
2719 
2720  FunctionArgList Args;
2721  Args.push_back(&ReduceListArg);
2722  Args.push_back(&ScratchPadArg);
2723  Args.push_back(&IndexArg);
2724  Args.push_back(&WidthArg);
2725  Args.push_back(&ShouldReduceArg);
2726 
2727  const CGFunctionInfo &CGFI =
2729  auto *Fn = llvm::Function::Create(
2731  "_omp_reduction_load_and_reduce", &CGM.getModule());
2732  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2733  Fn->setDoesNotRecurse();
2734  CodeGenFunction CGF(CGM);
2735  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2736 
2737  CGBuilderTy &Bld = CGF.Builder;
2738 
2739  // Get local Reduce list pointer.
2740  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2741  Address ReduceListAddr(
2743  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2744  C.VoidPtrTy, Loc),
2745  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2746  CGF.getPointerAlign());
2747 
2748  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
2749  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
2750  AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2751 
2752  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
2753  llvm::Value *IndexVal = Bld.CreateIntCast(
2754  CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
2755  CGM.SizeTy, /*isSigned=*/true);
2756 
2757  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
2758  llvm::Value *WidthVal = Bld.CreateIntCast(
2759  CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
2760  CGM.SizeTy, /*isSigned=*/true);
2761 
2762  Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
2763  llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
2764  AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
2765 
2766  // The absolute ptr address to the base addr of the next element to copy.
2767  llvm::Value *CumulativeElemBasePtr =
2768  Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2769  Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2770 
2771  // Create a Remote Reduce list to store the elements read from the
2772  // scratchpad array.
2773  Address RemoteReduceList =
2774  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
2775 
2776  // Assemble remote Reduce list from scratchpad array.
2777  emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
2778  SrcDataAddr, RemoteReduceList,
2779  {/*RemoteLaneOffset=*/nullptr,
2780  /*ScratchpadIndex=*/IndexVal,
2781  /*ScratchpadWidth=*/WidthVal});
2782 
2783  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2784  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2785  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2786 
2787  llvm::Value *CondReduce = Bld.CreateIsNotNull(ShouldReduceVal);
2788  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2789 
2790  CGF.EmitBlock(ThenBB);
2791  // We should reduce with the local Reduce list.
2792  // reduce_function(LocalReduceList, RemoteReduceList)
2794  ReduceListAddr.getPointer(), CGF.VoidPtrTy);
2795  llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2796  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2797  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2798  CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
2799  Bld.CreateBr(MergeBB);
2800 
2801  CGF.EmitBlock(ElseBB);
2802  // No reduction; just copy:
2803  // Local Reduce list = Remote Reduce list.
2804  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2805  RemoteReduceList, ReduceListAddr);
2806  Bld.CreateBr(MergeBB);
2807 
2808  CGF.EmitBlock(MergeBB);
2809 
2810  CGF.FinishFunction();
2811  return Fn;
2812 }
2813 
2814 /// This function emits a helper that stores reduced data from the team
2815 /// master to a scratchpad array in global memory.
2816 ///
2817 /// for elem in Reduce List:
2818 /// scratchpad[elem_id][index] = elem
2819 ///
2821  ArrayRef<const Expr *> Privates,
2822  QualType ReductionArrayTy,
2823  SourceLocation Loc) {
2824 
2825  ASTContext &C = CGM.getContext();
2826  QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
2827 
2828  // Source of the copy.
2829  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2831  // Base address of the scratchpad array, with each element storing a
2832  // Reduce list per team.
2833  ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2835  // A destination index into the scratchpad array, typically the team
2836  // identifier.
2837  ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2839  // Row width of an element in the scratchpad array, typically
2840  // the number of teams.
2841  ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2843 
2844  FunctionArgList Args;
2845  Args.push_back(&ReduceListArg);
2846  Args.push_back(&ScratchPadArg);
2847  Args.push_back(&IndexArg);
2848  Args.push_back(&WidthArg);
2849 
2850  const CGFunctionInfo &CGFI =
2852  auto *Fn = llvm::Function::Create(
2854  "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
2855  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2856  Fn->setDoesNotRecurse();
2857  CodeGenFunction CGF(CGM);
2858  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2859 
2860  CGBuilderTy &Bld = CGF.Builder;
2861 
2862  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2863  Address SrcDataAddr(
2865  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2866  C.VoidPtrTy, Loc),
2867  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2868  CGF.getPointerAlign());
2869 
2870  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
2871  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
2872  AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2873 
2874  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
2875  llvm::Value *IndexVal = Bld.CreateIntCast(
2876  CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
2877  CGF.SizeTy, /*isSigned=*/true);
2878 
2879  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
2880  llvm::Value *WidthVal =
2881  Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
2882  Int32Ty, SourceLocation()),
2883  CGF.SizeTy, /*isSigned=*/true);
2884 
2885  // The absolute ptr address to the base addr of the next element to copy.
2886  llvm::Value *CumulativeElemBasePtr =
2887  Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2888  Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2889 
2890  emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
2891  SrcDataAddr, DestDataAddr,
2892  {/*RemoteLaneOffset=*/nullptr,
2893  /*ScratchpadIndex=*/IndexVal,
2894  /*ScratchpadWidth=*/WidthVal});
2895 
2896  CGF.FinishFunction();
2897  return Fn;
2898 }
2899 
2900 /// This function emits a helper that gathers Reduce lists from the first
2901 /// lane of every active warp to lanes in the first warp.
2902 ///
2903 /// void inter_warp_copy_func(void* reduce_data, num_warps)
2904 /// shared smem[warp_size];
2905 /// For all data entries D in reduce_data:
2906 /// If (I am the first lane in each warp)
2907 /// Copy my local D to smem[warp_id]
2908 /// sync
2909 /// if (I am the first warp)
2910 /// Copy smem[thread_id] to my local D
2911 /// sync
2913  ArrayRef<const Expr *> Privates,
2914  QualType ReductionArrayTy,
2915  SourceLocation Loc) {
2916  ASTContext &C = CGM.getContext();
2917  llvm::Module &M = CGM.getModule();
2918 
2919  // ReduceList: thread local Reduce list.
2920  // At the stage of the computation when this function is called, partially
2921  // aggregated values reside in the first lane of every active warp.
2922  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2924  // NumWarps: number of warps active in the parallel region. This could
2925  // be smaller than 32 (max warps in a CTA) for partial block reduction.
2926  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2927  C.getIntTypeForBitwidth(32, /* Signed */ true),
2929  FunctionArgList Args;
2930  Args.push_back(&ReduceListArg);
2931  Args.push_back(&NumWarpsArg);
2932 
2933  const CGFunctionInfo &CGFI =
2935  auto *Fn = llvm::Function::Create(
2937  "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
2938  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2939  Fn->setDoesNotRecurse();
2940  CodeGenFunction CGF(CGM);
2941  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2942 
2943  CGBuilderTy &Bld = CGF.Builder;
2944 
2945  // This array is used as a medium to transfer, one reduce element at a time,
2946  // the data from the first lane of every warp to lanes in the first warp
2947  // in order to perform the final step of a reduction in a parallel region
2948  // (reduction across warps). The array is placed in NVPTX __shared__ memory
2949  // for reduced latency, as well as to have a distinct copy for concurrently
2950  // executing target regions. The array is declared with common linkage so
2951  // as to be shared across compilation units.
2952  StringRef TransferMediumName =
2953  "__openmp_nvptx_data_transfer_temporary_storage";
2954  llvm::GlobalVariable *TransferMedium =
2955  M.getGlobalVariable(TransferMediumName);
2956  if (!TransferMedium) {
2957  auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
2958  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2959  TransferMedium = new llvm::GlobalVariable(
2960  M, Ty,
2961  /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
2962  llvm::Constant::getNullValue(Ty), TransferMediumName,
2963  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2964  SharedAddressSpace);
2965  CGM.addCompilerUsedGlobal(TransferMedium);
2966  }
2967 
2968  // Get the CUDA thread id of the current OpenMP thread on the GPU.
2969  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2970  // nvptx_lane_id = nvptx_id % warpsize
2971  llvm::Value *LaneID = getNVPTXLaneID(CGF);
2972  // nvptx_warp_id = nvptx_id / warpsize
2973  llvm::Value *WarpID = getNVPTXWarpID(CGF);
2974 
2975  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2976  Address LocalReduceList(
2978  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2979  C.VoidPtrTy, SourceLocation()),
2980  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2981  CGF.getPointerAlign());
2982 
2983  unsigned Idx = 0;
2984  for (const Expr *Private : Privates) {
2985  //
2986  // Warp master copies reduce element to transfer medium in __shared__
2987  // memory.
2988  //
2989  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2990  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2991  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2992 
2993  // if (lane_id == 0)
2994  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2995  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2996  CGF.EmitBlock(ThenBB);
2997 
2998  // Reduce element = LocalReduceList[i]
2999  Address ElemPtrPtrAddr =
3000  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3001  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3002  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3003  // elemptr = (type[i]*)(elemptrptr)
3004  Address ElemPtr =
3005  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3006  ElemPtr = Bld.CreateElementBitCast(
3007  ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
3008 
3009  // Get pointer to location in transfer medium.
3010  // MediumPtr = &medium[warp_id]
3011  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3012  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3013  Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
3014  // Casting to actual data type.
3015  // MediumPtr = (type[i]*)MediumPtrAddr;
3016  MediumPtr = Bld.CreateElementBitCast(
3017  MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
3018 
3019  // elem = *elemptr
3020  //*MediumPtr = elem
3021  if (Private->getType()->isScalarType()) {
3022  llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3023  Private->getType(), Loc);
3024  // Store the source element value to the dest element address.
3025  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
3026  Private->getType());
3027  } else {
3028  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3029  CGF.MakeAddrLValue(MediumPtr, Private->getType()),
3031  }
3032 
3033  Bld.CreateBr(MergeBB);
3034 
3035  CGF.EmitBlock(ElseBB);
3036  Bld.CreateBr(MergeBB);
3037 
3038  CGF.EmitBlock(MergeBB);
3039 
3040  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3041  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3042  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
3043 
3044  llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
3045  NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
3046  // named_barrier_sync(ParallelBarrierID, num_active_threads)
3047  syncParallelThreads(CGF, NumActiveThreads);
3048 
3049  //
3050  // Warp 0 copies reduce element from transfer medium.
3051  //
3052  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3053  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3054  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3055 
3056  // Up to 32 threads in warp 0 are active.
3057  llvm::Value *IsActiveThread =
3058  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3059  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3060 
3061  CGF.EmitBlock(W0ThenBB);
3062 
3063  // SrcMediumPtr = &medium[tid]
3064  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3065  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3066  Address SrcMediumPtr(SrcMediumPtrVal,
3067  C.getTypeAlignInChars(Private->getType()));
3068  // SrcMediumVal = *SrcMediumPtr;
3069  SrcMediumPtr = Bld.CreateElementBitCast(
3070  SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
3071 
3072  // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
3073  Address TargetElemPtrPtr =
3074  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3075  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3076  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3077  Address TargetElemPtr =
3078  Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
3079  TargetElemPtr = Bld.CreateElementBitCast(
3080  TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
3081 
3082  // *TargetElemPtr = SrcMediumVal;
3083  if (Private->getType()->isScalarType()) {
3084  llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
3085  SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
3086  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3087  Private->getType());
3088  } else {
3089  CGF.EmitAggregateCopy(
3090  CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
3091  CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
3093  }
3094  Bld.CreateBr(W0MergeBB);
3095 
3096  CGF.EmitBlock(W0ElseBB);
3097  Bld.CreateBr(W0MergeBB);
3098 
3099  CGF.EmitBlock(W0MergeBB);
3100 
3101  // While warp 0 copies values from transfer medium, all other warps must
3102  // wait.
3103  syncParallelThreads(CGF, NumActiveThreads);
3104  ++Idx;
3105  }
3106 
3107  CGF.FinishFunction();
3108  return Fn;
3109 }
3110 
3111 /// Emit a helper that reduces data across two OpenMP threads (lanes)
3112 /// in the same warp. It uses shuffle instructions to copy over data from
3113 /// a remote lane's stack. The reduction algorithm performed is specified
3114 /// by the fourth parameter.
3115 ///
3116 /// Algorithm Versions.
3117 /// Full Warp Reduce (argument value 0):
3118 /// This algorithm assumes that all 32 lanes are active and gathers
3119 /// data from these 32 lanes, producing a single resultant value.
3120 /// Contiguous Partial Warp Reduce (argument value 1):
3121 /// This algorithm assumes that only a *contiguous* subset of lanes
3122 /// are active. This happens for the last warp in a parallel region
3123 /// when the user specified num_threads is not an integer multiple of
3124 /// 32. This contiguous subset always starts with the zeroth lane.
3125 /// Partial Warp Reduce (argument value 2):
3126 /// This algorithm gathers data from any number of lanes at any position.
3127 /// All reduced values are stored in the lowest possible lane. The set
3128 /// of problems every algorithm addresses is a super set of those
3129 /// addressable by algorithms with a lower version number. Overhead
3130 /// increases as algorithm version increases.
3131 ///
3132 /// Terminology
3133 /// Reduce element:
3134 /// Reduce element refers to the individual data field with primitive
3135 /// data types to be combined and reduced across threads.
3136 /// Reduce list:
3137 /// Reduce list refers to a collection of local, thread-private
3138 /// reduce elements.
3139 /// Remote Reduce list:
3140 /// Remote Reduce list refers to a collection of remote (relative to
3141 /// the current thread) reduce elements.
3142 ///
3143 /// We distinguish between three states of threads that are important to
3144 /// the implementation of this function.
3145 /// Alive threads:
3146 /// Threads in a warp executing the SIMT instruction, as distinguished from
3147 /// threads that are inactive due to divergent control flow.
3148 /// Active threads:
3149 /// The minimal set of threads that has to be alive upon entry to this
3150 /// function. The computation is correct iff active threads are alive.
3151 /// Some threads are alive but they are not active because they do not
3152 /// contribute to the computation in any useful manner. Turning them off
3153 /// may introduce control flow overheads without any tangible benefits.
3154 /// Effective threads:
3155 /// In order to comply with the argument requirements of the shuffle
3156 /// function, we must keep all lanes holding data alive. But at most
3157 /// half of them perform value aggregation; we refer to this half of
3158 /// threads as effective. The other half is simply handing off their
3159 /// data.
3160 ///
3161 /// Procedure
3162 /// Value shuffle:
3163 /// In this step active threads transfer data from higher lane positions
3164 /// in the warp to lower lane positions, creating Remote Reduce list.
3165 /// Value aggregation:
3166 /// In this step, effective threads combine their thread local Reduce list
3167 /// with Remote Reduce list and store the result in the thread local
3168 /// Reduce list.
3169 /// Value copy:
3170 /// In this step, we deal with the assumption made by algorithm 2
3171 /// (i.e. contiguity assumption). When we have an odd number of lanes
3172 /// active, say 2k+1, only k threads will be effective and therefore k
3173 /// new values will be produced. However, the Reduce list owned by the
3174 /// (2k+1)th thread is ignored in the value aggregation. Therefore
3175 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3176 /// that the contiguity assumption still holds.
3178  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3179  QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
3180  ASTContext &C = CGM.getContext();
3181 
3182  // Thread local Reduce list used to host the values of data to be reduced.
3183  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3185  // Current lane id; could be logical.
3186  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3188  // Offset of the remote source lane relative to the current lane.
3189  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3191  // Algorithm version. This is expected to be known at compile time.
3192  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3194  FunctionArgList Args;
3195  Args.push_back(&ReduceListArg);
3196  Args.push_back(&LaneIDArg);
3197  Args.push_back(&RemoteLaneOffsetArg);
3198  Args.push_back(&AlgoVerArg);
3199 
3200  const CGFunctionInfo &CGFI =
3202  auto *Fn = llvm::Function::Create(
3204  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3205  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3206  Fn->setDoesNotRecurse();
3207  CodeGenFunction CGF(CGM);
3208  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3209 
3210  CGBuilderTy &Bld = CGF.Builder;
3211 
3212  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3213  Address LocalReduceList(
3215  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3216  C.VoidPtrTy, SourceLocation()),
3217  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3218  CGF.getPointerAlign());
3219 
3220  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3221  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3222  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3223 
3224  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3225  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3226  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3227 
3228  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3229  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3230  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3231 
3232  // Create a local thread-private variable to host the Reduce list
3233  // from a remote lane.
3234  Address RemoteReduceList =
3235  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3236 
3237  // This loop iterates through the list of reduce elements and copies,
3238  // element by element, from a remote lane in the warp to RemoteReduceList,
3239  // hosted on the thread's stack.
3240  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3241  LocalReduceList, RemoteReduceList,
3242  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3243  /*ScratchpadIndex=*/nullptr,
3244  /*ScratchpadWidth=*/nullptr});
3245 
3246  // The actions to be performed on the Remote Reduce list is dependent
3247  // on the algorithm version.
3248  //
3249  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3250  // LaneId % 2 == 0 && Offset > 0):
3251  // do the reduction value aggregation
3252  //
3253  // The thread local variable Reduce list is mutated in place to host the
3254  // reduced data, which is the aggregated value produced from local and
3255  // remote lanes.
3256  //
3257  // Note that AlgoVer is expected to be a constant integer known at compile
3258  // time.
3259  // When AlgoVer==0, the first conjunction evaluates to true, making
3260  // the entire predicate true during compile time.
3261  // When AlgoVer==1, the second conjunction has only the second part to be
3262  // evaluated during runtime. Other conjunctions evaluates to false
3263  // during compile time.
3264  // When AlgoVer==2, the third conjunction has only the second part to be
3265  // evaluated during runtime. Other conjunctions evaluates to false
3266  // during compile time.
3267  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3268 
3269  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3270  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3271  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3272 
3273  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3274  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3275  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3276  CondAlgo2 = Bld.CreateAnd(
3277  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3278 
3279  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3280  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3281 
3282  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3283  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3284  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3285  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3286 
3287  CGF.EmitBlock(ThenBB);
3288  // reduce_function(LocalReduceList, RemoteReduceList)
3289  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3290  LocalReduceList.getPointer(), CGF.VoidPtrTy);
3291  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3292  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3293  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3294  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3295  Bld.CreateBr(MergeBB);
3296 
3297  CGF.EmitBlock(ElseBB);
3298  Bld.CreateBr(MergeBB);
3299 
3300  CGF.EmitBlock(MergeBB);
3301 
3302  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3303  // Reduce list.
3304  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3305  llvm::Value *CondCopy = Bld.CreateAnd(
3306  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3307 
3308  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3309  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3310  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3311  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3312 
3313  CGF.EmitBlock(CpyThenBB);
3314  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3315  RemoteReduceList, LocalReduceList);
3316  Bld.CreateBr(CpyMergeBB);
3317 
3318  CGF.EmitBlock(CpyElseBB);
3319  Bld.CreateBr(CpyMergeBB);
3320 
3321  CGF.EmitBlock(CpyMergeBB);
3322 
3323  CGF.FinishFunction();
3324  return Fn;
3325 }
3326 
3327 ///
3328 /// Design of OpenMP reductions on the GPU
3329 ///
3330 /// Consider a typical OpenMP program with one or more reduction
3331 /// clauses:
3332 ///
3333 /// float foo;
3334 /// double bar;
3335 /// #pragma omp target teams distribute parallel for \
3336 /// reduction(+:foo) reduction(*:bar)
3337 /// for (int i = 0; i < N; i++) {
3338 /// foo += A[i]; bar *= B[i];
3339 /// }
3340 ///
3341 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3342 /// all teams. In our OpenMP implementation on the NVPTX device an
3343 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3344 /// within a team are mapped to CUDA threads within a threadblock.
3345 /// Our goal is to efficiently aggregate values across all OpenMP
3346 /// threads such that:
3347 ///
3348 /// - the compiler and runtime are logically concise, and
3349 /// - the reduction is performed efficiently in a hierarchical
3350 /// manner as follows: within OpenMP threads in the same warp,
3351 /// across warps in a threadblock, and finally across teams on
3352 /// the NVPTX device.
3353 ///
3354 /// Introduction to Decoupling
3355 ///
3356 /// We would like to decouple the compiler and the runtime so that the
3357 /// latter is ignorant of the reduction variables (number, data types)
3358 /// and the reduction operators. This allows a simpler interface
3359 /// and implementation while still attaining good performance.
3360 ///
3361 /// Pseudocode for the aforementioned OpenMP program generated by the
3362 /// compiler is as follows:
3363 ///
3364 /// 1. Create private copies of reduction variables on each OpenMP
3365 /// thread: 'foo_private', 'bar_private'
3366 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3367 /// to it and writes the result in 'foo_private' and 'bar_private'
3368 /// respectively.
3369 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
3370 /// and store the result on the team master:
3371 ///
3372 /// __kmpc_nvptx_parallel_reduce_nowait(...,
3373 /// reduceData, shuffleReduceFn, interWarpCpyFn)
3374 ///
3375 /// where:
3376 /// struct ReduceData {
3377 /// double *foo;
3378 /// double *bar;
3379 /// } reduceData
3380 /// reduceData.foo = &foo_private
3381 /// reduceData.bar = &bar_private
3382 ///
3383 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3384 /// auxiliary functions generated by the compiler that operate on
3385 /// variables of type 'ReduceData'. They aid the runtime perform
3386 /// algorithmic steps in a data agnostic manner.
3387 ///
3388 /// 'shuffleReduceFn' is a pointer to a function that reduces data
3389 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
3390 /// same warp. It takes the following arguments as input:
3391 ///
3392 /// a. variable of type 'ReduceData' on the calling lane,
3393 /// b. its lane_id,
3394 /// c. an offset relative to the current lane_id to generate a
3395 /// remote_lane_id. The remote lane contains the second
3396 /// variable of type 'ReduceData' that is to be reduced.
3397 /// d. an algorithm version parameter determining which reduction
3398 /// algorithm to use.
3399 ///
3400 /// 'shuffleReduceFn' retrieves data from the remote lane using
3401 /// efficient GPU shuffle intrinsics and reduces, using the
3402 /// algorithm specified by the 4th parameter, the two operands
3403 /// element-wise. The result is written to the first operand.
3404 ///
3405 /// Different reduction algorithms are implemented in different
3406 /// runtime functions, all calling 'shuffleReduceFn' to perform
3407 /// the essential reduction step. Therefore, based on the 4th
3408 /// parameter, this function behaves slightly differently to
3409 /// cooperate with the runtime to ensure correctness under
3410 /// different circumstances.
3411 ///
3412 /// 'InterWarpCpyFn' is a pointer to a function that transfers
3413 /// reduced variables across warps. It tunnels, through CUDA
3414 /// shared memory, the thread-private data of type 'ReduceData'
3415 /// from lane 0 of each warp to a lane in the first warp.
3416 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3417 /// The last team writes the global reduced value to memory.
3418 ///
3419 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3420 /// reduceData, shuffleReduceFn, interWarpCpyFn,
3421 /// scratchpadCopyFn, loadAndReduceFn)
3422 ///
3423 /// 'scratchpadCopyFn' is a helper that stores reduced
3424 /// data from the team master to a scratchpad array in
3425 /// global memory.
3426 ///
3427 /// 'loadAndReduceFn' is a helper that loads data from
3428 /// the scratchpad array and reduces it with the input
3429 /// operand.
3430 ///
3431 /// These compiler generated functions hide address
3432 /// calculation and alignment information from the runtime.
3433 /// 5. if ret == 1:
3434 /// The team master of the last team stores the reduced
3435 /// result to the globals in memory.
3436 /// foo += reduceData.foo; bar *= reduceData.bar
3437 ///
3438 ///
3439 /// Warp Reduction Algorithms
3440 ///
3441 /// On the warp level, we have three algorithms implemented in the
3442 /// OpenMP runtime depending on the number of active lanes:
3443 ///
3444 /// Full Warp Reduction
3445 ///
3446 /// The reduce algorithm within a warp where all lanes are active
3447 /// is implemented in the runtime as follows:
3448 ///
3449 /// full_warp_reduce(void *reduce_data,
3450 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3451 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3452 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
3453 /// }
3454 ///
3455 /// The algorithm completes in log(2, WARPSIZE) steps.
3456 ///
3457 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3458 /// not used therefore we save instructions by not retrieving lane_id
3459 /// from the corresponding special registers. The 4th parameter, which
3460 /// represents the version of the algorithm being used, is set to 0 to
3461 /// signify full warp reduction.
3462 ///
3463 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3464 ///
3465 /// #reduce_elem refers to an element in the local lane's data structure
3466 /// #remote_elem is retrieved from a remote lane
3467 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3468 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3469 ///
3470 /// Contiguous Partial Warp Reduction
3471 ///
3472 /// This reduce algorithm is used within a warp where only the first
3473 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3474 /// number of OpenMP threads in a parallel region is not a multiple of
3475 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
3476 ///
3477 /// void
3478 /// contiguous_partial_reduce(void *reduce_data,
3479 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3480 /// int size, int lane_id) {
3481 /// int curr_size;
3482 /// int offset;
3483 /// curr_size = size;
3484 /// mask = curr_size/2;
3485 /// while (offset>0) {
3486 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3487 /// curr_size = (curr_size+1)/2;
3488 /// offset = curr_size/2;
3489 /// }
3490 /// }
3491 ///
3492 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3493 ///
3494 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3495 /// if (lane_id < offset)
3496 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3497 /// else
3498 /// reduce_elem = remote_elem
3499 ///
3500 /// This algorithm assumes that the data to be reduced are located in a
3501 /// contiguous subset of lanes starting from the first. When there is
3502 /// an odd number of active lanes, the data in the last lane is not
3503 /// aggregated with any other lane's dat but is instead copied over.
3504 ///
3505 /// Dispersed Partial Warp Reduction
3506 ///
3507 /// This algorithm is used within a warp when any discontiguous subset of
3508 /// lanes are active. It is used to implement the reduction operation
3509 /// across lanes in an OpenMP simd region or in a nested parallel region.
3510 ///
3511 /// void
3512 /// dispersed_partial_reduce(void *reduce_data,
3513 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3514 /// int size, remote_id;
3515 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3516 /// do {
3517 /// remote_id = next_active_lane_id_right_after_me();
3518 /// # the above function returns 0 of no active lane
3519 /// # is present right after the current lane.
3520 /// size = number_of_active_lanes_in_this_warp();
3521 /// logical_lane_id /= 2;
3522 /// ShuffleReduceFn(reduce_data, logical_lane_id,
3523 /// remote_id-1-threadIdx.x, 2);
3524 /// } while (logical_lane_id % 2 == 0 && size > 1);
3525 /// }
3526 ///
3527 /// There is no assumption made about the initial state of the reduction.
3528 /// Any number of lanes (>=1) could be active at any position. The reduction
3529 /// result is returned in the first active lane.
3530 ///
3531 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3532 ///
3533 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3534 /// if (lane_id % 2 == 0 && offset > 0)
3535 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3536 /// else
3537 /// reduce_elem = remote_elem
3538 ///
3539 ///
3540 /// Intra-Team Reduction
3541 ///
3542 /// This function, as implemented in the runtime call
3543 /// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
3544 /// threads in a team. It first reduces within a warp using the
3545 /// aforementioned algorithms. We then proceed to gather all such
3546 /// reduced values at the first warp.
3547 ///
3548 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3549 /// data from each of the "warp master" (zeroth lane of each warp, where
3550 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
3551 /// a mathematical sense) the problem of reduction across warp masters in
3552 /// a block to the problem of warp reduction.
3553 ///
3554 ///
3555 /// Inter-Team Reduction
3556 ///
3557 /// Once a team has reduced its data to a single value, it is stored in
3558 /// a global scratchpad array. Since each team has a distinct slot, this
3559 /// can be done without locking.
3560 ///
3561 /// The last team to write to the scratchpad array proceeds to reduce the
3562 /// scratchpad array. One or more workers in the last team use the helper
3563 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3564 /// the k'th worker reduces every k'th element.
3565 ///
3566 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
3567 /// reduce across workers and compute a globally reduced value.
3568 ///
3572  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3573  if (!CGF.HaveInsertPoint())
3574  return;
3575 
3576  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3577  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3578  bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
3579  assert((TeamsReduction || ParallelReduction || SimdReduction) &&
3580  "Invalid reduction selection in emitReduction.");
3581 
3582  if (Options.SimpleReduction) {
3583  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3584  ReductionOps, Options);
3585  return;
3586  }
3587 
3588  ASTContext &C = CGM.getContext();
3589 
3590  // 1. Build a list of reduction variables.
3591  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3592  auto Size = RHSExprs.size();
3593  for (const Expr *E : Privates) {
3594  if (E->getType()->isVariablyModifiedType())
3595  // Reserve place for array size.
3596  ++Size;
3597  }
3598  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3599  QualType ReductionArrayTy =
3601  /*IndexTypeQuals=*/0);
3602  Address ReductionList =
3603  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3604  auto IPriv = Privates.begin();
3605  unsigned Idx = 0;
3606  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3607  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3608  CGF.getPointerSize());
3609  CGF.Builder.CreateStore(
3611  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
3612  Elem);
3613  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3614  // Store array size.
3615  ++Idx;
3616  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3617  CGF.getPointerSize());
3618  llvm::Value *Size = CGF.Builder.CreateIntCast(
3619  CGF.getVLASize(
3620  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3621  .NumElts,
3622  CGF.SizeTy, /*isSigned=*/false);
3623  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3624  Elem);
3625  }
3626  }
3627 
3628  // 2. Emit reduce_func().
3629  llvm::Value *ReductionFn = emitReductionFunction(
3630  CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
3631  Privates, LHSExprs, RHSExprs, ReductionOps);
3632 
3633  // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3634  // RedList, shuffle_reduce_func, interwarp_copy_func);
3635  llvm::Value *ThreadId = getThreadID(CGF, Loc);
3636  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3638  ReductionList.getPointer(), CGF.VoidPtrTy);
3639 
3640  llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3641  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3642  llvm::Value *InterWarpCopyFn =
3643  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3644 
3645  llvm::Value *Args[] = {ThreadId,
3646  CGF.Builder.getInt32(RHSExprs.size()),
3647  ReductionArrayTySize,
3648  RL,
3649  ShuffleAndReduceFn,
3650  InterWarpCopyFn};
3651 
3652  llvm::Value *Res = nullptr;
3653  if (ParallelReduction)
3654  Res = CGF.EmitRuntimeCall(
3655  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
3656  Args);
3657  else if (SimdReduction)
3658  Res = CGF.EmitRuntimeCall(
3659  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
3660  Args);
3661 
3662  if (TeamsReduction) {
3663  llvm::Value *ScratchPadCopyFn =
3664  emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
3665  llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
3666  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3667 
3668  llvm::Value *Args[] = {ThreadId,
3669  CGF.Builder.getInt32(RHSExprs.size()),
3670  ReductionArrayTySize,
3671  RL,
3672  ShuffleAndReduceFn,
3673  InterWarpCopyFn,
3674  ScratchPadCopyFn,
3675  LoadAndReduceFn};
3676  Res = CGF.EmitRuntimeCall(
3677  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
3678  Args);
3679  }
3680 
3681  // 5. Build switch(res)
3682  llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
3683  llvm::SwitchInst *SwInst =
3684  CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
3685 
3686  // 6. Build case 1: where we have reduced values in the master
3687  // thread in each team.
3688  // __kmpc_end_reduce{_nowait}(<gtid>);
3689  // break;
3690  llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
3691  SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
3692  CGF.EmitBlock(Case1BB);
3693 
3694  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3695  llvm::Value *EndArgs[] = {ThreadId};
3696  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3697  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3698  auto IPriv = Privates.begin();
3699  auto ILHS = LHSExprs.begin();
3700  auto IRHS = RHSExprs.begin();
3701  for (const Expr *E : ReductionOps) {
3702  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3703  cast<DeclRefExpr>(*IRHS));
3704  ++IPriv;
3705  ++ILHS;
3706  ++IRHS;
3707  }
3708  };
3709  RegionCodeGenTy RCG(CodeGen);
3710  NVPTXActionTy Action(
3711  nullptr, llvm::None,
3712  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
3713  EndArgs);
3714  RCG.setAction(Action);
3715  RCG(CGF);
3716  CGF.EmitBranch(DefaultBB);
3717  CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
3718 }
3719 
3720 const VarDecl *
3722  const VarDecl *NativeParam) const {
3723  if (!NativeParam->getType()->isReferenceType())
3724  return NativeParam;
3725  QualType ArgType = NativeParam->getType();
3726  QualifierCollector QC;
3727  const Type *NonQualTy = QC.strip(ArgType);
3728  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3729  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3730  if (Attr->getCaptureKind() == OMPC_map) {
3731  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3733  }
3734  }
3735  ArgType = CGM.getContext().getPointerType(PointeeTy);
3736  QC.addRestrict();
3737  enum { NVPTX_local_addr = 5 };
3738  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3739  ArgType = QC.apply(CGM.getContext(), ArgType);
3740  if (isa<ImplicitParamDecl>(NativeParam))
3742  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3743  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3744  return ParmVarDecl::Create(
3745  CGM.getContext(),
3746  const_cast<DeclContext *>(NativeParam->getDeclContext()),
3747  NativeParam->getBeginLoc(), NativeParam->getLocation(),
3748  NativeParam->getIdentifier(), ArgType,
3749  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3750 }
3751 
3752 Address
3754  const VarDecl *NativeParam,
3755  const VarDecl *TargetParam) const {
3756  assert(NativeParam != TargetParam &&
3757  NativeParam->getType()->isReferenceType() &&
3758  "Native arg must not be the same as target arg.");
3759  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3760  QualType NativeParamType = NativeParam->getType();
3761  QualifierCollector QC;
3762  const Type *NonQualTy = QC.strip(NativeParamType);
3763  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3764  unsigned NativePointeeAddrSpace =
3765  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3766  QualType TargetTy = TargetParam->getType();
3767  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3768  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3769  // First cast to generic.
3771  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3772  /*AddrSpace=*/0));
3773  // Cast from generic to native address space.
3775  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3776  NativePointeeAddrSpace));
3777  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3778  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3779  NativeParamType);
3780  return NativeParamAddr;
3781 }
3782 
3784  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
3785  ArrayRef<llvm::Value *> Args) const {
3786  SmallVector<llvm::Value *, 4> TargetArgs;
3787  TargetArgs.reserve(Args.size());
3788  auto *FnType =
3789  cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
3790  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3791  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3792  TargetArgs.append(std::next(Args.begin(), I), Args.end());
3793  break;
3794  }
3795  llvm::Type *TargetType = FnType->getParamType(I);
3796  llvm::Value *NativeArg = Args[I];
3797  if (!TargetType->isPointerTy()) {
3798  TargetArgs.emplace_back(NativeArg);
3799  continue;
3800  }
3802  NativeArg,
3803  NativeArg->getType()->getPointerElementType()->getPointerTo());
3804  TargetArgs.emplace_back(
3805  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3806  }
3807  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3808 }
3809 
3810 /// Emit function which wraps the outline parallel region
3811 /// and controls the arguments which are passed to this function.
3812 /// The wrapper ensures that the outlined function is called
3813 /// with the correct arguments when data is shared.
3814 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
3815  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3816  ASTContext &Ctx = CGM.getContext();
3817  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3818 
3819  // Create a function that takes as argument the source thread.
3820  FunctionArgList WrapperArgs;
3821  QualType Int16QTy =
3822  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3823  QualType Int32QTy =
3824  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3825  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3826  /*Id=*/nullptr, Int16QTy,
3828  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3829  /*Id=*/nullptr, Int32QTy,
3831  WrapperArgs.emplace_back(&ParallelLevelArg);
3832  WrapperArgs.emplace_back(&WrapperArg);
3833 
3834  const CGFunctionInfo &CGFI =
3835  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3836 
3837  auto *Fn = llvm::Function::Create(
3839  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3840  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3841  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3842  Fn->setDoesNotRecurse();
3843 
3844  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3845  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3846  D.getBeginLoc(), D.getBeginLoc());
3847 
3848  const auto *RD = CS.getCapturedRecordDecl();
3849  auto CurField = RD->field_begin();
3850 
3851  Address ZeroAddr = CGF.CreateMemTemp(
3852  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
3853  /*Name*/ ".zero.addr");
3854  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
3855  // Get the array of arguments.
3857 
3858  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3859  Args.emplace_back(ZeroAddr.getPointer());
3860 
3861  CGBuilderTy &Bld = CGF.Builder;
3862  auto CI = CS.capture_begin();
3863 
3864  // Use global memory for data sharing.
3865  // Handle passing of global args to workers.
3866  Address GlobalArgs =
3867  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3868  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3869  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3870  CGF.EmitRuntimeCall(
3871  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
3872  DataSharingArgs);
3873 
3874  // Retrieve the shared variables from the list of references returned
3875  // by the runtime. Pass the variables to the outlined function.
3876  Address SharedArgListAddress = Address::invalid();
3877  if (CS.capture_size() > 0 ||
3879  SharedArgListAddress = CGF.EmitLoadOfPointer(
3880  GlobalArgs, CGF.getContext()
3882  CGF.getContext().VoidPtrTy))
3883  .castAs<PointerType>());
3884  }
3885  unsigned Idx = 0;
3887  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
3888  CGF.getPointerSize());
3889  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3890  Src, CGF.SizeTy->getPointerTo());
3891  llvm::Value *LB = CGF.EmitLoadOfScalar(
3892  TypedAddress,
3893  /*Volatile=*/false,
3895  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3896  Args.emplace_back(LB);
3897  ++Idx;
3898  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
3899  CGF.getPointerSize());
3900  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3901  Src, CGF.SizeTy->getPointerTo());
3902  llvm::Value *UB = CGF.EmitLoadOfScalar(
3903  TypedAddress,
3904  /*Volatile=*/false,
3906  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3907  Args.emplace_back(UB);
3908  ++Idx;
3909  }
3910  if (CS.capture_size() > 0) {
3911  ASTContext &CGFContext = CGF.getContext();
3912  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3913  QualType ElemTy = CurField->getType();
3914  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
3915  CGF.getPointerSize());
3916  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3917  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3918  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3919  /*Volatile=*/false,
3920  CGFContext.getPointerType(ElemTy),
3921  CI->getLocation());
3922  if (CI->capturesVariableByCopy() &&
3923  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3924  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3925  CI->getLocation());
3926  }
3927  Args.emplace_back(Arg);
3928  }
3929  }
3930 
3931  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3932  CGF.FinishFunction();
3933  return Fn;
3934 }
3935 
3937  const Decl *D) {
3939  return;
3940 
3941  assert(D && "Expected function or captured|block decl.");
3942  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3943  "Function is registered already.");
3944  const Stmt *Body = nullptr;
3945  bool NeedToDelayGlobalization = false;
3946  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3947  Body = FD->getBody();
3948  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3949  Body = BD->getBody();
3950  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3951  Body = CD->getBody();
3952  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3953  }
3954  if (!Body)
3955  return;
3956  CheckVarsEscapingDeclContext VarChecker(CGF);
3957  VarChecker.Visit(Body);
3958  const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
3959  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3960  VarChecker.getEscapedVariableLengthDecls();
3961  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3962  return;
3963  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3964  I->getSecond().MappedParams =
3965  llvm::make_unique<CodeGenFunction::OMPMapVars>();
3966  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
3967  I->getSecond().EscapedParameters.insert(
3968  VarChecker.getEscapedParameters().begin(),
3969  VarChecker.getEscapedParameters().end());
3970  I->getSecond().EscapedVariableLengthDecls.append(
3971  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3972  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3973  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3974  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3975  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
3976  Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
3977  }
3978  if (!NeedToDelayGlobalization) {
3979  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3980  struct GlobalizationScope final : EHScopeStack::Cleanup {
3981  GlobalizationScope() = default;
3982 
3983  void Emit(CodeGenFunction &CGF, Flags flags) override {
3984  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
3985  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3986  }
3987  };
3988  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3989  }
3990 }
3991 
3993  const VarDecl *VD) {
3995  return Address::invalid();
3996 
3997  VD = VD->getCanonicalDecl();
3998  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3999  if (I == FunctionGlobalizedDecls.end())
4000  return Address::invalid();
4001  auto VDI = I->getSecond().LocalVarData.find(VD);
4002  if (VDI != I->getSecond().LocalVarData.end())
4003  return VDI->second.second;
4004  if (VD->hasAttrs()) {
4006  E(VD->attr_end());
4007  IT != E; ++IT) {
4008  auto VDI = I->getSecond().LocalVarData.find(
4009  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4010  ->getCanonicalDecl());
4011  if (VDI != I->getSecond().LocalVarData.end())
4012  return VDI->second.second;
4013  }
4014  }
4015  return Address::invalid();
4016 }
4017 
4019  FunctionGlobalizedDecls.erase(CGF.CurFn);
4021 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:653
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
const BlockDecl * getBlockDecl() const
Definition: Expr.h:4988
static const Decl * getCanonicalDecl(const Decl *D)
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
static llvm::Value * emitCopyToScratchpad(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that stores reduced data from the team master to a scratchpad array in g...
Other implicit parameter.
Definition: Decl.h:1496
A class which contains all the information about a particular captured value.
Definition: Decl.h:3841
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2511
CanQualType VoidPtrTy
Definition: ASTContext.h:1053
A (possibly-)qualified type.
Definition: Type.h:642
ArrayRef< OMPClause * > clauses()
Definition: StmtOpenMP.h:260
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static bool hasParallelIfNumThreadsClause(ASTContext &Ctx, const OMPExecutableDirective &D)
Check if the parallel directive has an &#39;if&#39; clause with non-constant or false condition.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:139
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:195
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Address CreateConstGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:226
attr_iterator attr_begin() const
Definition: DeclBase.h:499
Stmt - This represents one statement.
Definition: Stmt.h:66
static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads)
Get barrier #ID to synchronize selected (multiple of warp size) threads in a CTA. ...
llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:54
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:168
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:409
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter...
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
static bool stable_sort_comparator(const PrivateDataTy P1, const PrivateDataTy P2)
This represents &#39;if&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:240
llvm::Value * ScratchpadIndex
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:226
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6126
The base class of the type hierarchy.
Definition: Type.h:1415
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:116
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Definition: CGExpr.cpp:2306
static bool hasStaticScheduling(const OMPExecutableDirective &D)
Check if the directive is loops based and has schedule clause at all or has static scheduling...
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:26
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
Represents a variable declaration or definition.
Definition: Decl.h:812
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:67
This represents &#39;num_threads&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:382
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
llvm::Value * getPointer() const
Definition: Address.h:38
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:57
SPMD execution mode (all threads are worker threads).
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:269
Represents a struct/union/class.
Definition: Decl.h:3570
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
Address getAddress() const
Definition: CGValue.h:327
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:154
attr_iterator attr_end() const
Definition: DeclBase.h:502
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
Represents a member of a struct/union/class.
Definition: Decl.h:2556
This represents clause &#39;lastprivate&#39; in the &#39;#pragma omp ...&#39; directives.
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive...
Definition: StmtOpenMP.h:209
static llvm::Value * getMasterThreadID(CodeGenFunction &CGF)
Get the thread id of the OMP master thread.
llvm::CallInst * EmitRuntimeCall(llvm::Value *callee, const Twine &name="")
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:3855
bool isReferenceType() const
Definition: Type.h:6189
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:244
Expr * getSubExpr()
Definition: Expr.h:2855
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:738
static bool hasNestedLightweightDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) lightweight runtime construct, if any.
void InitTempAlloca(Address Alloca, llvm::Value *Value)
InitTempAlloca - Provide an initial value for the given alloca which will be observable at all locati...
Definition: CGExpr.cpp:126
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
bool EvaluateAsBooleanCondition(bool &Result, const ASTContext &Ctx) const
EvaluateAsBooleanCondition - Return true if this is a constant which we can fold and convert to a boo...
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:157
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
uint32_t Offset
Definition: CacheTokens.cpp:43
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:67
child_range children()
Definition: Stmt.cpp:229
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:40
static CGOpenMPRuntimeNVPTX::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
llvm::Value * emitReductionFunction(CodeGenModule &CGM, SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
bool isOpenMPWorksharingDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a worksharing directive.
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1611
virtual llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda&#39;s captures is an init-capture.
Definition: ExprCXX.cpp:971
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
static bool hasDistributeWithLastprivateClauses(const OMPExecutableDirective &D)
Checks if the directive is the distribute clause with the lastprivate clauses.
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:875
static const Stmt * getSingleCompoundChild(const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one...
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:3836
llvm::Constant * CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create a new runtime function with the specified type and name.
static void syncCTAThreads(CodeGenFunction &CGF)
Synchronize all GPU threads in a block.
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:4382
Unknown execution mode (orphaned directive).
llvm::CallInst * EmitNounwindRuntimeCall(llvm::Value *callee, const Twine &name="")
Describes the capture of either a variable, or &#39;this&#39;, or variable-length array type.
Definition: Stmt.h:2090
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like &#39;private&#39;, &#39;firstprivate&#39;, &#39;reduction&#39; etc.
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
TypeSourceInfo * getTrivialTypeSourceInfo(QualType T, SourceLocation Loc=SourceLocation()) const
Allocate a TypeSourceInfo where all locations have been initialized to a given location, which defaults to the empty location.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:119
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:196
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:636
This represents one expression.
Definition: Expr.h:105
virtual llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
static Address invalid()
Definition: Address.h:35
Enters a new scope for capturing cleanups, all of which will be executed once the scope is exited...
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top, if IgnoreCaptured is true.
Definition: Stmt.cpp:139
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
const CGFunctionInfo & arrangeNullaryFunction()
A nullary function is a freestanding function of type &#39;void ()&#39;.
Definition: CGCall.cpp:695
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:4974
const Expr * getCallee() const
Definition: Expr.h:2325
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements, of a variable length array type, plus that largest non-variably-sized element type.
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:44
CharUnits getTypeAlignInChars(QualType T) const
Return the ABI-specified alignment of a (complete) type T, in characters.
DeclContext * getDeclContext()
Definition: DeclBase.h:426
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause *> Clauses)
Definition: StmtOpenMP.h:130
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
CanQualType ShortTy
Definition: ASTContext.h:1034
This represents &#39;ordered&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:925
QualType getConstantArrayType(QualType EltTy, const llvm::APInt &ArySize, ArrayType::ArraySizeModifier ASM, unsigned IndexTypeQuals) const
Return the unique reference to the type for a constant array of the specified element type...
QualType getRecordType(const RecordDecl *Decl) const
UnaryOperator - This represents the unary-expression&#39;s (except sizeof and alignof), the postinc/postdec operators from postfix-expression, and various extensions.
Definition: Expr.h:1784
MachineConfiguration
GPU Configuration: This information can be derived from cuda registers, however, providing compile ti...
llvm::Value * EmitCastToVoidPtr(llvm::Value *value)
Emit a cast to void* in the appropriate address space.
Definition: CGExpr.cpp:50
ValueDecl * getDecl()
Definition: Expr.h:1056
const LangOptions & getLangOpts() const
ASTContext & getContext() const
OpenMPProcBindClauseKind
OpenMP attributes for &#39;proc_bind&#39; clause.
Definition: OpenMPKinds.h:51
static llvm::Value * emitReduceScratchpadFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc)
This function emits a helper that loads data from the scratchpad array and (optionally) reduces it wi...
Non-SPMD execution mode (1 master thread, others are workers).
llvm::Value * ScratchpadWidth
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2016
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:35
bool hasClausesOfKind() const
Returns true if the current directive has one or more clauses of a specific kind. ...
Definition: StmtOpenMP.h:162
AttrVec & getAttrs()
Definition: DeclBase.h:478
bool hasAttrs() const
Definition: DeclBase.h:472
The l-value was considered opaque, so the alignment was determined from a type.
llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
Address CreateBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:142
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
This captures a statement into a function.
Definition: Stmt.h:2077
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen)
Emits code for OpenMP &#39;if&#39; clause using specified CodeGen function.
Encodes a location in the source.
static llvm::Value * getThreadLimit(CodeGenFunction &CGF, bool IsInSPMDExecutionMode=false)
Get the value of the thread_limit clause in the teams directive.
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Expr * getSubExpr() const
Definition: Expr.h:1811
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2072
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:33
CastKind getCastKind() const
Definition: Expr.h:2849
This represents &#39;schedule&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:744
DeclStmt - Adaptor class for mixing declarations with statements and expressions. ...
Definition: Stmt.h:505
OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:23
This file defines OpenMP nodes for declarative directives.
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:51
CanQualType VoidTy
Definition: ASTContext.h:1025
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
arg_range arguments()
Definition: Expr.h:2379
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
An aligned address.
Definition: Address.h:25
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:2924
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:2178
OpenMPRTLFunctionNVPTX
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language...
Definition: Expr.h:248
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
QualType getType() const
Definition: CGValue.h:264
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
FunctionArgList - Type for representing both the decl and type of parameters to a function...
Definition: CGCall.h:356
void setAction(PrePostActionTy &Action) const
CGFunctionInfo - Class to encapsulate the information about a function definition.
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2517
Dataflow Directional Tag Classes.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
static void getNVPTXCTABarrier(CodeGenFunction &CGF)
Get barrier to synchronize all threads in a block.
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:5920
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1267
ArrayRef< Capture > captures() const
Definition: Decl.h:3954
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:70
bool isOpenMPSimdDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a simd directive.
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr *> Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values, in the specified direction.
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:5927
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:211
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:108
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1390
llvm::Module & getModule() const
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3330
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads)
Synchronize worker threads in a parallel region.
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
llvm::Constant * createNVPTXRuntimeFunction(unsigned Function)
Returns specified OpenMP runtime function for the current OpenMP implementation.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
This file defines OpenMP AST classes for executable directives and clauses.
Address CreateConstArrayGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:195
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:6438
void addRestrict()
Definition: Type.h:277
T * getAttr() const
Definition: DeclBase.h:532
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:52
Opcode getOpcode() const
Definition: Expr.h:1808
decl_range decls()
Definition: Stmt.h:550
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:32
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:445
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:193
ExecutionMode
Defines the execution mode.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
bool isLValueReferenceType() const
Definition: Type.h:6193
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
static llvm::Value * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g., it is an signed integer type or a vector.
Definition: Type.cpp:1866
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block, taking care to avoid creation of branches from dummy blocks.
Definition: CGStmt.cpp:465
Privates[]
Gets the list of initial values for linear variables.
Definition: OpenMPClause.h:151
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Capturing by reference.
Definition: Lambda.h:38
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression...
Definition: CGExpr.cpp:1232
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
capture_range captures() const
Retrieve this lambda&#39;s captures.
Definition: ExprCXX.cpp:984
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2285
static llvm::Value * getNVPTXThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
CGCapturedStmtInfo * CapturedStmtInfo
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2422
static llvm::Value * getNVPTXWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
CanQualType IntTy
Definition: ASTContext.h:1034
llvm::Value * RemoteLaneOffset
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:1818
capture_range captures()
Definition: Stmt.h:2212
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:971
void addAddressSpace(LangAS space)
Definition: Type.h:382
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:713
QualType getType() const
Definition: Decl.h:647
bool isOpenMPLoopDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a directive with an associated loop construct.
LValue - This represents an lvalue references.
Definition: CGValue.h:167
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:147
CanQualType BoolTy
Definition: ASTContext.h:1026
unsigned getTargetAddressSpace(QualType T) const
Definition: ASTContext.h:2505
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:3740
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:164
static llvm::Value * getNVPTXNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
No in-class initializer.
Definition: Specifiers.h:230
llvm::Value * getPointer() const
Definition: CGValue.h:323
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc)
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Attr - This represents one attribute.
Definition: Attr.h:43
SourceLocation getLocation() const
Definition: DeclBase.h:417
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth, signed/unsigned.
static OMPLinearClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef< Expr *> VL, ArrayRef< Expr *> PL, ArrayRef< Expr *> IL, Expr *Step, Expr *CalcStep, Stmt *PreInit, Expr *PostUpdate)
Creates clause with a list of variables VL and a linear step Step.
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1544
static bool supportsLightweightRuntime(ASTContext &Ctx, const OMPExecutableDirective &D)
Checks if the construct supports lightweight runtime.