clang  7.0.0svn
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This provides a class for OpenMP runtime code generation specialized to NVPTX
11 // targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "CodeGenFunction.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/StmtOpenMP.h"
19 #include "clang/AST/StmtVisitor.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 
22 using namespace clang;
23 using namespace CodeGen;
24 
25 namespace {
27  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28  /// int16_t RequiresOMPRuntime);
29  OMPRTL_NVPTX__kmpc_kernel_init,
30  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
31  OMPRTL_NVPTX__kmpc_kernel_deinit,
32  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
33  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
34  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35  /// Call to void __kmpc_spmd_kernel_deinit();
36  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
37  /// Call to void __kmpc_kernel_prepare_parallel(void
38  /// *outlined_function, int16_t
39  /// IsOMPRuntimeInitialized);
40  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
42  /// int16_t IsOMPRuntimeInitialized);
43  OMPRTL_NVPTX__kmpc_kernel_parallel,
44  /// Call to void __kmpc_kernel_end_parallel();
45  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47  /// global_tid);
48  OMPRTL_NVPTX__kmpc_serialized_parallel,
49  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50  /// global_tid);
51  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
53  /// int16_t lane_offset, int16_t warp_size);
54  OMPRTL_NVPTX__kmpc_shuffle_int32,
55  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
56  /// int16_t lane_offset, int16_t warp_size);
57  OMPRTL_NVPTX__kmpc_shuffle_int64,
58  /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
59  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61  /// lane_offset, int16_t shortCircuit),
62  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63  OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
64  /// Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32
65  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
66  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
67  /// lane_offset, int16_t shortCircuit),
68  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
69  OMPRTL_NVPTX__kmpc_simd_reduce_nowait,
70  /// Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
71  /// int32_t num_vars, size_t reduce_size, void *reduce_data,
72  /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
73  /// lane_offset, int16_t shortCircuit),
74  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
75  /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
76  /// int32_t index, int32_t width),
77  /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
78  /// index, int32_t width, int32_t reduce))
79  OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
80  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
81  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
82  /// Call to void __kmpc_data_sharing_init_stack();
83  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
84  /// Call to void* __kmpc_data_sharing_push_stack(size_t size,
85  /// int16_t UseSharedMemory);
86  OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
87  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
88  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
89  /// Call to void __kmpc_begin_sharing_variables(void ***args,
90  /// size_t n_args);
91  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
92  /// Call to void __kmpc_end_sharing_variables();
93  OMPRTL_NVPTX__kmpc_end_sharing_variables,
94  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
95  OMPRTL_NVPTX__kmpc_get_shared_variables,
96  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
97  /// global_tid);
98  OMPRTL_NVPTX__kmpc_parallel_level,
99  /// Call to int8_t __kmpc_is_spmd_exec_mode();
100  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
101 };
102 
103 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
104 class NVPTXActionTy final : public PrePostActionTy {
105  llvm::Value *EnterCallee = nullptr;
106  ArrayRef<llvm::Value *> EnterArgs;
107  llvm::Value *ExitCallee = nullptr;
108  ArrayRef<llvm::Value *> ExitArgs;
109  bool Conditional = false;
110  llvm::BasicBlock *ContBlock = nullptr;
111 
112 public:
113  NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
114  llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
115  bool Conditional = false)
116  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
117  ExitArgs(ExitArgs), Conditional(Conditional) {}
118  void Enter(CodeGenFunction &CGF) override {
119  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
120  if (Conditional) {
121  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
122  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
123  ContBlock = CGF.createBasicBlock("omp_if.end");
124  // Generate the branch (If-stmt)
125  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
126  CGF.EmitBlock(ThenBlock);
127  }
128  }
129  void Done(CodeGenFunction &CGF) {
130  // Emit the rest of blocks/branches
131  CGF.EmitBranch(ContBlock);
132  CGF.EmitBlock(ContBlock, true);
133  }
134  void Exit(CodeGenFunction &CGF) override {
135  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
136  }
137 };
138 
139 /// A class to track the execution mode when codegening directives within
140 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
141 /// to the target region and used by containing directives such as 'parallel'
142 /// to emit optimized code.
143 class ExecutionModeRAII {
144 private:
147 
148 public:
149  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, bool IsSPMD)
150  : Mode(Mode) {
151  SavedMode = Mode;
152  Mode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD
154  }
155  ~ExecutionModeRAII() { Mode = SavedMode; }
156 };
157 
158 /// GPU Configuration: This information can be derived from cuda registers,
159 /// however, providing compile time constants helps generate more efficient
160 /// code. For all practical purposes this is fine because the configuration
161 /// is the same for all known NVPTX architectures.
162 enum MachineConfiguration : unsigned {
163  WarpSize = 32,
164  /// Number of bits required to represent a lane identifier, which is
165  /// computed as log_2(WarpSize).
166  LaneIDBits = 5,
167  LaneIDMask = WarpSize - 1,
168 
169  /// Global memory alignment for performance.
170  GlobalMemoryAlignment = 256,
171 };
172 
173 enum NamedBarrier : unsigned {
174  /// Synchronize on this barrier #ID using a named barrier primitive.
175  /// Only the subset of active threads in a parallel region arrive at the
176  /// barrier.
177  NB_Parallel = 1,
178 };
179 
180 /// Get the list of variables that can escape their declaration context.
181 class CheckVarsEscapingDeclContext final
182  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
183  CodeGenFunction &CGF;
184  llvm::SetVector<const ValueDecl *> EscapedDecls;
185  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
186  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
187  RecordDecl *GlobalizedRD = nullptr;
188  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
189  bool AllEscaped = false;
190  bool IsForParallelRegion = false;
191 
194  for (const Decl *D : VD->redecls()) {
195  if (!D->hasAttrs())
196  continue;
197  if (const auto *Attr = D->getAttr<OMPDeclareTargetDeclAttr>())
198  return Attr->getMapType();
199  }
200  return llvm::None;
201  }
202 
203  void markAsEscaped(const ValueDecl *VD) {
204  // Do not globalize declare target variables.
206  return;
207  VD = cast<ValueDecl>(VD->getCanonicalDecl());
208  // Variables captured by value must be globalized.
209  if (auto *CSI = CGF.CapturedStmtInfo) {
210  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
211  // Check if need to capture the variable that was already captured by
212  // value in the outer region.
213  if (!IsForParallelRegion) {
214  if (!FD->hasAttrs())
215  return;
216  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
217  if (!Attr)
218  return;
219  if (!isOpenMPPrivate(
220  static_cast<OpenMPClauseKind>(Attr->getCaptureKind())) ||
221  Attr->getCaptureKind() == OMPC_map)
222  return;
223  }
224  if (!FD->getType()->isReferenceType()) {
225  assert(!VD->getType()->isVariablyModifiedType() &&
226  "Parameter captured by value with variably modified type");
227  EscapedParameters.insert(VD);
228  } else if (!IsForParallelRegion) {
229  return;
230  }
231  }
232  }
233  if ((!CGF.CapturedStmtInfo ||
234  (IsForParallelRegion && CGF.CapturedStmtInfo)) &&
235  VD->getType()->isReferenceType())
236  // Do not globalize variables with reference type.
237  return;
238  if (VD->getType()->isVariablyModifiedType())
239  EscapedVariableLengthDecls.insert(VD);
240  else
241  EscapedDecls.insert(VD);
242  }
243 
244  void VisitValueDecl(const ValueDecl *VD) {
245  if (VD->getType()->isLValueReferenceType())
246  markAsEscaped(VD);
247  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
248  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
249  const bool SavedAllEscaped = AllEscaped;
250  AllEscaped = VD->getType()->isLValueReferenceType();
251  Visit(VarD->getInit());
252  AllEscaped = SavedAllEscaped;
253  }
254  }
255  }
256  void VisitOpenMPCapturedStmt(const CapturedStmt *S, bool IsParallelRegion) {
257  if (!S)
258  return;
259  for (const CapturedStmt::Capture &C : S->captures()) {
260  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
261  const ValueDecl *VD = C.getCapturedVar();
262  bool SavedIsParallelRegion = IsForParallelRegion;
263  IsForParallelRegion = IsParallelRegion;
264  markAsEscaped(VD);
265  if (isa<OMPCapturedExprDecl>(VD))
266  VisitValueDecl(VD);
267  IsForParallelRegion = SavedIsParallelRegion;
268  }
269  }
270  }
271 
272  typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
273  static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
274  return P1.first > P2.first;
275  }
276 
277  void buildRecordForGlobalizedVars() {
278  assert(!GlobalizedRD &&
279  "Record for globalized variables is built already.");
280  if (EscapedDecls.empty())
281  return;
282  ASTContext &C = CGF.getContext();
283  SmallVector<VarsDataTy, 4> GlobalizedVars;
284  for (const ValueDecl *D : EscapedDecls)
285  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
286  std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
288  // Build struct _globalized_locals_ty {
289  // /* globalized vars */
290  // };
291  GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
292  GlobalizedRD->startDefinition();
293  for (const auto &Pair : GlobalizedVars) {
294  const ValueDecl *VD = Pair.second;
295  QualType Type = VD->getType();
296  if (Type->isLValueReferenceType())
297  Type = C.getPointerType(Type.getNonReferenceType());
298  else
299  Type = Type.getNonReferenceType();
300  SourceLocation Loc = VD->getLocation();
301  auto *Field = FieldDecl::Create(
302  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
304  /*BW=*/nullptr, /*Mutable=*/false,
305  /*InitStyle=*/ICIS_NoInit);
306  Field->setAccess(AS_public);
307  GlobalizedRD->addDecl(Field);
308  if (VD->hasAttrs()) {
309  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
310  E(VD->getAttrs().end());
311  I != E; ++I)
312  Field->addAttr(*I);
313  }
314  MappedDeclsFields.try_emplace(VD, Field);
315  }
316  GlobalizedRD->completeDefinition();
317  }
318 
319 public:
320  CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
321  virtual ~CheckVarsEscapingDeclContext() = default;
322  void VisitDeclStmt(const DeclStmt *S) {
323  if (!S)
324  return;
325  for (const Decl *D : S->decls())
326  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
327  VisitValueDecl(VD);
328  }
329  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
330  if (!D)
331  return;
332  if (!D->hasAssociatedStmt())
333  return;
334  if (const auto *S =
335  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
336  // Do not analyze directives that do not actually require capturing,
337  // like `omp for` or `omp simd` directives.
339  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
340  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
341  VisitStmt(S->getCapturedStmt());
342  return;
343  }
344  VisitOpenMPCapturedStmt(S, CaptureRegions.back() == OMPD_parallel);
345  }
346  }
347  void VisitCapturedStmt(const CapturedStmt *S) {
348  if (!S)
349  return;
350  for (const CapturedStmt::Capture &C : S->captures()) {
351  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
352  const ValueDecl *VD = C.getCapturedVar();
353  markAsEscaped(VD);
354  if (isa<OMPCapturedExprDecl>(VD))
355  VisitValueDecl(VD);
356  }
357  }
358  }
359  void VisitLambdaExpr(const LambdaExpr *E) {
360  if (!E)
361  return;
362  for (const LambdaCapture &C : E->captures()) {
363  if (C.capturesVariable()) {
364  if (C.getCaptureKind() == LCK_ByRef) {
365  const ValueDecl *VD = C.getCapturedVar();
366  markAsEscaped(VD);
367  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
368  VisitValueDecl(VD);
369  }
370  }
371  }
372  }
373  void VisitBlockExpr(const BlockExpr *E) {
374  if (!E)
375  return;
376  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
377  if (C.isByRef()) {
378  const VarDecl *VD = C.getVariable();
379  markAsEscaped(VD);
380  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
381  VisitValueDecl(VD);
382  }
383  }
384  }
385  void VisitCallExpr(const CallExpr *E) {
386  if (!E)
387  return;
388  for (const Expr *Arg : E->arguments()) {
389  if (!Arg)
390  continue;
391  if (Arg->isLValue()) {
392  const bool SavedAllEscaped = AllEscaped;
393  AllEscaped = true;
394  Visit(Arg);
395  AllEscaped = SavedAllEscaped;
396  } else {
397  Visit(Arg);
398  }
399  }
400  Visit(E->getCallee());
401  }
402  void VisitDeclRefExpr(const DeclRefExpr *E) {
403  if (!E)
404  return;
405  const ValueDecl *VD = E->getDecl();
406  if (AllEscaped)
407  markAsEscaped(VD);
408  if (isa<OMPCapturedExprDecl>(VD))
409  VisitValueDecl(VD);
410  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
411  if (VarD->isInitCapture())
412  VisitValueDecl(VD);
413  }
414  void VisitUnaryOperator(const UnaryOperator *E) {
415  if (!E)
416  return;
417  if (E->getOpcode() == UO_AddrOf) {
418  const bool SavedAllEscaped = AllEscaped;
419  AllEscaped = true;
420  Visit(E->getSubExpr());
421  AllEscaped = SavedAllEscaped;
422  } else {
423  Visit(E->getSubExpr());
424  }
425  }
426  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
427  if (!E)
428  return;
429  if (E->getCastKind() == CK_ArrayToPointerDecay) {
430  const bool SavedAllEscaped = AllEscaped;
431  AllEscaped = true;
432  Visit(E->getSubExpr());
433  AllEscaped = SavedAllEscaped;
434  } else {
435  Visit(E->getSubExpr());
436  }
437  }
438  void VisitExpr(const Expr *E) {
439  if (!E)
440  return;
441  bool SavedAllEscaped = AllEscaped;
442  if (!E->isLValue())
443  AllEscaped = false;
444  for (const Stmt *Child : E->children())
445  if (Child)
446  Visit(Child);
447  AllEscaped = SavedAllEscaped;
448  }
449  void VisitStmt(const Stmt *S) {
450  if (!S)
451  return;
452  for (const Stmt *Child : S->children())
453  if (Child)
454  Visit(Child);
455  }
456 
457  /// Returns the record that handles all the escaped local variables and used
458  /// instead of their original storage.
459  const RecordDecl *getGlobalizedRecord() {
460  if (!GlobalizedRD)
461  buildRecordForGlobalizedVars();
462  return GlobalizedRD;
463  }
464 
465  /// Returns the field in the globalized record for the escaped variable.
466  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
467  assert(GlobalizedRD &&
468  "Record for globalized variables must be generated already.");
469  auto I = MappedDeclsFields.find(VD);
470  if (I == MappedDeclsFields.end())
471  return nullptr;
472  return I->getSecond();
473  }
474 
475  /// Returns the list of the escaped local variables/parameters.
476  ArrayRef<const ValueDecl *> getEscapedDecls() const {
477  return EscapedDecls.getArrayRef();
478  }
479 
480  /// Checks if the escaped local variable is actually a parameter passed by
481  /// value.
482  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
483  return EscapedParameters;
484  }
485 
486  /// Returns the list of the escaped variables with the variably modified
487  /// types.
488  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
489  return EscapedVariableLengthDecls.getArrayRef();
490  }
491 };
492 } // anonymous namespace
493 
494 /// Get the GPU warp size.
496  return CGF.EmitRuntimeCall(
497  llvm::Intrinsic::getDeclaration(
498  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
499  "nvptx_warp_size");
500 }
501 
502 /// Get the id of the current thread on the GPU.
504  return CGF.EmitRuntimeCall(
505  llvm::Intrinsic::getDeclaration(
506  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
507  "nvptx_tid");
508 }
509 
510 /// Get the id of the warp in the block.
511 /// We assume that the warp size is 32, which is always the case
512 /// on the NVPTX device, to generate more efficient code.
514  CGBuilderTy &Bld = CGF.Builder;
515  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
516 }
517 
518 /// Get the id of the current lane in the Warp.
519 /// We assume that the warp size is 32, which is always the case
520 /// on the NVPTX device, to generate more efficient code.
522  CGBuilderTy &Bld = CGF.Builder;
523  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
524  "nvptx_lane_id");
525 }
526 
527 /// Get the maximum number of threads in a block of the GPU.
529  return CGF.EmitRuntimeCall(
530  llvm::Intrinsic::getDeclaration(
531  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
532  "nvptx_num_threads");
533 }
534 
535 /// Get barrier to synchronize all threads in a block.
537  CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
538  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
539 }
540 
541 /// Get barrier #ID to synchronize selected (multiple of warp size) threads in
542 /// a CTA.
543 static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
544  llvm::Value *NumThreads) {
545  CGBuilderTy &Bld = CGF.Builder;
546  llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
547  CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
548  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
549  Args);
550 }
551 
552 /// Synchronize all GPU threads in a block.
554 
555 /// Synchronize worker threads in a parallel region.
556 static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
557  return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
558 }
559 
560 /// Get the value of the thread_limit clause in the teams directive.
561 /// For the 'generic' execution mode, the runtime encodes thread_limit in
562 /// the launch parameters, always starting thread_limit+warpSize threads per
563 /// CTA. The threads in the last warp are reserved for master execution.
564 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
566  bool IsInSPMDExecutionMode = false) {
567  CGBuilderTy &Bld = CGF.Builder;
568  return IsInSPMDExecutionMode
569  ? getNVPTXNumThreads(CGF)
570  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
571  "thread_limit");
572 }
573 
574 /// Get the thread id of the OMP master thread.
575 /// The master thread id is the first thread (lane) of the last warp in the
576 /// GPU block. Warp size is assumed to be some power of 2.
577 /// Thread id is 0 indexed.
578 /// E.g: If NumThreads is 33, master id is 32.
579 /// If NumThreads is 64, master id is 32.
580 /// If NumThreads is 1024, master id is 992.
582  CGBuilderTy &Bld = CGF.Builder;
583  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
584 
585  // We assume that the warp size is a power of 2.
586  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
587 
588  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
589  Bld.CreateNot(Mask), "master_tid");
590 }
591 
592 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
593  CodeGenModule &CGM, SourceLocation Loc)
594  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
595  Loc(Loc) {
596  createWorkerFunction(CGM);
597 }
598 
599 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
600  CodeGenModule &CGM) {
601  // Create an worker function with no arguments.
602 
603  WorkerFn = llvm::Function::Create(
605  /*placeholder=*/"_worker", &CGM.getModule());
606  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
607  WorkerFn->setDoesNotRecurse();
608 }
609 
611 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
612  return CurrentExecutionMode;
613 }
614 
617  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
619 }
620 
621 /// Checks if the \p Body is the \a CompoundStmt and returns its child statement
622 /// iff there is only one.
623 static const Stmt *getSingleCompoundChild(const Stmt *Body) {
624  if (const auto *C = dyn_cast<CompoundStmt>(Body))
625  if (C->size() == 1)
626  return C->body_front();
627  return Body;
628 }
629 
630 /// Check if the parallel directive has an 'if' clause with non-constant or
631 /// false condition. Also, check if the number of threads is strictly specified
632 /// and run those directives in non-SPMD mode.
634  const OMPExecutableDirective &D) {
636  return true;
637  for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
638  OpenMPDirectiveKind NameModifier = C->getNameModifier();
639  if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
640  continue;
641  const Expr *Cond = C->getCondition();
642  bool Result;
643  if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
644  return true;
645  }
646  return false;
647 }
648 
649 /// Check for inner (nested) SPMD construct, if any
651  const OMPExecutableDirective &D) {
652  const auto *CS = D.getInnermostCapturedStmt();
653  const auto *Body = CS->getCapturedStmt()->IgnoreContainers();
654  const Stmt *ChildStmt = getSingleCompoundChild(Body);
655 
656  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
657  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
658  switch (D.getDirectiveKind()) {
659  case OMPD_target:
660  if (isOpenMPParallelDirective(DKind) &&
661  !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
662  return true;
663  if (DKind == OMPD_teams || DKind == OMPD_teams_distribute) {
664  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
665  if (!Body)
666  return false;
667  ChildStmt = getSingleCompoundChild(Body);
668  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
669  DKind = NND->getDirectiveKind();
670  if (isOpenMPParallelDirective(DKind) &&
671  !hasParallelIfNumThreadsClause(Ctx, *NND))
672  return true;
673  if (DKind == OMPD_distribute) {
674  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
675  if (!Body)
676  return false;
677  ChildStmt = getSingleCompoundChild(Body);
678  if (!ChildStmt)
679  return false;
680  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
681  DKind = NND->getDirectiveKind();
682  return isOpenMPParallelDirective(DKind) &&
683  !hasParallelIfNumThreadsClause(Ctx, *NND);
684  }
685  }
686  }
687  }
688  return false;
689  case OMPD_target_teams:
690  if (isOpenMPParallelDirective(DKind) &&
691  !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
692  return true;
693  if (DKind == OMPD_distribute) {
694  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
695  if (!Body)
696  return false;
697  ChildStmt = getSingleCompoundChild(Body);
698  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
699  DKind = NND->getDirectiveKind();
700  return isOpenMPParallelDirective(DKind) &&
701  !hasParallelIfNumThreadsClause(Ctx, *NND);
702  }
703  }
704  return false;
705  case OMPD_target_teams_distribute:
706  return isOpenMPParallelDirective(DKind) &&
707  !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
708  case OMPD_target_simd:
709  case OMPD_target_parallel:
710  case OMPD_target_parallel_for:
711  case OMPD_target_parallel_for_simd:
712  case OMPD_target_teams_distribute_simd:
713  case OMPD_target_teams_distribute_parallel_for:
714  case OMPD_target_teams_distribute_parallel_for_simd:
715  case OMPD_parallel:
716  case OMPD_for:
717  case OMPD_parallel_for:
718  case OMPD_parallel_sections:
719  case OMPD_for_simd:
720  case OMPD_parallel_for_simd:
721  case OMPD_cancel:
722  case OMPD_cancellation_point:
723  case OMPD_ordered:
724  case OMPD_threadprivate:
725  case OMPD_task:
726  case OMPD_simd:
727  case OMPD_sections:
728  case OMPD_section:
729  case OMPD_single:
730  case OMPD_master:
731  case OMPD_critical:
732  case OMPD_taskyield:
733  case OMPD_barrier:
734  case OMPD_taskwait:
735  case OMPD_taskgroup:
736  case OMPD_atomic:
737  case OMPD_flush:
738  case OMPD_teams:
739  case OMPD_target_data:
740  case OMPD_target_exit_data:
741  case OMPD_target_enter_data:
742  case OMPD_distribute:
743  case OMPD_distribute_simd:
744  case OMPD_distribute_parallel_for:
745  case OMPD_distribute_parallel_for_simd:
746  case OMPD_teams_distribute:
747  case OMPD_teams_distribute_simd:
748  case OMPD_teams_distribute_parallel_for:
749  case OMPD_teams_distribute_parallel_for_simd:
750  case OMPD_target_update:
751  case OMPD_declare_simd:
752  case OMPD_declare_target:
753  case OMPD_end_declare_target:
754  case OMPD_declare_reduction:
755  case OMPD_taskloop:
756  case OMPD_taskloop_simd:
757  case OMPD_unknown:
758  llvm_unreachable("Unexpected directive.");
759  }
760  }
761 
762  return false;
763 }
764 
766  const OMPExecutableDirective &D) {
767  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
768  switch (DirectiveKind) {
769  case OMPD_target:
770  case OMPD_target_teams:
771  case OMPD_target_teams_distribute:
772  return hasNestedSPMDDirective(Ctx, D);
773  case OMPD_target_parallel:
774  case OMPD_target_parallel_for:
775  case OMPD_target_parallel_for_simd:
776  case OMPD_target_teams_distribute_parallel_for:
777  case OMPD_target_teams_distribute_parallel_for_simd:
778  return !hasParallelIfNumThreadsClause(Ctx, D);
779  case OMPD_target_simd:
780  case OMPD_target_teams_distribute_simd:
781  return false;
782  case OMPD_parallel:
783  case OMPD_for:
784  case OMPD_parallel_for:
785  case OMPD_parallel_sections:
786  case OMPD_for_simd:
787  case OMPD_parallel_for_simd:
788  case OMPD_cancel:
789  case OMPD_cancellation_point:
790  case OMPD_ordered:
791  case OMPD_threadprivate:
792  case OMPD_task:
793  case OMPD_simd:
794  case OMPD_sections:
795  case OMPD_section:
796  case OMPD_single:
797  case OMPD_master:
798  case OMPD_critical:
799  case OMPD_taskyield:
800  case OMPD_barrier:
801  case OMPD_taskwait:
802  case OMPD_taskgroup:
803  case OMPD_atomic:
804  case OMPD_flush:
805  case OMPD_teams:
806  case OMPD_target_data:
807  case OMPD_target_exit_data:
808  case OMPD_target_enter_data:
809  case OMPD_distribute:
810  case OMPD_distribute_simd:
811  case OMPD_distribute_parallel_for:
812  case OMPD_distribute_parallel_for_simd:
813  case OMPD_teams_distribute:
814  case OMPD_teams_distribute_simd:
815  case OMPD_teams_distribute_parallel_for:
816  case OMPD_teams_distribute_parallel_for_simd:
817  case OMPD_target_update:
818  case OMPD_declare_simd:
819  case OMPD_declare_target:
820  case OMPD_end_declare_target:
821  case OMPD_declare_reduction:
822  case OMPD_taskloop:
823  case OMPD_taskloop_simd:
824  case OMPD_unknown:
825  break;
826  }
827  llvm_unreachable(
828  "Unknown programming model for OpenMP directive on NVPTX target.");
829 }
830 
831 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
832  StringRef ParentName,
833  llvm::Function *&OutlinedFn,
834  llvm::Constant *&OutlinedFnID,
835  bool IsOffloadEntry,
836  const RegionCodeGenTy &CodeGen) {
837  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/false);
838  EntryFunctionState EST;
839  WorkerFunctionState WST(CGM, D.getLocStart());
840  Work.clear();
841  WrapperFunctionsMap.clear();
842 
843  // Emit target region as a standalone region.
844  class NVPTXPrePostActionTy : public PrePostActionTy {
845  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
846  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
847 
848  public:
849  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
850  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
851  : EST(EST), WST(WST) {}
852  void Enter(CodeGenFunction &CGF) override {
853  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
854  .emitNonSPMDEntryHeader(CGF, EST, WST);
855  }
856  void Exit(CodeGenFunction &CGF) override {
857  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
858  .emitNonSPMDEntryFooter(CGF, EST);
859  }
860  } Action(EST, WST);
861  CodeGen.setAction(Action);
862  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
863  IsOffloadEntry, CodeGen);
864 
865  // Now change the name of the worker function to correspond to this target
866  // region's entry function.
867  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
868 
869  // Create the worker function
870  emitWorkerFunction(WST);
871 }
872 
873 // Setup NVPTX threads for master-worker OpenMP scheme.
874 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
875  EntryFunctionState &EST,
876  WorkerFunctionState &WST) {
877  CGBuilderTy &Bld = CGF.Builder;
878 
879  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
880  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
881  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
882  EST.ExitBB = CGF.createBasicBlock(".exit");
883 
884  llvm::Value *IsWorker =
885  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
886  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
887 
888  CGF.EmitBlock(WorkerBB);
889  emitCall(CGF, WST.Loc, WST.WorkerFn);
890  CGF.EmitBranch(EST.ExitBB);
891 
892  CGF.EmitBlock(MasterCheckBB);
893  llvm::Value *IsMaster =
894  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
895  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
896 
897  CGF.EmitBlock(MasterBB);
898  IsInTargetMasterThreadRegion = true;
899  // SEQUENTIAL (MASTER) REGION START
900  // First action in sequential region:
901  // Initialize the state of the OpenMP runtime library on the GPU.
902  // TODO: Optimize runtime initialization and pass in correct value.
903  llvm::Value *Args[] = {getThreadLimit(CGF),
904  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
905  CGF.EmitRuntimeCall(
906  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
907 
908  // For data sharing, we need to initialize the stack.
909  CGF.EmitRuntimeCall(
910  createNVPTXRuntimeFunction(
911  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
912 
913  emitGenericVarsProlog(CGF, WST.Loc);
914 }
915 
916 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
917  EntryFunctionState &EST) {
918  IsInTargetMasterThreadRegion = false;
919  if (!CGF.HaveInsertPoint())
920  return;
921 
922  emitGenericVarsEpilog(CGF);
923 
924  if (!EST.ExitBB)
925  EST.ExitBB = CGF.createBasicBlock(".exit");
926 
927  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
928  CGF.EmitBranch(TerminateBB);
929 
930  CGF.EmitBlock(TerminateBB);
931  // Signal termination condition.
932  // TODO: Optimize runtime initialization and pass in correct value.
933  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
934  CGF.EmitRuntimeCall(
935  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
936  // Barrier to terminate worker threads.
937  syncCTAThreads(CGF);
938  // Master thread jumps to exit point.
939  CGF.EmitBranch(EST.ExitBB);
940 
941  CGF.EmitBlock(EST.ExitBB);
942  EST.ExitBB = nullptr;
943 }
944 
945 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
946  StringRef ParentName,
947  llvm::Function *&OutlinedFn,
948  llvm::Constant *&OutlinedFnID,
949  bool IsOffloadEntry,
950  const RegionCodeGenTy &CodeGen) {
951  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/true);
952  EntryFunctionState EST;
953 
954  // Emit target region as a standalone region.
955  class NVPTXPrePostActionTy : public PrePostActionTy {
957  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
958  const OMPExecutableDirective &D;
959 
960  public:
961  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
962  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
963  const OMPExecutableDirective &D)
964  : RT(RT), EST(EST), D(D) {}
965  void Enter(CodeGenFunction &CGF) override {
966  RT.emitSPMDEntryHeader(CGF, EST, D);
967  }
968  void Exit(CodeGenFunction &CGF) override {
969  RT.emitSPMDEntryFooter(CGF, EST);
970  }
971  } Action(*this, EST, D);
972  CodeGen.setAction(Action);
973  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
974  IsOffloadEntry, CodeGen);
975 }
976 
977 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
978  CodeGenFunction &CGF, EntryFunctionState &EST,
979  const OMPExecutableDirective &D) {
980  CGBuilderTy &Bld = CGF.Builder;
981 
982  // Setup BBs in entry function.
983  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
984  EST.ExitBB = CGF.createBasicBlock(".exit");
985 
986  // Initialize the OMP state in the runtime; called by all active threads.
987  // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
988  // based on code analysis of the target region.
989  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
990  /*RequiresOMPRuntime=*/Bld.getInt16(1),
991  /*RequiresDataSharing=*/Bld.getInt16(1)};
992  CGF.EmitRuntimeCall(
993  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
994  CGF.EmitBranch(ExecuteBB);
995 
996  CGF.EmitBlock(ExecuteBB);
997 
998  IsInTargetMasterThreadRegion = true;
999 }
1000 
1001 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1002  EntryFunctionState &EST) {
1003  IsInTargetMasterThreadRegion = false;
1004  if (!CGF.HaveInsertPoint())
1005  return;
1006 
1007  if (!EST.ExitBB)
1008  EST.ExitBB = CGF.createBasicBlock(".exit");
1009 
1010  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1011  CGF.EmitBranch(OMPDeInitBB);
1012 
1013  CGF.EmitBlock(OMPDeInitBB);
1014  // DeInitialize the OMP state in the runtime; called by all active threads.
1015  CGF.EmitRuntimeCall(
1016  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
1017  CGF.EmitBranch(EST.ExitBB);
1018 
1019  CGF.EmitBlock(EST.ExitBB);
1020  EST.ExitBB = nullptr;
1021 }
1022 
1023 // Create a unique global variable to indicate the execution mode of this target
1024 // region. The execution mode is either 'generic', or 'spmd' depending on the
1025 // target directive. This variable is picked up by the offload library to setup
1026 // the device appropriately before kernel launch. If the execution mode is
1027 // 'generic', the runtime reserves one warp for the master, otherwise, all
1028 // warps participate in parallel work.
1029 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1030  bool Mode) {
1031  auto *GVMode =
1032  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1033  llvm::GlobalValue::WeakAnyLinkage,
1034  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1035  Twine(Name, "_exec_mode"));
1036  CGM.addCompilerUsedGlobal(GVMode);
1037 }
1038 
1039 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1040  ASTContext &Ctx = CGM.getContext();
1041 
1042  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1043  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1044  WST.Loc, WST.Loc);
1045  emitWorkerLoop(CGF, WST);
1046  CGF.FinishFunction();
1047 }
1048 
1049 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1050  WorkerFunctionState &WST) {
1051  //
1052  // The workers enter this loop and wait for parallel work from the master.
1053  // When the master encounters a parallel region it sets up the work + variable
1054  // arguments, and wakes up the workers. The workers first check to see if
1055  // they are required for the parallel region, i.e., within the # of requested
1056  // parallel threads. The activated workers load the variable arguments and
1057  // execute the parallel work.
1058  //
1059 
1060  CGBuilderTy &Bld = CGF.Builder;
1061 
1062  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1063  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1064  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1065  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1066  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1067  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1068 
1069  CGF.EmitBranch(AwaitBB);
1070 
1071  // Workers wait for work from master.
1072  CGF.EmitBlock(AwaitBB);
1073  // Wait for parallel work
1074  syncCTAThreads(CGF);
1075 
1076  // For data sharing, we need to initialize the stack for workers.
1077  CGF.EmitRuntimeCall(
1078  createNVPTXRuntimeFunction(
1079  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1080 
1081  Address WorkFn =
1082  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1083  Address ExecStatus =
1084  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1085  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1086  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1087 
1088  // TODO: Optimize runtime initialization and pass in correct value.
1089  llvm::Value *Args[] = {WorkFn.getPointer(),
1090  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1091  llvm::Value *Ret = CGF.EmitRuntimeCall(
1092  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1093  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1094 
1095  // On termination condition (workid == 0), exit loop.
1096  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1097  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1098  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1099 
1100  // Activate requested workers.
1101  CGF.EmitBlock(SelectWorkersBB);
1102  llvm::Value *IsActive =
1103  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1104  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1105 
1106  // Signal start of parallel region.
1107  CGF.EmitBlock(ExecuteBB);
1108 
1109  // Process work items: outlined parallel functions.
1110  for (llvm::Function *W : Work) {
1111  // Try to match this outlined function.
1113 
1114  llvm::Value *WorkFnMatch =
1115  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1116 
1117  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1118  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1119  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1120 
1121  // Execute this outlined function.
1122  CGF.EmitBlock(ExecuteFNBB);
1123 
1124  // Insert call to work function via shared wrapper. The shared
1125  // wrapper takes two arguments:
1126  // - the parallelism level;
1127  // - the thread ID;
1128  emitCall(CGF, WST.Loc, W,
1129  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1130 
1131  // Go to end of parallel region.
1132  CGF.EmitBranch(TerminateBB);
1133 
1134  CGF.EmitBlock(CheckNextBB);
1135  }
1136  // Default case: call to outlined function through pointer if the target
1137  // region makes a declare target call that may contain an orphaned parallel
1138  // directive.
1139  auto *ParallelFnTy =
1140  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1141  /*isVarArg=*/false)
1142  ->getPointerTo();
1143  llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
1144  // Insert call to work function via shared wrapper. The shared
1145  // wrapper takes two arguments:
1146  // - the parallelism level;
1147  // - the thread ID;
1148  emitCall(CGF, WST.Loc, WorkFnCast,
1149  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1150  // Go to end of parallel region.
1151  CGF.EmitBranch(TerminateBB);
1152 
1153  // Signal end of parallel region.
1154  CGF.EmitBlock(TerminateBB);
1155  CGF.EmitRuntimeCall(
1156  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1157  llvm::None);
1158  CGF.EmitBranch(BarrierBB);
1159 
1160  // All active and inactive workers wait at a barrier after parallel region.
1161  CGF.EmitBlock(BarrierBB);
1162  // Barrier after parallel region.
1163  syncCTAThreads(CGF);
1164  CGF.EmitBranch(AwaitBB);
1165 
1166  // Exit target region.
1167  CGF.EmitBlock(ExitBB);
1168 }
1169 
1170 /// Returns specified OpenMP runtime function for the current OpenMP
1171 /// implementation. Specialized for the NVPTX device.
1172 /// \param Function OpenMP runtime function.
1173 /// \return Specified function.
1174 llvm::Constant *
1176  llvm::Constant *RTLFn = nullptr;
1177  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1178  case OMPRTL_NVPTX__kmpc_kernel_init: {
1179  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1180  // RequiresOMPRuntime);
1181  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1182  auto *FnTy =
1183  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1184  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1185  break;
1186  }
1187  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1188  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1189  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1190  auto *FnTy =
1191  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1192  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1193  break;
1194  }
1195  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1196  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1197  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1198  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1199  auto *FnTy =
1200  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1201  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1202  break;
1203  }
1204  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
1205  // Build void __kmpc_spmd_kernel_deinit();
1206  auto *FnTy =
1207  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1208  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
1209  break;
1210  }
1211  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1212  /// Build void __kmpc_kernel_prepare_parallel(
1213  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1214  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1215  auto *FnTy =
1216  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1217  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1218  break;
1219  }
1220  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1221  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1222  /// int16_t IsOMPRuntimeInitialized);
1223  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1224  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1225  auto *FnTy =
1226  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1227  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1228  break;
1229  }
1230  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1231  /// Build void __kmpc_kernel_end_parallel();
1232  auto *FnTy =
1233  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1234  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1235  break;
1236  }
1237  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1238  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1239  // global_tid);
1240  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1241  auto *FnTy =
1242  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1243  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1244  break;
1245  }
1246  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1247  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1248  // global_tid);
1249  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1250  auto *FnTy =
1251  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1252  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1253  break;
1254  }
1255  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1256  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1257  // int16_t lane_offset, int16_t warp_size);
1258  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1259  auto *FnTy =
1260  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1261  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1262  break;
1263  }
1264  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1265  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1266  // int16_t lane_offset, int16_t warp_size);
1267  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1268  auto *FnTy =
1269  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1270  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1271  break;
1272  }
1273  case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
1274  // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
1275  // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1276  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1277  // lane_offset, int16_t Algorithm Version),
1278  // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1279  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1280  CGM.Int16Ty, CGM.Int16Ty};
1281  auto *ShuffleReduceFnTy =
1282  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1283  /*isVarArg=*/false);
1284  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1285  auto *InterWarpCopyFnTy =
1286  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1287  /*isVarArg=*/false);
1288  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1289  CGM.Int32Ty,
1290  CGM.SizeTy,
1291  CGM.VoidPtrTy,
1292  ShuffleReduceFnTy->getPointerTo(),
1293  InterWarpCopyFnTy->getPointerTo()};
1294  auto *FnTy =
1295  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1296  RTLFn = CGM.CreateRuntimeFunction(
1297  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1298  break;
1299  }
1300  case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: {
1301  // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid,
1302  // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1303  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1304  // lane_offset, int16_t Algorithm Version),
1305  // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1306  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1307  CGM.Int16Ty, CGM.Int16Ty};
1308  auto *ShuffleReduceFnTy =
1309  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1310  /*isVarArg=*/false);
1311  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1312  auto *InterWarpCopyFnTy =
1313  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1314  /*isVarArg=*/false);
1315  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1316  CGM.Int32Ty,
1317  CGM.SizeTy,
1318  CGM.VoidPtrTy,
1319  ShuffleReduceFnTy->getPointerTo(),
1320  InterWarpCopyFnTy->getPointerTo()};
1321  auto *FnTy =
1322  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1323  RTLFn = CGM.CreateRuntimeFunction(
1324  FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait");
1325  break;
1326  }
1327  case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
1328  // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
1329  // int32_t num_vars, size_t reduce_size, void *reduce_data,
1330  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1331  // lane_offset, int16_t shortCircuit),
1332  // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
1333  // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
1334  // int32_t index, int32_t width),
1335  // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1336  // int32_t index, int32_t width, int32_t reduce))
1337  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1338  CGM.Int16Ty, CGM.Int16Ty};
1339  auto *ShuffleReduceFnTy =
1340  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1341  /*isVarArg=*/false);
1342  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1343  auto *InterWarpCopyFnTy =
1344  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1345  /*isVarArg=*/false);
1346  llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1347  CGM.Int32Ty, CGM.Int32Ty};
1348  auto *CopyToScratchpadFnTy =
1349  llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1350  /*isVarArg=*/false);
1351  llvm::Type *LoadReduceTypeParams[] = {
1352  CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1353  auto *LoadReduceFnTy =
1354  llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1355  /*isVarArg=*/false);
1356  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1357  CGM.Int32Ty,
1358  CGM.SizeTy,
1359  CGM.VoidPtrTy,
1360  ShuffleReduceFnTy->getPointerTo(),
1361  InterWarpCopyFnTy->getPointerTo(),
1362  CopyToScratchpadFnTy->getPointerTo(),
1363  LoadReduceFnTy->getPointerTo()};
1364  auto *FnTy =
1365  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1366  RTLFn = CGM.CreateRuntimeFunction(
1367  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1368  break;
1369  }
1370  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1371  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1372  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1373  auto *FnTy =
1374  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1375  RTLFn = CGM.CreateRuntimeFunction(
1376  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1377  break;
1378  }
1379  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1380  /// Build void __kmpc_data_sharing_init_stack();
1381  auto *FnTy =
1382  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1383  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1384  break;
1385  }
1386  case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
1387  // Build void *__kmpc_data_sharing_push_stack(size_t size,
1388  // int16_t UseSharedMemory);
1389  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1390  auto *FnTy =
1391  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1392  RTLFn = CGM.CreateRuntimeFunction(
1393  FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
1394  break;
1395  }
1396  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1397  // Build void __kmpc_data_sharing_pop_stack(void *a);
1398  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1399  auto *FnTy =
1400  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1401  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1402  /*Name=*/"__kmpc_data_sharing_pop_stack");
1403  break;
1404  }
1405  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1406  /// Build void __kmpc_begin_sharing_variables(void ***args,
1407  /// size_t n_args);
1408  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1409  auto *FnTy =
1410  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1411  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1412  break;
1413  }
1414  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1415  /// Build void __kmpc_end_sharing_variables();
1416  auto *FnTy =
1417  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1418  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1419  break;
1420  }
1421  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1422  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1423  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1424  auto *FnTy =
1425  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1426  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1427  break;
1428  }
1429  case OMPRTL_NVPTX__kmpc_parallel_level: {
1430  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1431  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1432  auto *FnTy =
1433  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1434  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1435  break;
1436  }
1437  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1438  // Build int8_t __kmpc_is_spmd_exec_mode();
1439  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1440  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1441  break;
1442  }
1443  }
1444  return RTLFn;
1445 }
1446 
1447 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1448  llvm::Constant *Addr,
1449  uint64_t Size, int32_t,
1450  llvm::GlobalValue::LinkageTypes) {
1451  // TODO: Add support for global variables on the device after declare target
1452  // support.
1453  if (!isa<llvm::Function>(Addr))
1454  return;
1455  llvm::Module &M = CGM.getModule();
1456  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1457 
1458  // Get "nvvm.annotations" metadata node
1459  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1460 
1461  llvm::Metadata *MDVals[] = {
1462  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1463  llvm::ConstantAsMetadata::get(
1464  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1465  // Append metadata to nvvm.annotations
1466  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1467 }
1468 
1469 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1470  const OMPExecutableDirective &D, StringRef ParentName,
1471  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1472  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1473  if (!IsOffloadEntry) // Nothing to do.
1474  return;
1475 
1476  assert(!ParentName.empty() && "Invalid target region parent name!");
1477 
1478  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1479  if (Mode)
1480  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1481  CodeGen);
1482  else
1483  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1484  CodeGen);
1485 
1486  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1487 }
1488 
1490  : CGOpenMPRuntime(CGM, "_", "$") {
1491  if (!CGM.getLangOpts().OpenMPIsDevice)
1492  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1493 }
1494 
1496  OpenMPProcBindClauseKind ProcBind,
1497  SourceLocation Loc) {
1498  // Do nothing in case of SPMD mode and L0 parallel.
1499  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1500  return;
1501 
1502  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1503 }
1504 
1506  llvm::Value *NumThreads,
1507  SourceLocation Loc) {
1508  // Do nothing in case of SPMD mode and L0 parallel.
1509  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1510  return;
1511 
1512  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1513 }
1514 
1516  const Expr *NumTeams,
1517  const Expr *ThreadLimit,
1518  SourceLocation Loc) {}
1519 
1521  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1522  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1523  SourceLocation Loc = D.getLocStart();
1524 
1525  // Emit target region as a standalone region.
1526  class NVPTXPrePostActionTy : public PrePostActionTy {
1527  SourceLocation &Loc;
1528  bool &IsInParallelRegion;
1529  bool PrevIsInParallelRegion;
1530 
1531  public:
1532  NVPTXPrePostActionTy(SourceLocation &Loc, bool &IsInParallelRegion)
1533  : Loc(Loc), IsInParallelRegion(IsInParallelRegion) {}
1534  void Enter(CodeGenFunction &CGF) override {
1535  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1536  .emitGenericVarsProlog(CGF, Loc);
1537  PrevIsInParallelRegion = IsInParallelRegion;
1538  IsInParallelRegion = true;
1539  }
1540  void Exit(CodeGenFunction &CGF) override {
1541  IsInParallelRegion = PrevIsInParallelRegion;
1542  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1543  .emitGenericVarsEpilog(CGF);
1544  }
1545  } Action(Loc, IsInParallelRegion);
1546  CodeGen.setAction(Action);
1547  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1548  IsInTargetMasterThreadRegion = false;
1549  auto *OutlinedFun =
1550  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1551  D, ThreadIDVar, InnermostKind, CodeGen));
1552  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1553  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1554  !IsInParallelRegion) {
1555  llvm::Function *WrapperFun =
1556  createParallelDataSharingWrapper(OutlinedFun, D);
1557  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1558  }
1559 
1560  return OutlinedFun;
1561 }
1562 
1564  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1565  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1566  SourceLocation Loc = D.getLocStart();
1567 
1568  // Emit target region as a standalone region.
1569  class NVPTXPrePostActionTy : public PrePostActionTy {
1570  SourceLocation &Loc;
1571 
1572  public:
1573  NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
1574  void Enter(CodeGenFunction &CGF) override {
1575  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1576  .emitGenericVarsProlog(CGF, Loc);
1577  }
1578  void Exit(CodeGenFunction &CGF) override {
1579  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1580  .emitGenericVarsEpilog(CGF);
1581  }
1582  } Action(Loc);
1583  CodeGen.setAction(Action);
1585  D, ThreadIDVar, InnermostKind, CodeGen);
1586  llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
1587  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
1588  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
1589  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
1590 
1591  return OutlinedFun;
1592 }
1593 
1594 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
1595  SourceLocation Loc) {
1597  return;
1598 
1599  CGBuilderTy &Bld = CGF.Builder;
1600 
1601  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1602  if (I == FunctionGlobalizedDecls.end())
1603  return;
1604  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1605  QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1606 
1607  // Recover pointer to this function's global record. The runtime will
1608  // handle the specifics of the allocation of the memory.
1609  // Use actual memory size of the record including the padding
1610  // for alignment purposes.
1611  unsigned Alignment =
1613  unsigned GlobalRecordSize =
1614  CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
1615  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1616  // TODO: allow the usage of shared memory to be controlled by
1617  // the user, for now, default to global.
1618  llvm::Value *GlobalRecordSizeArg[] = {
1619  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1620  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1621  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1622  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1623  GlobalRecordSizeArg);
1624  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1625  GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
1626  LValue Base =
1627  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
1628  I->getSecond().GlobalRecordAddr = GlobalRecValue;
1629 
1630  // Emit the "global alloca" which is a GEP from the global declaration
1631  // record using the pointer returned by the runtime.
1632  for (auto &Rec : I->getSecond().LocalVarData) {
1633  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1634  llvm::Value *ParValue;
1635  if (EscapedParam) {
1636  const auto *VD = cast<VarDecl>(Rec.first);
1637  LValue ParLVal =
1638  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1639  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1640  }
1641  const FieldDecl *FD = Rec.second.first;
1642  LValue VarAddr = CGF.EmitLValueForField(Base, FD);
1643  Rec.second.second = VarAddr.getAddress();
1644  if (EscapedParam) {
1645  const auto *VD = cast<VarDecl>(Rec.first);
1646  CGF.EmitStoreOfScalar(ParValue, VarAddr);
1647  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1648  }
1649  }
1650  }
1651  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1652  // Recover pointer to this function's global record. The runtime will
1653  // handle the specifics of the allocation of the memory.
1654  // Use actual memory size of the record including the padding
1655  // for alignment purposes.
1656  CGBuilderTy &Bld = CGF.Builder;
1657  llvm::Value *Size = CGF.getTypeSize(VD->getType());
1658  CharUnits Align = CGM.getContext().getDeclAlign(VD);
1659  Size = Bld.CreateNUWAdd(
1660  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1661  llvm::Value *AlignVal =
1662  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1663  Size = Bld.CreateUDiv(Size, AlignVal);
1664  Size = Bld.CreateNUWMul(Size, AlignVal);
1665  // TODO: allow the usage of shared memory to be controlled by
1666  // the user, for now, default to global.
1667  llvm::Value *GlobalRecordSizeArg[] = {
1668  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1669  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1670  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1671  GlobalRecordSizeArg);
1672  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1673  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1674  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1675  CGM.getContext().getDeclAlign(VD),
1677  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1678  Base.getAddress());
1679  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
1680  }
1681  I->getSecond().MappedParams->apply(CGF);
1682 }
1683 
1684 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1686  return;
1687 
1688  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1689  if (I != FunctionGlobalizedDecls.end()) {
1690  I->getSecond().MappedParams->restore(CGF);
1691  if (!CGF.HaveInsertPoint())
1692  return;
1693  for (llvm::Value *Addr :
1694  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1695  CGF.EmitRuntimeCall(
1696  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1697  Addr);
1698  }
1699  if (I->getSecond().GlobalRecordAddr) {
1700  CGF.EmitRuntimeCall(
1701  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1702  I->getSecond().GlobalRecordAddr);
1703  }
1704  }
1705 }
1706 
1708  const OMPExecutableDirective &D,
1709  SourceLocation Loc,
1710  llvm::Value *OutlinedFn,
1711  ArrayRef<llvm::Value *> CapturedVars) {
1712  if (!CGF.HaveInsertPoint())
1713  return;
1714 
1715  Address ZeroAddr = CGF.CreateMemTemp(
1716  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
1717  /*Name*/ ".zero.addr");
1718  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1719  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1720  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1721  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1722  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1723  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1724 }
1725 
1727  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1728  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1729  if (!CGF.HaveInsertPoint())
1730  return;
1731 
1732  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1733  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
1734  else
1735  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
1736 }
1737 
1738 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
1739  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1740  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1741  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
1742 
1743  // Force inline this outlined function at its call site.
1744  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1745 
1747  /*DestWidth=*/32, /*Signed=*/1),
1748  ".zero.addr");
1749  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1750  Address ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
1751  auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, ThreadIDAddr](
1752  CodeGenFunction &CGF, PrePostActionTy &Action) {
1753  Action.Enter(CGF);
1754 
1755  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1756  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
1757  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1758  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1759  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
1760  };
1761  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
1762  PrePostActionTy &) {
1763 
1764  RegionCodeGenTy RCG(CodeGen);
1765  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1766  llvm::Value *ThreadID = getThreadID(CGF, Loc);
1767  llvm::Value *Args[] = {RTLoc, ThreadID};
1768 
1769  NVPTXActionTy Action(
1770  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1771  Args,
1772  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1773  Args);
1774  RCG.setAction(Action);
1775  RCG(CGF);
1776  };
1777 
1778  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
1779  PrePostActionTy &Action) {
1780  CGBuilderTy &Bld = CGF.Builder;
1781  llvm::Function *WFn = WrapperFunctionsMap[Fn];
1782  assert(WFn && "Wrapper function does not exist!");
1783  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1784 
1785  // Prepare for parallel region. Indicate the outlined function.
1786  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1787  CGF.EmitRuntimeCall(
1788  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
1789  Args);
1790 
1791  // Create a private scope that will globalize the arguments
1792  // passed from the outside of the target region.
1793  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1794 
1795  // There's somehting to share.
1796  if (!CapturedVars.empty()) {
1797  // Prepare for parallel region. Indicate the outlined function.
1798  Address SharedArgs =
1799  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
1800  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
1801 
1802  llvm::Value *DataSharingArgs[] = {
1803  SharedArgsPtr,
1804  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1805  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1806  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
1807  DataSharingArgs);
1808 
1809  // Store variable address in a list of references to pass to workers.
1810  unsigned Idx = 0;
1811  ASTContext &Ctx = CGF.getContext();
1812  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
1813  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
1814  .castAs<PointerType>());
1815  for (llvm::Value *V : CapturedVars) {
1816  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
1817  CGF.getPointerSize());
1818  llvm::Value *PtrV;
1819  if (V->getType()->isIntegerTy())
1820  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1821  else
1822  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1823  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1824  Ctx.getPointerType(Ctx.VoidPtrTy));
1825  ++Idx;
1826  }
1827  }
1828 
1829  // Activate workers. This barrier is used by the master to signal
1830  // work for the workers.
1831  syncCTAThreads(CGF);
1832 
1833  // OpenMP [2.5, Parallel Construct, p.49]
1834  // There is an implied barrier at the end of a parallel region. After the
1835  // end of a parallel region, only the master thread of the team resumes
1836  // execution of the enclosing task region.
1837  //
1838  // The master waits at this barrier until all workers are done.
1839  syncCTAThreads(CGF);
1840 
1841  if (!CapturedVars.empty())
1842  CGF.EmitRuntimeCall(
1843  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
1844 
1845  // Remember for post-processing in worker loop.
1846  Work.emplace_back(WFn);
1847  };
1848 
1849  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen, &CodeGen](
1850  CodeGenFunction &CGF, PrePostActionTy &Action) {
1851  RegionCodeGenTy RCG(CodeGen);
1852  if (IsInParallelRegion) {
1853  SeqGen(CGF, Action);
1854  } else if (IsInTargetMasterThreadRegion) {
1855  L0ParallelGen(CGF, Action);
1856  } else if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD) {
1857  RCG(CGF);
1858  } else {
1859  // Check for master and then parallelism:
1860  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
1861  // Serialized execution.
1862  // } else if (master) {
1863  // Worker call.
1864  // } else {
1865  // Outlined function call.
1866  // }
1867  CGBuilderTy &Bld = CGF.Builder;
1868  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1869  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
1870  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
1871  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1872  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
1873  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
1874  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
1875  // There is no need to emit line number for unconditional branch.
1877  CGF.EmitBlock(ParallelCheckBB);
1878  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1879  llvm::Value *ThreadID = getThreadID(CGF, Loc);
1880  llvm::Value *PL = CGF.EmitRuntimeCall(
1881  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
1882  {RTLoc, ThreadID});
1883  llvm::Value *Res = Bld.CreateIsNotNull(PL);
1884  Bld.CreateCondBr(Res, SeqBB, MasterCheckBB);
1885  CGF.EmitBlock(SeqBB);
1886  SeqGen(CGF, Action);
1887  CGF.EmitBranch(ExitBB);
1888  // There is no need to emit line number for unconditional branch.
1890  CGF.EmitBlock(MasterCheckBB);
1891  llvm::BasicBlock *MasterThenBB = CGF.createBasicBlock("master.then");
1892  llvm::BasicBlock *ElseBlock = CGF.createBasicBlock("omp_if.else");
1893  llvm::Value *IsMaster =
1894  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1895  Bld.CreateCondBr(IsMaster, MasterThenBB, ElseBlock);
1896  CGF.EmitBlock(MasterThenBB);
1897  L0ParallelGen(CGF, Action);
1898  CGF.EmitBranch(ExitBB);
1899  // There is no need to emit line number for unconditional branch.
1901  CGF.EmitBlock(ElseBlock);
1902  RCG(CGF);
1903  // There is no need to emit line number for unconditional branch.
1905  // Emit the continuation block for code after the if.
1906  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1907  }
1908  };
1909 
1910  if (IfCond) {
1911  emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
1912  } else {
1914  RegionCodeGenTy ThenRCG(LNParallelGen);
1915  ThenRCG(CGF);
1916  }
1917 }
1918 
1919 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
1920  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1921  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1922  // Just call the outlined function to execute the parallel region.
1923  // OutlinedFn(&GTid, &zero, CapturedStruct);
1924  //
1925  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1926 
1928  /*DestWidth=*/32, /*Signed=*/1),
1929  ".zero.addr");
1930  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1931  Address ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
1932  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
1933  ThreadIDAddr](CodeGenFunction &CGF,
1934  PrePostActionTy &Action) {
1935  Action.Enter(CGF);
1936 
1937  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1938  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
1939  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1940  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1941  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1942  };
1943  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
1944  PrePostActionTy &) {
1945 
1946  RegionCodeGenTy RCG(CodeGen);
1947  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1948  llvm::Value *ThreadID = getThreadID(CGF, Loc);
1949  llvm::Value *Args[] = {RTLoc, ThreadID};
1950 
1951  NVPTXActionTy Action(
1952  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1953  Args,
1954  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1955  Args);
1956  RCG.setAction(Action);
1957  RCG(CGF);
1958  };
1959 
1960  if (IsInTargetMasterThreadRegion) {
1961  RegionCodeGenTy RCG(CodeGen);
1962  RCG(CGF);
1963  } else {
1964  // If we are not in the target region, it is definitely L2 parallelism or
1965  // more, because for SPMD mode we always has L1 parallel level, sowe don't
1966  // need to check for orphaned directives.
1967  RegionCodeGenTy RCG(SeqGen);
1968  RCG(CGF);
1969  }
1970 }
1971 
1973  CodeGenFunction &CGF, StringRef CriticalName,
1974  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1975  const Expr *Hint) {
1976  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1977  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1978  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1979  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1980  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1981 
1982  // Fetch team-local id of the thread.
1983  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
1984 
1985  // Get the width of the team.
1986  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
1987 
1988  // Initialize the counter variable for the loop.
1989  QualType Int32Ty =
1990  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1991  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1992  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1993  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1994  /*isInit=*/true);
1995 
1996  // Block checks if loop counter exceeds upper bound.
1997  CGF.EmitBlock(LoopBB);
1998  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1999  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2000  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2001 
2002  // Block tests which single thread should execute region, and which threads
2003  // should go straight to synchronisation point.
2004  CGF.EmitBlock(TestBB);
2005  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2006  llvm::Value *CmpThreadToCounter =
2007  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2008  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2009 
2010  // Block emits the body of the critical region.
2011  CGF.EmitBlock(BodyBB);
2012 
2013  // Output the critical statement.
2014  CriticalOpGen(CGF);
2015 
2016  // After the body surrounded by the critical region, the single executing
2017  // thread will jump to the synchronisation point.
2018  // Block waits for all threads in current team to finish then increments the
2019  // counter variable and returns to the loop.
2020  CGF.EmitBlock(SyncBB);
2021  getNVPTXCTABarrier(CGF);
2022 
2023  llvm::Value *IncCounterVal =
2024  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2025  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2026  CGF.EmitBranch(LoopBB);
2027 
2028  // Block that is reached when all threads in the team complete the region.
2029  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2030 }
2031 
2032 /// Cast value to the specified type.
2034  QualType ValTy, QualType CastTy,
2035  SourceLocation Loc) {
2036  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2037  "Cast type must sized.");
2038  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2039  "Val type must sized.");
2040  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2041  if (ValTy == CastTy)
2042  return Val;
2043  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2044  CGF.getContext().getTypeSizeInChars(CastTy))
2045  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2046  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2047  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2048  CastTy->hasSignedIntegerRepresentation());
2049  Address CastItem = CGF.CreateMemTemp(CastTy);
2051  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2052  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2053  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2054 }
2055 
2056 /// This function creates calls to one of two shuffle functions to copy
2057 /// variables between lanes in a warp.
2059  llvm::Value *Elem,
2060  QualType ElemType,
2062  SourceLocation Loc) {
2063  CodeGenModule &CGM = CGF.CGM;
2064  CGBuilderTy &Bld = CGF.Builder;
2065  CGOpenMPRuntimeNVPTX &RT =
2066  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2067 
2068  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2069  assert(Size.getQuantity() <= 8 &&
2070  "Unsupported bitwidth in shuffle instruction.");
2071 
2072  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2073  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2074  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2075 
2076  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2077  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2078  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2079  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2080  llvm::Value *WarpSize =
2081  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2082 
2083  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2084  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2085 
2086  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2087 }
2088 
2089 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2090  Address DestAddr, QualType ElemType,
2092  CGBuilderTy &Bld = CGF.Builder;
2093 
2094  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2095  // Create the loop over the big sized data.
2096  // ptr = (void*)Elem;
2097  // ptrEnd = (void*) Elem + 1;
2098  // Step = 8;
2099  // while (ptr + Step < ptrEnd)
2100  // shuffle((int64_t)*ptr);
2101  // Step = 4;
2102  // while (ptr + Step < ptrEnd)
2103  // shuffle((int32_t)*ptr);
2104  // ...
2105  Address ElemPtr = DestAddr;
2106  Address Ptr = SrcAddr;
2108  Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
2109  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2110  if (Size < CharUnits::fromQuantity(IntSize))
2111  continue;
2112  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2113  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2114  /*Signed=*/1);
2115  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2116  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2117  ElemPtr =
2118  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2119  if (Size.getQuantity() / IntSize > 1) {
2120  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2121  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2122  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2123  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2124  CGF.EmitBlock(PreCondBB);
2125  llvm::PHINode *PhiSrc =
2126  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2127  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2128  llvm::PHINode *PhiDest =
2129  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2130  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2131  Ptr = Address(PhiSrc, Ptr.getAlignment());
2132  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2133  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2135  Ptr.getPointer(), CGF.VoidPtrTy));
2136  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2137  ThenBB, ExitBB);
2138  CGF.EmitBlock(ThenBB);
2140  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2141  IntType, Offset, Loc);
2142  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2143  Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2144  ElemPtr =
2145  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2146  PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
2147  PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
2148  CGF.EmitBranch(PreCondBB);
2149  CGF.EmitBlock(ExitBB);
2150  } else {
2152  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2153  IntType, Offset, Loc);
2154  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2155  Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2156  ElemPtr =
2157  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2158  }
2159  Size = Size % IntSize;
2160  }
2161 }
2162 
2163 namespace {
2164 enum CopyAction : unsigned {
2165  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2166  // the warp using shuffle instructions.
2167  RemoteLaneToThread,
2168  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2169  ThreadCopy,
2170  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2171  ThreadToScratchpad,
2172  // ScratchpadToThread: Copy from a scratchpad array in global memory
2173  // containing team-reduced data to a thread's stack.
2174  ScratchpadToThread,
2175 };
2176 } // namespace
2177 
2182 };
2183 
2184 /// Emit instructions to copy a Reduce list, which contains partially
2185 /// aggregated values, in the specified direction.
2187  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2188  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2189  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2190 
2191  CodeGenModule &CGM = CGF.CGM;
2192  ASTContext &C = CGM.getContext();
2193  CGBuilderTy &Bld = CGF.Builder;
2194 
2195  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2196  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2197  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2198 
2199  // Iterates, element-by-element, through the source Reduce list and
2200  // make a copy.
2201  unsigned Idx = 0;
2202  unsigned Size = Privates.size();
2203  for (const Expr *Private : Privates) {
2204  Address SrcElementAddr = Address::invalid();
2205  Address DestElementAddr = Address::invalid();
2206  Address DestElementPtrAddr = Address::invalid();
2207  // Should we shuffle in an element from a remote lane?
2208  bool ShuffleInElement = false;
2209  // Set to true to update the pointer in the dest Reduce list to a
2210  // newly created element.
2211  bool UpdateDestListPtr = false;
2212  // Increment the src or dest pointer to the scratchpad, for each
2213  // new element.
2214  bool IncrScratchpadSrc = false;
2215  bool IncrScratchpadDest = false;
2216 
2217  switch (Action) {
2218  case RemoteLaneToThread: {
2219  // Step 1.1: Get the address for the src element in the Reduce list.
2220  Address SrcElementPtrAddr =
2221  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2222  SrcElementAddr = CGF.EmitLoadOfPointer(
2223  SrcElementPtrAddr,
2224  C.getPointerType(Private->getType())->castAs<PointerType>());
2225 
2226  // Step 1.2: Create a temporary to store the element in the destination
2227  // Reduce list.
2228  DestElementPtrAddr =
2229  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2230  DestElementAddr =
2231  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2232  ShuffleInElement = true;
2233  UpdateDestListPtr = true;
2234  break;
2235  }
2236  case ThreadCopy: {
2237  // Step 1.1: Get the address for the src element in the Reduce list.
2238  Address SrcElementPtrAddr =
2239  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2240  SrcElementAddr = CGF.EmitLoadOfPointer(
2241  SrcElementPtrAddr,
2242  C.getPointerType(Private->getType())->castAs<PointerType>());
2243 
2244  // Step 1.2: Get the address for dest element. The destination
2245  // element has already been created on the thread's stack.
2246  DestElementPtrAddr =
2247  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2248  DestElementAddr = CGF.EmitLoadOfPointer(
2249  DestElementPtrAddr,
2250  C.getPointerType(Private->getType())->castAs<PointerType>());
2251  break;
2252  }
2253  case ThreadToScratchpad: {
2254  // Step 1.1: Get the address for the src element in the Reduce list.
2255  Address SrcElementPtrAddr =
2256  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2257  SrcElementAddr = CGF.EmitLoadOfPointer(
2258  SrcElementPtrAddr,
2259  C.getPointerType(Private->getType())->castAs<PointerType>());
2260 
2261  // Step 1.2: Get the address for dest element:
2262  // address = base + index * ElementSizeInChars.
2263  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2264  llvm::Value *CurrentOffset =
2265  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2266  llvm::Value *ScratchPadElemAbsolutePtrVal =
2267  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2268  ScratchPadElemAbsolutePtrVal =
2269  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2270  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2271  C.getTypeAlignInChars(Private->getType()));
2272  IncrScratchpadDest = true;
2273  break;
2274  }
2275  case ScratchpadToThread: {
2276  // Step 1.1: Get the address for the src element in the scratchpad.
2277  // address = base + index * ElementSizeInChars.
2278  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2279  llvm::Value *CurrentOffset =
2280  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2281  llvm::Value *ScratchPadElemAbsolutePtrVal =
2282  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2283  ScratchPadElemAbsolutePtrVal =
2284  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2285  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2286  C.getTypeAlignInChars(Private->getType()));
2287  IncrScratchpadSrc = true;
2288 
2289  // Step 1.2: Create a temporary to store the element in the destination
2290  // Reduce list.
2291  DestElementPtrAddr =
2292  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2293  DestElementAddr =
2294  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2295  UpdateDestListPtr = true;
2296  break;
2297  }
2298  }
2299 
2300  // Regardless of src and dest of copy, we emit the load of src
2301  // element as this is required in all directions
2302  SrcElementAddr = Bld.CreateElementBitCast(
2303  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
2304  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2305  SrcElementAddr.getElementType());
2306 
2307  // Now that all active lanes have read the element in the
2308  // Reduce list, shuffle over the value from the remote lane.
2309  if (ShuffleInElement) {
2310  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2311  RemoteLaneOffset, Private->getExprLoc());
2312  } else {
2313  if (Private->getType()->isScalarType()) {
2314  llvm::Value *Elem =
2315  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
2316  Private->getType(), Private->getExprLoc());
2317  // Store the source element value to the dest element address.
2318  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
2319  Private->getType());
2320  } else {
2321  CGF.EmitAggregateCopy(
2322  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2323  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2325  }
2326  }
2327 
2328  // Step 3.1: Modify reference in dest Reduce list as needed.
2329  // Modifying the reference in Reduce list to point to the newly
2330  // created element. The element is live in the current function
2331  // scope and that of functions it invokes (i.e., reduce_function).
2332  // RemoteReduceData[i] = (void*)&RemoteElem
2333  if (UpdateDestListPtr) {
2335  DestElementAddr.getPointer(), CGF.VoidPtrTy),
2336  DestElementPtrAddr, /*Volatile=*/false,
2337  C.VoidPtrTy);
2338  }
2339 
2340  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2341  // address of the next element in scratchpad memory, unless we're currently
2342  // processing the last one. Memory alignment is also taken care of here.
2343  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2344  llvm::Value *ScratchpadBasePtr =
2345  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2346  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2347  ScratchpadBasePtr = Bld.CreateNUWAdd(
2348  ScratchpadBasePtr,
2349  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2350 
2351  // Take care of global memory alignment for performance
2352  ScratchpadBasePtr = Bld.CreateNUWSub(
2353  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2354  ScratchpadBasePtr = Bld.CreateUDiv(
2355  ScratchpadBasePtr,
2356  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2357  ScratchpadBasePtr = Bld.CreateNUWAdd(
2358  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2359  ScratchpadBasePtr = Bld.CreateNUWMul(
2360  ScratchpadBasePtr,
2361  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2362 
2363  if (IncrScratchpadDest)
2364  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2365  else /* IncrScratchpadSrc = true */
2366  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2367  }
2368 
2369  ++Idx;
2370  }
2371 }
2372 
2373 /// This function emits a helper that loads data from the scratchpad array
2374 /// and (optionally) reduces it with the input operand.
2375 ///
2376 /// load_and_reduce(local, scratchpad, index, width, should_reduce)
2377 /// reduce_data remote;
2378 /// for elem in remote:
2379 /// remote.elem = Scratchpad[elem_id][index]
2380 /// if (should_reduce)
2381 /// local = local @ remote
2382 /// else
2383 /// local = remote
2385  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2386  QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
2387  ASTContext &C = CGM.getContext();
2388  QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
2389 
2390  // Destination of the copy.
2391  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2393  // Base address of the scratchpad array, with each element storing a
2394  // Reduce list per team.
2395  ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2397  // A source index into the scratchpad array.
2398  ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2400  // Row width of an element in the scratchpad array, typically
2401  // the number of teams.
2402  ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2404  // If should_reduce == 1, then it's load AND reduce,
2405  // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
2406  // The latter case is used for initialization.
2407  ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2408  Int32Ty, ImplicitParamDecl::Other);
2409 
2410  FunctionArgList Args;
2411  Args.push_back(&ReduceListArg);
2412  Args.push_back(&ScratchPadArg);
2413  Args.push_back(&IndexArg);
2414  Args.push_back(&WidthArg);
2415  Args.push_back(&ShouldReduceArg);
2416 
2417  const CGFunctionInfo &CGFI =
2419  auto *Fn = llvm::Function::Create(
2421  "_omp_reduction_load_and_reduce", &CGM.getModule());
2422  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2423  Fn->setDoesNotRecurse();
2424  CodeGenFunction CGF(CGM);
2425  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2426 
2427  CGBuilderTy &Bld = CGF.Builder;
2428 
2429  // Get local Reduce list pointer.
2430  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2431  Address ReduceListAddr(
2433  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2434  C.VoidPtrTy, Loc),
2435  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2436  CGF.getPointerAlign());
2437 
2438  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
2439  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
2440  AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2441 
2442  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
2443  llvm::Value *IndexVal = Bld.CreateIntCast(
2444  CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
2445  CGM.SizeTy, /*isSigned=*/true);
2446 
2447  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
2448  llvm::Value *WidthVal = Bld.CreateIntCast(
2449  CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
2450  CGM.SizeTy, /*isSigned=*/true);
2451 
2452  Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
2453  llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
2454  AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
2455 
2456  // The absolute ptr address to the base addr of the next element to copy.
2457  llvm::Value *CumulativeElemBasePtr =
2458  Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2459  Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2460 
2461  // Create a Remote Reduce list to store the elements read from the
2462  // scratchpad array.
2463  Address RemoteReduceList =
2464  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
2465 
2466  // Assemble remote Reduce list from scratchpad array.
2467  emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
2468  SrcDataAddr, RemoteReduceList,
2469  {/*RemoteLaneOffset=*/nullptr,
2470  /*ScratchpadIndex=*/IndexVal,
2471  /*ScratchpadWidth=*/WidthVal});
2472 
2473  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2474  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2475  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2476 
2477  llvm::Value *CondReduce = Bld.CreateIsNotNull(ShouldReduceVal);
2478  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2479 
2480  CGF.EmitBlock(ThenBB);
2481  // We should reduce with the local Reduce list.
2482  // reduce_function(LocalReduceList, RemoteReduceList)
2484  ReduceListAddr.getPointer(), CGF.VoidPtrTy);
2485  llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2486  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2487  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2488  CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
2489  Bld.CreateBr(MergeBB);
2490 
2491  CGF.EmitBlock(ElseBB);
2492  // No reduction; just copy:
2493  // Local Reduce list = Remote Reduce list.
2494  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2495  RemoteReduceList, ReduceListAddr);
2496  Bld.CreateBr(MergeBB);
2497 
2498  CGF.EmitBlock(MergeBB);
2499 
2500  CGF.FinishFunction();
2501  return Fn;
2502 }
2503 
2504 /// This function emits a helper that stores reduced data from the team
2505 /// master to a scratchpad array in global memory.
2506 ///
2507 /// for elem in Reduce List:
2508 /// scratchpad[elem_id][index] = elem
2509 ///
2511  ArrayRef<const Expr *> Privates,
2512  QualType ReductionArrayTy,
2513  SourceLocation Loc) {
2514 
2515  ASTContext &C = CGM.getContext();
2516  QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
2517 
2518  // Source of the copy.
2519  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2521  // Base address of the scratchpad array, with each element storing a
2522  // Reduce list per team.
2523  ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2525  // A destination index into the scratchpad array, typically the team
2526  // identifier.
2527  ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2529  // Row width of an element in the scratchpad array, typically
2530  // the number of teams.
2531  ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2533 
2534  FunctionArgList Args;
2535  Args.push_back(&ReduceListArg);
2536  Args.push_back(&ScratchPadArg);
2537  Args.push_back(&IndexArg);
2538  Args.push_back(&WidthArg);
2539 
2540  const CGFunctionInfo &CGFI =
2542  auto *Fn = llvm::Function::Create(
2544  "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
2545  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2546  Fn->setDoesNotRecurse();
2547  CodeGenFunction CGF(CGM);
2548  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2549 
2550  CGBuilderTy &Bld = CGF.Builder;
2551 
2552  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2553  Address SrcDataAddr(
2555  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2556  C.VoidPtrTy, Loc),
2557  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2558  CGF.getPointerAlign());
2559 
2560  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
2561  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
2562  AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2563 
2564  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
2565  llvm::Value *IndexVal = Bld.CreateIntCast(
2566  CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
2567  CGF.SizeTy, /*isSigned=*/true);
2568 
2569  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
2570  llvm::Value *WidthVal =
2571  Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
2572  Int32Ty, SourceLocation()),
2573  CGF.SizeTy, /*isSigned=*/true);
2574 
2575  // The absolute ptr address to the base addr of the next element to copy.
2576  llvm::Value *CumulativeElemBasePtr =
2577  Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2578  Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2579 
2580  emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
2581  SrcDataAddr, DestDataAddr,
2582  {/*RemoteLaneOffset=*/nullptr,
2583  /*ScratchpadIndex=*/IndexVal,
2584  /*ScratchpadWidth=*/WidthVal});
2585 
2586  CGF.FinishFunction();
2587  return Fn;
2588 }
2589 
2590 /// This function emits a helper that gathers Reduce lists from the first
2591 /// lane of every active warp to lanes in the first warp.
2592 ///
2593 /// void inter_warp_copy_func(void* reduce_data, num_warps)
2594 /// shared smem[warp_size];
2595 /// For all data entries D in reduce_data:
2596 /// If (I am the first lane in each warp)
2597 /// Copy my local D to smem[warp_id]
2598 /// sync
2599 /// if (I am the first warp)
2600 /// Copy smem[thread_id] to my local D
2601 /// sync
2603  ArrayRef<const Expr *> Privates,
2604  QualType ReductionArrayTy,
2605  SourceLocation Loc) {
2606  ASTContext &C = CGM.getContext();
2607  llvm::Module &M = CGM.getModule();
2608 
2609  // ReduceList: thread local Reduce list.
2610  // At the stage of the computation when this function is called, partially
2611  // aggregated values reside in the first lane of every active warp.
2612  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2614  // NumWarps: number of warps active in the parallel region. This could
2615  // be smaller than 32 (max warps in a CTA) for partial block reduction.
2616  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2617  C.getIntTypeForBitwidth(32, /* Signed */ true),
2619  FunctionArgList Args;
2620  Args.push_back(&ReduceListArg);
2621  Args.push_back(&NumWarpsArg);
2622 
2623  const CGFunctionInfo &CGFI =
2625  auto *Fn = llvm::Function::Create(
2627  "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
2628  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2629  Fn->setDoesNotRecurse();
2630  CodeGenFunction CGF(CGM);
2631  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2632 
2633  CGBuilderTy &Bld = CGF.Builder;
2634 
2635  // This array is used as a medium to transfer, one reduce element at a time,
2636  // the data from the first lane of every warp to lanes in the first warp
2637  // in order to perform the final step of a reduction in a parallel region
2638  // (reduction across warps). The array is placed in NVPTX __shared__ memory
2639  // for reduced latency, as well as to have a distinct copy for concurrently
2640  // executing target regions. The array is declared with common linkage so
2641  // as to be shared across compilation units.
2642  StringRef TransferMediumName =
2643  "__openmp_nvptx_data_transfer_temporary_storage";
2644  llvm::GlobalVariable *TransferMedium =
2645  M.getGlobalVariable(TransferMediumName);
2646  if (!TransferMedium) {
2647  auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
2648  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2649  TransferMedium = new llvm::GlobalVariable(
2650  M, Ty,
2651  /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
2652  llvm::Constant::getNullValue(Ty), TransferMediumName,
2653  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2654  SharedAddressSpace);
2655  CGM.addCompilerUsedGlobal(TransferMedium);
2656  }
2657 
2658  // Get the CUDA thread id of the current OpenMP thread on the GPU.
2659  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2660  // nvptx_lane_id = nvptx_id % warpsize
2661  llvm::Value *LaneID = getNVPTXLaneID(CGF);
2662  // nvptx_warp_id = nvptx_id / warpsize
2663  llvm::Value *WarpID = getNVPTXWarpID(CGF);
2664 
2665  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2666  Address LocalReduceList(
2668  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2669  C.VoidPtrTy, SourceLocation()),
2670  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2671  CGF.getPointerAlign());
2672 
2673  unsigned Idx = 0;
2674  for (const Expr *Private : Privates) {
2675  //
2676  // Warp master copies reduce element to transfer medium in __shared__
2677  // memory.
2678  //
2679  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2680  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2681  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2682 
2683  // if (lane_id == 0)
2684  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2685  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2686  CGF.EmitBlock(ThenBB);
2687 
2688  // Reduce element = LocalReduceList[i]
2689  Address ElemPtrPtrAddr =
2690  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2691  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2692  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2693  // elemptr = (type[i]*)(elemptrptr)
2694  Address ElemPtr =
2695  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2696  ElemPtr = Bld.CreateElementBitCast(
2697  ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2698 
2699  // Get pointer to location in transfer medium.
2700  // MediumPtr = &medium[warp_id]
2701  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2702  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2703  Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
2704  // Casting to actual data type.
2705  // MediumPtr = (type[i]*)MediumPtrAddr;
2706  MediumPtr = Bld.CreateElementBitCast(
2707  MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2708 
2709  // elem = *elemptr
2710  //*MediumPtr = elem
2711  if (Private->getType()->isScalarType()) {
2712  llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
2713  Private->getType(), Loc);
2714  // Store the source element value to the dest element address.
2715  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
2716  Private->getType());
2717  } else {
2718  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2719  CGF.MakeAddrLValue(MediumPtr, Private->getType()),
2721  }
2722 
2723  Bld.CreateBr(MergeBB);
2724 
2725  CGF.EmitBlock(ElseBB);
2726  Bld.CreateBr(MergeBB);
2727 
2728  CGF.EmitBlock(MergeBB);
2729 
2730  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2731  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2732  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
2733 
2734  llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
2735  NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
2736  // named_barrier_sync(ParallelBarrierID, num_active_threads)
2737  syncParallelThreads(CGF, NumActiveThreads);
2738 
2739  //
2740  // Warp 0 copies reduce element from transfer medium.
2741  //
2742  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2743  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2744  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2745 
2746  // Up to 32 threads in warp 0 are active.
2747  llvm::Value *IsActiveThread =
2748  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2749  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2750 
2751  CGF.EmitBlock(W0ThenBB);
2752 
2753  // SrcMediumPtr = &medium[tid]
2754  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2755  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2756  Address SrcMediumPtr(SrcMediumPtrVal,
2757  C.getTypeAlignInChars(Private->getType()));
2758  // SrcMediumVal = *SrcMediumPtr;
2759  SrcMediumPtr = Bld.CreateElementBitCast(
2760  SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2761 
2762  // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
2763  Address TargetElemPtrPtr =
2764  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2765  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2766  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2767  Address TargetElemPtr =
2768  Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
2769  TargetElemPtr = Bld.CreateElementBitCast(
2770  TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2771 
2772  // *TargetElemPtr = SrcMediumVal;
2773  if (Private->getType()->isScalarType()) {
2774  llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
2775  SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
2776  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2777  Private->getType());
2778  } else {
2779  CGF.EmitAggregateCopy(
2780  CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
2781  CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
2783  }
2784  Bld.CreateBr(W0MergeBB);
2785 
2786  CGF.EmitBlock(W0ElseBB);
2787  Bld.CreateBr(W0MergeBB);
2788 
2789  CGF.EmitBlock(W0MergeBB);
2790 
2791  // While warp 0 copies values from transfer medium, all other warps must
2792  // wait.
2793  syncParallelThreads(CGF, NumActiveThreads);
2794  ++Idx;
2795  }
2796 
2797  CGF.FinishFunction();
2798  return Fn;
2799 }
2800 
2801 /// Emit a helper that reduces data across two OpenMP threads (lanes)
2802 /// in the same warp. It uses shuffle instructions to copy over data from
2803 /// a remote lane's stack. The reduction algorithm performed is specified
2804 /// by the fourth parameter.
2805 ///
2806 /// Algorithm Versions.
2807 /// Full Warp Reduce (argument value 0):
2808 /// This algorithm assumes that all 32 lanes are active and gathers
2809 /// data from these 32 lanes, producing a single resultant value.
2810 /// Contiguous Partial Warp Reduce (argument value 1):
2811 /// This algorithm assumes that only a *contiguous* subset of lanes
2812 /// are active. This happens for the last warp in a parallel region
2813 /// when the user specified num_threads is not an integer multiple of
2814 /// 32. This contiguous subset always starts with the zeroth lane.
2815 /// Partial Warp Reduce (argument value 2):
2816 /// This algorithm gathers data from any number of lanes at any position.
2817 /// All reduced values are stored in the lowest possible lane. The set
2818 /// of problems every algorithm addresses is a super set of those
2819 /// addressable by algorithms with a lower version number. Overhead
2820 /// increases as algorithm version increases.
2821 ///
2822 /// Terminology
2823 /// Reduce element:
2824 /// Reduce element refers to the individual data field with primitive
2825 /// data types to be combined and reduced across threads.
2826 /// Reduce list:
2827 /// Reduce list refers to a collection of local, thread-private
2828 /// reduce elements.
2829 /// Remote Reduce list:
2830 /// Remote Reduce list refers to a collection of remote (relative to
2831 /// the current thread) reduce elements.
2832 ///
2833 /// We distinguish between three states of threads that are important to
2834 /// the implementation of this function.
2835 /// Alive threads:
2836 /// Threads in a warp executing the SIMT instruction, as distinguished from
2837 /// threads that are inactive due to divergent control flow.
2838 /// Active threads:
2839 /// The minimal set of threads that has to be alive upon entry to this
2840 /// function. The computation is correct iff active threads are alive.
2841 /// Some threads are alive but they are not active because they do not
2842 /// contribute to the computation in any useful manner. Turning them off
2843 /// may introduce control flow overheads without any tangible benefits.
2844 /// Effective threads:
2845 /// In order to comply with the argument requirements of the shuffle
2846 /// function, we must keep all lanes holding data alive. But at most
2847 /// half of them perform value aggregation; we refer to this half of
2848 /// threads as effective. The other half is simply handing off their
2849 /// data.
2850 ///
2851 /// Procedure
2852 /// Value shuffle:
2853 /// In this step active threads transfer data from higher lane positions
2854 /// in the warp to lower lane positions, creating Remote Reduce list.
2855 /// Value aggregation:
2856 /// In this step, effective threads combine their thread local Reduce list
2857 /// with Remote Reduce list and store the result in the thread local
2858 /// Reduce list.
2859 /// Value copy:
2860 /// In this step, we deal with the assumption made by algorithm 2
2861 /// (i.e. contiguity assumption). When we have an odd number of lanes
2862 /// active, say 2k+1, only k threads will be effective and therefore k
2863 /// new values will be produced. However, the Reduce list owned by the
2864 /// (2k+1)th thread is ignored in the value aggregation. Therefore
2865 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2866 /// that the contiguity assumption still holds.
2868  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2869  QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
2870  ASTContext &C = CGM.getContext();
2871 
2872  // Thread local Reduce list used to host the values of data to be reduced.
2873  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2875  // Current lane id; could be logical.
2876  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2878  // Offset of the remote source lane relative to the current lane.
2879  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2881  // Algorithm version. This is expected to be known at compile time.
2882  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2884  FunctionArgList Args;
2885  Args.push_back(&ReduceListArg);
2886  Args.push_back(&LaneIDArg);
2887  Args.push_back(&RemoteLaneOffsetArg);
2888  Args.push_back(&AlgoVerArg);
2889 
2890  const CGFunctionInfo &CGFI =
2892  auto *Fn = llvm::Function::Create(
2894  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2895  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2896  Fn->setDoesNotRecurse();
2897  CodeGenFunction CGF(CGM);
2898  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2899 
2900  CGBuilderTy &Bld = CGF.Builder;
2901 
2902  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2903  Address LocalReduceList(
2905  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2906  C.VoidPtrTy, SourceLocation()),
2907  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2908  CGF.getPointerAlign());
2909 
2910  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2911  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2912  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2913 
2914  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2915  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2916  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2917 
2918  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2919  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2920  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2921 
2922  // Create a local thread-private variable to host the Reduce list
2923  // from a remote lane.
2924  Address RemoteReduceList =
2925  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2926 
2927  // This loop iterates through the list of reduce elements and copies,
2928  // element by element, from a remote lane in the warp to RemoteReduceList,
2929  // hosted on the thread's stack.
2930  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2931  LocalReduceList, RemoteReduceList,
2932  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2933  /*ScratchpadIndex=*/nullptr,
2934  /*ScratchpadWidth=*/nullptr});
2935 
2936  // The actions to be performed on the Remote Reduce list is dependent
2937  // on the algorithm version.
2938  //
2939  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2940  // LaneId % 2 == 0 && Offset > 0):
2941  // do the reduction value aggregation
2942  //
2943  // The thread local variable Reduce list is mutated in place to host the
2944  // reduced data, which is the aggregated value produced from local and
2945  // remote lanes.
2946  //
2947  // Note that AlgoVer is expected to be a constant integer known at compile
2948  // time.
2949  // When AlgoVer==0, the first conjunction evaluates to true, making
2950  // the entire predicate true during compile time.
2951  // When AlgoVer==1, the second conjunction has only the second part to be
2952  // evaluated during runtime. Other conjunctions evaluates to false
2953  // during compile time.
2954  // When AlgoVer==2, the third conjunction has only the second part to be
2955  // evaluated during runtime. Other conjunctions evaluates to false
2956  // during compile time.
2957  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2958 
2959  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2960  llvm::Value *CondAlgo1 = Bld.CreateAnd(
2961  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2962 
2963  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2964  llvm::Value *CondAlgo2 = Bld.CreateAnd(
2965  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2966  CondAlgo2 = Bld.CreateAnd(
2967  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2968 
2969  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2970  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2971 
2972  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2973  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2974  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2975  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2976 
2977  CGF.EmitBlock(ThenBB);
2978  // reduce_function(LocalReduceList, RemoteReduceList)
2979  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2980  LocalReduceList.getPointer(), CGF.VoidPtrTy);
2981  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2982  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2983  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2984  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2985  Bld.CreateBr(MergeBB);
2986 
2987  CGF.EmitBlock(ElseBB);
2988  Bld.CreateBr(MergeBB);
2989 
2990  CGF.EmitBlock(MergeBB);
2991 
2992  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2993  // Reduce list.
2994  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2995  llvm::Value *CondCopy = Bld.CreateAnd(
2996  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2997 
2998  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2999  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3000  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3001  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3002 
3003  CGF.EmitBlock(CpyThenBB);
3004  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3005  RemoteReduceList, LocalReduceList);
3006  Bld.CreateBr(CpyMergeBB);
3007 
3008  CGF.EmitBlock(CpyElseBB);
3009  Bld.CreateBr(CpyMergeBB);
3010 
3011  CGF.EmitBlock(CpyMergeBB);
3012 
3013  CGF.FinishFunction();
3014  return Fn;
3015 }
3016 
3017 ///
3018 /// Design of OpenMP reductions on the GPU
3019 ///
3020 /// Consider a typical OpenMP program with one or more reduction
3021 /// clauses:
3022 ///
3023 /// float foo;
3024 /// double bar;
3025 /// #pragma omp target teams distribute parallel for \
3026 /// reduction(+:foo) reduction(*:bar)
3027 /// for (int i = 0; i < N; i++) {
3028 /// foo += A[i]; bar *= B[i];
3029 /// }
3030 ///
3031 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3032 /// all teams. In our OpenMP implementation on the NVPTX device an
3033 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3034 /// within a team are mapped to CUDA threads within a threadblock.
3035 /// Our goal is to efficiently aggregate values across all OpenMP
3036 /// threads such that:
3037 ///
3038 /// - the compiler and runtime are logically concise, and
3039 /// - the reduction is performed efficiently in a hierarchical
3040 /// manner as follows: within OpenMP threads in the same warp,
3041 /// across warps in a threadblock, and finally across teams on
3042 /// the NVPTX device.
3043 ///
3044 /// Introduction to Decoupling
3045 ///
3046 /// We would like to decouple the compiler and the runtime so that the
3047 /// latter is ignorant of the reduction variables (number, data types)
3048 /// and the reduction operators. This allows a simpler interface
3049 /// and implementation while still attaining good performance.
3050 ///
3051 /// Pseudocode for the aforementioned OpenMP program generated by the
3052 /// compiler is as follows:
3053 ///
3054 /// 1. Create private copies of reduction variables on each OpenMP
3055 /// thread: 'foo_private', 'bar_private'
3056 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3057 /// to it and writes the result in 'foo_private' and 'bar_private'
3058 /// respectively.
3059 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
3060 /// and store the result on the team master:
3061 ///
3062 /// __kmpc_nvptx_parallel_reduce_nowait(...,
3063 /// reduceData, shuffleReduceFn, interWarpCpyFn)
3064 ///
3065 /// where:
3066 /// struct ReduceData {
3067 /// double *foo;
3068 /// double *bar;
3069 /// } reduceData
3070 /// reduceData.foo = &foo_private
3071 /// reduceData.bar = &bar_private
3072 ///
3073 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3074 /// auxiliary functions generated by the compiler that operate on
3075 /// variables of type 'ReduceData'. They aid the runtime perform
3076 /// algorithmic steps in a data agnostic manner.
3077 ///
3078 /// 'shuffleReduceFn' is a pointer to a function that reduces data
3079 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
3080 /// same warp. It takes the following arguments as input:
3081 ///
3082 /// a. variable of type 'ReduceData' on the calling lane,
3083 /// b. its lane_id,
3084 /// c. an offset relative to the current lane_id to generate a
3085 /// remote_lane_id. The remote lane contains the second
3086 /// variable of type 'ReduceData' that is to be reduced.
3087 /// d. an algorithm version parameter determining which reduction
3088 /// algorithm to use.
3089 ///
3090 /// 'shuffleReduceFn' retrieves data from the remote lane using
3091 /// efficient GPU shuffle intrinsics and reduces, using the
3092 /// algorithm specified by the 4th parameter, the two operands
3093 /// element-wise. The result is written to the first operand.
3094 ///
3095 /// Different reduction algorithms are implemented in different
3096 /// runtime functions, all calling 'shuffleReduceFn' to perform
3097 /// the essential reduction step. Therefore, based on the 4th
3098 /// parameter, this function behaves slightly differently to
3099 /// cooperate with the runtime to ensure correctness under
3100 /// different circumstances.
3101 ///
3102 /// 'InterWarpCpyFn' is a pointer to a function that transfers
3103 /// reduced variables across warps. It tunnels, through CUDA
3104 /// shared memory, the thread-private data of type 'ReduceData'
3105 /// from lane 0 of each warp to a lane in the first warp.
3106 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3107 /// The last team writes the global reduced value to memory.
3108 ///
3109 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3110 /// reduceData, shuffleReduceFn, interWarpCpyFn,
3111 /// scratchpadCopyFn, loadAndReduceFn)
3112 ///
3113 /// 'scratchpadCopyFn' is a helper that stores reduced
3114 /// data from the team master to a scratchpad array in
3115 /// global memory.
3116 ///
3117 /// 'loadAndReduceFn' is a helper that loads data from
3118 /// the scratchpad array and reduces it with the input
3119 /// operand.
3120 ///
3121 /// These compiler generated functions hide address
3122 /// calculation and alignment information from the runtime.
3123 /// 5. if ret == 1:
3124 /// The team master of the last team stores the reduced
3125 /// result to the globals in memory.
3126 /// foo += reduceData.foo; bar *= reduceData.bar
3127 ///
3128 ///
3129 /// Warp Reduction Algorithms
3130 ///
3131 /// On the warp level, we have three algorithms implemented in the
3132 /// OpenMP runtime depending on the number of active lanes:
3133 ///
3134 /// Full Warp Reduction
3135 ///
3136 /// The reduce algorithm within a warp where all lanes are active
3137 /// is implemented in the runtime as follows:
3138 ///
3139 /// full_warp_reduce(void *reduce_data,
3140 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3141 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3142 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
3143 /// }
3144 ///
3145 /// The algorithm completes in log(2, WARPSIZE) steps.
3146 ///
3147 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3148 /// not used therefore we save instructions by not retrieving lane_id
3149 /// from the corresponding special registers. The 4th parameter, which
3150 /// represents the version of the algorithm being used, is set to 0 to
3151 /// signify full warp reduction.
3152 ///
3153 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3154 ///
3155 /// #reduce_elem refers to an element in the local lane's data structure
3156 /// #remote_elem is retrieved from a remote lane
3157 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3158 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3159 ///
3160 /// Contiguous Partial Warp Reduction
3161 ///
3162 /// This reduce algorithm is used within a warp where only the first
3163 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3164 /// number of OpenMP threads in a parallel region is not a multiple of
3165 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
3166 ///
3167 /// void
3168 /// contiguous_partial_reduce(void *reduce_data,
3169 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3170 /// int size, int lane_id) {
3171 /// int curr_size;
3172 /// int offset;
3173 /// curr_size = size;
3174 /// mask = curr_size/2;
3175 /// while (offset>0) {
3176 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3177 /// curr_size = (curr_size+1)/2;
3178 /// offset = curr_size/2;
3179 /// }
3180 /// }
3181 ///
3182 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3183 ///
3184 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3185 /// if (lane_id < offset)
3186 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3187 /// else
3188 /// reduce_elem = remote_elem
3189 ///
3190 /// This algorithm assumes that the data to be reduced are located in a
3191 /// contiguous subset of lanes starting from the first. When there is
3192 /// an odd number of active lanes, the data in the last lane is not
3193 /// aggregated with any other lane's dat but is instead copied over.
3194 ///
3195 /// Dispersed Partial Warp Reduction
3196 ///
3197 /// This algorithm is used within a warp when any discontiguous subset of
3198 /// lanes are active. It is used to implement the reduction operation
3199 /// across lanes in an OpenMP simd region or in a nested parallel region.
3200 ///
3201 /// void
3202 /// dispersed_partial_reduce(void *reduce_data,
3203 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3204 /// int size, remote_id;
3205 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3206 /// do {
3207 /// remote_id = next_active_lane_id_right_after_me();
3208 /// # the above function returns 0 of no active lane
3209 /// # is present right after the current lane.
3210 /// size = number_of_active_lanes_in_this_warp();
3211 /// logical_lane_id /= 2;
3212 /// ShuffleReduceFn(reduce_data, logical_lane_id,
3213 /// remote_id-1-threadIdx.x, 2);
3214 /// } while (logical_lane_id % 2 == 0 && size > 1);
3215 /// }
3216 ///
3217 /// There is no assumption made about the initial state of the reduction.
3218 /// Any number of lanes (>=1) could be active at any position. The reduction
3219 /// result is returned in the first active lane.
3220 ///
3221 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3222 ///
3223 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3224 /// if (lane_id % 2 == 0 && offset > 0)
3225 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3226 /// else
3227 /// reduce_elem = remote_elem
3228 ///
3229 ///
3230 /// Intra-Team Reduction
3231 ///
3232 /// This function, as implemented in the runtime call
3233 /// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
3234 /// threads in a team. It first reduces within a warp using the
3235 /// aforementioned algorithms. We then proceed to gather all such
3236 /// reduced values at the first warp.
3237 ///
3238 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3239 /// data from each of the "warp master" (zeroth lane of each warp, where
3240 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
3241 /// a mathematical sense) the problem of reduction across warp masters in
3242 /// a block to the problem of warp reduction.
3243 ///
3244 ///
3245 /// Inter-Team Reduction
3246 ///
3247 /// Once a team has reduced its data to a single value, it is stored in
3248 /// a global scratchpad array. Since each team has a distinct slot, this
3249 /// can be done without locking.
3250 ///
3251 /// The last team to write to the scratchpad array proceeds to reduce the
3252 /// scratchpad array. One or more workers in the last team use the helper
3253 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3254 /// the k'th worker reduces every k'th element.
3255 ///
3256 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
3257 /// reduce across workers and compute a globally reduced value.
3258 ///
3262  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3263  if (!CGF.HaveInsertPoint())
3264  return;
3265 
3266  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3267  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3268  bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
3269  assert((TeamsReduction || ParallelReduction || SimdReduction) &&
3270  "Invalid reduction selection in emitReduction.");
3271 
3272  if (Options.SimpleReduction) {
3273  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3274  ReductionOps, Options);
3275  return;
3276  }
3277 
3278  ASTContext &C = CGM.getContext();
3279 
3280  // 1. Build a list of reduction variables.
3281  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3282  auto Size = RHSExprs.size();
3283  for (const Expr *E : Privates) {
3284  if (E->getType()->isVariablyModifiedType())
3285  // Reserve place for array size.
3286  ++Size;
3287  }
3288  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3289  QualType ReductionArrayTy =
3291  /*IndexTypeQuals=*/0);
3292  Address ReductionList =
3293  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3294  auto IPriv = Privates.begin();
3295  unsigned Idx = 0;
3296  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3297  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3298  CGF.getPointerSize());
3299  CGF.Builder.CreateStore(
3301  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
3302  Elem);
3303  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3304  // Store array size.
3305  ++Idx;
3306  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3307  CGF.getPointerSize());
3308  llvm::Value *Size = CGF.Builder.CreateIntCast(
3309  CGF.getVLASize(
3310  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3311  .NumElts,
3312  CGF.SizeTy, /*isSigned=*/false);
3313  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3314  Elem);
3315  }
3316  }
3317 
3318  // 2. Emit reduce_func().
3319  llvm::Value *ReductionFn = emitReductionFunction(
3320  CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
3321  Privates, LHSExprs, RHSExprs, ReductionOps);
3322 
3323  // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3324  // RedList, shuffle_reduce_func, interwarp_copy_func);
3325  llvm::Value *ThreadId = getThreadID(CGF, Loc);
3326  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3328  ReductionList.getPointer(), CGF.VoidPtrTy);
3329 
3330  llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3331  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3332  llvm::Value *InterWarpCopyFn =
3333  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3334 
3335  llvm::Value *Args[] = {ThreadId,
3336  CGF.Builder.getInt32(RHSExprs.size()),
3337  ReductionArrayTySize,
3338  RL,
3339  ShuffleAndReduceFn,
3340  InterWarpCopyFn};
3341 
3342  llvm::Value *Res = nullptr;
3343  if (ParallelReduction)
3344  Res = CGF.EmitRuntimeCall(
3345  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
3346  Args);
3347  else if (SimdReduction)
3348  Res = CGF.EmitRuntimeCall(
3349  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
3350  Args);
3351 
3352  if (TeamsReduction) {
3353  llvm::Value *ScratchPadCopyFn =
3354  emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
3355  llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
3356  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3357 
3358  llvm::Value *Args[] = {ThreadId,
3359  CGF.Builder.getInt32(RHSExprs.size()),
3360  ReductionArrayTySize,
3361  RL,
3362  ShuffleAndReduceFn,
3363  InterWarpCopyFn,
3364  ScratchPadCopyFn,
3365  LoadAndReduceFn};
3366  Res = CGF.EmitRuntimeCall(
3367  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
3368  Args);
3369  }
3370 
3371  // 5. Build switch(res)
3372  llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
3373  llvm::SwitchInst *SwInst =
3374  CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
3375 
3376  // 6. Build case 1: where we have reduced values in the master
3377  // thread in each team.
3378  // __kmpc_end_reduce{_nowait}(<gtid>);
3379  // break;
3380  llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
3381  SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
3382  CGF.EmitBlock(Case1BB);
3383 
3384  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3385  llvm::Value *EndArgs[] = {ThreadId};
3386  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3387  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3388  auto IPriv = Privates.begin();
3389  auto ILHS = LHSExprs.begin();
3390  auto IRHS = RHSExprs.begin();
3391  for (const Expr *E : ReductionOps) {
3392  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3393  cast<DeclRefExpr>(*IRHS));
3394  ++IPriv;
3395  ++ILHS;
3396  ++IRHS;
3397  }
3398  };
3399  RegionCodeGenTy RCG(CodeGen);
3400  NVPTXActionTy Action(
3401  nullptr, llvm::None,
3402  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
3403  EndArgs);
3404  RCG.setAction(Action);
3405  RCG(CGF);
3406  CGF.EmitBranch(DefaultBB);
3407  CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
3408 }
3409 
3410 const VarDecl *
3412  const VarDecl *NativeParam) const {
3413  if (!NativeParam->getType()->isReferenceType())
3414  return NativeParam;
3415  QualType ArgType = NativeParam->getType();
3416  QualifierCollector QC;
3417  const Type *NonQualTy = QC.strip(ArgType);
3418  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3419  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3420  if (Attr->getCaptureKind() == OMPC_map) {
3421  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3423  }
3424  }
3425  ArgType = CGM.getContext().getPointerType(PointeeTy);
3426  QC.addRestrict();
3427  enum { NVPTX_local_addr = 5 };
3428  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3429  ArgType = QC.apply(CGM.getContext(), ArgType);
3430  if (isa<ImplicitParamDecl>(NativeParam))
3432  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3433  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3434  return ParmVarDecl::Create(
3435  CGM.getContext(),
3436  const_cast<DeclContext *>(NativeParam->getDeclContext()),
3437  NativeParam->getLocStart(), NativeParam->getLocation(),
3438  NativeParam->getIdentifier(), ArgType,
3439  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3440 }
3441 
3442 Address
3444  const VarDecl *NativeParam,
3445  const VarDecl *TargetParam) const {
3446  assert(NativeParam != TargetParam &&
3447  NativeParam->getType()->isReferenceType() &&
3448  "Native arg must not be the same as target arg.");
3449  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3450  QualType NativeParamType = NativeParam->getType();
3451  QualifierCollector QC;
3452  const Type *NonQualTy = QC.strip(NativeParamType);
3453  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3454  unsigned NativePointeeAddrSpace =
3455  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3456  QualType TargetTy = TargetParam->getType();
3457  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3458  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3459  // First cast to generic.
3461  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3462  /*AddrSpace=*/0));
3463  // Cast from generic to native address space.
3465  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3466  NativePointeeAddrSpace));
3467  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3468  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3469  NativeParamType);
3470  return NativeParamAddr;
3471 }
3472 
3474  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
3475  ArrayRef<llvm::Value *> Args) const {
3476  SmallVector<llvm::Value *, 4> TargetArgs;
3477  TargetArgs.reserve(Args.size());
3478  auto *FnType =
3479  cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
3480  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3481  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3482  TargetArgs.append(std::next(Args.begin(), I), Args.end());
3483  break;
3484  }
3485  llvm::Type *TargetType = FnType->getParamType(I);
3486  llvm::Value *NativeArg = Args[I];
3487  if (!TargetType->isPointerTy()) {
3488  TargetArgs.emplace_back(NativeArg);
3489  continue;
3490  }
3492  NativeArg,
3493  NativeArg->getType()->getPointerElementType()->getPointerTo());
3494  TargetArgs.emplace_back(
3495  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3496  }
3497  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3498 }
3499 
3500 /// Emit function which wraps the outline parallel region
3501 /// and controls the arguments which are passed to this function.
3502 /// The wrapper ensures that the outlined function is called
3503 /// with the correct arguments when data is shared.
3504 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
3505  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3506  ASTContext &Ctx = CGM.getContext();
3507  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3508 
3509  // Create a function that takes as argument the source thread.
3510  FunctionArgList WrapperArgs;
3511  QualType Int16QTy =
3512  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3513  QualType Int32QTy =
3514  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3515  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
3516  /*Id=*/nullptr, Int16QTy,
3518  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
3519  /*Id=*/nullptr, Int32QTy,
3521  WrapperArgs.emplace_back(&ParallelLevelArg);
3522  WrapperArgs.emplace_back(&WrapperArg);
3523 
3524  const CGFunctionInfo &CGFI =
3525  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3526 
3527  auto *Fn = llvm::Function::Create(
3529  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3530  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3531  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3532  Fn->setDoesNotRecurse();
3533 
3534  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3535  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3536  D.getLocStart(), D.getLocStart());
3537 
3538  const auto *RD = CS.getCapturedRecordDecl();
3539  auto CurField = RD->field_begin();
3540 
3541  Address ZeroAddr = CGF.CreateMemTemp(
3542  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
3543  /*Name*/ ".zero.addr");
3544  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
3545  // Get the array of arguments.
3547 
3548  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3549  Args.emplace_back(ZeroAddr.getPointer());
3550 
3551  CGBuilderTy &Bld = CGF.Builder;
3552  auto CI = CS.capture_begin();
3553 
3554  // Use global memory for data sharing.
3555  // Handle passing of global args to workers.
3556  Address GlobalArgs =
3557  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3558  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3559  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3560  CGF.EmitRuntimeCall(
3561  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
3562  DataSharingArgs);
3563 
3564  // Retrieve the shared variables from the list of references returned
3565  // by the runtime. Pass the variables to the outlined function.
3566  Address SharedArgListAddress = Address::invalid();
3567  if (CS.capture_size() > 0 ||
3569  SharedArgListAddress = CGF.EmitLoadOfPointer(
3570  GlobalArgs, CGF.getContext()
3572  CGF.getContext().VoidPtrTy))
3573  .castAs<PointerType>());
3574  }
3575  unsigned Idx = 0;
3577  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
3578  CGF.getPointerSize());
3579  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3580  Src, CGF.SizeTy->getPointerTo());
3581  llvm::Value *LB = CGF.EmitLoadOfScalar(
3582  TypedAddress,
3583  /*Volatile=*/false,
3585  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3586  Args.emplace_back(LB);
3587  ++Idx;
3588  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
3589  CGF.getPointerSize());
3590  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3591  Src, CGF.SizeTy->getPointerTo());
3592  llvm::Value *UB = CGF.EmitLoadOfScalar(
3593  TypedAddress,
3594  /*Volatile=*/false,
3596  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3597  Args.emplace_back(UB);
3598  ++Idx;
3599  }
3600  if (CS.capture_size() > 0) {
3601  ASTContext &CGFContext = CGF.getContext();
3602  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3603  QualType ElemTy = CurField->getType();
3604  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
3605  CGF.getPointerSize());
3606  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3607  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3608  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3609  /*Volatile=*/false,
3610  CGFContext.getPointerType(ElemTy),
3611  CI->getLocation());
3612  if (CI->capturesVariableByCopy() &&
3613  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3614  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3615  CI->getLocation());
3616  }
3617  Args.emplace_back(Arg);
3618  }
3619  }
3620 
3621  emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);
3622  CGF.FinishFunction();
3623  return Fn;
3624 }
3625 
3627  const Decl *D) {
3629  return;
3630 
3631  assert(D && "Expected function or captured|block decl.");
3632  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3633  "Function is registered already.");
3634  const Stmt *Body = nullptr;
3635  bool NeedToDelayGlobalization = false;
3636  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3637  Body = FD->getBody();
3638  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3639  Body = BD->getBody();
3640  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3641  Body = CD->getBody();
3642  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3643  }
3644  if (!Body)
3645  return;
3646  CheckVarsEscapingDeclContext VarChecker(CGF);
3647  VarChecker.Visit(Body);
3648  const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
3649  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3650  VarChecker.getEscapedVariableLengthDecls();
3651  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3652  return;
3653  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3654  I->getSecond().MappedParams =
3655  llvm::make_unique<CodeGenFunction::OMPMapVars>();
3656  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
3657  I->getSecond().EscapedParameters.insert(
3658  VarChecker.getEscapedParameters().begin(),
3659  VarChecker.getEscapedParameters().end());
3660  I->getSecond().EscapedVariableLengthDecls.append(
3661  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3662  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3663  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3664  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3665  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
3666  Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
3667  }
3668  if (!NeedToDelayGlobalization) {
3669  emitGenericVarsProlog(CGF, D->getLocStart());
3670  struct GlobalizationScope final : EHScopeStack::Cleanup {
3671  GlobalizationScope() = default;
3672 
3673  void Emit(CodeGenFunction &CGF, Flags flags) override {
3674  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
3675  .emitGenericVarsEpilog(CGF);
3676  }
3677  };
3678  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3679  }
3680 }
3681 
3683  const VarDecl *VD) {
3685  return Address::invalid();
3686 
3687  VD = VD->getCanonicalDecl();
3688  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3689  if (I == FunctionGlobalizedDecls.end())
3690  return Address::invalid();
3691  auto VDI = I->getSecond().LocalVarData.find(VD);
3692  if (VDI != I->getSecond().LocalVarData.end())
3693  return VDI->second.second;
3694  if (VD->hasAttrs()) {
3696  E(VD->attr_end());
3697  IT != E; ++IT) {
3698  auto VDI = I->getSecond().LocalVarData.find(
3699  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3700  ->getCanonicalDecl());
3701  if (VDI != I->getSecond().LocalVarData.end())
3702  return VDI->second.second;
3703  }
3704  }
3705  return Address::invalid();
3706 }
3707 
3709  FunctionGlobalizedDecls.erase(CGF.CurFn);
3711 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:653
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
const BlockDecl * getBlockDecl() const
Definition: Expr.h:4945
static const Decl * getCanonicalDecl(const Decl *D)
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
static llvm::Value * emitCopyToScratchpad(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that stores reduced data from the team master to a scratchpad array in g...
Other implicit parameter.
Definition: Decl.h:1493
A class which contains all the information about a particular captured value.
Definition: Decl.h:3842
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2375
CanQualType VoidPtrTy
Definition: ASTContext.h:1025
A (possibly-)qualified type.
Definition: Type.h:655
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static bool hasParallelIfNumThreadsClause(ASTContext &Ctx, const OMPExecutableDirective &D)
Check if the parallel directive has an &#39;if&#39; clause with non-constant or false condition.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:139
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:195
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Address CreateConstGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:226
attr_iterator attr_begin() const
Definition: DeclBase.h:499
Stmt - This represents one statement.
Definition: Stmt.h:66
static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads)
Get barrier #ID to synchronize selected (multiple of warp size) threads in a CTA. ...
llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:54
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter...
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
static bool stable_sort_comparator(const PrivateDataTy P1, const PrivateDataTy P2)
This represents &#39;if&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:240
llvm::Value * ScratchpadIndex
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:226
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:5998
The base class of the type hierarchy.
Definition: Type.h:1421
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:116
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Definition: CGExpr.cpp:2277
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:26
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
Represents a variable declaration or definition.
Definition: Decl.h:812
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:67
This represents &#39;num_threads&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:382
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
llvm::Value * getPointer() const
Definition: Address.h:38
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:57
SPMD execution mode (all threads are worker threads).
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:269
Represents a struct/union/class.
Definition: Decl.h:3548
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
Address getAddress() const
Definition: CGValue.h:327
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:150
attr_iterator attr_end() const
Definition: DeclBase.h:502
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
Represents a member of a struct/union/class.
Definition: Decl.h:2521
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive...
Definition: StmtOpenMP.h:209
static llvm::Value * getMasterThreadID(CodeGenFunction &CGF)
Get the thread id of the OMP master thread.
llvm::CallInst * EmitRuntimeCall(llvm::Value *callee, const Twine &name="")
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:3789
bool isReferenceType() const
Definition: Type.h:6061
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:244
Expr * getSubExpr()
Definition: Expr.h:2841
void InitTempAlloca(Address Alloca, llvm::Value *Value)
InitTempAlloca - Provide an initial value for the given alloca which will be observable at all locati...
Definition: CGExpr.cpp:126
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
bool EvaluateAsBooleanCondition(bool &Result, const ASTContext &Ctx) const
EvaluateAsBooleanCondition - Return true if this is a constant which we can fold and convert to a boo...
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:157
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
uint32_t Offset
Definition: CacheTokens.cpp:43
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:67
child_range children()
Definition: Stmt.cpp:227
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:40
static CGOpenMPRuntimeNVPTX::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
llvm::Value * emitReductionFunction(CodeGenModule &CGM, SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1583
virtual llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda&#39;s captures is an init-capture.
Definition: ExprCXX.cpp:955
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
static llvm::Optional< OMPDeclareTargetDeclAttr::MapTypeTy > isDeclareTargetDeclaration(const ValueDecl *VD)
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:875
static const Stmt * getSingleCompoundChild(const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one...
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:3806
llvm::Constant * CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create a new runtime function with the specified type and name.
SourceLocation getLocStart() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:168
static void syncCTAThreads(CodeGenFunction &CGF)
Synchronize all GPU threads in a block.
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:4277
Describes the capture of either a variable, or &#39;this&#39;, or variable-length array type.
Definition: Stmt.h:2083
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like &#39;private&#39;, &#39;firstprivate&#39;, &#39;reduction&#39; etc.
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
TypeSourceInfo * getTrivialTypeSourceInfo(QualType T, SourceLocation Loc=SourceLocation()) const
Allocate a TypeSourceInfo where all locations have been initialized to a given location, which defaults to the empty location.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:119
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:196
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:636
Expr - This represents one expression.
Definition: Expr.h:106
virtual llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
static Address invalid()
Definition: Address.h:35
Enters a new scope for capturing cleanups, all of which will be executed once the scope is exited...
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top, if IgnoreCaptured is true.
Definition: Stmt.cpp:133
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
const CGFunctionInfo & arrangeNullaryFunction()
A nullary function is a freestanding function of type &#39;void ()&#39;.
Definition: CGCall.cpp:695
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:4931
const Expr * getCallee() const
Definition: Expr.h:2323
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements, of a variable length array type, plus that largest non-variably-sized element type.
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:44
CharUnits getTypeAlignInChars(QualType T) const
Return the ABI-specified alignment of a (complete) type T, in characters.
DeclContext * getDeclContext()
Definition: DeclBase.h:426
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause *> Clauses)
Definition: StmtOpenMP.h:130
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
CanQualType ShortTy
Definition: ASTContext.h:1006
QualType getConstantArrayType(QualType EltTy, const llvm::APInt &ArySize, ArrayType::ArraySizeModifier ASM, unsigned IndexTypeQuals) const
Return the unique reference to the type for a constant array of the specified element type...
QualType getRecordType(const RecordDecl *Decl) const
UnaryOperator - This represents the unary-expression&#39;s (except sizeof and alignof), the postinc/postdec operators from postfix-expression, and various extensions.
Definition: Expr.h:1782
MachineConfiguration
GPU Configuration: This information can be derived from cuda registers, however, providing compile ti...
ValueDecl * getDecl()
Definition: Expr.h:1057
const LangOptions & getLangOpts() const
ASTContext & getContext() const
OpenMPProcBindClauseKind
OpenMP attributes for &#39;proc_bind&#39; clause.
Definition: OpenMPKinds.h:51
static llvm::Value * emitReduceScratchpadFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc)
This function emits a helper that loads data from the scratchpad array and (optionally) reduces it wi...
Non-SPMD execution mode (1 master thread, others are workers).
llvm::Value * ScratchpadWidth
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2006
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:35
bool hasClausesOfKind() const
Returns true if the current directive has one or more clauses of a specific kind. ...
Definition: StmtOpenMP.h:162
AttrVec & getAttrs()
Definition: DeclBase.h:478
bool hasAttrs() const
Definition: DeclBase.h:472
The l-value was considered opaque, so the alignment was determined from a type.
llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
SourceLocation getLocStart() const LLVM_READONLY
Definition: DeclBase.h:409
Address CreateBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:142
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
This captures a statement into a function.
Definition: Stmt.h:2070
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen)
Emits code for OpenMP &#39;if&#39; clause using specified CodeGen function.
Encodes a location in the source.
static llvm::Value * getThreadLimit(CodeGenFunction &CGF, bool IsInSPMDExecutionMode=false)
Get the value of the thread_limit clause in the teams directive.
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Expr * getSubExpr() const
Definition: Expr.h:1809
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:1946
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:33
CastKind getCastKind() const
Definition: Expr.h:2835
DeclStmt - Adaptor class for mixing declarations with statements and expressions. ...
Definition: Stmt.h:499
OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:23
This file defines OpenMP nodes for declarative directives.
CanQualType VoidTy
Definition: ASTContext.h:997
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
arg_range arguments()
Definition: Expr.h:2377
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
An aligned address.
Definition: Address.h:25
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:2902
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:2171
OpenMPRTLFunctionNVPTX
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language...
Definition: Expr.h:249
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
QualType getType() const
Definition: CGValue.h:264
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
FunctionArgList - Type for representing both the decl and type of parameters to a function...
Definition: CGCall.h:356
void setAction(PrePostActionTy &Action) const
CGFunctionInfo - Class to encapsulate the information about a function definition.
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
redecl_range redecls() const
Returns an iterator range for all the redeclarations of the same decl.
Definition: DeclBase.h:946
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2501
Dataflow Directional Tag Classes.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
static void getNVPTXCTABarrier(CodeGenFunction &CGF)
Get barrier to synchronize all threads in a block.
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:5792
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1262
ArrayRef< Capture > captures() const
Definition: Decl.h:3968
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:70
bool isOpenMPSimdDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a simd directive.
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr *> Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values, in the specified direction.
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:5799
SourceLocation getLocStart() const LLVM_READONLY
Definition: Decl.h:738
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:211
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:108
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1390
llvm::Module & getModule() const
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3342
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads)
Synchronize worker threads in a parallel region.
llvm::Constant * createNVPTXRuntimeFunction(unsigned Function)
Returns specified OpenMP runtime function for the current OpenMP implementation.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
This file defines OpenMP AST classes for executable directives and clauses.
Address CreateConstArrayGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:195
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:6304
void addRestrict()
Definition: Type.h:290
T * getAttr() const
Definition: DeclBase.h:532
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:52
Opcode getOpcode() const
Definition: Expr.h:1806
decl_range decls()
Definition: Stmt.h:546
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:32
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:445
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:193
ExecutionMode
Defines the execution mode.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
bool isLValueReferenceType() const
Definition: Type.h:6065
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
static llvm::Value * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g., it is an signed integer type or a vector.
Definition: Type.cpp:1837
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block, taking care to avoid creation of branches from dummy blocks.
Definition: CGStmt.cpp:465
Privates[]
Gets the list of initial values for linear variables.
Definition: OpenMPClause.h:141
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Capturing by reference.
Definition: Lambda.h:38
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression...
Definition: CGExpr.cpp:1199
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
capture_range captures() const
Retrieve this lambda&#39;s captures.
Definition: ExprCXX.cpp:968
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2283
static llvm::Value * getNVPTXThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
CGCapturedStmtInfo * CapturedStmtInfo
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2357
static llvm::Value * getNVPTXWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
CanQualType IntTy
Definition: ASTContext.h:1006
llvm::Value * RemoteLaneOffset
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:1818
capture_range captures()
Definition: Stmt.h:2205
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:972
void addAddressSpace(LangAS space)
Definition: Type.h:395
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:689
QualType getType() const
Definition: Decl.h:647
LValue - This represents an lvalue references.
Definition: CGValue.h:167
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:147
CanQualType BoolTy
Definition: ASTContext.h:998
unsigned getTargetAddressSpace(QualType T) const
Definition: ASTContext.h:2440
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:3690
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:164
static llvm::Value * getNVPTXNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
No in-class initializer.
Definition: Specifiers.h:230
llvm::Value * getPointer() const
Definition: CGValue.h:323
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc)
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Attr - This represents one attribute.
Definition: Attr.h:43
SourceLocation getLocation() const
Definition: DeclBase.h:417
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth, signed/unsigned.
static OMPLinearClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef< Expr *> VL, ArrayRef< Expr *> PL, ArrayRef< Expr *> IL, Expr *Step, Expr *CalcStep, Stmt *PreInit, Expr *PostUpdate)
Creates clause with a list of variables VL and a linear step Step.
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1544