clang  8.0.0svn
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This provides a class for OpenMP runtime code generation specialized to NVPTX
11 // targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "CodeGenFunction.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/StmtOpenMP.h"
19 #include "clang/AST/StmtVisitor.h"
20 #include "clang/Basic/Cuda.h"
21 #include "llvm/ADT/SmallPtrSet.h"
22 
23 using namespace clang;
24 using namespace CodeGen;
25 
26 namespace {
28  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
29  /// int16_t RequiresOMPRuntime);
30  OMPRTL_NVPTX__kmpc_kernel_init,
31  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
32  OMPRTL_NVPTX__kmpc_kernel_deinit,
33  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
34  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
35  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
36  /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
37  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
38  /// Call to void __kmpc_kernel_prepare_parallel(void
39  /// *outlined_function, int16_t
40  /// IsOMPRuntimeInitialized);
41  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
42  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
43  /// int16_t IsOMPRuntimeInitialized);
44  OMPRTL_NVPTX__kmpc_kernel_parallel,
45  /// Call to void __kmpc_kernel_end_parallel();
46  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
47  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
48  /// global_tid);
49  OMPRTL_NVPTX__kmpc_serialized_parallel,
50  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
51  /// global_tid);
52  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
53  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
54  /// int16_t lane_offset, int16_t warp_size);
55  OMPRTL_NVPTX__kmpc_shuffle_int32,
56  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
57  /// int16_t lane_offset, int16_t warp_size);
58  OMPRTL_NVPTX__kmpc_shuffle_int64,
59  /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
60  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
61  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
62  /// lane_offset, int16_t shortCircuit),
63  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
64  OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
65  /// Call to __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32
66  /// global_tid, kmp_critical_name *lck)
67  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple,
68  /// Call to __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc,
69  /// kmp_int32 global_tid, kmp_critical_name *lck)
70  OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple,
71  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
72  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
73  /// Call to void __kmpc_data_sharing_init_stack();
74  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
75  /// Call to void __kmpc_data_sharing_init_stack_spmd();
76  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
77  /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
78  /// int16_t UseSharedMemory);
79  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
80  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
81  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
82  /// Call to void __kmpc_begin_sharing_variables(void ***args,
83  /// size_t n_args);
84  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
85  /// Call to void __kmpc_end_sharing_variables();
86  OMPRTL_NVPTX__kmpc_end_sharing_variables,
87  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
88  OMPRTL_NVPTX__kmpc_get_shared_variables,
89  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
90  /// global_tid);
91  OMPRTL_NVPTX__kmpc_parallel_level,
92  /// Call to int8_t __kmpc_is_spmd_exec_mode();
93  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
94  /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size,
95  /// int16_t is_shared, const void **res);
96  OMPRTL_NVPTX__kmpc_get_team_static_memory,
97  /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared);
98  OMPRTL_NVPTX__kmpc_restore_team_static_memory,
99  // Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
101 };
102 
103 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
104 class NVPTXActionTy final : public PrePostActionTy {
105  llvm::Value *EnterCallee = nullptr;
106  ArrayRef<llvm::Value *> EnterArgs;
107  llvm::Value *ExitCallee = nullptr;
108  ArrayRef<llvm::Value *> ExitArgs;
109  bool Conditional = false;
110  llvm::BasicBlock *ContBlock = nullptr;
111 
112 public:
113  NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
114  llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
115  bool Conditional = false)
116  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
117  ExitArgs(ExitArgs), Conditional(Conditional) {}
118  void Enter(CodeGenFunction &CGF) override {
119  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
120  if (Conditional) {
121  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
122  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
123  ContBlock = CGF.createBasicBlock("omp_if.end");
124  // Generate the branch (If-stmt)
125  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
126  CGF.EmitBlock(ThenBlock);
127  }
128  }
129  void Done(CodeGenFunction &CGF) {
130  // Emit the rest of blocks/branches
131  CGF.EmitBranch(ContBlock);
132  CGF.EmitBlock(ContBlock, true);
133  }
134  void Exit(CodeGenFunction &CGF) override {
135  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
136  }
137 };
138 
139 /// A class to track the execution mode when codegening directives within
140 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
141 /// to the target region and used by containing directives such as 'parallel'
142 /// to emit optimized code.
143 class ExecutionRuntimeModesRAII {
144 private:
148  bool SavedRuntimeMode = false;
149  bool *RuntimeMode = nullptr;
150 
151 public:
152  /// Constructor for Non-SPMD mode.
153  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
154  : ExecMode(ExecMode) {
155  SavedExecMode = ExecMode;
157  }
158  /// Constructor for SPMD mode.
159  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
160  bool &RuntimeMode, bool FullRuntimeMode)
161  : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
162  SavedExecMode = ExecMode;
163  SavedRuntimeMode = RuntimeMode;
165  RuntimeMode = FullRuntimeMode;
166  }
167  ~ExecutionRuntimeModesRAII() {
168  ExecMode = SavedExecMode;
169  if (RuntimeMode)
170  *RuntimeMode = SavedRuntimeMode;
171  }
172 };
173 
174 /// GPU Configuration: This information can be derived from cuda registers,
175 /// however, providing compile time constants helps generate more efficient
176 /// code. For all practical purposes this is fine because the configuration
177 /// is the same for all known NVPTX architectures.
178 enum MachineConfiguration : unsigned {
179  WarpSize = 32,
180  /// Number of bits required to represent a lane identifier, which is
181  /// computed as log_2(WarpSize).
182  LaneIDBits = 5,
183  LaneIDMask = WarpSize - 1,
184 
185  /// Global memory alignment for performance.
186  GlobalMemoryAlignment = 128,
187 
188  /// Maximal size of the shared memory buffer.
189  SharedMemorySize = 128,
190 };
191 
192 enum NamedBarrier : unsigned {
193  /// Synchronize on this barrier #ID using a named barrier primitive.
194  /// Only the subset of active threads in a parallel region arrive at the
195  /// barrier.
196  NB_Parallel = 1,
197 };
198 
199 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
200  RefExpr = RefExpr->IgnoreParens();
201  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
202  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
203  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
204  Base = TempASE->getBase()->IgnoreParenImpCasts();
205  RefExpr = Base;
206  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
207  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
208  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
209  Base = TempOASE->getBase()->IgnoreParenImpCasts();
210  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
211  Base = TempASE->getBase()->IgnoreParenImpCasts();
212  RefExpr = Base;
213  }
214  RefExpr = RefExpr->IgnoreParenImpCasts();
215  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
216  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
217  const auto *ME = cast<MemberExpr>(RefExpr);
218  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
219 }
220 
221 typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
222 static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
223  return P1.first > P2.first;
224 }
225 
226 static RecordDecl *buildRecordForGlobalizedVars(
227  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
228  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
229  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
230  &MappedDeclsFields) {
231  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
232  return nullptr;
233  SmallVector<VarsDataTy, 4> GlobalizedVars;
234  for (const ValueDecl *D : EscapedDecls)
235  GlobalizedVars.emplace_back(
237  C.getDeclAlign(D).getQuantity(),
238  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
239  D);
240  for (const ValueDecl *D : EscapedDeclsForTeams)
241  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
242  std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
244  // Build struct _globalized_locals_ty {
245  // /* globalized vars */[WarSize] align (max(decl_align,
246  // GlobalMemoryAlignment))
247  // /* globalized vars */ for EscapedDeclsForTeams
248  // };
249  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
250  GlobalizedRD->startDefinition();
251  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
252  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
253  for (const auto &Pair : GlobalizedVars) {
254  const ValueDecl *VD = Pair.second;
255  QualType Type = VD->getType();
256  if (Type->isLValueReferenceType())
257  Type = C.getPointerType(Type.getNonReferenceType());
258  else
259  Type = Type.getNonReferenceType();
260  SourceLocation Loc = VD->getLocation();
261  FieldDecl *Field;
262  if (SingleEscaped.count(VD)) {
263  Field = FieldDecl::Create(
264  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
266  /*BW=*/nullptr, /*Mutable=*/false,
267  /*InitStyle=*/ICIS_NoInit);
268  Field->setAccess(AS_public);
269  if (VD->hasAttrs()) {
270  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
271  E(VD->getAttrs().end());
272  I != E; ++I)
273  Field->addAttr(*I);
274  }
275  } else {
276  llvm::APInt ArraySize(32, WarpSize);
277  Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
278  Field = FieldDecl::Create(
279  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
281  /*BW=*/nullptr, /*Mutable=*/false,
282  /*InitStyle=*/ICIS_NoInit);
283  Field->setAccess(AS_public);
284  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
285  static_cast<CharUnits::QuantityType>(
286  GlobalMemoryAlignment)));
287  Field->addAttr(AlignedAttr::CreateImplicit(
288  C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
289  IntegerLiteral::Create(C, Align,
290  C.getIntTypeForBitwidth(32, /*Signed=*/0),
291  SourceLocation())));
292  }
293  GlobalizedRD->addDecl(Field);
294  MappedDeclsFields.try_emplace(VD, Field);
295  }
296  GlobalizedRD->completeDefinition();
297  return GlobalizedRD;
298 }
299 
300 /// Get the list of variables that can escape their declaration context.
301 class CheckVarsEscapingDeclContext final
302  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
303  CodeGenFunction &CGF;
304  llvm::SetVector<const ValueDecl *> EscapedDecls;
305  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
306  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
307  RecordDecl *GlobalizedRD = nullptr;
308  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
309  bool AllEscaped = false;
310  bool IsForCombinedParallelRegion = false;
311 
312  void markAsEscaped(const ValueDecl *VD) {
313  // Do not globalize declare target variables.
314  if (!isa<VarDecl>(VD) ||
315  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
316  return;
317  VD = cast<ValueDecl>(VD->getCanonicalDecl());
318  // Variables captured by value must be globalized.
319  if (auto *CSI = CGF.CapturedStmtInfo) {
320  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
321  // Check if need to capture the variable that was already captured by
322  // value in the outer region.
323  if (!IsForCombinedParallelRegion) {
324  if (!FD->hasAttrs())
325  return;
326  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
327  if (!Attr)
328  return;
329  if (((Attr->getCaptureKind() != OMPC_map) &&
331  static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) ||
332  ((Attr->getCaptureKind() == OMPC_map) &&
333  !FD->getType()->isAnyPointerType()))
334  return;
335  }
336  if (!FD->getType()->isReferenceType()) {
337  assert(!VD->getType()->isVariablyModifiedType() &&
338  "Parameter captured by value with variably modified type");
339  EscapedParameters.insert(VD);
340  } else if (!IsForCombinedParallelRegion) {
341  return;
342  }
343  }
344  }
345  if ((!CGF.CapturedStmtInfo ||
346  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
347  VD->getType()->isReferenceType())
348  // Do not globalize variables with reference type.
349  return;
350  if (VD->getType()->isVariablyModifiedType())
351  EscapedVariableLengthDecls.insert(VD);
352  else
353  EscapedDecls.insert(VD);
354  }
355 
356  void VisitValueDecl(const ValueDecl *VD) {
357  if (VD->getType()->isLValueReferenceType())
358  markAsEscaped(VD);
359  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
360  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
361  const bool SavedAllEscaped = AllEscaped;
362  AllEscaped = VD->getType()->isLValueReferenceType();
363  Visit(VarD->getInit());
364  AllEscaped = SavedAllEscaped;
365  }
366  }
367  }
368  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
369  ArrayRef<OMPClause *> Clauses,
370  bool IsCombinedParallelRegion) {
371  if (!S)
372  return;
373  for (const CapturedStmt::Capture &C : S->captures()) {
374  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
375  const ValueDecl *VD = C.getCapturedVar();
376  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
377  if (IsCombinedParallelRegion) {
378  // Check if the variable is privatized in the combined construct and
379  // those private copies must be shared in the inner parallel
380  // directive.
381  IsForCombinedParallelRegion = false;
382  for (const OMPClause *C : Clauses) {
383  if (!isOpenMPPrivate(C->getClauseKind()) ||
384  C->getClauseKind() == OMPC_reduction ||
385  C->getClauseKind() == OMPC_linear ||
386  C->getClauseKind() == OMPC_private)
387  continue;
389  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
390  Vars = PC->getVarRefs();
391  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
392  Vars = PC->getVarRefs();
393  else
394  llvm_unreachable("Unexpected clause.");
395  for (const auto *E : Vars) {
396  const Decl *D =
397  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
398  if (D == VD->getCanonicalDecl()) {
399  IsForCombinedParallelRegion = true;
400  break;
401  }
402  }
403  if (IsForCombinedParallelRegion)
404  break;
405  }
406  }
407  markAsEscaped(VD);
408  if (isa<OMPCapturedExprDecl>(VD))
409  VisitValueDecl(VD);
410  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
411  }
412  }
413  }
414 
415  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
416  assert(!GlobalizedRD &&
417  "Record for globalized variables is built already.");
418  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
419  if (IsInTTDRegion)
420  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
421  else
422  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
423  GlobalizedRD = ::buildRecordForGlobalizedVars(
424  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
425  MappedDeclsFields);
426  }
427 
428 public:
429  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
430  ArrayRef<const ValueDecl *> TeamsReductions)
431  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
432  }
433  virtual ~CheckVarsEscapingDeclContext() = default;
434  void VisitDeclStmt(const DeclStmt *S) {
435  if (!S)
436  return;
437  for (const Decl *D : S->decls())
438  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
439  VisitValueDecl(VD);
440  }
441  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
442  if (!D)
443  return;
444  if (!D->hasAssociatedStmt())
445  return;
446  if (const auto *S =
447  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
448  // Do not analyze directives that do not actually require capturing,
449  // like `omp for` or `omp simd` directives.
451  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
452  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
453  VisitStmt(S->getCapturedStmt());
454  return;
455  }
456  VisitOpenMPCapturedStmt(
457  S, D->clauses(),
458  CaptureRegions.back() == OMPD_parallel &&
460  }
461  }
462  void VisitCapturedStmt(const CapturedStmt *S) {
463  if (!S)
464  return;
465  for (const CapturedStmt::Capture &C : S->captures()) {
466  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
467  const ValueDecl *VD = C.getCapturedVar();
468  markAsEscaped(VD);
469  if (isa<OMPCapturedExprDecl>(VD))
470  VisitValueDecl(VD);
471  }
472  }
473  }
474  void VisitLambdaExpr(const LambdaExpr *E) {
475  if (!E)
476  return;
477  for (const LambdaCapture &C : E->captures()) {
478  if (C.capturesVariable()) {
479  if (C.getCaptureKind() == LCK_ByRef) {
480  const ValueDecl *VD = C.getCapturedVar();
481  markAsEscaped(VD);
482  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
483  VisitValueDecl(VD);
484  }
485  }
486  }
487  }
488  void VisitBlockExpr(const BlockExpr *E) {
489  if (!E)
490  return;
491  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
492  if (C.isByRef()) {
493  const VarDecl *VD = C.getVariable();
494  markAsEscaped(VD);
495  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
496  VisitValueDecl(VD);
497  }
498  }
499  }
500  void VisitCallExpr(const CallExpr *E) {
501  if (!E)
502  return;
503  for (const Expr *Arg : E->arguments()) {
504  if (!Arg)
505  continue;
506  if (Arg->isLValue()) {
507  const bool SavedAllEscaped = AllEscaped;
508  AllEscaped = true;
509  Visit(Arg);
510  AllEscaped = SavedAllEscaped;
511  } else {
512  Visit(Arg);
513  }
514  }
515  Visit(E->getCallee());
516  }
517  void VisitDeclRefExpr(const DeclRefExpr *E) {
518  if (!E)
519  return;
520  const ValueDecl *VD = E->getDecl();
521  if (AllEscaped)
522  markAsEscaped(VD);
523  if (isa<OMPCapturedExprDecl>(VD))
524  VisitValueDecl(VD);
525  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
526  if (VarD->isInitCapture())
527  VisitValueDecl(VD);
528  }
529  void VisitUnaryOperator(const UnaryOperator *E) {
530  if (!E)
531  return;
532  if (E->getOpcode() == UO_AddrOf) {
533  const bool SavedAllEscaped = AllEscaped;
534  AllEscaped = true;
535  Visit(E->getSubExpr());
536  AllEscaped = SavedAllEscaped;
537  } else {
538  Visit(E->getSubExpr());
539  }
540  }
541  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
542  if (!E)
543  return;
544  if (E->getCastKind() == CK_ArrayToPointerDecay) {
545  const bool SavedAllEscaped = AllEscaped;
546  AllEscaped = true;
547  Visit(E->getSubExpr());
548  AllEscaped = SavedAllEscaped;
549  } else {
550  Visit(E->getSubExpr());
551  }
552  }
553  void VisitExpr(const Expr *E) {
554  if (!E)
555  return;
556  bool SavedAllEscaped = AllEscaped;
557  if (!E->isLValue())
558  AllEscaped = false;
559  for (const Stmt *Child : E->children())
560  if (Child)
561  Visit(Child);
562  AllEscaped = SavedAllEscaped;
563  }
564  void VisitStmt(const Stmt *S) {
565  if (!S)
566  return;
567  for (const Stmt *Child : S->children())
568  if (Child)
569  Visit(Child);
570  }
571 
572  /// Returns the record that handles all the escaped local variables and used
573  /// instead of their original storage.
574  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
575  if (!GlobalizedRD)
576  buildRecordForGlobalizedVars(IsInTTDRegion);
577  return GlobalizedRD;
578  }
579 
580  /// Returns the field in the globalized record for the escaped variable.
581  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
582  assert(GlobalizedRD &&
583  "Record for globalized variables must be generated already.");
584  auto I = MappedDeclsFields.find(VD);
585  if (I == MappedDeclsFields.end())
586  return nullptr;
587  return I->getSecond();
588  }
589 
590  /// Returns the list of the escaped local variables/parameters.
591  ArrayRef<const ValueDecl *> getEscapedDecls() const {
592  return EscapedDecls.getArrayRef();
593  }
594 
595  /// Checks if the escaped local variable is actually a parameter passed by
596  /// value.
597  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
598  return EscapedParameters;
599  }
600 
601  /// Returns the list of the escaped variables with the variably modified
602  /// types.
603  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
604  return EscapedVariableLengthDecls.getArrayRef();
605  }
606 };
607 } // anonymous namespace
608 
609 /// Get the GPU warp size.
611  return CGF.EmitRuntimeCall(
612  llvm::Intrinsic::getDeclaration(
613  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
614  "nvptx_warp_size");
615 }
616 
617 /// Get the id of the current thread on the GPU.
619  return CGF.EmitRuntimeCall(
620  llvm::Intrinsic::getDeclaration(
621  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
622  "nvptx_tid");
623 }
624 
625 /// Get the id of the warp in the block.
626 /// We assume that the warp size is 32, which is always the case
627 /// on the NVPTX device, to generate more efficient code.
629  CGBuilderTy &Bld = CGF.Builder;
630  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
631 }
632 
633 /// Get the id of the current lane in the Warp.
634 /// We assume that the warp size is 32, which is always the case
635 /// on the NVPTX device, to generate more efficient code.
637  CGBuilderTy &Bld = CGF.Builder;
638  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
639  "nvptx_lane_id");
640 }
641 
642 /// Get the maximum number of threads in a block of the GPU.
644  return CGF.EmitRuntimeCall(
645  llvm::Intrinsic::getDeclaration(
646  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
647  "nvptx_num_threads");
648 }
649 
650 /// Get barrier to synchronize all threads in a block.
652  llvm::Function *F = llvm::Intrinsic::getDeclaration(
653  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0);
654  F->addFnAttr(llvm::Attribute::Convergent);
655  CGF.EmitRuntimeCall(F);
656 }
657 
658 /// Get barrier #ID to synchronize selected (multiple of warp size) threads in
659 /// a CTA.
660 static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
661  llvm::Value *NumThreads) {
662  CGBuilderTy &Bld = CGF.Builder;
663  llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
664  llvm::Function *F = llvm::Intrinsic::getDeclaration(
665  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier);
666  F->addFnAttr(llvm::Attribute::Convergent);
667  CGF.EmitRuntimeCall(F, Args);
668 }
669 
670 /// Synchronize all GPU threads in a block.
672 
673 /// Synchronize worker threads in a parallel region.
674 static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
675  return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
676 }
677 
678 /// Get the value of the thread_limit clause in the teams directive.
679 /// For the 'generic' execution mode, the runtime encodes thread_limit in
680 /// the launch parameters, always starting thread_limit+warpSize threads per
681 /// CTA. The threads in the last warp are reserved for master execution.
682 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
684  bool IsInSPMDExecutionMode = false) {
685  CGBuilderTy &Bld = CGF.Builder;
686  return IsInSPMDExecutionMode
687  ? getNVPTXNumThreads(CGF)
688  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
689  "thread_limit");
690 }
691 
692 /// Get the thread id of the OMP master thread.
693 /// The master thread id is the first thread (lane) of the last warp in the
694 /// GPU block. Warp size is assumed to be some power of 2.
695 /// Thread id is 0 indexed.
696 /// E.g: If NumThreads is 33, master id is 32.
697 /// If NumThreads is 64, master id is 32.
698 /// If NumThreads is 1024, master id is 992.
700  CGBuilderTy &Bld = CGF.Builder;
701  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
702 
703  // We assume that the warp size is a power of 2.
704  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
705 
706  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
707  Bld.CreateNot(Mask), "master_tid");
708 }
709 
710 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
711  CodeGenModule &CGM, SourceLocation Loc)
712  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
713  Loc(Loc) {
714  createWorkerFunction(CGM);
715 }
716 
717 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
718  CodeGenModule &CGM) {
719  // Create an worker function with no arguments.
720 
721  WorkerFn = llvm::Function::Create(
723  /*placeholder=*/"_worker", &CGM.getModule());
724  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
725  WorkerFn->setDoesNotRecurse();
726 }
727 
729 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
730  return CurrentExecutionMode;
731 }
732 
735  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
737 }
738 
739 // Checks if the expression is constant or does not have non-trivial function
740 // calls.
741 static bool isTrivial(ASTContext &Ctx, const Expr * E) {
742  // We can skip constant expressions.
743  // We can skip expressions with trivial calls or simple expressions.
745  !E->hasNonTrivialCall(Ctx)) &&
746  !E->HasSideEffects(Ctx, /*IncludePossibleEffects=*/true);
747 }
748 
749 /// Checks if the \p Body is the \a CompoundStmt and returns its child statement
750 /// iff there is only one that is not evaluatable at the compile time.
751 static const Stmt *getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body) {
752  if (const auto *C = dyn_cast<CompoundStmt>(Body)) {
753  const Stmt *Child = nullptr;
754  for (const Stmt *S : C->body()) {
755  if (const auto *E = dyn_cast<Expr>(S)) {
756  if (isTrivial(Ctx, E))
757  continue;
758  }
759  // Some of the statements can be ignored.
760  if (isa<AsmStmt>(S) || isa<NullStmt>(S) || isa<OMPFlushDirective>(S) ||
761  isa<OMPBarrierDirective>(S) || isa<OMPTaskyieldDirective>(S))
762  continue;
763  // Analyze declarations.
764  if (const auto *DS = dyn_cast<DeclStmt>(S)) {
765  if (llvm::all_of(DS->decls(), [&Ctx](const Decl *D) {
766  if (isa<EmptyDecl>(D) || isa<DeclContext>(D) ||
767  isa<TypeDecl>(D) || isa<PragmaCommentDecl>(D) ||
768  isa<PragmaDetectMismatchDecl>(D) || isa<UsingDecl>(D) ||
769  isa<UsingDirectiveDecl>(D) ||
770  isa<OMPDeclareReductionDecl>(D) ||
771  isa<OMPThreadPrivateDecl>(D))
772  return true;
773  const auto *VD = dyn_cast<VarDecl>(D);
774  if (!VD)
775  return false;
776  return VD->isConstexpr() ||
777  ((VD->getType().isTrivialType(Ctx) ||
778  VD->getType()->isReferenceType()) &&
779  (!VD->hasInit() || isTrivial(Ctx, VD->getInit())));
780  }))
781  continue;
782  }
783  // Found multiple children - cannot get the one child only.
784  if (Child)
785  return Body;
786  Child = S;
787  }
788  if (Child)
789  return Child;
790  }
791  return Body;
792 }
793 
794 /// Check if the parallel directive has an 'if' clause with non-constant or
795 /// false condition. Also, check if the number of threads is strictly specified
796 /// and run those directives in non-SPMD mode.
798  const OMPExecutableDirective &D) {
800  return true;
801  for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
802  OpenMPDirectiveKind NameModifier = C->getNameModifier();
803  if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
804  continue;
805  const Expr *Cond = C->getCondition();
806  bool Result;
807  if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
808  return true;
809  }
810  return false;
811 }
812 
813 /// Check for inner (nested) SPMD construct, if any
815  const OMPExecutableDirective &D) {
816  const auto *CS = D.getInnermostCapturedStmt();
817  const auto *Body =
818  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
819  const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
820 
821  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
822  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
823  switch (D.getDirectiveKind()) {
824  case OMPD_target:
825  if (isOpenMPParallelDirective(DKind) &&
826  !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
827  return true;
828  if (DKind == OMPD_teams) {
829  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
830  /*IgnoreCaptured=*/true);
831  if (!Body)
832  return false;
833  ChildStmt = getSingleCompoundChild(Ctx, Body);
834  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
835  DKind = NND->getDirectiveKind();
836  if (isOpenMPParallelDirective(DKind) &&
837  !hasParallelIfNumThreadsClause(Ctx, *NND))
838  return true;
839  }
840  }
841  return false;
842  case OMPD_target_teams:
843  return isOpenMPParallelDirective(DKind) &&
844  !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
845  case OMPD_target_simd:
846  case OMPD_target_parallel:
847  case OMPD_target_parallel_for:
848  case OMPD_target_parallel_for_simd:
849  case OMPD_target_teams_distribute:
850  case OMPD_target_teams_distribute_simd:
851  case OMPD_target_teams_distribute_parallel_for:
852  case OMPD_target_teams_distribute_parallel_for_simd:
853  case OMPD_parallel:
854  case OMPD_for:
855  case OMPD_parallel_for:
856  case OMPD_parallel_sections:
857  case OMPD_for_simd:
858  case OMPD_parallel_for_simd:
859  case OMPD_cancel:
860  case OMPD_cancellation_point:
861  case OMPD_ordered:
862  case OMPD_threadprivate:
863  case OMPD_task:
864  case OMPD_simd:
865  case OMPD_sections:
866  case OMPD_section:
867  case OMPD_single:
868  case OMPD_master:
869  case OMPD_critical:
870  case OMPD_taskyield:
871  case OMPD_barrier:
872  case OMPD_taskwait:
873  case OMPD_taskgroup:
874  case OMPD_atomic:
875  case OMPD_flush:
876  case OMPD_teams:
877  case OMPD_target_data:
878  case OMPD_target_exit_data:
879  case OMPD_target_enter_data:
880  case OMPD_distribute:
881  case OMPD_distribute_simd:
882  case OMPD_distribute_parallel_for:
883  case OMPD_distribute_parallel_for_simd:
884  case OMPD_teams_distribute:
885  case OMPD_teams_distribute_simd:
886  case OMPD_teams_distribute_parallel_for:
887  case OMPD_teams_distribute_parallel_for_simd:
888  case OMPD_target_update:
889  case OMPD_declare_simd:
890  case OMPD_declare_target:
891  case OMPD_end_declare_target:
892  case OMPD_declare_reduction:
893  case OMPD_taskloop:
894  case OMPD_taskloop_simd:
895  case OMPD_requires:
896  case OMPD_unknown:
897  llvm_unreachable("Unexpected directive.");
898  }
899  }
900 
901  return false;
902 }
903 
905  const OMPExecutableDirective &D) {
906  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
907  switch (DirectiveKind) {
908  case OMPD_target:
909  case OMPD_target_teams:
910  return hasNestedSPMDDirective(Ctx, D);
911  case OMPD_target_parallel:
912  case OMPD_target_parallel_for:
913  case OMPD_target_parallel_for_simd:
914  case OMPD_target_teams_distribute_parallel_for:
915  case OMPD_target_teams_distribute_parallel_for_simd:
916  return !hasParallelIfNumThreadsClause(Ctx, D);
917  case OMPD_target_simd:
918  case OMPD_target_teams_distribute:
919  case OMPD_target_teams_distribute_simd:
920  return false;
921  case OMPD_parallel:
922  case OMPD_for:
923  case OMPD_parallel_for:
924  case OMPD_parallel_sections:
925  case OMPD_for_simd:
926  case OMPD_parallel_for_simd:
927  case OMPD_cancel:
928  case OMPD_cancellation_point:
929  case OMPD_ordered:
930  case OMPD_threadprivate:
931  case OMPD_task:
932  case OMPD_simd:
933  case OMPD_sections:
934  case OMPD_section:
935  case OMPD_single:
936  case OMPD_master:
937  case OMPD_critical:
938  case OMPD_taskyield:
939  case OMPD_barrier:
940  case OMPD_taskwait:
941  case OMPD_taskgroup:
942  case OMPD_atomic:
943  case OMPD_flush:
944  case OMPD_teams:
945  case OMPD_target_data:
946  case OMPD_target_exit_data:
947  case OMPD_target_enter_data:
948  case OMPD_distribute:
949  case OMPD_distribute_simd:
950  case OMPD_distribute_parallel_for:
951  case OMPD_distribute_parallel_for_simd:
952  case OMPD_teams_distribute:
953  case OMPD_teams_distribute_simd:
954  case OMPD_teams_distribute_parallel_for:
955  case OMPD_teams_distribute_parallel_for_simd:
956  case OMPD_target_update:
957  case OMPD_declare_simd:
958  case OMPD_declare_target:
959  case OMPD_end_declare_target:
960  case OMPD_declare_reduction:
961  case OMPD_taskloop:
962  case OMPD_taskloop_simd:
963  case OMPD_requires:
964  case OMPD_unknown:
965  break;
966  }
967  llvm_unreachable(
968  "Unknown programming model for OpenMP directive on NVPTX target.");
969 }
970 
971 /// Check if the directive is loops based and has schedule clause at all or has
972 /// static scheduling.
976  "Expected loop-based directive.");
977  return !D.hasClausesOfKind<OMPOrderedClause>() &&
979  llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
980  [](const OMPScheduleClause *C) {
981  return C->getScheduleKind() == OMPC_SCHEDULE_static;
982  }));
983 }
984 
985 /// Check for inner (nested) lightweight runtime construct, if any
987  const OMPExecutableDirective &D) {
988  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
989  const auto *CS = D.getInnermostCapturedStmt();
990  const auto *Body =
991  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
992  const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
993 
994  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
995  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
996  switch (D.getDirectiveKind()) {
997  case OMPD_target:
998  if (isOpenMPParallelDirective(DKind) &&
1000  hasStaticScheduling(*NestedDir))
1001  return true;
1002  if (DKind == OMPD_parallel) {
1003  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
1004  /*IgnoreCaptured=*/true);
1005  if (!Body)
1006  return false;
1007  ChildStmt = getSingleCompoundChild(Ctx, Body);
1008  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1009  DKind = NND->getDirectiveKind();
1010  if (isOpenMPWorksharingDirective(DKind) &&
1012  return true;
1013  }
1014  } else if (DKind == OMPD_teams) {
1015  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
1016  /*IgnoreCaptured=*/true);
1017  if (!Body)
1018  return false;
1019  ChildStmt = getSingleCompoundChild(Ctx, Body);
1020  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1021  DKind = NND->getDirectiveKind();
1022  if (isOpenMPParallelDirective(DKind) &&
1025  return true;
1026  if (DKind == OMPD_parallel) {
1027  Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
1028  /*IgnoreCaptured=*/true);
1029  if (!Body)
1030  return false;
1031  ChildStmt = getSingleCompoundChild(Ctx, Body);
1032  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1033  DKind = NND->getDirectiveKind();
1034  if (isOpenMPWorksharingDirective(DKind) &&
1036  return true;
1037  }
1038  }
1039  }
1040  }
1041  return false;
1042  case OMPD_target_teams:
1043  if (isOpenMPParallelDirective(DKind) &&
1045  hasStaticScheduling(*NestedDir))
1046  return true;
1047  if (DKind == OMPD_parallel) {
1048  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
1049  /*IgnoreCaptured=*/true);
1050  if (!Body)
1051  return false;
1052  ChildStmt = getSingleCompoundChild(Ctx, Body);
1053  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1054  DKind = NND->getDirectiveKind();
1055  if (isOpenMPWorksharingDirective(DKind) &&
1057  return true;
1058  }
1059  }
1060  return false;
1061  case OMPD_target_parallel:
1062  return isOpenMPWorksharingDirective(DKind) &&
1063  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
1064  case OMPD_target_teams_distribute:
1065  case OMPD_target_simd:
1066  case OMPD_target_parallel_for:
1067  case OMPD_target_parallel_for_simd:
1068  case OMPD_target_teams_distribute_simd:
1069  case OMPD_target_teams_distribute_parallel_for:
1070  case OMPD_target_teams_distribute_parallel_for_simd:
1071  case OMPD_parallel:
1072  case OMPD_for:
1073  case OMPD_parallel_for:
1074  case OMPD_parallel_sections:
1075  case OMPD_for_simd:
1076  case OMPD_parallel_for_simd:
1077  case OMPD_cancel:
1078  case OMPD_cancellation_point:
1079  case OMPD_ordered:
1080  case OMPD_threadprivate:
1081  case OMPD_task:
1082  case OMPD_simd:
1083  case OMPD_sections:
1084  case OMPD_section:
1085  case OMPD_single:
1086  case OMPD_master:
1087  case OMPD_critical:
1088  case OMPD_taskyield:
1089  case OMPD_barrier:
1090  case OMPD_taskwait:
1091  case OMPD_taskgroup:
1092  case OMPD_atomic:
1093  case OMPD_flush:
1094  case OMPD_teams:
1095  case OMPD_target_data:
1096  case OMPD_target_exit_data:
1097  case OMPD_target_enter_data:
1098  case OMPD_distribute:
1099  case OMPD_distribute_simd:
1100  case OMPD_distribute_parallel_for:
1101  case OMPD_distribute_parallel_for_simd:
1102  case OMPD_teams_distribute:
1103  case OMPD_teams_distribute_simd:
1104  case OMPD_teams_distribute_parallel_for:
1105  case OMPD_teams_distribute_parallel_for_simd:
1106  case OMPD_target_update:
1107  case OMPD_declare_simd:
1108  case OMPD_declare_target:
1109  case OMPD_end_declare_target:
1110  case OMPD_declare_reduction:
1111  case OMPD_taskloop:
1112  case OMPD_taskloop_simd:
1113  case OMPD_requires:
1114  case OMPD_unknown:
1115  llvm_unreachable("Unexpected directive.");
1116  }
1117  }
1118 
1119  return false;
1120 }
1121 
1122 /// Checks if the construct supports lightweight runtime. It must be SPMD
1123 /// construct + inner loop-based construct with static scheduling.
1125  const OMPExecutableDirective &D) {
1126  if (!supportsSPMDExecutionMode(Ctx, D))
1127  return false;
1128  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1129  switch (DirectiveKind) {
1130  case OMPD_target:
1131  case OMPD_target_teams:
1132  case OMPD_target_parallel:
1133  return hasNestedLightweightDirective(Ctx, D);
1134  case OMPD_target_parallel_for:
1135  case OMPD_target_parallel_for_simd:
1136  case OMPD_target_teams_distribute_parallel_for:
1137  case OMPD_target_teams_distribute_parallel_for_simd:
1138  // (Last|First)-privates must be shared in parallel region.
1139  return hasStaticScheduling(D);
1140  case OMPD_target_simd:
1141  case OMPD_target_teams_distribute:
1142  case OMPD_target_teams_distribute_simd:
1143  return false;
1144  case OMPD_parallel:
1145  case OMPD_for:
1146  case OMPD_parallel_for:
1147  case OMPD_parallel_sections:
1148  case OMPD_for_simd:
1149  case OMPD_parallel_for_simd:
1150  case OMPD_cancel:
1151  case OMPD_cancellation_point:
1152  case OMPD_ordered:
1153  case OMPD_threadprivate:
1154  case OMPD_task:
1155  case OMPD_simd:
1156  case OMPD_sections:
1157  case OMPD_section:
1158  case OMPD_single:
1159  case OMPD_master:
1160  case OMPD_critical:
1161  case OMPD_taskyield:
1162  case OMPD_barrier:
1163  case OMPD_taskwait:
1164  case OMPD_taskgroup:
1165  case OMPD_atomic:
1166  case OMPD_flush:
1167  case OMPD_teams:
1168  case OMPD_target_data:
1169  case OMPD_target_exit_data:
1170  case OMPD_target_enter_data:
1171  case OMPD_distribute:
1172  case OMPD_distribute_simd:
1173  case OMPD_distribute_parallel_for:
1174  case OMPD_distribute_parallel_for_simd:
1175  case OMPD_teams_distribute:
1176  case OMPD_teams_distribute_simd:
1177  case OMPD_teams_distribute_parallel_for:
1178  case OMPD_teams_distribute_parallel_for_simd:
1179  case OMPD_target_update:
1180  case OMPD_declare_simd:
1181  case OMPD_declare_target:
1182  case OMPD_end_declare_target:
1183  case OMPD_declare_reduction:
1184  case OMPD_taskloop:
1185  case OMPD_taskloop_simd:
1186  case OMPD_requires:
1187  case OMPD_unknown:
1188  break;
1189  }
1190  llvm_unreachable(
1191  "Unknown programming model for OpenMP directive on NVPTX target.");
1192 }
1193 
1194 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1195  StringRef ParentName,
1196  llvm::Function *&OutlinedFn,
1197  llvm::Constant *&OutlinedFnID,
1198  bool IsOffloadEntry,
1199  const RegionCodeGenTy &CodeGen) {
1200  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1201  EntryFunctionState EST;
1202  WorkerFunctionState WST(CGM, D.getBeginLoc());
1203  Work.clear();
1204  WrapperFunctionsMap.clear();
1205 
1206  // Emit target region as a standalone region.
1207  class NVPTXPrePostActionTy : public PrePostActionTy {
1208  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1209  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1210 
1211  public:
1212  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1213  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1214  : EST(EST), WST(WST) {}
1215  void Enter(CodeGenFunction &CGF) override {
1216  auto &RT =
1217  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1218  RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1219  // Skip target region initialization.
1220  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1221  }
1222  void Exit(CodeGenFunction &CGF) override {
1223  auto &RT =
1224  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1225  RT.clearLocThreadIdInsertPt(CGF);
1226  RT.emitNonSPMDEntryFooter(CGF, EST);
1227  }
1228  } Action(EST, WST);
1229  CodeGen.setAction(Action);
1230  IsInTTDRegion = true;
1231  // Reserve place for the globalized memory.
1232  GlobalizedRecords.emplace_back();
1233  if (!KernelStaticGlobalized) {
1234  KernelStaticGlobalized = new llvm::GlobalVariable(
1235  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1237  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1238  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1239  llvm::GlobalValue::NotThreadLocal,
1241  }
1242  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1243  IsOffloadEntry, CodeGen);
1244  IsInTTDRegion = false;
1245 
1246  // Now change the name of the worker function to correspond to this target
1247  // region's entry function.
1248  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1249 
1250  // Create the worker function
1251  emitWorkerFunction(WST);
1252 }
1253 
1254 // Setup NVPTX threads for master-worker OpenMP scheme.
1255 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1256  EntryFunctionState &EST,
1257  WorkerFunctionState &WST) {
1258  CGBuilderTy &Bld = CGF.Builder;
1259 
1260  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1261  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1262  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1263  EST.ExitBB = CGF.createBasicBlock(".exit");
1264 
1265  llvm::Value *IsWorker =
1266  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1267  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1268 
1269  CGF.EmitBlock(WorkerBB);
1270  emitCall(CGF, WST.Loc, WST.WorkerFn);
1271  CGF.EmitBranch(EST.ExitBB);
1272 
1273  CGF.EmitBlock(MasterCheckBB);
1274  llvm::Value *IsMaster =
1275  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1276  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1277 
1278  CGF.EmitBlock(MasterBB);
1279  IsInTargetMasterThreadRegion = true;
1280  // SEQUENTIAL (MASTER) REGION START
1281  // First action in sequential region:
1282  // Initialize the state of the OpenMP runtime library on the GPU.
1283  // TODO: Optimize runtime initialization and pass in correct value.
1284  llvm::Value *Args[] = {getThreadLimit(CGF),
1285  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1286  CGF.EmitRuntimeCall(
1287  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1288 
1289  // For data sharing, we need to initialize the stack.
1290  CGF.EmitRuntimeCall(
1291  createNVPTXRuntimeFunction(
1292  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1293 
1294  emitGenericVarsProlog(CGF, WST.Loc);
1295 }
1296 
1297 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1298  EntryFunctionState &EST) {
1299  IsInTargetMasterThreadRegion = false;
1300  if (!CGF.HaveInsertPoint())
1301  return;
1302 
1303  emitGenericVarsEpilog(CGF);
1304 
1305  if (!EST.ExitBB)
1306  EST.ExitBB = CGF.createBasicBlock(".exit");
1307 
1308  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1309  CGF.EmitBranch(TerminateBB);
1310 
1311  CGF.EmitBlock(TerminateBB);
1312  // Signal termination condition.
1313  // TODO: Optimize runtime initialization and pass in correct value.
1314  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1315  CGF.EmitRuntimeCall(
1316  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1317  // Barrier to terminate worker threads.
1318  syncCTAThreads(CGF);
1319  // Master thread jumps to exit point.
1320  CGF.EmitBranch(EST.ExitBB);
1321 
1322  CGF.EmitBlock(EST.ExitBB);
1323  EST.ExitBB = nullptr;
1324 }
1325 
1326 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1327  StringRef ParentName,
1328  llvm::Function *&OutlinedFn,
1329  llvm::Constant *&OutlinedFnID,
1330  bool IsOffloadEntry,
1331  const RegionCodeGenTy &CodeGen) {
1332  ExecutionRuntimeModesRAII ModeRAII(
1333  CurrentExecutionMode, RequiresFullRuntime,
1334  CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1336  EntryFunctionState EST;
1337 
1338  // Emit target region as a standalone region.
1339  class NVPTXPrePostActionTy : public PrePostActionTy {
1341  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1342  const OMPExecutableDirective &D;
1343 
1344  public:
1345  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1346  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1347  const OMPExecutableDirective &D)
1348  : RT(RT), EST(EST), D(D) {}
1349  void Enter(CodeGenFunction &CGF) override {
1350  RT.emitSPMDEntryHeader(CGF, EST, D);
1351  // Skip target region initialization.
1352  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1353  }
1354  void Exit(CodeGenFunction &CGF) override {
1355  RT.clearLocThreadIdInsertPt(CGF);
1356  RT.emitSPMDEntryFooter(CGF, EST);
1357  }
1358  } Action(*this, EST, D);
1359  CodeGen.setAction(Action);
1360  IsInTTDRegion = true;
1361  // Reserve place for the globalized memory.
1362  GlobalizedRecords.emplace_back();
1363  if (!KernelStaticGlobalized) {
1364  KernelStaticGlobalized = new llvm::GlobalVariable(
1365  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1367  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1368  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1369  llvm::GlobalValue::NotThreadLocal,
1371  }
1372  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1373  IsOffloadEntry, CodeGen);
1374  IsInTTDRegion = false;
1375 }
1376 
1377 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1378  CodeGenFunction &CGF, EntryFunctionState &EST,
1379  const OMPExecutableDirective &D) {
1380  CGBuilderTy &Bld = CGF.Builder;
1381 
1382  // Setup BBs in entry function.
1383  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1384  EST.ExitBB = CGF.createBasicBlock(".exit");
1385 
1386  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1387  /*RequiresOMPRuntime=*/
1388  Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1389  /*RequiresDataSharing=*/Bld.getInt16(0)};
1390  CGF.EmitRuntimeCall(
1391  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1392 
1393  if (RequiresFullRuntime) {
1394  // For data sharing, we need to initialize the stack.
1395  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1396  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1397  }
1398 
1399  CGF.EmitBranch(ExecuteBB);
1400 
1401  CGF.EmitBlock(ExecuteBB);
1402 
1403  IsInTargetMasterThreadRegion = true;
1404 }
1405 
1406 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1407  EntryFunctionState &EST) {
1408  IsInTargetMasterThreadRegion = false;
1409  if (!CGF.HaveInsertPoint())
1410  return;
1411 
1412  if (!EST.ExitBB)
1413  EST.ExitBB = CGF.createBasicBlock(".exit");
1414 
1415  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1416  CGF.EmitBranch(OMPDeInitBB);
1417 
1418  CGF.EmitBlock(OMPDeInitBB);
1419  // DeInitialize the OMP state in the runtime; called by all active threads.
1420  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1421  CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1422  CGF.EmitRuntimeCall(
1423  createNVPTXRuntimeFunction(
1424  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1425  CGF.EmitBranch(EST.ExitBB);
1426 
1427  CGF.EmitBlock(EST.ExitBB);
1428  EST.ExitBB = nullptr;
1429 }
1430 
1431 // Create a unique global variable to indicate the execution mode of this target
1432 // region. The execution mode is either 'generic', or 'spmd' depending on the
1433 // target directive. This variable is picked up by the offload library to setup
1434 // the device appropriately before kernel launch. If the execution mode is
1435 // 'generic', the runtime reserves one warp for the master, otherwise, all
1436 // warps participate in parallel work.
1437 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1438  bool Mode) {
1439  auto *GVMode =
1440  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1441  llvm::GlobalValue::WeakAnyLinkage,
1442  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1443  Twine(Name, "_exec_mode"));
1444  CGM.addCompilerUsedGlobal(GVMode);
1445 }
1446 
1447 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1448  ASTContext &Ctx = CGM.getContext();
1449 
1450  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1451  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1452  WST.Loc, WST.Loc);
1453  emitWorkerLoop(CGF, WST);
1454  CGF.FinishFunction();
1455 }
1456 
1457 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1458  WorkerFunctionState &WST) {
1459  //
1460  // The workers enter this loop and wait for parallel work from the master.
1461  // When the master encounters a parallel region it sets up the work + variable
1462  // arguments, and wakes up the workers. The workers first check to see if
1463  // they are required for the parallel region, i.e., within the # of requested
1464  // parallel threads. The activated workers load the variable arguments and
1465  // execute the parallel work.
1466  //
1467 
1468  CGBuilderTy &Bld = CGF.Builder;
1469 
1470  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1471  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1472  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1473  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1474  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1475  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1476 
1477  CGF.EmitBranch(AwaitBB);
1478 
1479  // Workers wait for work from master.
1480  CGF.EmitBlock(AwaitBB);
1481  // Wait for parallel work
1482  syncCTAThreads(CGF);
1483 
1484  Address WorkFn =
1485  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1486  Address ExecStatus =
1487  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1488  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1489  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1490 
1491  // TODO: Optimize runtime initialization and pass in correct value.
1492  llvm::Value *Args[] = {WorkFn.getPointer(),
1493  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1494  llvm::Value *Ret = CGF.EmitRuntimeCall(
1495  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1496  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1497 
1498  // On termination condition (workid == 0), exit loop.
1499  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1500  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1501  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1502 
1503  // Activate requested workers.
1504  CGF.EmitBlock(SelectWorkersBB);
1505  llvm::Value *IsActive =
1506  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1507  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1508 
1509  // Signal start of parallel region.
1510  CGF.EmitBlock(ExecuteBB);
1511  // Skip initialization.
1512  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1513 
1514  // Process work items: outlined parallel functions.
1515  for (llvm::Function *W : Work) {
1516  // Try to match this outlined function.
1518 
1519  llvm::Value *WorkFnMatch =
1520  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1521 
1522  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1523  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1524  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1525 
1526  // Execute this outlined function.
1527  CGF.EmitBlock(ExecuteFNBB);
1528 
1529  // Insert call to work function via shared wrapper. The shared
1530  // wrapper takes two arguments:
1531  // - the parallelism level;
1532  // - the thread ID;
1533  emitCall(CGF, WST.Loc, W,
1534  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1535 
1536  // Go to end of parallel region.
1537  CGF.EmitBranch(TerminateBB);
1538 
1539  CGF.EmitBlock(CheckNextBB);
1540  }
1541  // Default case: call to outlined function through pointer if the target
1542  // region makes a declare target call that may contain an orphaned parallel
1543  // directive.
1544  auto *ParallelFnTy =
1545  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1546  /*isVarArg=*/false)
1547  ->getPointerTo();
1548  llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
1549  // Insert call to work function via shared wrapper. The shared
1550  // wrapper takes two arguments:
1551  // - the parallelism level;
1552  // - the thread ID;
1553  emitCall(CGF, WST.Loc, WorkFnCast,
1554  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1555  // Go to end of parallel region.
1556  CGF.EmitBranch(TerminateBB);
1557 
1558  // Signal end of parallel region.
1559  CGF.EmitBlock(TerminateBB);
1560  CGF.EmitRuntimeCall(
1561  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1562  llvm::None);
1563  CGF.EmitBranch(BarrierBB);
1564 
1565  // All active and inactive workers wait at a barrier after parallel region.
1566  CGF.EmitBlock(BarrierBB);
1567  // Barrier after parallel region.
1568  syncCTAThreads(CGF);
1569  CGF.EmitBranch(AwaitBB);
1570 
1571  // Exit target region.
1572  CGF.EmitBlock(ExitBB);
1573  // Skip initialization.
1574  clearLocThreadIdInsertPt(CGF);
1575 }
1576 
1577 /// Returns specified OpenMP runtime function for the current OpenMP
1578 /// implementation. Specialized for the NVPTX device.
1579 /// \param Function OpenMP runtime function.
1580 /// \return Specified function.
1581 llvm::Constant *
1583  llvm::Constant *RTLFn = nullptr;
1584  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1585  case OMPRTL_NVPTX__kmpc_kernel_init: {
1586  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1587  // RequiresOMPRuntime);
1588  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1589  auto *FnTy =
1590  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1591  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1592  break;
1593  }
1594  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1595  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1596  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1597  auto *FnTy =
1598  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1599  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1600  break;
1601  }
1602  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1603  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1604  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1605  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1606  auto *FnTy =
1607  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1608  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1609  break;
1610  }
1611  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1612  // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1613  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1614  auto *FnTy =
1615  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1616  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1617  break;
1618  }
1619  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1620  /// Build void __kmpc_kernel_prepare_parallel(
1621  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1622  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1623  auto *FnTy =
1624  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1625  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1626  break;
1627  }
1628  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1629  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1630  /// int16_t IsOMPRuntimeInitialized);
1631  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1632  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1633  auto *FnTy =
1634  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1635  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1636  break;
1637  }
1638  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1639  /// Build void __kmpc_kernel_end_parallel();
1640  auto *FnTy =
1641  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1642  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1643  break;
1644  }
1645  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1646  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1647  // global_tid);
1648  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1649  auto *FnTy =
1650  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1651  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1652  break;
1653  }
1654  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1655  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1656  // global_tid);
1657  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1658  auto *FnTy =
1659  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1660  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1661  break;
1662  }
1663  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1664  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1665  // int16_t lane_offset, int16_t warp_size);
1666  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1667  auto *FnTy =
1668  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1669  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1670  break;
1671  }
1672  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1673  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1674  // int16_t lane_offset, int16_t warp_size);
1675  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1676  auto *FnTy =
1677  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1678  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1679  break;
1680  }
1681  case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
1682  // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
1683  // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1684  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1685  // lane_offset, int16_t Algorithm Version),
1686  // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1687  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1688  CGM.Int16Ty, CGM.Int16Ty};
1689  auto *ShuffleReduceFnTy =
1690  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1691  /*isVarArg=*/false);
1692  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1693  auto *InterWarpCopyFnTy =
1694  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1695  /*isVarArg=*/false);
1696  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1697  CGM.Int32Ty,
1698  CGM.SizeTy,
1699  CGM.VoidPtrTy,
1700  ShuffleReduceFnTy->getPointerTo(),
1701  InterWarpCopyFnTy->getPointerTo()};
1702  auto *FnTy =
1703  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1704  RTLFn = CGM.CreateRuntimeFunction(
1705  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1706  break;
1707  }
1708  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1709  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1710  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1711  auto *FnTy =
1712  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1713  RTLFn = CGM.CreateRuntimeFunction(
1714  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1715  break;
1716  }
1717  case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple: {
1718  // Build __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32
1719  // global_tid, kmp_critical_name *lck)
1720  llvm::Type *TypeParams[] = {
1721  getIdentTyPointerTy(), CGM.Int32Ty,
1722  llvm::PointerType::getUnqual(getKmpCriticalNameTy())};
1723  auto *FnTy =
1724  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1725  RTLFn = CGM.CreateRuntimeFunction(
1726  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_simple");
1727  break;
1728  }
1729  case OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple: {
1730  // Build __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc, kmp_int32
1731  // global_tid, kmp_critical_name *lck)
1732  llvm::Type *TypeParams[] = {
1733  getIdentTyPointerTy(), CGM.Int32Ty,
1734  llvm::PointerType::getUnqual(getKmpCriticalNameTy())};
1735  auto *FnTy =
1736  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1737  RTLFn = CGM.CreateRuntimeFunction(
1738  FnTy, /*Name=*/"__kmpc_nvptx_teams_end_reduce_nowait_simple");
1739  break;
1740  }
1741  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1742  /// Build void __kmpc_data_sharing_init_stack();
1743  auto *FnTy =
1744  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1745  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1746  break;
1747  }
1748  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1749  /// Build void __kmpc_data_sharing_init_stack_spmd();
1750  auto *FnTy =
1751  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1752  RTLFn =
1753  CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1754  break;
1755  }
1756  case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1757  // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1758  // int16_t UseSharedMemory);
1759  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1760  auto *FnTy =
1761  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1762  RTLFn = CGM.CreateRuntimeFunction(
1763  FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
1764  break;
1765  }
1766  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1767  // Build void __kmpc_data_sharing_pop_stack(void *a);
1768  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1769  auto *FnTy =
1770  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1771  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1772  /*Name=*/"__kmpc_data_sharing_pop_stack");
1773  break;
1774  }
1775  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1776  /// Build void __kmpc_begin_sharing_variables(void ***args,
1777  /// size_t n_args);
1778  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1779  auto *FnTy =
1780  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1781  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1782  break;
1783  }
1784  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1785  /// Build void __kmpc_end_sharing_variables();
1786  auto *FnTy =
1787  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1788  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1789  break;
1790  }
1791  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1792  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1793  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1794  auto *FnTy =
1795  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1796  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1797  break;
1798  }
1799  case OMPRTL_NVPTX__kmpc_parallel_level: {
1800  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1801  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1802  auto *FnTy =
1803  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1804  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1805  break;
1806  }
1807  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1808  // Build int8_t __kmpc_is_spmd_exec_mode();
1809  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1810  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1811  break;
1812  }
1813  case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1814  // Build void __kmpc_get_team_static_memory(const void *buf, size_t size,
1815  // int16_t is_shared, const void **res);
1816  llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty,
1817  CGM.VoidPtrPtrTy};
1818  auto *FnTy =
1819  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1820  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1821  break;
1822  }
1823  case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1824  // Build void __kmpc_restore_team_static_memory(int16_t is_shared);
1825  auto *FnTy =
1826  llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false);
1827  RTLFn =
1828  CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1829  break;
1830  }
1831  case OMPRTL__kmpc_barrier: {
1832  // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1833  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1834  auto *FnTy =
1835  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1836  RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
1837  cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent);
1838  break;
1839  }
1840  }
1841  return RTLFn;
1842 }
1843 
1844 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1845  llvm::Constant *Addr,
1846  uint64_t Size, int32_t,
1847  llvm::GlobalValue::LinkageTypes) {
1848  // TODO: Add support for global variables on the device after declare target
1849  // support.
1850  if (!isa<llvm::Function>(Addr))
1851  return;
1852  llvm::Module &M = CGM.getModule();
1853  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1854 
1855  // Get "nvvm.annotations" metadata node
1856  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1857 
1858  llvm::Metadata *MDVals[] = {
1859  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1860  llvm::ConstantAsMetadata::get(
1861  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1862  // Append metadata to nvvm.annotations
1863  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1864 }
1865 
1866 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1867  const OMPExecutableDirective &D, StringRef ParentName,
1868  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1869  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1870  if (!IsOffloadEntry) // Nothing to do.
1871  return;
1872 
1873  assert(!ParentName.empty() && "Invalid target region parent name!");
1874 
1875  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1876  if (Mode)
1877  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1878  CodeGen);
1879  else
1880  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1881  CodeGen);
1882 
1883  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1884 }
1885 
1886 namespace {
1888 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1889 enum ModeFlagsTy : unsigned {
1890  /// Bit set to 1 when in SPMD mode.
1891  KMP_IDENT_SPMD_MODE = 0x01,
1892  /// Bit set to 1 when a simplified runtime is used.
1893  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1894  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1895 };
1896 
1897 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1898 static const ModeFlagsTy UndefinedMode =
1899  (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1900 } // anonymous namespace
1901 
1903  switch (getExecutionMode()) {
1904  case EM_SPMD:
1905  if (requiresFullRuntime())
1906  return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1907  return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1908  case EM_NonSPMD:
1909  assert(requiresFullRuntime() && "Expected full runtime.");
1910  return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1911  case EM_Unknown:
1912  return UndefinedMode;
1913  }
1914  llvm_unreachable("Unknown flags are requested.");
1915 }
1916 
1918  : CGOpenMPRuntime(CGM, "_", "$") {
1919  if (!CGM.getLangOpts().OpenMPIsDevice)
1920  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1921 }
1922 
1924  OpenMPProcBindClauseKind ProcBind,
1925  SourceLocation Loc) {
1926  // Do nothing in case of SPMD mode and L0 parallel.
1927  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1928  return;
1929 
1930  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1931 }
1932 
1934  llvm::Value *NumThreads,
1935  SourceLocation Loc) {
1936  // Do nothing in case of SPMD mode and L0 parallel.
1937  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1938  return;
1939 
1940  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1941 }
1942 
1944  const Expr *NumTeams,
1945  const Expr *ThreadLimit,
1946  SourceLocation Loc) {}
1947 
1949  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1950  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1951  // Emit target region as a standalone region.
1952  class NVPTXPrePostActionTy : public PrePostActionTy {
1953  bool &IsInParallelRegion;
1954  bool PrevIsInParallelRegion;
1955 
1956  public:
1957  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1958  : IsInParallelRegion(IsInParallelRegion) {}
1959  void Enter(CodeGenFunction &CGF) override {
1960  PrevIsInParallelRegion = IsInParallelRegion;
1961  IsInParallelRegion = true;
1962  }
1963  void Exit(CodeGenFunction &CGF) override {
1964  IsInParallelRegion = PrevIsInParallelRegion;
1965  }
1966  } Action(IsInParallelRegion);
1967  CodeGen.setAction(Action);
1968  bool PrevIsInTTDRegion = IsInTTDRegion;
1969  IsInTTDRegion = false;
1970  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1971  IsInTargetMasterThreadRegion = false;
1972  auto *OutlinedFun =
1973  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1974  D, ThreadIDVar, InnermostKind, CodeGen));
1975  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1976  IsInTTDRegion = PrevIsInTTDRegion;
1977  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1978  !IsInParallelRegion) {
1979  llvm::Function *WrapperFun =
1980  createParallelDataSharingWrapper(OutlinedFun, D);
1981  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1982  }
1983 
1984  return OutlinedFun;
1985 }
1986 
1987 /// Get list of lastprivate variables from the teams distribute ... or
1988 /// teams {distribute ...} directives.
1989 static void
1993  "expected teams directive.");
1994  const OMPExecutableDirective *Dir = &D;
1996  if (const Stmt *S = getSingleCompoundChild(
1997  Ctx,
1999  /*IgnoreCaptured=*/true))) {
2000  Dir = dyn_cast<OMPExecutableDirective>(S);
2001  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
2002  Dir = nullptr;
2003  }
2004  }
2005  if (!Dir)
2006  return;
2007  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
2008  for (const Expr *E : C->getVarRefs())
2009  Vars.push_back(getPrivateItem(E));
2010  }
2011 }
2012 
2013 /// Get list of reduction variables from the teams ... directives.
2014 static void
2018  "expected teams directive.");
2019  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
2020  for (const Expr *E : C->privates())
2021  Vars.push_back(getPrivateItem(E));
2022  }
2023 }
2024 
2026  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
2027  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
2028  SourceLocation Loc = D.getBeginLoc();
2029 
2030  const RecordDecl *GlobalizedRD = nullptr;
2031  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
2032  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
2033  // Globalize team reductions variable unconditionally in all modes.
2034  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
2035  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
2036  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2037  if (!LastPrivatesReductions.empty()) {
2038  GlobalizedRD = ::buildRecordForGlobalizedVars(
2039  CGM.getContext(), llvm::None, LastPrivatesReductions,
2040  MappedDeclsFields);
2041  }
2042  } else if (!LastPrivatesReductions.empty()) {
2043  assert(!TeamAndReductions.first &&
2044  "Previous team declaration is not expected.");
2045  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2046  std::swap(TeamAndReductions.second, LastPrivatesReductions);
2047  }
2048 
2049  // Emit target region as a standalone region.
2050  class NVPTXPrePostActionTy : public PrePostActionTy {
2051  SourceLocation &Loc;
2052  const RecordDecl *GlobalizedRD;
2053  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2054  &MappedDeclsFields;
2055 
2056  public:
2057  NVPTXPrePostActionTy(
2058  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2059  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2060  &MappedDeclsFields)
2061  : Loc(Loc), GlobalizedRD(GlobalizedRD),
2062  MappedDeclsFields(MappedDeclsFields) {}
2063  void Enter(CodeGenFunction &CGF) override {
2064  auto &Rt =
2065  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2066  if (GlobalizedRD) {
2067  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2068  I->getSecond().GlobalRecord = GlobalizedRD;
2069  I->getSecond().MappedParams =
2070  llvm::make_unique<CodeGenFunction::OMPMapVars>();
2071  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2072  for (const auto &Pair : MappedDeclsFields) {
2073  assert(Pair.getFirst()->isCanonicalDecl() &&
2074  "Expected canonical declaration");
2075  Data.insert(std::make_pair(Pair.getFirst(),
2076  MappedVarData(Pair.getSecond(),
2077  /*IsOnePerTeam=*/true)));
2078  }
2079  }
2080  Rt.emitGenericVarsProlog(CGF, Loc);
2081  }
2082  void Exit(CodeGenFunction &CGF) override {
2083  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2084  .emitGenericVarsEpilog(CGF);
2085  }
2086  } Action(Loc, GlobalizedRD, MappedDeclsFields);
2087  CodeGen.setAction(Action);
2089  D, ThreadIDVar, InnermostKind, CodeGen);
2090  llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
2091  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2092  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2093  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2094 
2095  return OutlinedFun;
2096 }
2097 
2098 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2099  SourceLocation Loc,
2100  bool WithSPMDCheck) {
2102  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2103  return;
2104 
2105  CGBuilderTy &Bld = CGF.Builder;
2106 
2107  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2108  if (I == FunctionGlobalizedDecls.end())
2109  return;
2110  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2111  QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2112  QualType SecGlobalRecTy;
2113 
2114  // Recover pointer to this function's global record. The runtime will
2115  // handle the specifics of the allocation of the memory.
2116  // Use actual memory size of the record including the padding
2117  // for alignment purposes.
2118  unsigned Alignment =
2119  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2120  unsigned GlobalRecordSize =
2121  CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2122  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2123 
2124  llvm::PointerType *GlobalRecPtrTy =
2125  CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2126  llvm::Value *GlobalRecCastAddr;
2127  llvm::Value *IsTTD = nullptr;
2128  if (!IsInTTDRegion &&
2129  (WithSPMDCheck ||
2130  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2131  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2132  llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2133  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2134  if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2135  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2136  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2137  llvm::Value *PL = CGF.EmitRuntimeCall(
2138  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2139  {RTLoc, ThreadID});
2140  IsTTD = Bld.CreateIsNull(PL);
2141  }
2142  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2143  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2144  Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2145  // There is no need to emit line number for unconditional branch.
2147  CGF.EmitBlock(SPMDBB);
2148  Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2149  CharUnits::fromQuantity(Alignment));
2150  CGF.EmitBranch(ExitBB);
2151  // There is no need to emit line number for unconditional branch.
2153  CGF.EmitBlock(NonSPMDBB);
2154  llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2155  if (const RecordDecl *SecGlobalizedVarsRecord =
2156  I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2157  SecGlobalRecTy =
2158  CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2159 
2160  // Recover pointer to this function's global record. The runtime will
2161  // handle the specifics of the allocation of the memory.
2162  // Use actual memory size of the record including the padding
2163  // for alignment purposes.
2164  unsigned Alignment =
2165  CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2166  unsigned GlobalRecordSize =
2167  CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2168  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2169  Size = Bld.CreateSelect(
2170  IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2171  }
2172  // TODO: allow the usage of shared memory to be controlled by
2173  // the user, for now, default to global.
2174  llvm::Value *GlobalRecordSizeArg[] = {
2175  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2176  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2178  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2179  GlobalRecordSizeArg);
2180  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2181  GlobalRecValue, GlobalRecPtrTy);
2182  CGF.EmitBlock(ExitBB);
2183  auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2184  /*NumReservedValues=*/2, "_select_stack");
2185  Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2186  Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2187  GlobalRecCastAddr = Phi;
2188  I->getSecond().GlobalRecordAddr = Phi;
2189  I->getSecond().IsInSPMDModeFlag = IsSPMD;
2190  } else if (IsInTTDRegion) {
2191  assert(GlobalizedRecords.back().Records.size() < 2 &&
2192  "Expected less than 2 globalized records: one for target and one "
2193  "for teams.");
2194  unsigned Offset = 0;
2195  for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2196  QualType RDTy = CGM.getContext().getRecordType(RD);
2197  unsigned Alignment =
2199  unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2200  Offset =
2201  llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2202  }
2203  unsigned Alignment =
2204  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2205  Offset = llvm::alignTo(Offset, Alignment);
2206  GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2207  ++GlobalizedRecords.back().RegionCounter;
2208  if (GlobalizedRecords.back().Records.size() == 1) {
2209  assert(KernelStaticGlobalized &&
2210  "Kernel static pointer must be initialized already.");
2211  auto *UseSharedMemory = new llvm::GlobalVariable(
2212  CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2214  "_openmp_static_kernel$is_shared");
2215  UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2216  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2217  /*DestWidth=*/16, /*Signed=*/0);
2218  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2219  Address(UseSharedMemory,
2220  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2221  /*Volatile=*/false, Int16Ty, Loc);
2222  auto *StaticGlobalized = new llvm::GlobalVariable(
2223  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2224  llvm::GlobalValue::CommonLinkage, nullptr);
2225  auto *RecSize = new llvm::GlobalVariable(
2226  CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2228  "_openmp_static_kernel$size");
2229  RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2230  llvm::Value *Ld = CGF.EmitLoadOfScalar(
2231  Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
2232  CGM.getContext().getSizeType(), Loc);
2234  KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2235  llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld,
2236  IsInSharedMemory, ResAddr};
2238  OMPRTL_NVPTX__kmpc_get_team_static_memory),
2239  GlobalRecordSizeArg);
2240  GlobalizedRecords.back().Buffer = StaticGlobalized;
2241  GlobalizedRecords.back().RecSize = RecSize;
2242  GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2243  GlobalizedRecords.back().Loc = Loc;
2244  }
2245  assert(KernelStaticGlobalized && "Global address must be set already.");
2246  Address FrameAddr = CGF.EmitLoadOfPointer(
2247  Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2248  CGM.getContext()
2250  .castAs<PointerType>());
2251  llvm::Value *GlobalRecValue =
2252  Bld.CreateConstInBoundsGEP(FrameAddr, Offset, CharUnits::One())
2253  .getPointer();
2254  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2255  I->getSecond().IsInSPMDModeFlag = nullptr;
2256  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2257  GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2258  } else {
2259  // TODO: allow the usage of shared memory to be controlled by
2260  // the user, for now, default to global.
2261  llvm::Value *GlobalRecordSizeArg[] = {
2262  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2263  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2264  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2266  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2267  GlobalRecordSizeArg);
2268  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2269  GlobalRecValue, GlobalRecPtrTy);
2270  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2271  I->getSecond().IsInSPMDModeFlag = nullptr;
2272  }
2273  LValue Base =
2274  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
2275 
2276  // Emit the "global alloca" which is a GEP from the global declaration
2277  // record using the pointer returned by the runtime.
2278  LValue SecBase;
2279  decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2280  if (IsTTD) {
2281  SecIt = I->getSecond().SecondaryLocalVarData->begin();
2282  llvm::PointerType *SecGlobalRecPtrTy =
2283  CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2284  SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2286  I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2287  SecGlobalRecTy);
2288  }
2289  for (auto &Rec : I->getSecond().LocalVarData) {
2290  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2291  llvm::Value *ParValue;
2292  if (EscapedParam) {
2293  const auto *VD = cast<VarDecl>(Rec.first);
2294  LValue ParLVal =
2295  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2296  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2297  }
2298  LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2299  // Emit VarAddr basing on lane-id if required.
2300  QualType VarTy;
2301  if (Rec.second.IsOnePerTeam) {
2302  VarTy = Rec.second.FD->getType();
2303  } else {
2304  llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2305  VarAddr.getAddress().getPointer(),
2306  {Bld.getInt32(0), getNVPTXLaneID(CGF)});
2307  VarTy =
2308  Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2309  VarAddr = CGF.MakeAddrLValue(
2310  Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2312  }
2313  Rec.second.PrivateAddr = VarAddr.getAddress();
2314  if (!IsInTTDRegion &&
2315  (WithSPMDCheck ||
2316  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2317  assert(I->getSecond().IsInSPMDModeFlag &&
2318  "Expected unknown execution mode or required SPMD check.");
2319  if (IsTTD) {
2320  assert(SecIt->second.IsOnePerTeam &&
2321  "Secondary glob data must be one per team.");
2322  LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2323  VarAddr.setAddress(
2324  Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2325  VarAddr.getPointer()),
2326  VarAddr.getAlignment()));
2327  Rec.second.PrivateAddr = VarAddr.getAddress();
2328  }
2329  Address GlobalPtr = Rec.second.PrivateAddr;
2330  Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2331  Rec.second.PrivateAddr = Address(
2332  Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2333  LocalAddr.getPointer(), GlobalPtr.getPointer()),
2334  LocalAddr.getAlignment());
2335  }
2336  if (EscapedParam) {
2337  const auto *VD = cast<VarDecl>(Rec.first);
2338  CGF.EmitStoreOfScalar(ParValue, VarAddr);
2339  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2340  }
2341  if (IsTTD)
2342  ++SecIt;
2343  }
2344  }
2345  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2346  // Recover pointer to this function's global record. The runtime will
2347  // handle the specifics of the allocation of the memory.
2348  // Use actual memory size of the record including the padding
2349  // for alignment purposes.
2350  CGBuilderTy &Bld = CGF.Builder;
2351  llvm::Value *Size = CGF.getTypeSize(VD->getType());
2352  CharUnits Align = CGM.getContext().getDeclAlign(VD);
2353  Size = Bld.CreateNUWAdd(
2354  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2355  llvm::Value *AlignVal =
2356  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2357  Size = Bld.CreateUDiv(Size, AlignVal);
2358  Size = Bld.CreateNUWMul(Size, AlignVal);
2359  // TODO: allow the usage of shared memory to be controlled by
2360  // the user, for now, default to global.
2361  llvm::Value *GlobalRecordSizeArg[] = {
2362  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2363  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2365  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2366  GlobalRecordSizeArg);
2367  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2368  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2369  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2370  CGM.getContext().getDeclAlign(VD),
2372  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2373  Base.getAddress());
2374  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2375  }
2376  I->getSecond().MappedParams->apply(CGF);
2377 }
2378 
2379 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2380  bool WithSPMDCheck) {
2382  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2383  return;
2384 
2385  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2386  if (I != FunctionGlobalizedDecls.end()) {
2387  I->getSecond().MappedParams->restore(CGF);
2388  if (!CGF.HaveInsertPoint())
2389  return;
2390  for (llvm::Value *Addr :
2391  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2392  CGF.EmitRuntimeCall(
2393  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2394  Addr);
2395  }
2396  if (I->getSecond().GlobalRecordAddr) {
2397  if (!IsInTTDRegion &&
2398  (WithSPMDCheck ||
2399  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2400  CGBuilderTy &Bld = CGF.Builder;
2401  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2402  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2403  Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2404  // There is no need to emit line number for unconditional branch.
2406  CGF.EmitBlock(NonSPMDBB);
2407  CGF.EmitRuntimeCall(
2409  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2410  CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2411  CGF.EmitBlock(ExitBB);
2412  } else if (IsInTTDRegion) {
2413  assert(GlobalizedRecords.back().RegionCounter > 0 &&
2414  "region counter must be > 0.");
2415  --GlobalizedRecords.back().RegionCounter;
2416  // Emit the restore function only in the target region.
2417  if (GlobalizedRecords.back().RegionCounter == 0) {
2418  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2419  /*DestWidth=*/16, /*Signed=*/0);
2420  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2421  Address(GlobalizedRecords.back().UseSharedMemory,
2422  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2423  /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2424  CGF.EmitRuntimeCall(
2426  OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2427  IsInSharedMemory);
2428  }
2429  } else {
2431  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2432  I->getSecond().GlobalRecordAddr);
2433  }
2434  }
2435  }
2436 }
2437 
2439  const OMPExecutableDirective &D,
2440  SourceLocation Loc,
2441  llvm::Value *OutlinedFn,
2442  ArrayRef<llvm::Value *> CapturedVars) {
2443  if (!CGF.HaveInsertPoint())
2444  return;
2445 
2446  Address ZeroAddr = CGF.CreateMemTemp(
2447  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2448  /*Name*/ ".zero.addr");
2449  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2450  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2451  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2452  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2453  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2454  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2455 }
2456 
2458  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2459  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2460  if (!CGF.HaveInsertPoint())
2461  return;
2462 
2463  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2464  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2465  else
2466  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2467 }
2468 
2469 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2470  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2471  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2472  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2473 
2474  // Force inline this outlined function at its call site.
2475  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2476 
2478  /*DestWidth=*/32, /*Signed=*/1),
2479  ".zero.addr");
2480  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2481  // ThreadId for serialized parallels is 0.
2482  Address ThreadIDAddr = ZeroAddr;
2483  auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
2484  CodeGenFunction &CGF, PrePostActionTy &Action) {
2485  Action.Enter(CGF);
2486 
2487  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2488  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2489  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2490  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2491  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2492  };
2493  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2494  PrePostActionTy &) {
2495 
2496  RegionCodeGenTy RCG(CodeGen);
2497  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2498  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2499  llvm::Value *Args[] = {RTLoc, ThreadID};
2500 
2501  NVPTXActionTy Action(
2502  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2503  Args,
2504  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2505  Args);
2506  RCG.setAction(Action);
2507  RCG(CGF);
2508  };
2509 
2510  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2511  PrePostActionTy &Action) {
2512  CGBuilderTy &Bld = CGF.Builder;
2513  llvm::Function *WFn = WrapperFunctionsMap[Fn];
2514  assert(WFn && "Wrapper function does not exist!");
2515  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2516 
2517  // Prepare for parallel region. Indicate the outlined function.
2518  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
2519  CGF.EmitRuntimeCall(
2520  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2521  Args);
2522 
2523  // Create a private scope that will globalize the arguments
2524  // passed from the outside of the target region.
2525  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2526 
2527  // There's something to share.
2528  if (!CapturedVars.empty()) {
2529  // Prepare for parallel region. Indicate the outlined function.
2530  Address SharedArgs =
2531  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2532  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2533 
2534  llvm::Value *DataSharingArgs[] = {
2535  SharedArgsPtr,
2536  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2537  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2538  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2539  DataSharingArgs);
2540 
2541  // Store variable address in a list of references to pass to workers.
2542  unsigned Idx = 0;
2543  ASTContext &Ctx = CGF.getContext();
2544  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2545  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2546  .castAs<PointerType>());
2547  for (llvm::Value *V : CapturedVars) {
2548  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2549  CGF.getPointerSize());
2550  llvm::Value *PtrV;
2551  if (V->getType()->isIntegerTy())
2552  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2553  else
2554  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2555  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2556  Ctx.getPointerType(Ctx.VoidPtrTy));
2557  ++Idx;
2558  }
2559  }
2560 
2561  // Activate workers. This barrier is used by the master to signal
2562  // work for the workers.
2563  syncCTAThreads(CGF);
2564 
2565  // OpenMP [2.5, Parallel Construct, p.49]
2566  // There is an implied barrier at the end of a parallel region. After the
2567  // end of a parallel region, only the master thread of the team resumes
2568  // execution of the enclosing task region.
2569  //
2570  // The master waits at this barrier until all workers are done.
2571  syncCTAThreads(CGF);
2572 
2573  if (!CapturedVars.empty())
2574  CGF.EmitRuntimeCall(
2575  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2576 
2577  // Remember for post-processing in worker loop.
2578  Work.emplace_back(WFn);
2579  };
2580 
2581  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2582  CodeGenFunction &CGF, PrePostActionTy &Action) {
2583  if (IsInParallelRegion) {
2584  SeqGen(CGF, Action);
2585  } else if (IsInTargetMasterThreadRegion) {
2586  L0ParallelGen(CGF, Action);
2587  } else {
2588  // Check for master and then parallelism:
2589  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2590  // Serialized execution.
2591  // } else {
2592  // Worker call.
2593  // }
2594  CGBuilderTy &Bld = CGF.Builder;
2595  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2596  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2597  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2598  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2599  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2600  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2601  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2602  // There is no need to emit line number for unconditional branch.
2604  CGF.EmitBlock(ParallelCheckBB);
2605  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2606  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2607  llvm::Value *PL = CGF.EmitRuntimeCall(
2608  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2609  {RTLoc, ThreadID});
2610  llvm::Value *Res = Bld.CreateIsNotNull(PL);
2611  Bld.CreateCondBr(Res, SeqBB, MasterBB);
2612  CGF.EmitBlock(SeqBB);
2613  SeqGen(CGF, Action);
2614  CGF.EmitBranch(ExitBB);
2615  // There is no need to emit line number for unconditional branch.
2617  CGF.EmitBlock(MasterBB);
2618  L0ParallelGen(CGF, Action);
2619  CGF.EmitBranch(ExitBB);
2620  // There is no need to emit line number for unconditional branch.
2622  // Emit the continuation block for code after the if.
2623  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2624  }
2625  };
2626 
2627  if (IfCond) {
2628  emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2629  } else {
2631  RegionCodeGenTy ThenRCG(LNParallelGen);
2632  ThenRCG(CGF);
2633  }
2634 }
2635 
2636 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2637  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2638  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2639  // Just call the outlined function to execute the parallel region.
2640  // OutlinedFn(&GTid, &zero, CapturedStruct);
2641  //
2642  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2643 
2645  /*DestWidth=*/32, /*Signed=*/1),
2646  ".zero.addr");
2647  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2648  // ThreadId for serialized parallels is 0.
2649  Address ThreadIDAddr = ZeroAddr;
2650  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
2651  &ThreadIDAddr](CodeGenFunction &CGF,
2652  PrePostActionTy &Action) {
2653  Action.Enter(CGF);
2654 
2655  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2656  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2657  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2658  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2659  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2660  };
2661  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2662  PrePostActionTy &) {
2663 
2664  RegionCodeGenTy RCG(CodeGen);
2665  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2666  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2667  llvm::Value *Args[] = {RTLoc, ThreadID};
2668 
2669  NVPTXActionTy Action(
2670  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2671  Args,
2672  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2673  Args);
2674  RCG.setAction(Action);
2675  RCG(CGF);
2676  };
2677 
2678  if (IsInTargetMasterThreadRegion) {
2679  // In the worker need to use the real thread id.
2680  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2681  RegionCodeGenTy RCG(CodeGen);
2682  RCG(CGF);
2683  } else {
2684  // If we are not in the target region, it is definitely L2 parallelism or
2685  // more, because for SPMD mode we always has L1 parallel level, sowe don't
2686  // need to check for orphaned directives.
2687  RegionCodeGenTy RCG(SeqGen);
2688  RCG(CGF);
2689  }
2690 }
2691 
2693  SourceLocation Loc,
2694  OpenMPDirectiveKind Kind, bool,
2695  bool) {
2696  // Always emit simple barriers!
2697  if (!CGF.HaveInsertPoint())
2698  return;
2699  // Build call __kmpc_cancel_barrier(loc, thread_id);
2700  unsigned Flags = getDefaultFlagsForBarriers(Kind);
2701  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2702  getThreadID(CGF, Loc)};
2704 }
2705 
2707  CodeGenFunction &CGF, StringRef CriticalName,
2708  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2709  const Expr *Hint) {
2710  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2711  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2712  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2713  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2714  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2715 
2716  // Fetch team-local id of the thread.
2717  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2718 
2719  // Get the width of the team.
2720  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2721 
2722  // Initialize the counter variable for the loop.
2723  QualType Int32Ty =
2724  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2725  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2726  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2727  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2728  /*isInit=*/true);
2729 
2730  // Block checks if loop counter exceeds upper bound.
2731  CGF.EmitBlock(LoopBB);
2732  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2733  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2734  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2735 
2736  // Block tests which single thread should execute region, and which threads
2737  // should go straight to synchronisation point.
2738  CGF.EmitBlock(TestBB);
2739  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2740  llvm::Value *CmpThreadToCounter =
2741  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2742  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2743 
2744  // Block emits the body of the critical region.
2745  CGF.EmitBlock(BodyBB);
2746 
2747  // Output the critical statement.
2748  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2749  Hint);
2750 
2751  // After the body surrounded by the critical region, the single executing
2752  // thread will jump to the synchronisation point.
2753  // Block waits for all threads in current team to finish then increments the
2754  // counter variable and returns to the loop.
2755  CGF.EmitBlock(SyncBB);
2756  emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false,
2757  /*ForceSimpleCall=*/true);
2758 
2759  llvm::Value *IncCounterVal =
2760  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2761  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2762  CGF.EmitBranch(LoopBB);
2763 
2764  // Block that is reached when all threads in the team complete the region.
2765  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2766 }
2767 
2768 /// Cast value to the specified type.
2770  QualType ValTy, QualType CastTy,
2771  SourceLocation Loc) {
2772  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2773  "Cast type must sized.");
2774  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2775  "Val type must sized.");
2776  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2777  if (ValTy == CastTy)
2778  return Val;
2779  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2780  CGF.getContext().getTypeSizeInChars(CastTy))
2781  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2782  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2783  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2784  CastTy->hasSignedIntegerRepresentation());
2785  Address CastItem = CGF.CreateMemTemp(CastTy);
2787  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2788  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2789  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2790 }
2791 
2792 /// This function creates calls to one of two shuffle functions to copy
2793 /// variables between lanes in a warp.
2795  llvm::Value *Elem,
2796  QualType ElemType,
2798  SourceLocation Loc) {
2799  CodeGenModule &CGM = CGF.CGM;
2800  CGBuilderTy &Bld = CGF.Builder;
2801  CGOpenMPRuntimeNVPTX &RT =
2802  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2803 
2804  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2805  assert(Size.getQuantity() <= 8 &&
2806  "Unsupported bitwidth in shuffle instruction.");
2807 
2808  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2809  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2810  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2811 
2812  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2813  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2814  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2815  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2816  llvm::Value *WarpSize =
2817  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2818 
2819  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2820  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2821 
2822  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2823 }
2824 
2825 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2826  Address DestAddr, QualType ElemType,
2828  CGBuilderTy &Bld = CGF.Builder;
2829 
2830  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2831  // Create the loop over the big sized data.
2832  // ptr = (void*)Elem;
2833  // ptrEnd = (void*) Elem + 1;
2834  // Step = 8;
2835  // while (ptr + Step < ptrEnd)
2836  // shuffle((int64_t)*ptr);
2837  // Step = 4;
2838  // while (ptr + Step < ptrEnd)
2839  // shuffle((int32_t)*ptr);
2840  // ...
2841  Address ElemPtr = DestAddr;
2842  Address Ptr = SrcAddr;
2844  Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
2845  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2846  if (Size < CharUnits::fromQuantity(IntSize))
2847  continue;
2848  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2849  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2850  /*Signed=*/1);
2851  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2852  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2853  ElemPtr =
2854  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2855  if (Size.getQuantity() / IntSize > 1) {
2856  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2857  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2858  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2859  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2860  CGF.EmitBlock(PreCondBB);
2861  llvm::PHINode *PhiSrc =
2862  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2863  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2864  llvm::PHINode *PhiDest =
2865  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2866  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2867  Ptr = Address(PhiSrc, Ptr.getAlignment());
2868  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2869  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2871  Ptr.getPointer(), CGF.VoidPtrTy));
2872  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2873  ThenBB, ExitBB);
2874  CGF.EmitBlock(ThenBB);
2876  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2877  IntType, Offset, Loc);
2878  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2879  Address LocalPtr =
2880  Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2881  Address LocalElemPtr =
2882  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2883  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2884  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2885  CGF.EmitBranch(PreCondBB);
2886  CGF.EmitBlock(ExitBB);
2887  } else {
2889  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2890  IntType, Offset, Loc);
2891  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2892  Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2893  ElemPtr =
2894  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2895  }
2896  Size = Size % IntSize;
2897  }
2898 }
2899 
2900 namespace {
2901 enum CopyAction : unsigned {
2902  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2903  // the warp using shuffle instructions.
2904  RemoteLaneToThread,
2905  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2906  ThreadCopy,
2907  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2908  ThreadToScratchpad,
2909  // ScratchpadToThread: Copy from a scratchpad array in global memory
2910  // containing team-reduced data to a thread's stack.
2911  ScratchpadToThread,
2912 };
2913 } // namespace
2914 
2919 };
2920 
2921 /// Emit instructions to copy a Reduce list, which contains partially
2922 /// aggregated values, in the specified direction.
2924  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2925  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2926  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2927 
2928  CodeGenModule &CGM = CGF.CGM;
2929  ASTContext &C = CGM.getContext();
2930  CGBuilderTy &Bld = CGF.Builder;
2931 
2932  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2933  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2934  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2935 
2936  // Iterates, element-by-element, through the source Reduce list and
2937  // make a copy.
2938  unsigned Idx = 0;
2939  unsigned Size = Privates.size();
2940  for (const Expr *Private : Privates) {
2941  Address SrcElementAddr = Address::invalid();
2942  Address DestElementAddr = Address::invalid();
2943  Address DestElementPtrAddr = Address::invalid();
2944  // Should we shuffle in an element from a remote lane?
2945  bool ShuffleInElement = false;
2946  // Set to true to update the pointer in the dest Reduce list to a
2947  // newly created element.
2948  bool UpdateDestListPtr = false;
2949  // Increment the src or dest pointer to the scratchpad, for each
2950  // new element.
2951  bool IncrScratchpadSrc = false;
2952  bool IncrScratchpadDest = false;
2953 
2954  switch (Action) {
2955  case RemoteLaneToThread: {
2956  // Step 1.1: Get the address for the src element in the Reduce list.
2957  Address SrcElementPtrAddr =
2958  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2959  SrcElementAddr = CGF.EmitLoadOfPointer(
2960  SrcElementPtrAddr,
2961  C.getPointerType(Private->getType())->castAs<PointerType>());
2962 
2963  // Step 1.2: Create a temporary to store the element in the destination
2964  // Reduce list.
2965  DestElementPtrAddr =
2966  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2967  DestElementAddr =
2968  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2969  ShuffleInElement = true;
2970  UpdateDestListPtr = true;
2971  break;
2972  }
2973  case ThreadCopy: {
2974  // Step 1.1: Get the address for the src element in the Reduce list.
2975  Address SrcElementPtrAddr =
2976  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2977  SrcElementAddr = CGF.EmitLoadOfPointer(
2978  SrcElementPtrAddr,
2979  C.getPointerType(Private->getType())->castAs<PointerType>());
2980 
2981  // Step 1.2: Get the address for dest element. The destination
2982  // element has already been created on the thread's stack.
2983  DestElementPtrAddr =
2984  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2985  DestElementAddr = CGF.EmitLoadOfPointer(
2986  DestElementPtrAddr,
2987  C.getPointerType(Private->getType())->castAs<PointerType>());
2988  break;
2989  }
2990  case ThreadToScratchpad: {
2991  // Step 1.1: Get the address for the src element in the Reduce list.
2992  Address SrcElementPtrAddr =
2993  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2994  SrcElementAddr = CGF.EmitLoadOfPointer(
2995  SrcElementPtrAddr,
2996  C.getPointerType(Private->getType())->castAs<PointerType>());
2997 
2998  // Step 1.2: Get the address for dest element:
2999  // address = base + index * ElementSizeInChars.
3000  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3001  llvm::Value *CurrentOffset =
3002  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3003  llvm::Value *ScratchPadElemAbsolutePtrVal =
3004  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
3005  ScratchPadElemAbsolutePtrVal =
3006  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3007  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3008  C.getTypeAlignInChars(Private->getType()));
3009  IncrScratchpadDest = true;
3010  break;
3011  }
3012  case ScratchpadToThread: {
3013  // Step 1.1: Get the address for the src element in the scratchpad.
3014  // address = base + index * ElementSizeInChars.
3015  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3016  llvm::Value *CurrentOffset =
3017  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3018  llvm::Value *ScratchPadElemAbsolutePtrVal =
3019  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
3020  ScratchPadElemAbsolutePtrVal =
3021  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3022  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3023  C.getTypeAlignInChars(Private->getType()));
3024  IncrScratchpadSrc = true;
3025 
3026  // Step 1.2: Create a temporary to store the element in the destination
3027  // Reduce list.
3028  DestElementPtrAddr =
3029  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
3030  DestElementAddr =
3031  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3032  UpdateDestListPtr = true;
3033  break;
3034  }
3035  }
3036 
3037  // Regardless of src and dest of copy, we emit the load of src
3038  // element as this is required in all directions
3039  SrcElementAddr = Bld.CreateElementBitCast(
3040  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3041  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3042  SrcElementAddr.getElementType());
3043 
3044  // Now that all active lanes have read the element in the
3045  // Reduce list, shuffle over the value from the remote lane.
3046  if (ShuffleInElement) {
3047  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3048  RemoteLaneOffset, Private->getExprLoc());
3049  } else {
3050  if (Private->getType()->isScalarType()) {
3051  llvm::Value *Elem =
3052  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
3053  Private->getType(), Private->getExprLoc());
3054  // Store the source element value to the dest element address.
3055  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
3056  Private->getType());
3057  } else {
3058  CGF.EmitAggregateCopy(
3059  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3060  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3062  }
3063  }
3064 
3065  // Step 3.1: Modify reference in dest Reduce list as needed.
3066  // Modifying the reference in Reduce list to point to the newly
3067  // created element. The element is live in the current function
3068  // scope and that of functions it invokes (i.e., reduce_function).
3069  // RemoteReduceData[i] = (void*)&RemoteElem
3070  if (UpdateDestListPtr) {
3072  DestElementAddr.getPointer(), CGF.VoidPtrTy),
3073  DestElementPtrAddr, /*Volatile=*/false,
3074  C.VoidPtrTy);
3075  }
3076 
3077  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3078  // address of the next element in scratchpad memory, unless we're currently
3079  // processing the last one. Memory alignment is also taken care of here.
3080  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3081  llvm::Value *ScratchpadBasePtr =
3082  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3083  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3084  ScratchpadBasePtr = Bld.CreateNUWAdd(
3085  ScratchpadBasePtr,
3086  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3087 
3088  // Take care of global memory alignment for performance
3089  ScratchpadBasePtr = Bld.CreateNUWSub(
3090  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3091  ScratchpadBasePtr = Bld.CreateUDiv(
3092  ScratchpadBasePtr,
3093  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3094  ScratchpadBasePtr = Bld.CreateNUWAdd(
3095  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3096  ScratchpadBasePtr = Bld.CreateNUWMul(
3097  ScratchpadBasePtr,
3098  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3099 
3100  if (IncrScratchpadDest)
3101  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3102  else /* IncrScratchpadSrc = true */
3103  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3104  }
3105 
3106  ++Idx;
3107  }
3108 }
3109 
3110 /// This function emits a helper that gathers Reduce lists from the first
3111 /// lane of every active warp to lanes in the first warp.
3112 ///
3113 /// void inter_warp_copy_func(void* reduce_data, num_warps)
3114 /// shared smem[warp_size];
3115 /// For all data entries D in reduce_data:
3116 /// If (I am the first lane in each warp)
3117 /// Copy my local D to smem[warp_id]
3118 /// sync
3119 /// if (I am the first warp)
3120 /// Copy smem[thread_id] to my local D
3121 /// sync
3123  ArrayRef<const Expr *> Privates,
3124  QualType ReductionArrayTy,
3125  SourceLocation Loc) {
3126  ASTContext &C = CGM.getContext();
3127  llvm::Module &M = CGM.getModule();
3128 
3129  // ReduceList: thread local Reduce list.
3130  // At the stage of the computation when this function is called, partially
3131  // aggregated values reside in the first lane of every active warp.
3132  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3134  // NumWarps: number of warps active in the parallel region. This could
3135  // be smaller than 32 (max warps in a CTA) for partial block reduction.
3136  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3137  C.getIntTypeForBitwidth(32, /* Signed */ true),
3139  FunctionArgList Args;
3140  Args.push_back(&ReduceListArg);
3141  Args.push_back(&NumWarpsArg);
3142 
3143  const CGFunctionInfo &CGFI =
3145  auto *Fn = llvm::Function::Create(
3147  "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
3148  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3149  Fn->setDoesNotRecurse();
3150  CodeGenFunction CGF(CGM);
3151  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3152 
3153  CGBuilderTy &Bld = CGF.Builder;
3154 
3155  // This array is used as a medium to transfer, one reduce element at a time,
3156  // the data from the first lane of every warp to lanes in the first warp
3157  // in order to perform the final step of a reduction in a parallel region
3158  // (reduction across warps). The array is placed in NVPTX __shared__ memory
3159  // for reduced latency, as well as to have a distinct copy for concurrently
3160  // executing target regions. The array is declared with common linkage so
3161  // as to be shared across compilation units.
3162  StringRef TransferMediumName =
3163  "__openmp_nvptx_data_transfer_temporary_storage";
3164  llvm::GlobalVariable *TransferMedium =
3165  M.getGlobalVariable(TransferMediumName);
3166  if (!TransferMedium) {
3167  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3168  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3169  TransferMedium = new llvm::GlobalVariable(
3170  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3171  llvm::Constant::getNullValue(Ty), TransferMediumName,
3172  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3173  SharedAddressSpace);
3174  CGM.addCompilerUsedGlobal(TransferMedium);
3175  }
3176 
3177  // Get the CUDA thread id of the current OpenMP thread on the GPU.
3178  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3179  // nvptx_lane_id = nvptx_id % warpsize
3180  llvm::Value *LaneID = getNVPTXLaneID(CGF);
3181  // nvptx_warp_id = nvptx_id / warpsize
3182  llvm::Value *WarpID = getNVPTXWarpID(CGF);
3183 
3184  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3185  Address LocalReduceList(
3187  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3188  C.VoidPtrTy, Loc),
3189  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3190  CGF.getPointerAlign());
3191 
3192  unsigned Idx = 0;
3193  for (const Expr *Private : Privates) {
3194  //
3195  // Warp master copies reduce element to transfer medium in __shared__
3196  // memory.
3197  //
3198  unsigned RealTySize =
3199  C.getTypeSizeInChars(Private->getType())
3200  .alignTo(C.getTypeAlignInChars(Private->getType()))
3201  .getQuantity();
3202  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3203  unsigned NumIters = RealTySize / TySize;
3204  if (NumIters == 0)
3205  continue;
3206  QualType CType = C.getIntTypeForBitwidth(
3207  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
3208  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3209  CharUnits Align = CharUnits::fromQuantity(TySize);
3210  llvm::Value *Cnt = nullptr;
3211  Address CntAddr = Address::invalid();
3212  llvm::BasicBlock *PrecondBB = nullptr;
3213  llvm::BasicBlock *ExitBB = nullptr;
3214  if (NumIters > 1) {
3215  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3216  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3217  /*Volatile=*/false, C.IntTy);
3218  PrecondBB = CGF.createBasicBlock("precond");
3219  ExitBB = CGF.createBasicBlock("exit");
3220  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3221  // There is no need to emit line number for unconditional branch.
3223  CGF.EmitBlock(PrecondBB);
3224  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
3225  llvm::Value *Cmp =
3226  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3227  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3228  CGF.EmitBlock(BodyBB);
3229  }
3230  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3231  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3232  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3233 
3234  // if (lane_id == 0)
3235  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3236  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3237  CGF.EmitBlock(ThenBB);
3238 
3239  // Reduce element = LocalReduceList[i]
3240  Address ElemPtrPtrAddr =
3241  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3242  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3243  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3244  // elemptr = ((CopyType*)(elemptrptr)) + I
3245  Address ElemPtr = Address(ElemPtrPtr, Align);
3246  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3247  if (NumIters > 1) {
3248  ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3249  ElemPtr.getAlignment());
3250  }
3251 
3252  // Get pointer to location in transfer medium.
3253  // MediumPtr = &medium[warp_id]
3254  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3255  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3256  Address MediumPtr(MediumPtrVal, Align);
3257  // Casting to actual data type.
3258  // MediumPtr = (CopyType*)MediumPtrAddr;
3259  MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3260 
3261  // elem = *elemptr
3262  //*MediumPtr = elem
3263  llvm::Value *Elem =
3264  CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
3265  // Store the source element value to the dest element address.
3266  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
3267 
3268  Bld.CreateBr(MergeBB);
3269 
3270  CGF.EmitBlock(ElseBB);
3271  Bld.CreateBr(MergeBB);
3272 
3273  CGF.EmitBlock(MergeBB);
3274 
3275  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3276  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3277  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
3278 
3279  llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
3280  NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
3281  // named_barrier_sync(ParallelBarrierID, num_active_threads)
3282  syncParallelThreads(CGF, NumActiveThreads);
3283 
3284  //
3285  // Warp 0 copies reduce element from transfer medium.
3286  //
3287  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3288  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3289  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3290 
3291  // Up to 32 threads in warp 0 are active.
3292  llvm::Value *IsActiveThread =
3293  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3294  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3295 
3296  CGF.EmitBlock(W0ThenBB);
3297 
3298  // SrcMediumPtr = &medium[tid]
3299  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3300  TransferMedium,
3301  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3302  Address SrcMediumPtr(SrcMediumPtrVal, Align);
3303  // SrcMediumVal = *SrcMediumPtr;
3304  SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3305 
3306  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3307  Address TargetElemPtrPtr =
3308  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3309  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3310  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
3311  Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3312  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3313  if (NumIters > 1) {
3314  TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3315  TargetElemPtr.getAlignment());
3316  }
3317 
3318  // *TargetElemPtr = SrcMediumVal;
3319  llvm::Value *SrcMediumValue =
3320  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
3321  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3322  CType);
3323  Bld.CreateBr(W0MergeBB);
3324 
3325  CGF.EmitBlock(W0ElseBB);
3326  Bld.CreateBr(W0MergeBB);
3327 
3328  CGF.EmitBlock(W0MergeBB);
3329 
3330  // While warp 0 copies values from transfer medium, all other warps must
3331  // wait.
3332  syncParallelThreads(CGF, NumActiveThreads);
3333  if (NumIters > 1) {
3334  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
3335  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
3336  CGF.EmitBranch(PrecondBB);
3338  CGF.EmitBlock(ExitBB);
3339  }
3340  RealTySize %= TySize;
3341  }
3342  ++Idx;
3343  }
3344 
3345  CGF.FinishFunction();
3346  return Fn;
3347 }
3348 
3349 /// Emit a helper that reduces data across two OpenMP threads (lanes)
3350 /// in the same warp. It uses shuffle instructions to copy over data from
3351 /// a remote lane's stack. The reduction algorithm performed is specified
3352 /// by the fourth parameter.
3353 ///
3354 /// Algorithm Versions.
3355 /// Full Warp Reduce (argument value 0):
3356 /// This algorithm assumes that all 32 lanes are active and gathers
3357 /// data from these 32 lanes, producing a single resultant value.
3358 /// Contiguous Partial Warp Reduce (argument value 1):
3359 /// This algorithm assumes that only a *contiguous* subset of lanes
3360 /// are active. This happens for the last warp in a parallel region
3361 /// when the user specified num_threads is not an integer multiple of
3362 /// 32. This contiguous subset always starts with the zeroth lane.
3363 /// Partial Warp Reduce (argument value 2):
3364 /// This algorithm gathers data from any number of lanes at any position.
3365 /// All reduced values are stored in the lowest possible lane. The set
3366 /// of problems every algorithm addresses is a super set of those
3367 /// addressable by algorithms with a lower version number. Overhead
3368 /// increases as algorithm version increases.
3369 ///
3370 /// Terminology
3371 /// Reduce element:
3372 /// Reduce element refers to the individual data field with primitive
3373 /// data types to be combined and reduced across threads.
3374 /// Reduce list:
3375 /// Reduce list refers to a collection of local, thread-private
3376 /// reduce elements.
3377 /// Remote Reduce list:
3378 /// Remote Reduce list refers to a collection of remote (relative to
3379 /// the current thread) reduce elements.
3380 ///
3381 /// We distinguish between three states of threads that are important to
3382 /// the implementation of this function.
3383 /// Alive threads:
3384 /// Threads in a warp executing the SIMT instruction, as distinguished from
3385 /// threads that are inactive due to divergent control flow.
3386 /// Active threads:
3387 /// The minimal set of threads that has to be alive upon entry to this
3388 /// function. The computation is correct iff active threads are alive.
3389 /// Some threads are alive but they are not active because they do not
3390 /// contribute to the computation in any useful manner. Turning them off
3391 /// may introduce control flow overheads without any tangible benefits.
3392 /// Effective threads:
3393 /// In order to comply with the argument requirements of the shuffle
3394 /// function, we must keep all lanes holding data alive. But at most
3395 /// half of them perform value aggregation; we refer to this half of
3396 /// threads as effective. The other half is simply handing off their
3397 /// data.
3398 ///
3399 /// Procedure
3400 /// Value shuffle:
3401 /// In this step active threads transfer data from higher lane positions
3402 /// in the warp to lower lane positions, creating Remote Reduce list.
3403 /// Value aggregation:
3404 /// In this step, effective threads combine their thread local Reduce list
3405 /// with Remote Reduce list and store the result in the thread local
3406 /// Reduce list.
3407 /// Value copy:
3408 /// In this step, we deal with the assumption made by algorithm 2
3409 /// (i.e. contiguity assumption). When we have an odd number of lanes
3410 /// active, say 2k+1, only k threads will be effective and therefore k
3411 /// new values will be produced. However, the Reduce list owned by the
3412 /// (2k+1)th thread is ignored in the value aggregation. Therefore
3413 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3414 /// that the contiguity assumption still holds.
3416  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3417  QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
3418  ASTContext &C = CGM.getContext();
3419 
3420  // Thread local Reduce list used to host the values of data to be reduced.
3421  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3423  // Current lane id; could be logical.
3424  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3426  // Offset of the remote source lane relative to the current lane.
3427  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3429  // Algorithm version. This is expected to be known at compile time.
3430  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3432  FunctionArgList Args;
3433  Args.push_back(&ReduceListArg);
3434  Args.push_back(&LaneIDArg);
3435  Args.push_back(&RemoteLaneOffsetArg);
3436  Args.push_back(&AlgoVerArg);
3437 
3438  const CGFunctionInfo &CGFI =
3440  auto *Fn = llvm::Function::Create(
3442  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3443  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3444  Fn->setDoesNotRecurse();
3445  CodeGenFunction CGF(CGM);
3446  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3447 
3448  CGBuilderTy &Bld = CGF.Builder;
3449 
3450  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3451  Address LocalReduceList(
3453  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3454  C.VoidPtrTy, SourceLocation()),
3455  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3456  CGF.getPointerAlign());
3457 
3458  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3459  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3460  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3461 
3462  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3463  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3464  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3465 
3466  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3467  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3468  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3469 
3470  // Create a local thread-private variable to host the Reduce list
3471  // from a remote lane.
3472  Address RemoteReduceList =
3473  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3474 
3475  // This loop iterates through the list of reduce elements and copies,
3476  // element by element, from a remote lane in the warp to RemoteReduceList,
3477  // hosted on the thread's stack.
3478  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3479  LocalReduceList, RemoteReduceList,
3480  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3481  /*ScratchpadIndex=*/nullptr,
3482  /*ScratchpadWidth=*/nullptr});
3483 
3484  // The actions to be performed on the Remote Reduce list is dependent
3485  // on the algorithm version.
3486  //
3487  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3488  // LaneId % 2 == 0 && Offset > 0):
3489  // do the reduction value aggregation
3490  //
3491  // The thread local variable Reduce list is mutated in place to host the
3492  // reduced data, which is the aggregated value produced from local and
3493  // remote lanes.
3494  //
3495  // Note that AlgoVer is expected to be a constant integer known at compile
3496  // time.
3497  // When AlgoVer==0, the first conjunction evaluates to true, making
3498  // the entire predicate true during compile time.
3499  // When AlgoVer==1, the second conjunction has only the second part to be
3500  // evaluated during runtime. Other conjunctions evaluates to false
3501  // during compile time.
3502  // When AlgoVer==2, the third conjunction has only the second part to be
3503  // evaluated during runtime. Other conjunctions evaluates to false
3504  // during compile time.
3505  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3506 
3507  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3508  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3509  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3510 
3511  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3512  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3513  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3514  CondAlgo2 = Bld.CreateAnd(
3515  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3516 
3517  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3518  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3519 
3520  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3521  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3522  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3523  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3524 
3525  CGF.EmitBlock(ThenBB);
3526  // reduce_function(LocalReduceList, RemoteReduceList)
3527  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3528  LocalReduceList.getPointer(), CGF.VoidPtrTy);
3529  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3530  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3531  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3532  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3533  Bld.CreateBr(MergeBB);
3534 
3535  CGF.EmitBlock(ElseBB);
3536  Bld.CreateBr(MergeBB);
3537 
3538  CGF.EmitBlock(MergeBB);
3539 
3540  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3541  // Reduce list.
3542  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3543  llvm::Value *CondCopy = Bld.CreateAnd(
3544  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3545 
3546  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3547  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3548  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3549  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3550 
3551  CGF.EmitBlock(CpyThenBB);
3552  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3553  RemoteReduceList, LocalReduceList);
3554  Bld.CreateBr(CpyMergeBB);
3555 
3556  CGF.EmitBlock(CpyElseBB);
3557  Bld.CreateBr(CpyMergeBB);
3558 
3559  CGF.EmitBlock(CpyMergeBB);
3560 
3561  CGF.FinishFunction();
3562  return Fn;
3563 }
3564 
3565 ///
3566 /// Design of OpenMP reductions on the GPU
3567 ///
3568 /// Consider a typical OpenMP program with one or more reduction
3569 /// clauses:
3570 ///
3571 /// float foo;
3572 /// double bar;
3573 /// #pragma omp target teams distribute parallel for \
3574 /// reduction(+:foo) reduction(*:bar)
3575 /// for (int i = 0; i < N; i++) {
3576 /// foo += A[i]; bar *= B[i];
3577 /// }
3578 ///
3579 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3580 /// all teams. In our OpenMP implementation on the NVPTX device an
3581 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3582 /// within a team are mapped to CUDA threads within a threadblock.
3583 /// Our goal is to efficiently aggregate values across all OpenMP
3584 /// threads such that:
3585 ///
3586 /// - the compiler and runtime are logically concise, and
3587 /// - the reduction is performed efficiently in a hierarchical
3588 /// manner as follows: within OpenMP threads in the same warp,
3589 /// across warps in a threadblock, and finally across teams on
3590 /// the NVPTX device.
3591 ///
3592 /// Introduction to Decoupling
3593 ///
3594 /// We would like to decouple the compiler and the runtime so that the
3595 /// latter is ignorant of the reduction variables (number, data types)
3596 /// and the reduction operators. This allows a simpler interface
3597 /// and implementation while still attaining good performance.
3598 ///
3599 /// Pseudocode for the aforementioned OpenMP program generated by the
3600 /// compiler is as follows:
3601 ///
3602 /// 1. Create private copies of reduction variables on each OpenMP
3603 /// thread: 'foo_private', 'bar_private'
3604 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3605 /// to it and writes the result in 'foo_private' and 'bar_private'
3606 /// respectively.
3607 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
3608 /// and store the result on the team master:
3609 ///
3610 /// __kmpc_nvptx_parallel_reduce_nowait(...,
3611 /// reduceData, shuffleReduceFn, interWarpCpyFn)
3612 ///
3613 /// where:
3614 /// struct ReduceData {
3615 /// double *foo;
3616 /// double *bar;
3617 /// } reduceData
3618 /// reduceData.foo = &foo_private
3619 /// reduceData.bar = &bar_private
3620 ///
3621 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3622 /// auxiliary functions generated by the compiler that operate on
3623 /// variables of type 'ReduceData'. They aid the runtime perform
3624 /// algorithmic steps in a data agnostic manner.
3625 ///
3626 /// 'shuffleReduceFn' is a pointer to a function that reduces data
3627 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
3628 /// same warp. It takes the following arguments as input:
3629 ///
3630 /// a. variable of type 'ReduceData' on the calling lane,
3631 /// b. its lane_id,
3632 /// c. an offset relative to the current lane_id to generate a
3633 /// remote_lane_id. The remote lane contains the second
3634 /// variable of type 'ReduceData' that is to be reduced.
3635 /// d. an algorithm version parameter determining which reduction
3636 /// algorithm to use.
3637 ///
3638 /// 'shuffleReduceFn' retrieves data from the remote lane using
3639 /// efficient GPU shuffle intrinsics and reduces, using the
3640 /// algorithm specified by the 4th parameter, the two operands
3641 /// element-wise. The result is written to the first operand.
3642 ///
3643 /// Different reduction algorithms are implemented in different
3644 /// runtime functions, all calling 'shuffleReduceFn' to perform
3645 /// the essential reduction step. Therefore, based on the 4th
3646 /// parameter, this function behaves slightly differently to
3647 /// cooperate with the runtime to ensure correctness under
3648 /// different circumstances.
3649 ///
3650 /// 'InterWarpCpyFn' is a pointer to a function that transfers
3651 /// reduced variables across warps. It tunnels, through CUDA
3652 /// shared memory, the thread-private data of type 'ReduceData'
3653 /// from lane 0 of each warp to a lane in the first warp.
3654 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3655 /// The last team writes the global reduced value to memory.
3656 ///
3657 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3658 /// reduceData, shuffleReduceFn, interWarpCpyFn,
3659 /// scratchpadCopyFn, loadAndReduceFn)
3660 ///
3661 /// 'scratchpadCopyFn' is a helper that stores reduced
3662 /// data from the team master to a scratchpad array in
3663 /// global memory.
3664 ///
3665 /// 'loadAndReduceFn' is a helper that loads data from
3666 /// the scratchpad array and reduces it with the input
3667 /// operand.
3668 ///
3669 /// These compiler generated functions hide address
3670 /// calculation and alignment information from the runtime.
3671 /// 5. if ret == 1:
3672 /// The team master of the last team stores the reduced
3673 /// result to the globals in memory.
3674 /// foo += reduceData.foo; bar *= reduceData.bar
3675 ///
3676 ///
3677 /// Warp Reduction Algorithms
3678 ///
3679 /// On the warp level, we have three algorithms implemented in the
3680 /// OpenMP runtime depending on the number of active lanes:
3681 ///
3682 /// Full Warp Reduction
3683 ///
3684 /// The reduce algorithm within a warp where all lanes are active
3685 /// is implemented in the runtime as follows:
3686 ///
3687 /// full_warp_reduce(void *reduce_data,
3688 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3689 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3690 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
3691 /// }
3692 ///
3693 /// The algorithm completes in log(2, WARPSIZE) steps.
3694 ///
3695 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3696 /// not used therefore we save instructions by not retrieving lane_id
3697 /// from the corresponding special registers. The 4th parameter, which
3698 /// represents the version of the algorithm being used, is set to 0 to
3699 /// signify full warp reduction.
3700 ///
3701 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3702 ///
3703 /// #reduce_elem refers to an element in the local lane's data structure
3704 /// #remote_elem is retrieved from a remote lane
3705 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3706 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3707 ///
3708 /// Contiguous Partial Warp Reduction
3709 ///
3710 /// This reduce algorithm is used within a warp where only the first
3711 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3712 /// number of OpenMP threads in a parallel region is not a multiple of
3713 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
3714 ///
3715 /// void
3716 /// contiguous_partial_reduce(void *reduce_data,
3717 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3718 /// int size, int lane_id) {
3719 /// int curr_size;
3720 /// int offset;
3721 /// curr_size = size;
3722 /// mask = curr_size/2;
3723 /// while (offset>0) {
3724 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3725 /// curr_size = (curr_size+1)/2;
3726 /// offset = curr_size/2;
3727 /// }
3728 /// }
3729 ///
3730 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3731 ///
3732 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3733 /// if (lane_id < offset)
3734 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3735 /// else
3736 /// reduce_elem = remote_elem
3737 ///
3738 /// This algorithm assumes that the data to be reduced are located in a
3739 /// contiguous subset of lanes starting from the first. When there is
3740 /// an odd number of active lanes, the data in the last lane is not
3741 /// aggregated with any other lane's dat but is instead copied over.
3742 ///
3743 /// Dispersed Partial Warp Reduction
3744 ///
3745 /// This algorithm is used within a warp when any discontiguous subset of
3746 /// lanes are active. It is used to implement the reduction operation
3747 /// across lanes in an OpenMP simd region or in a nested parallel region.
3748 ///
3749 /// void
3750 /// dispersed_partial_reduce(void *reduce_data,
3751 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3752 /// int size, remote_id;
3753 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3754 /// do {
3755 /// remote_id = next_active_lane_id_right_after_me();
3756 /// # the above function returns 0 of no active lane
3757 /// # is present right after the current lane.
3758 /// size = number_of_active_lanes_in_this_warp();
3759 /// logical_lane_id /= 2;
3760 /// ShuffleReduceFn(reduce_data, logical_lane_id,
3761 /// remote_id-1-threadIdx.x, 2);
3762 /// } while (logical_lane_id % 2 == 0 && size > 1);
3763 /// }
3764 ///
3765 /// There is no assumption made about the initial state of the reduction.
3766 /// Any number of lanes (>=1) could be active at any position. The reduction
3767 /// result is returned in the first active lane.
3768 ///
3769 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3770 ///
3771 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3772 /// if (lane_id % 2 == 0 && offset > 0)
3773 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3774 /// else
3775 /// reduce_elem = remote_elem
3776 ///
3777 ///
3778 /// Intra-Team Reduction
3779 ///
3780 /// This function, as implemented in the runtime call
3781 /// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
3782 /// threads in a team. It first reduces within a warp using the
3783 /// aforementioned algorithms. We then proceed to gather all such
3784 /// reduced values at the first warp.
3785 ///
3786 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3787 /// data from each of the "warp master" (zeroth lane of each warp, where
3788 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
3789 /// a mathematical sense) the problem of reduction across warp masters in
3790 /// a block to the problem of warp reduction.
3791 ///
3792 ///
3793 /// Inter-Team Reduction
3794 ///
3795 /// Once a team has reduced its data to a single value, it is stored in
3796 /// a global scratchpad array. Since each team has a distinct slot, this
3797 /// can be done without locking.
3798 ///
3799 /// The last team to write to the scratchpad array proceeds to reduce the
3800 /// scratchpad array. One or more workers in the last team use the helper
3801 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3802 /// the k'th worker reduces every k'th element.
3803 ///
3804 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
3805 /// reduce across workers and compute a globally reduced value.
3806 ///
3810  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3811  if (!CGF.HaveInsertPoint())
3812  return;
3813 
3814  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3815 #ifndef NDEBUG
3816  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3817 #endif
3818 
3819  if (Options.SimpleReduction) {
3820  assert(!TeamsReduction && !ParallelReduction &&
3821  "Invalid reduction selection in emitReduction.");
3822  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3823  ReductionOps, Options);
3824  return;
3825  }
3826 
3827  assert((TeamsReduction || ParallelReduction) &&
3828  "Invalid reduction selection in emitReduction.");
3829 
3830  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3831  // RedList, shuffle_reduce_func, interwarp_copy_func);
3832  // or
3833  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3834  llvm::Value *ThreadId = getThreadID(CGF, Loc);
3835 
3836  llvm::Value *Res;
3837  if (ParallelReduction) {
3838  ASTContext &C = CGM.getContext();
3839  // 1. Build a list of reduction variables.
3840  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3841  auto Size = RHSExprs.size();
3842  for (const Expr *E : Privates) {
3843  if (E->getType()->isVariablyModifiedType())
3844  // Reserve place for array size.
3845  ++Size;
3846  }
3847  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3848  QualType ReductionArrayTy =
3850  /*IndexTypeQuals=*/0);
3851  Address ReductionList =
3852  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3853  auto IPriv = Privates.begin();
3854  unsigned Idx = 0;
3855  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3856  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3857  CGF.getPointerSize());
3858  CGF.Builder.CreateStore(
3860  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
3861  Elem);
3862  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3863  // Store array size.
3864  ++Idx;
3865  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3866  CGF.getPointerSize());
3867  llvm::Value *Size = CGF.Builder.CreateIntCast(
3868  CGF.getVLASize(
3869  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3870  .NumElts,
3871  CGF.SizeTy, /*isSigned=*/false);
3872  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3873  Elem);
3874  }
3875  }
3876 
3877  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3879  ReductionList.getPointer(), CGF.VoidPtrTy);
3880  llvm::Value *ReductionFn = emitReductionFunction(
3881  CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
3882  Privates, LHSExprs, RHSExprs, ReductionOps);
3883  llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3884  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3885  llvm::Value *InterWarpCopyFn =
3886  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3887 
3888  llvm::Value *Args[] = {ThreadId,
3889  CGF.Builder.getInt32(RHSExprs.size()),
3890  ReductionArrayTySize,
3891  RL,
3892  ShuffleAndReduceFn,
3893  InterWarpCopyFn};
3894 
3895  Res = CGF.EmitRuntimeCall(
3896  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
3897  Args);
3898  } else {
3899  assert(TeamsReduction && "expected teams reduction.");
3900  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3901  std::string Name = getName({"reduction"});
3902  llvm::Value *Lock = getCriticalRegionLock(Name);
3903  llvm::Value *Args[] = {RTLoc, ThreadId, Lock};
3904  Res = CGF.EmitRuntimeCall(
3906  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple),
3907  Args);
3908  }
3909 
3910  // 5. Build if (res == 1)
3911  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3912  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3913  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3914  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3915  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3916 
3917  // 6. Build then branch: where we have reduced values in the master
3918  // thread in each team.
3919  // __kmpc_end_reduce{_nowait}(<gtid>);
3920  // break;
3921  CGF.EmitBlock(ThenBB);
3922 
3923  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3924  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3925  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3926  auto IPriv = Privates.begin();
3927  auto ILHS = LHSExprs.begin();
3928  auto IRHS = RHSExprs.begin();
3929  for (const Expr *E : ReductionOps) {
3930  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3931  cast<DeclRefExpr>(*IRHS));
3932  ++IPriv;
3933  ++ILHS;
3934  ++IRHS;
3935  }
3936  };
3937  if (ParallelReduction) {
3938  llvm::Value *EndArgs[] = {ThreadId};
3939  RegionCodeGenTy RCG(CodeGen);
3940  NVPTXActionTy Action(
3941  nullptr, llvm::None,
3942  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
3943  EndArgs);
3944  RCG.setAction(Action);
3945  RCG(CGF);
3946  } else {
3947  assert(TeamsReduction && "expected teams reduction.");
3948  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3949  std::string Name = getName({"reduction"});
3950  llvm::Value *Lock = getCriticalRegionLock(Name);
3951  llvm::Value *EndArgs[] = {RTLoc, ThreadId, Lock};
3952  RegionCodeGenTy RCG(CodeGen);
3953  NVPTXActionTy Action(
3954  nullptr, llvm::None,
3956  OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple),
3957  EndArgs);
3958  RCG.setAction(Action);
3959  RCG(CGF);
3960  }
3961  // There is no need to emit line number for unconditional branch.
3963  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3964 }
3965 
3966 const VarDecl *
3968  const VarDecl *NativeParam) const {
3969  if (!NativeParam->getType()->isReferenceType())
3970  return NativeParam;
3971  QualType ArgType = NativeParam->getType();
3972  QualifierCollector QC;
3973  const Type *NonQualTy = QC.strip(ArgType);
3974  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3975  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3976  if (Attr->getCaptureKind() == OMPC_map) {
3977  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3979  }
3980  }
3981  ArgType = CGM.getContext().getPointerType(PointeeTy);
3982  QC.addRestrict();
3983  enum { NVPTX_local_addr = 5 };
3984  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3985  ArgType = QC.apply(CGM.getContext(), ArgType);
3986  if (isa<ImplicitParamDecl>(NativeParam))
3988  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3989  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3990  return ParmVarDecl::Create(
3991  CGM.getContext(),
3992  const_cast<DeclContext *>(NativeParam->getDeclContext()),
3993  NativeParam->getBeginLoc(), NativeParam->getLocation(),
3994  NativeParam->getIdentifier(), ArgType,
3995  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3996 }
3997 
3998 Address
4000  const VarDecl *NativeParam,
4001  const VarDecl *TargetParam) const {
4002  assert(NativeParam != TargetParam &&
4003  NativeParam->getType()->isReferenceType() &&
4004  "Native arg must not be the same as target arg.");
4005  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4006  QualType NativeParamType = NativeParam->getType();
4007  QualifierCollector QC;
4008  const Type *NonQualTy = QC.strip(NativeParamType);
4009  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4010  unsigned NativePointeeAddrSpace =
4011  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4012  QualType TargetTy = TargetParam->getType();
4013  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4014  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
4015  // First cast to generic.
4017  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4018  /*AddrSpace=*/0));
4019  // Cast from generic to native address space.
4021  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4022  NativePointeeAddrSpace));
4023  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4024  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
4025  NativeParamType);
4026  return NativeParamAddr;
4027 }
4028 
4030  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
4031  ArrayRef<llvm::Value *> Args) const {
4032  SmallVector<llvm::Value *, 4> TargetArgs;
4033  TargetArgs.reserve(Args.size());
4034  auto *FnType =
4035  cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
4036  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
4037  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4038  TargetArgs.append(std::next(Args.begin(), I), Args.end());
4039  break;
4040  }
4041  llvm::Type *TargetType = FnType->getParamType(I);
4042  llvm::Value *NativeArg = Args[I];
4043  if (!TargetType->isPointerTy()) {
4044  TargetArgs.emplace_back(NativeArg);
4045  continue;
4046  }
4048  NativeArg,
4049  NativeArg->getType()->getPointerElementType()->getPointerTo());
4050  TargetArgs.emplace_back(
4051  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4052  }
4053  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4054 }
4055 
4056 /// Emit function which wraps the outline parallel region
4057 /// and controls the arguments which are passed to this function.
4058 /// The wrapper ensures that the outlined function is called
4059 /// with the correct arguments when data is shared.
4060 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4061  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4062  ASTContext &Ctx = CGM.getContext();
4063  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4064 
4065  // Create a function that takes as argument the source thread.
4066  FunctionArgList WrapperArgs;
4067  QualType Int16QTy =
4068  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4069  QualType Int32QTy =
4070  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4071  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4072  /*Id=*/nullptr, Int16QTy,
4074  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4075  /*Id=*/nullptr, Int32QTy,
4077  WrapperArgs.emplace_back(&ParallelLevelArg);
4078  WrapperArgs.emplace_back(&WrapperArg);
4079 
4080  const CGFunctionInfo &CGFI =
4081  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4082 
4083  auto *Fn = llvm::Function::Create(
4085  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4086  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4087  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4088  Fn->setDoesNotRecurse();
4089 
4090  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4091  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4092  D.getBeginLoc(), D.getBeginLoc());
4093 
4094  const auto *RD = CS.getCapturedRecordDecl();
4095  auto CurField = RD->field_begin();
4096 
4097  Address ZeroAddr = CGF.CreateMemTemp(
4098  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
4099  /*Name*/ ".zero.addr");
4100  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4101  // Get the array of arguments.
4103 
4104  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4105  Args.emplace_back(ZeroAddr.getPointer());
4106 
4107  CGBuilderTy &Bld = CGF.Builder;
4108  auto CI = CS.capture_begin();
4109 
4110  // Use global memory for data sharing.
4111  // Handle passing of global args to workers.
4112  Address GlobalArgs =
4113  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4114  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4115  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4116  CGF.EmitRuntimeCall(
4117  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4118  DataSharingArgs);
4119 
4120  // Retrieve the shared variables from the list of references returned
4121  // by the runtime. Pass the variables to the outlined function.
4122  Address SharedArgListAddress = Address::invalid();
4123  if (CS.capture_size() > 0 ||
4125  SharedArgListAddress = CGF.EmitLoadOfPointer(
4126  GlobalArgs, CGF.getContext()
4128  CGF.getContext().VoidPtrTy))
4129  .castAs<PointerType>());
4130  }
4131  unsigned Idx = 0;
4133  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4134  CGF.getPointerSize());
4135  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4136  Src, CGF.SizeTy->getPointerTo());
4137  llvm::Value *LB = CGF.EmitLoadOfScalar(
4138  TypedAddress,
4139  /*Volatile=*/false,
4141  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4142  Args.emplace_back(LB);
4143  ++Idx;
4144  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4145  CGF.getPointerSize());
4146  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4147  Src, CGF.SizeTy->getPointerTo());
4148  llvm::Value *UB = CGF.EmitLoadOfScalar(
4149  TypedAddress,
4150  /*Volatile=*/false,
4152  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4153  Args.emplace_back(UB);
4154  ++Idx;
4155  }
4156  if (CS.capture_size() > 0) {
4157  ASTContext &CGFContext = CGF.getContext();
4158  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4159  QualType ElemTy = CurField->getType();
4160  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
4161  CGF.getPointerSize());
4162  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4163  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4164  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4165  /*Volatile=*/false,
4166  CGFContext.getPointerType(ElemTy),
4167  CI->getLocation());
4168  if (CI->capturesVariableByCopy() &&
4169  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4170  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4171  CI->getLocation());
4172  }
4173  Args.emplace_back(Arg);
4174  }
4175  }
4176 
4177  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4178  CGF.FinishFunction();
4179  return Fn;
4180 }
4181 
4183  const Decl *D) {
4185  return;
4186 
4187  assert(D && "Expected function or captured|block decl.");
4188  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4189  "Function is registered already.");
4190  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4191  "Team is set but not processed.");
4192  const Stmt *Body = nullptr;
4193  bool NeedToDelayGlobalization = false;
4194  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4195  Body = FD->getBody();
4196  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4197  Body = BD->getBody();
4198  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4199  Body = CD->getBody();
4200  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4201  if (NeedToDelayGlobalization &&
4202  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4203  return;
4204  }
4205  if (!Body)
4206  return;
4207  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4208  VarChecker.Visit(Body);
4209  const RecordDecl *GlobalizedVarsRecord =
4210  VarChecker.getGlobalizedRecord(IsInTTDRegion);
4211  TeamAndReductions.first = nullptr;
4212  TeamAndReductions.second.clear();
4213  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4214  VarChecker.getEscapedVariableLengthDecls();
4215  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4216  return;
4217  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4218  I->getSecond().MappedParams =
4219  llvm::make_unique<CodeGenFunction::OMPMapVars>();
4220  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4221  I->getSecond().EscapedParameters.insert(
4222  VarChecker.getEscapedParameters().begin(),
4223  VarChecker.getEscapedParameters().end());
4224  I->getSecond().EscapedVariableLengthDecls.append(
4225  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4226  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4227  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4228  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4229  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4230  Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4231  }
4232  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4233  CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4234  VarChecker.Visit(Body);
4235  I->getSecond().SecondaryGlobalRecord =
4236  VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4237  I->getSecond().SecondaryLocalVarData.emplace();
4238  DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4239  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4240  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4241  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4242  Data.insert(
4243  std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4244  }
4245  }
4246  if (!NeedToDelayGlobalization) {
4247  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4248  struct GlobalizationScope final : EHScopeStack::Cleanup {
4249  GlobalizationScope() = default;
4250 
4251  void Emit(CodeGenFunction &CGF, Flags flags) override {
4252  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4253  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4254  }
4255  };
4256  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4257  }
4258 }
4259 
4261  const VarDecl *VD) {
4263  return Address::invalid();
4264 
4265  VD = VD->getCanonicalDecl();
4266  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4267  if (I == FunctionGlobalizedDecls.end())
4268  return Address::invalid();
4269  auto VDI = I->getSecond().LocalVarData.find(VD);
4270  if (VDI != I->getSecond().LocalVarData.end())
4271  return VDI->second.PrivateAddr;
4272  if (VD->hasAttrs()) {
4274  E(VD->attr_end());
4275  IT != E; ++IT) {
4276  auto VDI = I->getSecond().LocalVarData.find(
4277  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4278  ->getCanonicalDecl());
4279  if (VDI != I->getSecond().LocalVarData.end())
4280  return VDI->second.PrivateAddr;
4281  }
4282  }
4283  return Address::invalid();
4284 }
4285 
4287  FunctionGlobalizedDecls.erase(CGF.CurFn);
4289 }
4290 
4292  CodeGenFunction &CGF, const OMPLoopDirective &S,
4293  OpenMPDistScheduleClauseKind &ScheduleKind,
4294  llvm::Value *&Chunk) const {
4295  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4296  ScheduleKind = OMPC_DIST_SCHEDULE_static;
4297  Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4298  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4300  return;
4301  }
4303  CGF, S, ScheduleKind, Chunk);
4304 }
4305 
4307  CodeGenFunction &CGF, const OMPLoopDirective &S,
4308  OpenMPScheduleClauseKind &ScheduleKind,
4309  const Expr *&ChunkExpr) const {
4310  ScheduleKind = OMPC_SCHEDULE_static;
4311  // Chunk size is 1 in this case.
4312  llvm::APInt ChunkSize(32, 1);
4313  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4314  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4315  SourceLocation());
4316 }
4317 
4319  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4321  " Expected target-based directive.");
4322  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4323  for (const CapturedStmt::Capture &C : CS->captures()) {
4324  // Capture variables captured by reference in lambdas for target-based
4325  // directives.
4326  if (!C.capturesVariable())
4327  continue;
4328  const VarDecl *VD = C.getCapturedVar();
4329  const auto *RD = VD->getType()
4330  .getCanonicalType()
4332  ->getAsCXXRecordDecl();
4333  if (!RD || !RD->isLambda())
4334  continue;
4335  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4336  LValue VDLVal;
4337  if (VD->getType().getCanonicalType()->isReferenceType())
4338  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4339  else
4340  VDLVal = CGF.MakeAddrLValue(
4341  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4342  llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4343  FieldDecl *ThisCapture = nullptr;
4344  RD->getCaptureFields(Captures, ThisCapture);
4345  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4346  LValue ThisLVal =
4347  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4348  llvm::Value *CXXThis = CGF.LoadCXXThis();
4349  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4350  }
4351  for (const LambdaCapture &LC : RD->captures()) {
4352  if (LC.getCaptureKind() != LCK_ByRef)
4353  continue;
4354  const VarDecl *VD = LC.getCapturedVar();
4355  if (!CS->capturesVariable(VD))
4356  continue;
4357  auto It = Captures.find(VD);
4358  assert(It != Captures.end() && "Found lambda capture without field.");
4359  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4360  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4361  if (VD->getType().getCanonicalType()->isReferenceType())
4362  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4363  VD->getType().getCanonicalType())
4364  .getAddress();
4365  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4366  }
4367  }
4368 }
4369 
4370 // Get current CudaArch and ignore any unknown values
4372  if (!CGM.getTarget().hasFeature("ptx"))
4373  return CudaArch::UNKNOWN;
4374  llvm::StringMap<bool> Features;
4375  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4376  CGM.getTarget().getTargetOpts().CPU,
4377  CGM.getTarget().getTargetOpts().Features);
4378  for (const auto &Feature : Features) {
4379  if (Feature.getValue()) {
4380  CudaArch Arch = StringToCudaArch(Feature.getKey());
4381  if (Arch != CudaArch::UNKNOWN)
4382  return Arch;
4383  }
4384  }
4385  return CudaArch::UNKNOWN;
4386 }
4387 
4388 /// Check to see if target architecture supports unified addressing which is
4389 /// a restriction for OpenMP requires clause "unified_shared_memory".
4391  CodeGenModule &CGM, const OMPRequiresDecl *D) const {
4392  for (const OMPClause *Clause : D->clauselists()) {
4393  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4394  switch (getCudaArch(CGM)) {
4395  case CudaArch::SM_20:
4396  case CudaArch::SM_21:
4397  case CudaArch::SM_30:
4398  case CudaArch::SM_32:
4399  case CudaArch::SM_35:
4400  case CudaArch::SM_37:
4401  case CudaArch::SM_50:
4402  case CudaArch::SM_52:
4403  case CudaArch::SM_53:
4404  case CudaArch::SM_60:
4405  case CudaArch::SM_61:
4406  case CudaArch::SM_62:
4407  CGM.Error(Clause->getBeginLoc(),
4408  "Target architecture does not support unified addressing");
4409  return;
4410  case CudaArch::SM_70:
4411  case CudaArch::SM_72:
4412  case CudaArch::SM_75:
4413  case CudaArch::GFX600:
4414  case CudaArch::GFX601:
4415  case CudaArch::GFX700:
4416  case CudaArch::GFX701:
4417  case CudaArch::GFX702:
4418  case CudaArch::GFX703:
4419  case CudaArch::GFX704:
4420  case CudaArch::GFX801:
4421  case CudaArch::GFX802:
4422  case CudaArch::GFX803:
4423  case CudaArch::GFX810:
4424  case CudaArch::GFX900:
4425  case CudaArch::GFX902:
4426  case CudaArch::GFX904:
4427  case CudaArch::GFX906:
4428  case CudaArch::GFX909:
4429  case CudaArch::UNKNOWN:
4430  break;
4431  case CudaArch::LAST:
4432  llvm_unreachable("Unexpected Cuda arch.");
4433  }
4434  }
4435  }
4436 }
4437 
4438 /// Get number of SMs and number of blocks per SM.
4439 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4440  std::pair<unsigned, unsigned> Data;
4441  if (CGM.getLangOpts().OpenMPCUDANumSMs)
4442  Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4443  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4444  Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4445  if (Data.first && Data.second)
4446  return Data;
4447  switch (getCudaArch(CGM)) {
4448  case CudaArch::SM_20:
4449  case CudaArch::SM_21:
4450  case CudaArch::SM_30:
4451  case CudaArch::SM_32:
4452  case CudaArch::SM_35:
4453  case CudaArch::SM_37:
4454  case CudaArch::SM_50:
4455  case CudaArch::SM_52:
4456  case CudaArch::SM_53:
4457  return {16, 16};
4458  case CudaArch::SM_60:
4459  case CudaArch::SM_61:
4460  case CudaArch::SM_62:
4461  return {56, 32};
4462  case CudaArch::SM_70:
4463  case CudaArch::SM_72:
4464  case CudaArch::SM_75:
4465  return {84, 32};
4466  case CudaArch::GFX600:
4467  case CudaArch::GFX601:
4468  case CudaArch::GFX700:
4469  case CudaArch::GFX701:
4470  case CudaArch::GFX702:
4471  case CudaArch::GFX703:
4472  case CudaArch::GFX704:
4473  case CudaArch::GFX801:
4474  case CudaArch::GFX802:
4475  case CudaArch::GFX803:
4476  case CudaArch::GFX810:
4477  case CudaArch::GFX900:
4478  case CudaArch::GFX902:
4479  case CudaArch::GFX904:
4480  case CudaArch::GFX906:
4481  case CudaArch::GFX909:
4482  case CudaArch::UNKNOWN:
4483  break;
4484  case CudaArch::LAST:
4485  llvm_unreachable("Unexpected Cuda arch.");
4486  }
4487  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4488 }
4489 
4491  if (!GlobalizedRecords.empty()) {
4492  ASTContext &C = CGM.getContext();
4495  RecordDecl *StaticRD = C.buildImplicitRecord(
4496  "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4497  StaticRD->startDefinition();
4498  RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4499  "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4500  SharedStaticRD->startDefinition();
4501  for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4502  if (Records.Records.empty())
4503  continue;
4504  unsigned Size = 0;
4505  unsigned RecAlignment = 0;
4506  for (const RecordDecl *RD : Records.Records) {
4507  QualType RDTy = C.getRecordType(RD);
4508  unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
4509  RecAlignment = std::max(RecAlignment, Alignment);
4510  unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
4511  Size =
4512  llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4513  }
4514  Size = llvm::alignTo(Size, RecAlignment);
4515  llvm::APInt ArySize(/*numBits=*/64, Size);
4516  QualType SubTy = C.getConstantArrayType(
4517  C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
4518  const bool UseSharedMemory = Size <= SharedMemorySize;
4519  auto *Field =
4520  FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4521  SourceLocation(), SourceLocation(), nullptr, SubTy,
4523  /*BW=*/nullptr, /*Mutable=*/false,
4524  /*InitStyle=*/ICIS_NoInit);
4525  Field->setAccess(AS_public);
4526  if (UseSharedMemory) {
4527  SharedStaticRD->addDecl(Field);
4528  SharedRecs.push_back(&Records);
4529  } else {
4530  StaticRD->addDecl(Field);
4531  GlobalRecs.push_back(&Records);
4532  }
4533  Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
4534  Records.UseSharedMemory->setInitializer(
4535  llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
4536  }
4537  SharedStaticRD->completeDefinition();
4538  if (!SharedStaticRD->field_empty()) {
4539  QualType StaticTy = C.getRecordType(SharedStaticRD);
4540  llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
4541  auto *GV = new llvm::GlobalVariable(
4542  CGM.getModule(), LLVMStaticTy,
4543  /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
4544  llvm::Constant::getNullValue(LLVMStaticTy),
4545  "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
4546  llvm::GlobalValue::NotThreadLocal,
4548  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4549  GV, CGM.VoidPtrTy);
4550  for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
4551  Rec->Buffer->replaceAllUsesWith(Replacement);
4552  Rec->Buffer->eraseFromParent();
4553  }
4554  }
4555  StaticRD->completeDefinition();
4556  if (!StaticRD->field_empty()) {
4557  QualType StaticTy = C.getRecordType(StaticRD);
4558  std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4559  llvm::APInt Size1(32, SMsBlockPerSM.second);
4560  QualType Arr1Ty =
4561  C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
4562  /*IndexTypeQuals=*/0);
4563  llvm::APInt Size2(32, SMsBlockPerSM.first);
4564  QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
4565  /*IndexTypeQuals=*/0);
4566  llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4567  auto *GV = new llvm::GlobalVariable(
4568  CGM.getModule(), LLVMArr2Ty,
4569  /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
4570  llvm::Constant::getNullValue(LLVMArr2Ty),
4571  "_openmp_static_glob_rd_$_");
4572  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4573  GV, CGM.VoidPtrTy);
4574  for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
4575  Rec->Buffer->replaceAllUsesWith(Replacement);
4576  Rec->Buffer->eraseFromParent();
4577  }
4578  }
4579  }
4581 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:657
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5137
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:151
static const Decl * getCanonicalDecl(const Decl *D)
llvm::IntegerType * IntTy
int
int64_t QuantityType
Definition: CharUnits.h:40
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
Other implicit parameter.
Definition: Decl.h:1511
A class which contains all the information about a particular captured value.
Definition: Decl.h:3873
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2543
CanQualType VoidPtrTy
Definition: ASTContext.h:1043
A (possibly-)qualified type.
Definition: Type.h:642
CudaArch
Definition: Cuda.h:35
ArrayRef< OMPClause * > clauses()
Definition: StmtOpenMP.h:260
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static bool hasParallelIfNumThreadsClause(ASTContext &Ctx, const OMPExecutableDirective &D)
Check if the parallel directive has an &#39;if&#39; clause with non-constant or false condition.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:139
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:193
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Address CreateConstGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:226
static std::pair< unsigned, unsigned > getSMsBlocksPerSM(CodeGenModule &CGM)
Get number of SMs and number of blocks per SM.
attr_iterator attr_begin() const
Definition: DeclBase.h:494
Stmt - This represents one statement.
Definition: Stmt.h:66
static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads)
Get barrier #ID to synchronize selected (multiple of warp size) threads in a CTA. ...
llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
bool hasNonTrivialCall(const ASTContext &Ctx) const
Determine whether this expression involves a call to any function that is not trivial.
Definition: Expr.cpp:3389
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl *> &Vars)
Get list of reduction variables from the teams ... directives.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:87
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:54
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:168
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:410
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter...
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
static bool stable_sort_comparator(const PrivateDataTy P1, const PrivateDataTy P2)
This represents &#39;if&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:240
llvm::Value * ScratchpadIndex
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:226
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6231
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of &#39;this&#39;.
The base class of the type hierarchy.
Definition: Type.h:1415
virtual void clear()
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:4168
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:116
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Definition: CGExpr.cpp:2318
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference...
Definition: CGExpr.cpp:3981
static bool hasStaticScheduling(const OMPExecutableDirective &D)
Check if the directive is loops based and has schedule clause at all or has static scheduling...
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:26
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false)
QualType getElementType() const
Definition: Type.h:2853
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1289
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:103
Represents a variable declaration or definition.
Definition: Decl.h:812
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:67
This represents &#39;num_threads&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:382
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
The "union" keyword.
Definition: Type.h:5025
const ArrayType * castAsArrayTypeUnsafe() const
A variant of castAs<> for array type which silently discards qualifiers from the outermost type...
Definition: Type.h:6788
bool field_empty() const
Definition: Decl.h:3801
DiagnosticsEngine & getDiags() const
llvm::Value * getPointer() const
Definition: Address.h:38
llvm::Type * ConvertTypeForMem(QualType T)
ConvertTypeForMem - Convert type T into a llvm::Type.
std::string getName(ArrayRef< StringRef > Parts) const
Get the platform-specific name separator.
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:57
SPMD execution mode (all threads are worker threads).
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:269
Represents a struct/union/class.
Definition: Decl.h:3602
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
clauselist_range clauselists()
Definition: DeclOpenMP.h:295
Address getAddress() const
Definition: CGValue.h:327
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:154
attr_iterator attr_end() const
Definition: DeclBase.h:497
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
Represents a member of a struct/union/class.
Definition: Decl.h:2588
This represents clause &#39;lastprivate&#39; in the &#39;#pragma omp ...&#39; directives.
CharUnits getAlignment() const
Definition: CGValue.h:316
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive...
Definition: StmtOpenMP.h:209
unsigned getDefaultLocationReserved2Flags() const override
Returns additional flags that can be stored in reserved_2 field of the default location.
static llvm::Value * getMasterThreadID(CodeGenFunction &CGF)
Get the thread id of the OMP master thread.
void setLocThreadIdInsertPt(CodeGenFunction &CGF, bool AtCurrentPoint=false)
llvm::CallInst * EmitRuntimeCall(llvm::Value *callee, const Twine &name="")
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:3891
bool isReferenceType() const
Definition: Type.h:6294
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:244
Expr * getSubExpr()
Definition: Expr.h:2982
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:738
static bool hasNestedLightweightDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) lightweight runtime construct, if any.
void InitTempAlloca(Address Alloca, llvm::Value *Value)
InitTempAlloca - Provide an initial value for the given alloca which will be observable at all locati...
Definition: CGExpr.cpp:126
This is a common base class for loop directives (&#39;omp simd&#39;, &#39;omp for&#39;, &#39;omp for simd&#39; etc...
Definition: StmtOpenMP.h:338
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
OpenMPDistScheduleClauseKind
OpenMP attributes for &#39;dist_schedule&#39; clause.
Definition: OpenMPKinds.h:100
bool EvaluateAsBooleanCondition(bool &Result, const ASTContext &Ctx) const
EvaluateAsBooleanCondition - Return true if this is a constant which we can fold and convert to a boo...
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:157
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:67
child_range children()
Definition: Stmt.cpp:237
Expr * getIterationVariable() const
Definition: StmtOpenMP.h:785
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
ModeFlagsTy
Enum for accesseing the reserved_2 field of the ident_t struct.
bool isConstexpr() const
Whether this variable is (C++11) constexpr.
Definition: Decl.h:1383
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:40
static CGOpenMPRuntimeNVPTX::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
llvm::Value * emitReductionFunction(CodeGenModule &CGM, SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
This represents clause &#39;reduction&#39; in the &#39;#pragma omp ...&#39; directives.
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
bool isOpenMPWorksharingDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a worksharing directive.
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1606
virtual llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value &#39;V&#39; and type &#39;type&#39;.
Definition: Expr.cpp:778
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda&#39;s captures is an init-capture.
Definition: ExprCXX.cpp:972
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:870
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:3853
llvm::Constant * CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create a new runtime function with the specified type and name.
static void syncCTAThreads(CodeGenFunction &CGF)
Synchronize all GPU threads in a block.
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:4418
Unknown execution mode (orphaned directive).
static CharUnits One()
One - Construct a CharUnits quantity of one.
Definition: CharUnits.h:58
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1613
llvm::CallInst * EmitNounwindRuntimeCall(llvm::Value *callee, const Twine &name="")
Describes the capture of either a variable, or &#39;this&#39;, or variable-length array type.
Definition: Stmt.h:2898
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like &#39;private&#39;, &#39;firstprivate&#39;, &#39;reduction&#39; etc.
void setAddress(Address address)
Definition: CGValue.h:328
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl *> &Vars)
Get list of lastprivate variables from the teams distribute ...
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
TypeSourceInfo * getTrivialTypeSourceInfo(QualType T, SourceLocation Loc=SourceLocation()) const
Allocate a TypeSourceInfo where all locations have been initialized to a given location, which defaults to the empty location.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:119
This represents &#39;#pragma omp requires...&#39; directive.
Definition: DeclOpenMP.h:250
unsigned Offset
Definition: Format.cpp:1631
bool HasSideEffects(const ASTContext &Ctx, bool IncludePossibleEffects=true) const
HasSideEffects - This routine returns true for all those expressions which have any effect other than...
Definition: Expr.cpp:3064
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:196
virtual bool initFeatureMap(llvm::StringMap< bool > &Features, DiagnosticsEngine &Diags, StringRef CPU, const std::vector< std::string > &FeatureVec) const
Initialize the map with the default set of target features for the CPU this should include all legal ...
Definition: TargetInfo.cpp:385
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:636
This represents one expression.
Definition: Expr.h:106
virtual llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
static Address invalid()
Definition: Address.h:35
Enters a new scope for capturing cleanups, all of which will be executed once the scope is exited...
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top, if IgnoreCaptured is true.
Definition: Stmt.cpp:147
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
const CGFunctionInfo & arrangeNullaryFunction()
A nullary function is a freestanding function of type &#39;void ()&#39;.
Definition: CGCall.cpp:699
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:5123
const Expr * getCallee() const
Definition: Expr.h:2451
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements, of a variable length array type, plus that largest non-variably-sized element type.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:44
CharUnits getTypeAlignInChars(QualType T) const
Return the ABI-specified alignment of a (complete) type T, in characters.
DeclContext * getDeclContext()
Definition: DeclBase.h:427
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause *> Clauses)
Definition: StmtOpenMP.h:130
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
CanQualType ShortTy
Definition: ASTContext.h:1024
This represents &#39;ordered&#39; clause in the &#39;#pragma omp ...&#39; directive.
QualType getType() const
Definition: Expr.h:128
QualType getConstantArrayType(QualType EltTy, const llvm::APInt &ArySize, ArrayType::ArraySizeModifier ASM, unsigned IndexTypeQuals) const
Return the unique reference to the type for a constant array of the specified element type...
QualType getRecordType(const RecordDecl *Decl) const
UnaryOperator - This represents the unary-expression&#39;s (except sizeof and alignof), the postinc/postdec operators from postfix-expression, and various extensions.
Definition: Expr.h:1907
MachineConfiguration
GPU Configuration: This information can be derived from cuda registers, however, providing compile ti...
llvm::Value * EmitCastToVoidPtr(llvm::Value *value)
Emit a cast to void* in the appropriate address space.
Definition: CGExpr.cpp:50
Allow UB that we can give a value, but not arbitrary unmodeled side effects.
Definition: Expr.h:597
const TargetInfo & getTarget() const
ValueDecl * getDecl()
Definition: Expr.h:1125
const LangOptions & getLangOpts() const
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
ASTContext & getContext() const
OpenMPProcBindClauseKind
OpenMP attributes for &#39;proc_bind&#39; clause.
Definition: OpenMPKinds.h:51
Non-SPMD execution mode (1 master thread, others are workers).
llvm::Value * ScratchpadWidth
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2026
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:35
bool hasClausesOfKind() const
Returns true if the current directive has one or more clauses of a specific kind. ...
Definition: StmtOpenMP.h:162
AttrVec & getAttrs()
Definition: DeclBase.h:479
bool hasAttrs() const
Definition: DeclBase.h:473
std::string CPU
If given, the name of the target CPU to generate code for.
Definition: TargetOptions.h:36
The l-value was considered opaque, so the alignment was determined from a type.
llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
Address CreateBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:142
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Kind
This captures a statement into a function.
Definition: Stmt.h:2885
QualType getCanonicalType() const
Definition: Type.h:6097
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen)
Emits code for OpenMP &#39;if&#39; clause using specified CodeGen function.
Encodes a location in the source.
static llvm::Value * getThreadLimit(CodeGenFunction &CGF, bool IsInSPMDExecutionMode=false)
Get the value of the thread_limit clause in the teams directive.
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Expr * getSubExpr() const
Definition: Expr.h:1937
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2101
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:33
CastKind getCastKind() const
Definition: Expr.h:2976
This represents &#39;schedule&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:948
DeclStmt - Adaptor class for mixing declarations with statements and expressions. ...
Definition: Stmt.h:923
OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:23
This file defines OpenMP nodes for declarative directives.
std::vector< std::string > Features
The list of target specific features to enable or disable – this should be a list of strings startin...
Definition: TargetOptions.h:55
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:51
CanQualType VoidTy
Definition: ASTContext.h:1015
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
arg_range arguments()
Definition: Expr.h:2513
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
An aligned address.
Definition: Address.h:25
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3051
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:2986
OpenMPRTLFunctionNVPTX
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language...
Definition: Expr.h:249
llvm::Value * getCriticalRegionLock(StringRef CriticalName)
Returns corresponding lock object for the specified critical region name.
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
QualType getType() const
Definition: CGValue.h:264
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can&#39;t be done.
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
FunctionArgList - Type for representing both the decl and type of parameters to a function...
Definition: CGCall.h:356
CanQualType CharTy
Definition: ASTContext.h:1017
static CudaArch getCudaArch(CodeGenModule &CGM)
void setAction(PrePostActionTy &Action) const
CGFunctionInfo - Class to encapsulate the information about a function definition.
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2535
Dataflow Directional Tag Classes.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Definition: CGExpr.cpp:2309
static void getNVPTXCTABarrier(CodeGenFunction &CGF)
Get barrier to synchronize all threads in a block.
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:6025
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1262
ArrayRef< Capture > captures() const
Definition: Decl.h:3994
static bool isTrivial(ASTContext &Ctx, const Expr *E)
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:70
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr *> Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values, in the specified direction.
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:6032
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:211
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:108
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1392
llvm::Module & getModule() const
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3354
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1081
Expr * IgnoreParenImpCasts() LLVM_READONLY
IgnoreParenImpCasts - Ignore parentheses and implicit casts.
Definition: Expr.cpp:2652
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
void checkArchForUnifiedAddressing(CodeGenModule &CGM, const OMPRequiresDecl *D) const override
Perform check on requires decl to ensure that target architecture supports unified addressing...
Definition: