clang  10.0.0svn
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for OpenMP runtime code generation specialized to NVPTX
10 // targets.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeNVPTX.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/DeclOpenMP.h"
17 #include "clang/AST/StmtOpenMP.h"
18 #include "clang/AST/StmtVisitor.h"
19 #include "clang/Basic/Cuda.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 
22 using namespace clang;
23 using namespace CodeGen;
24 
25 namespace {
27  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28  /// int16_t RequiresOMPRuntime);
29  OMPRTL_NVPTX__kmpc_kernel_init,
30  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
31  OMPRTL_NVPTX__kmpc_kernel_deinit,
32  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
33  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
34  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35  /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
36  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
37  /// Call to void __kmpc_kernel_prepare_parallel(void
38  /// *outlined_function, int16_t
39  /// IsOMPRuntimeInitialized);
40  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
42  /// int16_t IsOMPRuntimeInitialized);
43  OMPRTL_NVPTX__kmpc_kernel_parallel,
44  /// Call to void __kmpc_kernel_end_parallel();
45  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47  /// global_tid);
48  OMPRTL_NVPTX__kmpc_serialized_parallel,
49  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50  /// global_tid);
51  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
53  /// int16_t lane_offset, int16_t warp_size);
54  OMPRTL_NVPTX__kmpc_shuffle_int32,
55  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
56  /// int16_t lane_offset, int16_t warp_size);
57  OMPRTL_NVPTX__kmpc_shuffle_int64,
58  /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
59  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61  /// lane_offset, int16_t shortCircuit),
62  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2,
64  /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
65  /// global_tid, void *global_buffer, int32_t num_of_records, void*
66  /// reduce_data,
67  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
68  /// lane_offset, int16_t shortCircuit),
69  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
70  /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
71  /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
72  /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
73  /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
74  /// *buffer, int idx, void *reduce_data));
75  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2,
76  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
77  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
78  /// Call to void __kmpc_data_sharing_init_stack();
79  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
80  /// Call to void __kmpc_data_sharing_init_stack_spmd();
81  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
82  /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
83  /// int16_t UseSharedMemory);
84  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
85  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
86  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
87  /// Call to void __kmpc_begin_sharing_variables(void ***args,
88  /// size_t n_args);
89  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
90  /// Call to void __kmpc_end_sharing_variables();
91  OMPRTL_NVPTX__kmpc_end_sharing_variables,
92  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
93  OMPRTL_NVPTX__kmpc_get_shared_variables,
94  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
95  /// global_tid);
96  OMPRTL_NVPTX__kmpc_parallel_level,
97  /// Call to int8_t __kmpc_is_spmd_exec_mode();
98  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
99  /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
100  /// const void *buf, size_t size, int16_t is_shared, const void **res);
101  OMPRTL_NVPTX__kmpc_get_team_static_memory,
102  /// Call to void __kmpc_restore_team_static_memory(int16_t
103  /// isSPMDExecutionMode, int16_t is_shared);
104  OMPRTL_NVPTX__kmpc_restore_team_static_memory,
105  /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
107  /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
108  /// global_tid);
109  OMPRTL__kmpc_barrier_simple_spmd,
110  /// Call to int32_t __kmpc_warp_active_thread_mask(void);
111  OMPRTL_NVPTX__kmpc_warp_active_thread_mask,
112  /// Call to void __kmpc_syncwarp(int32_t Mask);
113  OMPRTL_NVPTX__kmpc_syncwarp,
114 };
115 
116 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
117 class NVPTXActionTy final : public PrePostActionTy {
118  llvm::FunctionCallee EnterCallee = nullptr;
119  ArrayRef<llvm::Value *> EnterArgs;
120  llvm::FunctionCallee ExitCallee = nullptr;
121  ArrayRef<llvm::Value *> ExitArgs;
122  bool Conditional = false;
123  llvm::BasicBlock *ContBlock = nullptr;
124 
125 public:
126  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
127  ArrayRef<llvm::Value *> EnterArgs,
128  llvm::FunctionCallee ExitCallee,
129  ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
130  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
131  ExitArgs(ExitArgs), Conditional(Conditional) {}
132  void Enter(CodeGenFunction &CGF) override {
133  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
134  if (Conditional) {
135  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
136  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
137  ContBlock = CGF.createBasicBlock("omp_if.end");
138  // Generate the branch (If-stmt)
139  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
140  CGF.EmitBlock(ThenBlock);
141  }
142  }
143  void Done(CodeGenFunction &CGF) {
144  // Emit the rest of blocks/branches
145  CGF.EmitBranch(ContBlock);
146  CGF.EmitBlock(ContBlock, true);
147  }
148  void Exit(CodeGenFunction &CGF) override {
149  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
150  }
151 };
152 
153 /// A class to track the execution mode when codegening directives within
154 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
155 /// to the target region and used by containing directives such as 'parallel'
156 /// to emit optimized code.
157 class ExecutionRuntimeModesRAII {
158 private:
162  bool SavedRuntimeMode = false;
163  bool *RuntimeMode = nullptr;
164 
165 public:
166  /// Constructor for Non-SPMD mode.
167  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
168  : ExecMode(ExecMode) {
169  SavedExecMode = ExecMode;
171  }
172  /// Constructor for SPMD mode.
173  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
174  bool &RuntimeMode, bool FullRuntimeMode)
175  : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
176  SavedExecMode = ExecMode;
177  SavedRuntimeMode = RuntimeMode;
179  RuntimeMode = FullRuntimeMode;
180  }
181  ~ExecutionRuntimeModesRAII() {
182  ExecMode = SavedExecMode;
183  if (RuntimeMode)
184  *RuntimeMode = SavedRuntimeMode;
185  }
186 };
187 
188 /// GPU Configuration: This information can be derived from cuda registers,
189 /// however, providing compile time constants helps generate more efficient
190 /// code. For all practical purposes this is fine because the configuration
191 /// is the same for all known NVPTX architectures.
192 enum MachineConfiguration : unsigned {
193  WarpSize = 32,
194  /// Number of bits required to represent a lane identifier, which is
195  /// computed as log_2(WarpSize).
196  LaneIDBits = 5,
197  LaneIDMask = WarpSize - 1,
198 
199  /// Global memory alignment for performance.
200  GlobalMemoryAlignment = 128,
201 
202  /// Maximal size of the shared memory buffer.
203  SharedMemorySize = 128,
204 };
205 
206 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
207  RefExpr = RefExpr->IgnoreParens();
208  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
209  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
210  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
211  Base = TempASE->getBase()->IgnoreParenImpCasts();
212  RefExpr = Base;
213  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
214  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
215  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
216  Base = TempOASE->getBase()->IgnoreParenImpCasts();
217  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
218  Base = TempASE->getBase()->IgnoreParenImpCasts();
219  RefExpr = Base;
220  }
221  RefExpr = RefExpr->IgnoreParenImpCasts();
222  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
223  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
224  const auto *ME = cast<MemberExpr>(RefExpr);
225  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
226 }
227 
228 
229 static RecordDecl *buildRecordForGlobalizedVars(
230  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
231  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
232  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
233  &MappedDeclsFields, int BufSize) {
234  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
235  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
236  return nullptr;
237  SmallVector<VarsDataTy, 4> GlobalizedVars;
238  for (const ValueDecl *D : EscapedDecls)
239  GlobalizedVars.emplace_back(
241  C.getDeclAlign(D).getQuantity(),
242  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
243  D);
244  for (const ValueDecl *D : EscapedDeclsForTeams)
245  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
246  llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
247  return L.first > R.first;
248  });
249 
250  // Build struct _globalized_locals_ty {
251  // /* globalized vars */[WarSize] align (max(decl_align,
252  // GlobalMemoryAlignment))
253  // /* globalized vars */ for EscapedDeclsForTeams
254  // };
255  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
256  GlobalizedRD->startDefinition();
257  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
258  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
259  for (const auto &Pair : GlobalizedVars) {
260  const ValueDecl *VD = Pair.second;
261  QualType Type = VD->getType();
262  if (Type->isLValueReferenceType())
263  Type = C.getPointerType(Type.getNonReferenceType());
264  else
265  Type = Type.getNonReferenceType();
266  SourceLocation Loc = VD->getLocation();
267  FieldDecl *Field;
268  if (SingleEscaped.count(VD)) {
269  Field = FieldDecl::Create(
270  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
272  /*BW=*/nullptr, /*Mutable=*/false,
273  /*InitStyle=*/ICIS_NoInit);
274  Field->setAccess(AS_public);
275  if (VD->hasAttrs()) {
276  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
277  E(VD->getAttrs().end());
278  I != E; ++I)
279  Field->addAttr(*I);
280  }
281  } else {
282  llvm::APInt ArraySize(32, BufSize);
283  Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
284  0);
285  Field = FieldDecl::Create(
286  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
288  /*BW=*/nullptr, /*Mutable=*/false,
289  /*InitStyle=*/ICIS_NoInit);
290  Field->setAccess(AS_public);
291  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
292  static_cast<CharUnits::QuantityType>(
293  GlobalMemoryAlignment)));
294  Field->addAttr(AlignedAttr::CreateImplicit(
295  C, /*IsAlignmentExpr=*/true,
296  IntegerLiteral::Create(C, Align,
297  C.getIntTypeForBitwidth(32, /*Signed=*/0),
298  SourceLocation()),
299  {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
300  }
301  GlobalizedRD->addDecl(Field);
302  MappedDeclsFields.try_emplace(VD, Field);
303  }
304  GlobalizedRD->completeDefinition();
305  return GlobalizedRD;
306 }
307 
308 /// Get the list of variables that can escape their declaration context.
309 class CheckVarsEscapingDeclContext final
310  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
311  CodeGenFunction &CGF;
312  llvm::SetVector<const ValueDecl *> EscapedDecls;
313  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
314  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
315  RecordDecl *GlobalizedRD = nullptr;
316  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
317  bool AllEscaped = false;
318  bool IsForCombinedParallelRegion = false;
319 
320  void markAsEscaped(const ValueDecl *VD) {
321  // Do not globalize declare target variables.
322  if (!isa<VarDecl>(VD) ||
323  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
324  return;
325  VD = cast<ValueDecl>(VD->getCanonicalDecl());
326  // Use user-specified allocation.
327  if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
328  return;
329  // Variables captured by value must be globalized.
330  if (auto *CSI = CGF.CapturedStmtInfo) {
331  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
332  // Check if need to capture the variable that was already captured by
333  // value in the outer region.
334  if (!IsForCombinedParallelRegion) {
335  if (!FD->hasAttrs())
336  return;
337  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
338  if (!Attr)
339  return;
340  if (((Attr->getCaptureKind() != OMPC_map) &&
342  static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) ||
343  ((Attr->getCaptureKind() == OMPC_map) &&
344  !FD->getType()->isAnyPointerType()))
345  return;
346  }
347  if (!FD->getType()->isReferenceType()) {
348  assert(!VD->getType()->isVariablyModifiedType() &&
349  "Parameter captured by value with variably modified type");
350  EscapedParameters.insert(VD);
351  } else if (!IsForCombinedParallelRegion) {
352  return;
353  }
354  }
355  }
356  if ((!CGF.CapturedStmtInfo ||
357  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
358  VD->getType()->isReferenceType())
359  // Do not globalize variables with reference type.
360  return;
361  if (VD->getType()->isVariablyModifiedType())
362  EscapedVariableLengthDecls.insert(VD);
363  else
364  EscapedDecls.insert(VD);
365  }
366 
367  void VisitValueDecl(const ValueDecl *VD) {
368  if (VD->getType()->isLValueReferenceType())
369  markAsEscaped(VD);
370  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
371  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
372  const bool SavedAllEscaped = AllEscaped;
373  AllEscaped = VD->getType()->isLValueReferenceType();
374  Visit(VarD->getInit());
375  AllEscaped = SavedAllEscaped;
376  }
377  }
378  }
379  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
380  ArrayRef<OMPClause *> Clauses,
381  bool IsCombinedParallelRegion) {
382  if (!S)
383  return;
384  for (const CapturedStmt::Capture &C : S->captures()) {
385  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
386  const ValueDecl *VD = C.getCapturedVar();
387  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
388  if (IsCombinedParallelRegion) {
389  // Check if the variable is privatized in the combined construct and
390  // those private copies must be shared in the inner parallel
391  // directive.
392  IsForCombinedParallelRegion = false;
393  for (const OMPClause *C : Clauses) {
394  if (!isOpenMPPrivate(C->getClauseKind()) ||
395  C->getClauseKind() == OMPC_reduction ||
396  C->getClauseKind() == OMPC_linear ||
397  C->getClauseKind() == OMPC_private)
398  continue;
400  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
401  Vars = PC->getVarRefs();
402  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
403  Vars = PC->getVarRefs();
404  else
405  llvm_unreachable("Unexpected clause.");
406  for (const auto *E : Vars) {
407  const Decl *D =
408  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
409  if (D == VD->getCanonicalDecl()) {
410  IsForCombinedParallelRegion = true;
411  break;
412  }
413  }
414  if (IsForCombinedParallelRegion)
415  break;
416  }
417  }
418  markAsEscaped(VD);
419  if (isa<OMPCapturedExprDecl>(VD))
420  VisitValueDecl(VD);
421  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
422  }
423  }
424  }
425 
426  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
427  assert(!GlobalizedRD &&
428  "Record for globalized variables is built already.");
429  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
430  if (IsInTTDRegion)
431  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
432  else
433  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
434  GlobalizedRD = ::buildRecordForGlobalizedVars(
435  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
436  MappedDeclsFields, WarpSize);
437  }
438 
439 public:
440  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
441  ArrayRef<const ValueDecl *> TeamsReductions)
442  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
443  }
444  virtual ~CheckVarsEscapingDeclContext() = default;
445  void VisitDeclStmt(const DeclStmt *S) {
446  if (!S)
447  return;
448  for (const Decl *D : S->decls())
449  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
450  VisitValueDecl(VD);
451  }
452  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
453  if (!D)
454  return;
455  if (!D->hasAssociatedStmt())
456  return;
457  if (const auto *S =
458  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
459  // Do not analyze directives that do not actually require capturing,
460  // like `omp for` or `omp simd` directives.
462  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
463  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
464  VisitStmt(S->getCapturedStmt());
465  return;
466  }
467  VisitOpenMPCapturedStmt(
468  S, D->clauses(),
469  CaptureRegions.back() == OMPD_parallel &&
471  }
472  }
473  void VisitCapturedStmt(const CapturedStmt *S) {
474  if (!S)
475  return;
476  for (const CapturedStmt::Capture &C : S->captures()) {
477  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
478  const ValueDecl *VD = C.getCapturedVar();
479  markAsEscaped(VD);
480  if (isa<OMPCapturedExprDecl>(VD))
481  VisitValueDecl(VD);
482  }
483  }
484  }
485  void VisitLambdaExpr(const LambdaExpr *E) {
486  if (!E)
487  return;
488  for (const LambdaCapture &C : E->captures()) {
489  if (C.capturesVariable()) {
490  if (C.getCaptureKind() == LCK_ByRef) {
491  const ValueDecl *VD = C.getCapturedVar();
492  markAsEscaped(VD);
493  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
494  VisitValueDecl(VD);
495  }
496  }
497  }
498  }
499  void VisitBlockExpr(const BlockExpr *E) {
500  if (!E)
501  return;
502  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
503  if (C.isByRef()) {
504  const VarDecl *VD = C.getVariable();
505  markAsEscaped(VD);
506  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
507  VisitValueDecl(VD);
508  }
509  }
510  }
511  void VisitCallExpr(const CallExpr *E) {
512  if (!E)
513  return;
514  for (const Expr *Arg : E->arguments()) {
515  if (!Arg)
516  continue;
517  if (Arg->isLValue()) {
518  const bool SavedAllEscaped = AllEscaped;
519  AllEscaped = true;
520  Visit(Arg);
521  AllEscaped = SavedAllEscaped;
522  } else {
523  Visit(Arg);
524  }
525  }
526  Visit(E->getCallee());
527  }
528  void VisitDeclRefExpr(const DeclRefExpr *E) {
529  if (!E)
530  return;
531  const ValueDecl *VD = E->getDecl();
532  if (AllEscaped)
533  markAsEscaped(VD);
534  if (isa<OMPCapturedExprDecl>(VD))
535  VisitValueDecl(VD);
536  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
537  if (VarD->isInitCapture())
538  VisitValueDecl(VD);
539  }
540  void VisitUnaryOperator(const UnaryOperator *E) {
541  if (!E)
542  return;
543  if (E->getOpcode() == UO_AddrOf) {
544  const bool SavedAllEscaped = AllEscaped;
545  AllEscaped = true;
546  Visit(E->getSubExpr());
547  AllEscaped = SavedAllEscaped;
548  } else {
549  Visit(E->getSubExpr());
550  }
551  }
552  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
553  if (!E)
554  return;
555  if (E->getCastKind() == CK_ArrayToPointerDecay) {
556  const bool SavedAllEscaped = AllEscaped;
557  AllEscaped = true;
558  Visit(E->getSubExpr());
559  AllEscaped = SavedAllEscaped;
560  } else {
561  Visit(E->getSubExpr());
562  }
563  }
564  void VisitExpr(const Expr *E) {
565  if (!E)
566  return;
567  bool SavedAllEscaped = AllEscaped;
568  if (!E->isLValue())
569  AllEscaped = false;
570  for (const Stmt *Child : E->children())
571  if (Child)
572  Visit(Child);
573  AllEscaped = SavedAllEscaped;
574  }
575  void VisitStmt(const Stmt *S) {
576  if (!S)
577  return;
578  for (const Stmt *Child : S->children())
579  if (Child)
580  Visit(Child);
581  }
582 
583  /// Returns the record that handles all the escaped local variables and used
584  /// instead of their original storage.
585  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
586  if (!GlobalizedRD)
587  buildRecordForGlobalizedVars(IsInTTDRegion);
588  return GlobalizedRD;
589  }
590 
591  /// Returns the field in the globalized record for the escaped variable.
592  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
593  assert(GlobalizedRD &&
594  "Record for globalized variables must be generated already.");
595  auto I = MappedDeclsFields.find(VD);
596  if (I == MappedDeclsFields.end())
597  return nullptr;
598  return I->getSecond();
599  }
600 
601  /// Returns the list of the escaped local variables/parameters.
602  ArrayRef<const ValueDecl *> getEscapedDecls() const {
603  return EscapedDecls.getArrayRef();
604  }
605 
606  /// Checks if the escaped local variable is actually a parameter passed by
607  /// value.
608  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
609  return EscapedParameters;
610  }
611 
612  /// Returns the list of the escaped variables with the variably modified
613  /// types.
614  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
615  return EscapedVariableLengthDecls.getArrayRef();
616  }
617 };
618 } // anonymous namespace
619 
620 /// Get the GPU warp size.
622  return CGF.EmitRuntimeCall(
623  llvm::Intrinsic::getDeclaration(
624  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
625  "nvptx_warp_size");
626 }
627 
628 /// Get the id of the current thread on the GPU.
630  return CGF.EmitRuntimeCall(
631  llvm::Intrinsic::getDeclaration(
632  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
633  "nvptx_tid");
634 }
635 
636 /// Get the id of the warp in the block.
637 /// We assume that the warp size is 32, which is always the case
638 /// on the NVPTX device, to generate more efficient code.
640  CGBuilderTy &Bld = CGF.Builder;
641  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
642 }
643 
644 /// Get the id of the current lane in the Warp.
645 /// We assume that the warp size is 32, which is always the case
646 /// on the NVPTX device, to generate more efficient code.
648  CGBuilderTy &Bld = CGF.Builder;
649  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
650  "nvptx_lane_id");
651 }
652 
653 /// Get the maximum number of threads in a block of the GPU.
655  return CGF.EmitRuntimeCall(
656  llvm::Intrinsic::getDeclaration(
657  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
658  "nvptx_num_threads");
659 }
660 
661 /// Get the value of the thread_limit clause in the teams directive.
662 /// For the 'generic' execution mode, the runtime encodes thread_limit in
663 /// the launch parameters, always starting thread_limit+warpSize threads per
664 /// CTA. The threads in the last warp are reserved for master execution.
665 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
667  bool IsInSPMDExecutionMode = false) {
668  CGBuilderTy &Bld = CGF.Builder;
669  return IsInSPMDExecutionMode
670  ? getNVPTXNumThreads(CGF)
671  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
672  "thread_limit");
673 }
674 
675 /// Get the thread id of the OMP master thread.
676 /// The master thread id is the first thread (lane) of the last warp in the
677 /// GPU block. Warp size is assumed to be some power of 2.
678 /// Thread id is 0 indexed.
679 /// E.g: If NumThreads is 33, master id is 32.
680 /// If NumThreads is 64, master id is 32.
681 /// If NumThreads is 1024, master id is 992.
683  CGBuilderTy &Bld = CGF.Builder;
684  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
685 
686  // We assume that the warp size is a power of 2.
687  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
688 
689  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
690  Bld.CreateNot(Mask), "master_tid");
691 }
692 
693 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
694  CodeGenModule &CGM, SourceLocation Loc)
695  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
696  Loc(Loc) {
697  createWorkerFunction(CGM);
698 }
699 
700 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
701  CodeGenModule &CGM) {
702  // Create an worker function with no arguments.
703 
704  WorkerFn = llvm::Function::Create(
706  /*placeholder=*/"_worker", &CGM.getModule());
707  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
708  WorkerFn->setDoesNotRecurse();
709 }
710 
712 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
713  return CurrentExecutionMode;
714 }
715 
718  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
720 }
721 
722 /// Check for inner (nested) SPMD construct, if any
724  const OMPExecutableDirective &D) {
725  const auto *CS = D.getInnermostCapturedStmt();
726  const auto *Body =
727  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
728  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
729 
730  if (const auto *NestedDir =
731  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
732  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
733  switch (D.getDirectiveKind()) {
734  case OMPD_target:
735  if (isOpenMPParallelDirective(DKind))
736  return true;
737  if (DKind == OMPD_teams) {
738  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
739  /*IgnoreCaptured=*/true);
740  if (!Body)
741  return false;
742  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
743  if (const auto *NND =
744  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
745  DKind = NND->getDirectiveKind();
746  if (isOpenMPParallelDirective(DKind))
747  return true;
748  }
749  }
750  return false;
751  case OMPD_target_teams:
752  return isOpenMPParallelDirective(DKind);
753  case OMPD_target_simd:
754  case OMPD_target_parallel:
755  case OMPD_target_parallel_for:
756  case OMPD_target_parallel_for_simd:
757  case OMPD_target_teams_distribute:
758  case OMPD_target_teams_distribute_simd:
759  case OMPD_target_teams_distribute_parallel_for:
760  case OMPD_target_teams_distribute_parallel_for_simd:
761  case OMPD_parallel:
762  case OMPD_for:
763  case OMPD_parallel_for:
764  case OMPD_parallel_sections:
765  case OMPD_for_simd:
766  case OMPD_parallel_for_simd:
767  case OMPD_cancel:
768  case OMPD_cancellation_point:
769  case OMPD_ordered:
770  case OMPD_threadprivate:
771  case OMPD_allocate:
772  case OMPD_task:
773  case OMPD_simd:
774  case OMPD_sections:
775  case OMPD_section:
776  case OMPD_single:
777  case OMPD_master:
778  case OMPD_critical:
779  case OMPD_taskyield:
780  case OMPD_barrier:
781  case OMPD_taskwait:
782  case OMPD_taskgroup:
783  case OMPD_atomic:
784  case OMPD_flush:
785  case OMPD_teams:
786  case OMPD_target_data:
787  case OMPD_target_exit_data:
788  case OMPD_target_enter_data:
789  case OMPD_distribute:
790  case OMPD_distribute_simd:
791  case OMPD_distribute_parallel_for:
792  case OMPD_distribute_parallel_for_simd:
793  case OMPD_teams_distribute:
794  case OMPD_teams_distribute_simd:
795  case OMPD_teams_distribute_parallel_for:
796  case OMPD_teams_distribute_parallel_for_simd:
797  case OMPD_target_update:
798  case OMPD_declare_simd:
799  case OMPD_declare_variant:
800  case OMPD_declare_target:
801  case OMPD_end_declare_target:
802  case OMPD_declare_reduction:
803  case OMPD_declare_mapper:
804  case OMPD_taskloop:
805  case OMPD_taskloop_simd:
806  case OMPD_master_taskloop:
807  case OMPD_master_taskloop_simd:
808  case OMPD_parallel_master_taskloop:
809  case OMPD_requires:
810  case OMPD_unknown:
811  llvm_unreachable("Unexpected directive.");
812  }
813  }
814 
815  return false;
816 }
817 
819  const OMPExecutableDirective &D) {
820  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
821  switch (DirectiveKind) {
822  case OMPD_target:
823  case OMPD_target_teams:
824  return hasNestedSPMDDirective(Ctx, D);
825  case OMPD_target_parallel:
826  case OMPD_target_parallel_for:
827  case OMPD_target_parallel_for_simd:
828  case OMPD_target_teams_distribute_parallel_for:
829  case OMPD_target_teams_distribute_parallel_for_simd:
830  case OMPD_target_simd:
831  case OMPD_target_teams_distribute_simd:
832  return true;
833  case OMPD_target_teams_distribute:
834  return false;
835  case OMPD_parallel:
836  case OMPD_for:
837  case OMPD_parallel_for:
838  case OMPD_parallel_sections:
839  case OMPD_for_simd:
840  case OMPD_parallel_for_simd:
841  case OMPD_cancel:
842  case OMPD_cancellation_point:
843  case OMPD_ordered:
844  case OMPD_threadprivate:
845  case OMPD_allocate:
846  case OMPD_task:
847  case OMPD_simd:
848  case OMPD_sections:
849  case OMPD_section:
850  case OMPD_single:
851  case OMPD_master:
852  case OMPD_critical:
853  case OMPD_taskyield:
854  case OMPD_barrier:
855  case OMPD_taskwait:
856  case OMPD_taskgroup:
857  case OMPD_atomic:
858  case OMPD_flush:
859  case OMPD_teams:
860  case OMPD_target_data:
861  case OMPD_target_exit_data:
862  case OMPD_target_enter_data:
863  case OMPD_distribute:
864  case OMPD_distribute_simd:
865  case OMPD_distribute_parallel_for:
866  case OMPD_distribute_parallel_for_simd:
867  case OMPD_teams_distribute:
868  case OMPD_teams_distribute_simd:
869  case OMPD_teams_distribute_parallel_for:
870  case OMPD_teams_distribute_parallel_for_simd:
871  case OMPD_target_update:
872  case OMPD_declare_simd:
873  case OMPD_declare_variant:
874  case OMPD_declare_target:
875  case OMPD_end_declare_target:
876  case OMPD_declare_reduction:
877  case OMPD_declare_mapper:
878  case OMPD_taskloop:
879  case OMPD_taskloop_simd:
880  case OMPD_master_taskloop:
881  case OMPD_master_taskloop_simd:
882  case OMPD_parallel_master_taskloop:
883  case OMPD_requires:
884  case OMPD_unknown:
885  break;
886  }
887  llvm_unreachable(
888  "Unknown programming model for OpenMP directive on NVPTX target.");
889 }
890 
891 /// Check if the directive is loops based and has schedule clause at all or has
892 /// static scheduling.
896  "Expected loop-based directive.");
897  return !D.hasClausesOfKind<OMPOrderedClause>() &&
899  llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
900  [](const OMPScheduleClause *C) {
901  return C->getScheduleKind() == OMPC_SCHEDULE_static;
902  }));
903 }
904 
905 /// Check for inner (nested) lightweight runtime construct, if any
907  const OMPExecutableDirective &D) {
908  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
909  const auto *CS = D.getInnermostCapturedStmt();
910  const auto *Body =
911  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
912  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
913 
914  if (const auto *NestedDir =
915  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
916  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
917  switch (D.getDirectiveKind()) {
918  case OMPD_target:
919  if (isOpenMPParallelDirective(DKind) &&
921  hasStaticScheduling(*NestedDir))
922  return true;
923  if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
924  return true;
925  if (DKind == OMPD_parallel) {
926  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
927  /*IgnoreCaptured=*/true);
928  if (!Body)
929  return false;
930  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
931  if (const auto *NND =
932  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
933  DKind = NND->getDirectiveKind();
934  if (isOpenMPWorksharingDirective(DKind) &&
936  return true;
937  }
938  } else if (DKind == OMPD_teams) {
939  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
940  /*IgnoreCaptured=*/true);
941  if (!Body)
942  return false;
943  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
944  if (const auto *NND =
945  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
946  DKind = NND->getDirectiveKind();
947  if (isOpenMPParallelDirective(DKind) &&
950  return true;
951  if (DKind == OMPD_parallel) {
952  Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
953  /*IgnoreCaptured=*/true);
954  if (!Body)
955  return false;
956  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
957  if (const auto *NND =
958  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
959  DKind = NND->getDirectiveKind();
960  if (isOpenMPWorksharingDirective(DKind) &&
962  return true;
963  }
964  }
965  }
966  }
967  return false;
968  case OMPD_target_teams:
969  if (isOpenMPParallelDirective(DKind) &&
971  hasStaticScheduling(*NestedDir))
972  return true;
973  if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
974  return true;
975  if (DKind == OMPD_parallel) {
976  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
977  /*IgnoreCaptured=*/true);
978  if (!Body)
979  return false;
980  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
981  if (const auto *NND =
982  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
983  DKind = NND->getDirectiveKind();
984  if (isOpenMPWorksharingDirective(DKind) &&
986  return true;
987  }
988  }
989  return false;
990  case OMPD_target_parallel:
991  if (DKind == OMPD_simd)
992  return true;
993  return isOpenMPWorksharingDirective(DKind) &&
994  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
995  case OMPD_target_teams_distribute:
996  case OMPD_target_simd:
997  case OMPD_target_parallel_for:
998  case OMPD_target_parallel_for_simd:
999  case OMPD_target_teams_distribute_simd:
1000  case OMPD_target_teams_distribute_parallel_for:
1001  case OMPD_target_teams_distribute_parallel_for_simd:
1002  case OMPD_parallel:
1003  case OMPD_for:
1004  case OMPD_parallel_for:
1005  case OMPD_parallel_sections:
1006  case OMPD_for_simd:
1007  case OMPD_parallel_for_simd:
1008  case OMPD_cancel:
1009  case OMPD_cancellation_point:
1010  case OMPD_ordered:
1011  case OMPD_threadprivate:
1012  case OMPD_allocate:
1013  case OMPD_task:
1014  case OMPD_simd:
1015  case OMPD_sections:
1016  case OMPD_section:
1017  case OMPD_single:
1018  case OMPD_master:
1019  case OMPD_critical:
1020  case OMPD_taskyield:
1021  case OMPD_barrier:
1022  case OMPD_taskwait:
1023  case OMPD_taskgroup:
1024  case OMPD_atomic:
1025  case OMPD_flush:
1026  case OMPD_teams:
1027  case OMPD_target_data:
1028  case OMPD_target_exit_data:
1029  case OMPD_target_enter_data:
1030  case OMPD_distribute:
1031  case OMPD_distribute_simd:
1032  case OMPD_distribute_parallel_for:
1033  case OMPD_distribute_parallel_for_simd:
1034  case OMPD_teams_distribute:
1035  case OMPD_teams_distribute_simd:
1036  case OMPD_teams_distribute_parallel_for:
1037  case OMPD_teams_distribute_parallel_for_simd:
1038  case OMPD_target_update:
1039  case OMPD_declare_simd:
1040  case OMPD_declare_variant:
1041  case OMPD_declare_target:
1042  case OMPD_end_declare_target:
1043  case OMPD_declare_reduction:
1044  case OMPD_declare_mapper:
1045  case OMPD_taskloop:
1046  case OMPD_taskloop_simd:
1047  case OMPD_master_taskloop:
1048  case OMPD_master_taskloop_simd:
1049  case OMPD_parallel_master_taskloop:
1050  case OMPD_requires:
1051  case OMPD_unknown:
1052  llvm_unreachable("Unexpected directive.");
1053  }
1054  }
1055 
1056  return false;
1057 }
1058 
1059 /// Checks if the construct supports lightweight runtime. It must be SPMD
1060 /// construct + inner loop-based construct with static scheduling.
1062  const OMPExecutableDirective &D) {
1063  if (!supportsSPMDExecutionMode(Ctx, D))
1064  return false;
1065  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1066  switch (DirectiveKind) {
1067  case OMPD_target:
1068  case OMPD_target_teams:
1069  case OMPD_target_parallel:
1070  return hasNestedLightweightDirective(Ctx, D);
1071  case OMPD_target_parallel_for:
1072  case OMPD_target_parallel_for_simd:
1073  case OMPD_target_teams_distribute_parallel_for:
1074  case OMPD_target_teams_distribute_parallel_for_simd:
1075  // (Last|First)-privates must be shared in parallel region.
1076  return hasStaticScheduling(D);
1077  case OMPD_target_simd:
1078  case OMPD_target_teams_distribute_simd:
1079  return true;
1080  case OMPD_target_teams_distribute:
1081  return false;
1082  case OMPD_parallel:
1083  case OMPD_for:
1084  case OMPD_parallel_for:
1085  case OMPD_parallel_sections:
1086  case OMPD_for_simd:
1087  case OMPD_parallel_for_simd:
1088  case OMPD_cancel:
1089  case OMPD_cancellation_point:
1090  case OMPD_ordered:
1091  case OMPD_threadprivate:
1092  case OMPD_allocate:
1093  case OMPD_task:
1094  case OMPD_simd:
1095  case OMPD_sections:
1096  case OMPD_section:
1097  case OMPD_single:
1098  case OMPD_master:
1099  case OMPD_critical:
1100  case OMPD_taskyield:
1101  case OMPD_barrier:
1102  case OMPD_taskwait:
1103  case OMPD_taskgroup:
1104  case OMPD_atomic:
1105  case OMPD_flush:
1106  case OMPD_teams:
1107  case OMPD_target_data:
1108  case OMPD_target_exit_data:
1109  case OMPD_target_enter_data:
1110  case OMPD_distribute:
1111  case OMPD_distribute_simd:
1112  case OMPD_distribute_parallel_for:
1113  case OMPD_distribute_parallel_for_simd:
1114  case OMPD_teams_distribute:
1115  case OMPD_teams_distribute_simd:
1116  case OMPD_teams_distribute_parallel_for:
1117  case OMPD_teams_distribute_parallel_for_simd:
1118  case OMPD_target_update:
1119  case OMPD_declare_simd:
1120  case OMPD_declare_variant:
1121  case OMPD_declare_target:
1122  case OMPD_end_declare_target:
1123  case OMPD_declare_reduction:
1124  case OMPD_declare_mapper:
1125  case OMPD_taskloop:
1126  case OMPD_taskloop_simd:
1127  case OMPD_master_taskloop:
1128  case OMPD_master_taskloop_simd:
1129  case OMPD_parallel_master_taskloop:
1130  case OMPD_requires:
1131  case OMPD_unknown:
1132  break;
1133  }
1134  llvm_unreachable(
1135  "Unknown programming model for OpenMP directive on NVPTX target.");
1136 }
1137 
1138 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1139  StringRef ParentName,
1140  llvm::Function *&OutlinedFn,
1141  llvm::Constant *&OutlinedFnID,
1142  bool IsOffloadEntry,
1143  const RegionCodeGenTy &CodeGen) {
1144  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1145  EntryFunctionState EST;
1146  WorkerFunctionState WST(CGM, D.getBeginLoc());
1147  Work.clear();
1148  WrapperFunctionsMap.clear();
1149 
1150  // Emit target region as a standalone region.
1151  class NVPTXPrePostActionTy : public PrePostActionTy {
1152  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1153  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1154 
1155  public:
1156  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1157  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1158  : EST(EST), WST(WST) {}
1159  void Enter(CodeGenFunction &CGF) override {
1160  auto &RT =
1161  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1162  RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1163  // Skip target region initialization.
1164  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1165  }
1166  void Exit(CodeGenFunction &CGF) override {
1167  auto &RT =
1168  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1169  RT.clearLocThreadIdInsertPt(CGF);
1170  RT.emitNonSPMDEntryFooter(CGF, EST);
1171  }
1172  } Action(EST, WST);
1173  CodeGen.setAction(Action);
1174  IsInTTDRegion = true;
1175  // Reserve place for the globalized memory.
1176  GlobalizedRecords.emplace_back();
1177  if (!KernelStaticGlobalized) {
1178  KernelStaticGlobalized = new llvm::GlobalVariable(
1179  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1181  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1182  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1183  llvm::GlobalValue::NotThreadLocal,
1185  }
1186  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1187  IsOffloadEntry, CodeGen);
1188  IsInTTDRegion = false;
1189 
1190  // Now change the name of the worker function to correspond to this target
1191  // region's entry function.
1192  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1193 
1194  // Create the worker function
1195  emitWorkerFunction(WST);
1196 }
1197 
1198 // Setup NVPTX threads for master-worker OpenMP scheme.
1199 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1200  EntryFunctionState &EST,
1201  WorkerFunctionState &WST) {
1202  CGBuilderTy &Bld = CGF.Builder;
1203 
1204  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1205  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1206  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1207  EST.ExitBB = CGF.createBasicBlock(".exit");
1208 
1209  llvm::Value *IsWorker =
1210  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1211  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1212 
1213  CGF.EmitBlock(WorkerBB);
1214  emitCall(CGF, WST.Loc, WST.WorkerFn);
1215  CGF.EmitBranch(EST.ExitBB);
1216 
1217  CGF.EmitBlock(MasterCheckBB);
1218  llvm::Value *IsMaster =
1219  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1220  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1221 
1222  CGF.EmitBlock(MasterBB);
1223  IsInTargetMasterThreadRegion = true;
1224  // SEQUENTIAL (MASTER) REGION START
1225  // First action in sequential region:
1226  // Initialize the state of the OpenMP runtime library on the GPU.
1227  // TODO: Optimize runtime initialization and pass in correct value.
1228  llvm::Value *Args[] = {getThreadLimit(CGF),
1229  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1230  CGF.EmitRuntimeCall(
1231  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1232 
1233  // For data sharing, we need to initialize the stack.
1234  CGF.EmitRuntimeCall(
1235  createNVPTXRuntimeFunction(
1236  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1237 
1238  emitGenericVarsProlog(CGF, WST.Loc);
1239 }
1240 
1241 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1242  EntryFunctionState &EST) {
1243  IsInTargetMasterThreadRegion = false;
1244  if (!CGF.HaveInsertPoint())
1245  return;
1246 
1247  emitGenericVarsEpilog(CGF);
1248 
1249  if (!EST.ExitBB)
1250  EST.ExitBB = CGF.createBasicBlock(".exit");
1251 
1252  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1253  CGF.EmitBranch(TerminateBB);
1254 
1255  CGF.EmitBlock(TerminateBB);
1256  // Signal termination condition.
1257  // TODO: Optimize runtime initialization and pass in correct value.
1258  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1259  CGF.EmitRuntimeCall(
1260  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1261  // Barrier to terminate worker threads.
1262  syncCTAThreads(CGF);
1263  // Master thread jumps to exit point.
1264  CGF.EmitBranch(EST.ExitBB);
1265 
1266  CGF.EmitBlock(EST.ExitBB);
1267  EST.ExitBB = nullptr;
1268 }
1269 
1270 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1271  StringRef ParentName,
1272  llvm::Function *&OutlinedFn,
1273  llvm::Constant *&OutlinedFnID,
1274  bool IsOffloadEntry,
1275  const RegionCodeGenTy &CodeGen) {
1276  ExecutionRuntimeModesRAII ModeRAII(
1277  CurrentExecutionMode, RequiresFullRuntime,
1278  CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1280  EntryFunctionState EST;
1281 
1282  // Emit target region as a standalone region.
1283  class NVPTXPrePostActionTy : public PrePostActionTy {
1285  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1286  const OMPExecutableDirective &D;
1287 
1288  public:
1289  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1290  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1291  const OMPExecutableDirective &D)
1292  : RT(RT), EST(EST), D(D) {}
1293  void Enter(CodeGenFunction &CGF) override {
1294  RT.emitSPMDEntryHeader(CGF, EST, D);
1295  // Skip target region initialization.
1296  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1297  }
1298  void Exit(CodeGenFunction &CGF) override {
1299  RT.clearLocThreadIdInsertPt(CGF);
1300  RT.emitSPMDEntryFooter(CGF, EST);
1301  }
1302  } Action(*this, EST, D);
1303  CodeGen.setAction(Action);
1304  IsInTTDRegion = true;
1305  // Reserve place for the globalized memory.
1306  GlobalizedRecords.emplace_back();
1307  if (!KernelStaticGlobalized) {
1308  KernelStaticGlobalized = new llvm::GlobalVariable(
1309  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1311  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1312  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1313  llvm::GlobalValue::NotThreadLocal,
1315  }
1316  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1317  IsOffloadEntry, CodeGen);
1318  IsInTTDRegion = false;
1319 }
1320 
1321 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1322  CodeGenFunction &CGF, EntryFunctionState &EST,
1323  const OMPExecutableDirective &D) {
1324  CGBuilderTy &Bld = CGF.Builder;
1325 
1326  // Setup BBs in entry function.
1327  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1328  EST.ExitBB = CGF.createBasicBlock(".exit");
1329 
1330  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1331  /*RequiresOMPRuntime=*/
1332  Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1333  /*RequiresDataSharing=*/Bld.getInt16(0)};
1334  CGF.EmitRuntimeCall(
1335  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1336 
1337  if (RequiresFullRuntime) {
1338  // For data sharing, we need to initialize the stack.
1339  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1340  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1341  }
1342 
1343  CGF.EmitBranch(ExecuteBB);
1344 
1345  CGF.EmitBlock(ExecuteBB);
1346 
1347  IsInTargetMasterThreadRegion = true;
1348 }
1349 
1350 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1351  EntryFunctionState &EST) {
1352  IsInTargetMasterThreadRegion = false;
1353  if (!CGF.HaveInsertPoint())
1354  return;
1355 
1356  if (!EST.ExitBB)
1357  EST.ExitBB = CGF.createBasicBlock(".exit");
1358 
1359  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1360  CGF.EmitBranch(OMPDeInitBB);
1361 
1362  CGF.EmitBlock(OMPDeInitBB);
1363  // DeInitialize the OMP state in the runtime; called by all active threads.
1364  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1365  CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1366  CGF.EmitRuntimeCall(
1367  createNVPTXRuntimeFunction(
1368  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1369  CGF.EmitBranch(EST.ExitBB);
1370 
1371  CGF.EmitBlock(EST.ExitBB);
1372  EST.ExitBB = nullptr;
1373 }
1374 
1375 // Create a unique global variable to indicate the execution mode of this target
1376 // region. The execution mode is either 'generic', or 'spmd' depending on the
1377 // target directive. This variable is picked up by the offload library to setup
1378 // the device appropriately before kernel launch. If the execution mode is
1379 // 'generic', the runtime reserves one warp for the master, otherwise, all
1380 // warps participate in parallel work.
1381 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1382  bool Mode) {
1383  auto *GVMode =
1384  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1385  llvm::GlobalValue::WeakAnyLinkage,
1386  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1387  Twine(Name, "_exec_mode"));
1388  CGM.addCompilerUsedGlobal(GVMode);
1389 }
1390 
1391 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1392  ASTContext &Ctx = CGM.getContext();
1393 
1394  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1395  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1396  WST.Loc, WST.Loc);
1397  emitWorkerLoop(CGF, WST);
1398  CGF.FinishFunction();
1399 }
1400 
1401 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1402  WorkerFunctionState &WST) {
1403  //
1404  // The workers enter this loop and wait for parallel work from the master.
1405  // When the master encounters a parallel region it sets up the work + variable
1406  // arguments, and wakes up the workers. The workers first check to see if
1407  // they are required for the parallel region, i.e., within the # of requested
1408  // parallel threads. The activated workers load the variable arguments and
1409  // execute the parallel work.
1410  //
1411 
1412  CGBuilderTy &Bld = CGF.Builder;
1413 
1414  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1415  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1416  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1417  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1418  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1419  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1420 
1421  CGF.EmitBranch(AwaitBB);
1422 
1423  // Workers wait for work from master.
1424  CGF.EmitBlock(AwaitBB);
1425  // Wait for parallel work
1426  syncCTAThreads(CGF);
1427 
1428  Address WorkFn =
1429  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1430  Address ExecStatus =
1431  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1432  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1433  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1434 
1435  // TODO: Optimize runtime initialization and pass in correct value.
1436  llvm::Value *Args[] = {WorkFn.getPointer(),
1437  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1439  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1440  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1441 
1442  // On termination condition (workid == 0), exit loop.
1443  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1444  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1445  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1446 
1447  // Activate requested workers.
1448  CGF.EmitBlock(SelectWorkersBB);
1449  llvm::Value *IsActive =
1450  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1451  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1452 
1453  // Signal start of parallel region.
1454  CGF.EmitBlock(ExecuteBB);
1455  // Skip initialization.
1456  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1457 
1458  // Process work items: outlined parallel functions.
1459  for (llvm::Function *W : Work) {
1460  // Try to match this outlined function.
1462 
1463  llvm::Value *WorkFnMatch =
1464  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1465 
1466  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1467  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1468  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1469 
1470  // Execute this outlined function.
1471  CGF.EmitBlock(ExecuteFNBB);
1472 
1473  // Insert call to work function via shared wrapper. The shared
1474  // wrapper takes two arguments:
1475  // - the parallelism level;
1476  // - the thread ID;
1477  emitCall(CGF, WST.Loc, W,
1478  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1479 
1480  // Go to end of parallel region.
1481  CGF.EmitBranch(TerminateBB);
1482 
1483  CGF.EmitBlock(CheckNextBB);
1484  }
1485  // Default case: call to outlined function through pointer if the target
1486  // region makes a declare target call that may contain an orphaned parallel
1487  // directive.
1488  auto *ParallelFnTy =
1489  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1490  /*isVarArg=*/false);
1491  llvm::Value *WorkFnCast =
1492  Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1493  // Insert call to work function via shared wrapper. The shared
1494  // wrapper takes two arguments:
1495  // - the parallelism level;
1496  // - the thread ID;
1497  emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1498  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1499  // Go to end of parallel region.
1500  CGF.EmitBranch(TerminateBB);
1501 
1502  // Signal end of parallel region.
1503  CGF.EmitBlock(TerminateBB);
1504  CGF.EmitRuntimeCall(
1505  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1506  llvm::None);
1507  CGF.EmitBranch(BarrierBB);
1508 
1509  // All active and inactive workers wait at a barrier after parallel region.
1510  CGF.EmitBlock(BarrierBB);
1511  // Barrier after parallel region.
1512  syncCTAThreads(CGF);
1513  CGF.EmitBranch(AwaitBB);
1514 
1515  // Exit target region.
1516  CGF.EmitBlock(ExitBB);
1517  // Skip initialization.
1518  clearLocThreadIdInsertPt(CGF);
1519 }
1520 
1521 /// Returns specified OpenMP runtime function for the current OpenMP
1522 /// implementation. Specialized for the NVPTX device.
1523 /// \param Function OpenMP runtime function.
1524 /// \return Specified function.
1525 llvm::FunctionCallee
1527  llvm::FunctionCallee RTLFn = nullptr;
1528  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1529  case OMPRTL_NVPTX__kmpc_kernel_init: {
1530  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1531  // RequiresOMPRuntime);
1532  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1533  auto *FnTy =
1534  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1535  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1536  break;
1537  }
1538  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1539  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1540  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1541  auto *FnTy =
1542  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1543  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1544  break;
1545  }
1546  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1547  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1548  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1549  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1550  auto *FnTy =
1551  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1552  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1553  break;
1554  }
1555  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1556  // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1557  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1558  auto *FnTy =
1559  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1560  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1561  break;
1562  }
1563  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1564  /// Build void __kmpc_kernel_prepare_parallel(
1565  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1566  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1567  auto *FnTy =
1568  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1569  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1570  break;
1571  }
1572  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1573  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1574  /// int16_t IsOMPRuntimeInitialized);
1575  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1576  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1577  auto *FnTy =
1578  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1579  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1580  break;
1581  }
1582  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1583  /// Build void __kmpc_kernel_end_parallel();
1584  auto *FnTy =
1585  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1586  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1587  break;
1588  }
1589  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1590  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1591  // global_tid);
1592  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1593  auto *FnTy =
1594  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1595  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1596  break;
1597  }
1598  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1599  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1600  // global_tid);
1601  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1602  auto *FnTy =
1603  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1604  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1605  break;
1606  }
1607  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1608  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1609  // int16_t lane_offset, int16_t warp_size);
1610  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1611  auto *FnTy =
1612  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1613  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1614  break;
1615  }
1616  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1617  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1618  // int16_t lane_offset, int16_t warp_size);
1619  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1620  auto *FnTy =
1621  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1622  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1623  break;
1624  }
1625  case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: {
1626  // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
1627  // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
1628  // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t
1629  // lane_id, int16_t lane_offset, int16_t Algorithm Version), void
1630  // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1631  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1632  CGM.Int16Ty, CGM.Int16Ty};
1633  auto *ShuffleReduceFnTy =
1634  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1635  /*isVarArg=*/false);
1636  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1637  auto *InterWarpCopyFnTy =
1638  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1639  /*isVarArg=*/false);
1640  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1641  CGM.Int32Ty,
1642  CGM.Int32Ty,
1643  CGM.SizeTy,
1644  CGM.VoidPtrTy,
1645  ShuffleReduceFnTy->getPointerTo(),
1646  InterWarpCopyFnTy->getPointerTo()};
1647  auto *FnTy =
1648  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1649  RTLFn = CGM.CreateRuntimeFunction(
1650  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2");
1651  break;
1652  }
1653  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1654  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1655  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1656  auto *FnTy =
1657  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1658  RTLFn = CGM.CreateRuntimeFunction(
1659  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1660  break;
1661  }
1662  case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: {
1663  // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
1664  // global_tid, void *global_buffer, int32_t num_of_records, void*
1665  // reduce_data,
1666  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1667  // lane_offset, int16_t shortCircuit),
1668  // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
1669  // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
1670  // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
1671  // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
1672  // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
1673  // *buffer, int idx, void *reduce_data));
1674  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1675  CGM.Int16Ty, CGM.Int16Ty};
1676  auto *ShuffleReduceFnTy =
1677  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1678  /*isVarArg=*/false);
1679  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1680  auto *InterWarpCopyFnTy =
1681  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1682  /*isVarArg=*/false);
1683  llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy,
1684  CGM.VoidPtrTy};
1685  auto *GlobalListFnTy =
1686  llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams,
1687  /*isVarArg=*/false);
1688  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1689  CGM.Int32Ty,
1690  CGM.VoidPtrTy,
1691  CGM.Int32Ty,
1692  CGM.VoidPtrTy,
1693  ShuffleReduceFnTy->getPointerTo(),
1694  InterWarpCopyFnTy->getPointerTo(),
1695  GlobalListFnTy->getPointerTo(),
1696  GlobalListFnTy->getPointerTo(),
1697  GlobalListFnTy->getPointerTo(),
1698  GlobalListFnTy->getPointerTo()};
1699  auto *FnTy =
1700  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1701  RTLFn = CGM.CreateRuntimeFunction(
1702  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2");
1703  break;
1704  }
1705  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1706  /// Build void __kmpc_data_sharing_init_stack();
1707  auto *FnTy =
1708  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1709  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1710  break;
1711  }
1712  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1713  /// Build void __kmpc_data_sharing_init_stack_spmd();
1714  auto *FnTy =
1715  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1716  RTLFn =
1717  CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1718  break;
1719  }
1720  case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1721  // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1722  // int16_t UseSharedMemory);
1723  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1724  auto *FnTy =
1725  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1726  RTLFn = CGM.CreateRuntimeFunction(
1727  FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
1728  break;
1729  }
1730  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1731  // Build void __kmpc_data_sharing_pop_stack(void *a);
1732  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1733  auto *FnTy =
1734  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1735  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1736  /*Name=*/"__kmpc_data_sharing_pop_stack");
1737  break;
1738  }
1739  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1740  /// Build void __kmpc_begin_sharing_variables(void ***args,
1741  /// size_t n_args);
1742  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1743  auto *FnTy =
1744  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1745  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1746  break;
1747  }
1748  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1749  /// Build void __kmpc_end_sharing_variables();
1750  auto *FnTy =
1751  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1752  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1753  break;
1754  }
1755  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1756  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1757  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1758  auto *FnTy =
1759  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1760  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1761  break;
1762  }
1763  case OMPRTL_NVPTX__kmpc_parallel_level: {
1764  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1765  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1766  auto *FnTy =
1767  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1768  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1769  break;
1770  }
1771  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1772  // Build int8_t __kmpc_is_spmd_exec_mode();
1773  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1774  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1775  break;
1776  }
1777  case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1778  // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
1779  // const void *buf, size_t size, int16_t is_shared, const void **res);
1780  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
1781  CGM.Int16Ty, CGM.VoidPtrPtrTy};
1782  auto *FnTy =
1783  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1784  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1785  break;
1786  }
1787  case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1788  // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
1789  // int16_t is_shared);
1790  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
1791  auto *FnTy =
1792  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1793  RTLFn =
1794  CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1795  break;
1796  }
1797  case OMPRTL__kmpc_barrier: {
1798  // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1799  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1800  auto *FnTy =
1801  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1802  RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
1803  cast<llvm::Function>(RTLFn.getCallee())
1804  ->addFnAttr(llvm::Attribute::Convergent);
1805  break;
1806  }
1807  case OMPRTL__kmpc_barrier_simple_spmd: {
1808  // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
1809  // global_tid);
1810  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1811  auto *FnTy =
1812  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1813  RTLFn =
1814  CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
1815  cast<llvm::Function>(RTLFn.getCallee())
1816  ->addFnAttr(llvm::Attribute::Convergent);
1817  break;
1818  }
1819  case OMPRTL_NVPTX__kmpc_warp_active_thread_mask: {
1820  // Build int32_t __kmpc_warp_active_thread_mask(void);
1821  auto *FnTy =
1822  llvm::FunctionType::get(CGM.Int32Ty, llvm::None, /*isVarArg=*/false);
1823  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_warp_active_thread_mask");
1824  break;
1825  }
1826  case OMPRTL_NVPTX__kmpc_syncwarp: {
1827  // Build void __kmpc_syncwarp(kmp_int32 Mask);
1828  auto *FnTy =
1829  llvm::FunctionType::get(CGM.VoidTy, CGM.Int32Ty, /*isVarArg=*/false);
1830  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_syncwarp");
1831  break;
1832  }
1833  }
1834  return RTLFn;
1835 }
1836 
1837 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1838  llvm::Constant *Addr,
1839  uint64_t Size, int32_t,
1840  llvm::GlobalValue::LinkageTypes) {
1841  // TODO: Add support for global variables on the device after declare target
1842  // support.
1843  if (!isa<llvm::Function>(Addr))
1844  return;
1845  llvm::Module &M = CGM.getModule();
1846  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1847 
1848  // Get "nvvm.annotations" metadata node
1849  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1850 
1851  llvm::Metadata *MDVals[] = {
1852  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1853  llvm::ConstantAsMetadata::get(
1854  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1855  // Append metadata to nvvm.annotations
1856  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1857 }
1858 
1859 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1860  const OMPExecutableDirective &D, StringRef ParentName,
1861  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1862  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1863  if (!IsOffloadEntry) // Nothing to do.
1864  return;
1865 
1866  assert(!ParentName.empty() && "Invalid target region parent name!");
1867 
1868  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1869  if (Mode)
1870  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1871  CodeGen);
1872  else
1873  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1874  CodeGen);
1875 
1876  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1877 }
1878 
1879 namespace {
1881 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1882 enum ModeFlagsTy : unsigned {
1883  /// Bit set to 1 when in SPMD mode.
1884  KMP_IDENT_SPMD_MODE = 0x01,
1885  /// Bit set to 1 when a simplified runtime is used.
1886  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1887  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1888 };
1889 
1890 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1891 static const ModeFlagsTy UndefinedMode =
1892  (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1893 } // anonymous namespace
1894 
1896  switch (getExecutionMode()) {
1897  case EM_SPMD:
1898  if (requiresFullRuntime())
1899  return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1900  return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1901  case EM_NonSPMD:
1902  assert(requiresFullRuntime() && "Expected full runtime.");
1903  return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1904  case EM_Unknown:
1905  return UndefinedMode;
1906  }
1907  llvm_unreachable("Unknown flags are requested.");
1908 }
1909 
1911  const GlobalDecl &OldGD,
1912  llvm::GlobalValue *OrigAddr,
1913  bool IsForDefinition) {
1914  // Emit the function in OldGD with the body from NewGD, if NewGD is defined.
1915  auto *NewFD = cast<FunctionDecl>(NewGD.getDecl());
1916  if (NewFD->isDefined()) {
1917  CGM.emitOpenMPDeviceFunctionRedefinition(OldGD, NewGD, OrigAddr);
1918  return true;
1919  }
1920  return false;
1921 }
1922 
1924  : CGOpenMPRuntime(CGM, "_", "$") {
1925  if (!CGM.getLangOpts().OpenMPIsDevice)
1926  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1927 }
1928 
1930  OpenMPProcBindClauseKind ProcBind,
1931  SourceLocation Loc) {
1932  // Do nothing in case of SPMD mode and L0 parallel.
1933  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1934  return;
1935 
1936  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1937 }
1938 
1940  llvm::Value *NumThreads,
1941  SourceLocation Loc) {
1942  // Do nothing in case of SPMD mode and L0 parallel.
1943  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1944  return;
1945 
1946  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1947 }
1948 
1950  const Expr *NumTeams,
1951  const Expr *ThreadLimit,
1952  SourceLocation Loc) {}
1953 
1955  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1956  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1957  // Emit target region as a standalone region.
1958  class NVPTXPrePostActionTy : public PrePostActionTy {
1959  bool &IsInParallelRegion;
1960  bool PrevIsInParallelRegion;
1961 
1962  public:
1963  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1964  : IsInParallelRegion(IsInParallelRegion) {}
1965  void Enter(CodeGenFunction &CGF) override {
1966  PrevIsInParallelRegion = IsInParallelRegion;
1967  IsInParallelRegion = true;
1968  }
1969  void Exit(CodeGenFunction &CGF) override {
1970  IsInParallelRegion = PrevIsInParallelRegion;
1971  }
1972  } Action(IsInParallelRegion);
1973  CodeGen.setAction(Action);
1974  bool PrevIsInTTDRegion = IsInTTDRegion;
1975  IsInTTDRegion = false;
1976  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1977  IsInTargetMasterThreadRegion = false;
1978  auto *OutlinedFun =
1979  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1980  D, ThreadIDVar, InnermostKind, CodeGen));
1981  if (CGM.getLangOpts().Optimize) {
1982  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
1983  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
1984  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
1985  }
1986  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1987  IsInTTDRegion = PrevIsInTTDRegion;
1988  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1989  !IsInParallelRegion) {
1990  llvm::Function *WrapperFun =
1991  createParallelDataSharingWrapper(OutlinedFun, D);
1992  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1993  }
1994 
1995  return OutlinedFun;
1996 }
1997 
1998 /// Get list of lastprivate variables from the teams distribute ... or
1999 /// teams {distribute ...} directives.
2000 static void
2004  "expected teams directive.");
2005  const OMPExecutableDirective *Dir = &D;
2008  Ctx,
2010  /*IgnoreCaptured=*/true))) {
2011  Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
2012  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
2013  Dir = nullptr;
2014  }
2015  }
2016  if (!Dir)
2017  return;
2018  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
2019  for (const Expr *E : C->getVarRefs())
2020  Vars.push_back(getPrivateItem(E));
2021  }
2022 }
2023 
2024 /// Get list of reduction variables from the teams ... directives.
2025 static void
2029  "expected teams directive.");
2030  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
2031  for (const Expr *E : C->privates())
2032  Vars.push_back(getPrivateItem(E));
2033  }
2034 }
2035 
2037  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
2038  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
2039  SourceLocation Loc = D.getBeginLoc();
2040 
2041  const RecordDecl *GlobalizedRD = nullptr;
2042  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
2043  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
2044  // Globalize team reductions variable unconditionally in all modes.
2045  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2046  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
2047  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
2048  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2049  if (!LastPrivatesReductions.empty()) {
2050  GlobalizedRD = ::buildRecordForGlobalizedVars(
2051  CGM.getContext(), llvm::None, LastPrivatesReductions,
2052  MappedDeclsFields, WarpSize);
2053  }
2054  } else if (!LastPrivatesReductions.empty()) {
2055  assert(!TeamAndReductions.first &&
2056  "Previous team declaration is not expected.");
2057  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2058  std::swap(TeamAndReductions.second, LastPrivatesReductions);
2059  }
2060 
2061  // Emit target region as a standalone region.
2062  class NVPTXPrePostActionTy : public PrePostActionTy {
2063  SourceLocation &Loc;
2064  const RecordDecl *GlobalizedRD;
2065  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2066  &MappedDeclsFields;
2067 
2068  public:
2069  NVPTXPrePostActionTy(
2070  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2071  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2072  &MappedDeclsFields)
2073  : Loc(Loc), GlobalizedRD(GlobalizedRD),
2074  MappedDeclsFields(MappedDeclsFields) {}
2075  void Enter(CodeGenFunction &CGF) override {
2076  auto &Rt =
2077  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2078  if (GlobalizedRD) {
2079  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2080  I->getSecond().GlobalRecord = GlobalizedRD;
2081  I->getSecond().MappedParams =
2082  std::make_unique<CodeGenFunction::OMPMapVars>();
2083  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2084  for (const auto &Pair : MappedDeclsFields) {
2085  assert(Pair.getFirst()->isCanonicalDecl() &&
2086  "Expected canonical declaration");
2087  Data.insert(std::make_pair(Pair.getFirst(),
2088  MappedVarData(Pair.getSecond(),
2089  /*IsOnePerTeam=*/true)));
2090  }
2091  }
2092  Rt.emitGenericVarsProlog(CGF, Loc);
2093  }
2094  void Exit(CodeGenFunction &CGF) override {
2095  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2096  .emitGenericVarsEpilog(CGF);
2097  }
2098  } Action(Loc, GlobalizedRD, MappedDeclsFields);
2099  CodeGen.setAction(Action);
2100  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2101  D, ThreadIDVar, InnermostKind, CodeGen);
2102  if (CGM.getLangOpts().Optimize) {
2103  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2104  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2105  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2106  }
2107 
2108  return OutlinedFun;
2109 }
2110 
2111 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2112  SourceLocation Loc,
2113  bool WithSPMDCheck) {
2115  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2116  return;
2117 
2118  CGBuilderTy &Bld = CGF.Builder;
2119 
2120  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2121  if (I == FunctionGlobalizedDecls.end())
2122  return;
2123  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2124  QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2125  QualType SecGlobalRecTy;
2126 
2127  // Recover pointer to this function's global record. The runtime will
2128  // handle the specifics of the allocation of the memory.
2129  // Use actual memory size of the record including the padding
2130  // for alignment purposes.
2131  unsigned Alignment =
2132  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2133  unsigned GlobalRecordSize =
2134  CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2135  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2136 
2137  llvm::PointerType *GlobalRecPtrTy =
2138  CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2139  llvm::Value *GlobalRecCastAddr;
2140  llvm::Value *IsTTD = nullptr;
2141  if (!IsInTTDRegion &&
2142  (WithSPMDCheck ||
2143  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2144  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2145  llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2146  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2147  if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2148  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2149  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2150  llvm::Value *PL = CGF.EmitRuntimeCall(
2151  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2152  {RTLoc, ThreadID});
2153  IsTTD = Bld.CreateIsNull(PL);
2154  }
2155  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2156  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2157  Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2158  // There is no need to emit line number for unconditional branch.
2160  CGF.EmitBlock(SPMDBB);
2161  Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2162  CharUnits::fromQuantity(Alignment));
2163  CGF.EmitBranch(ExitBB);
2164  // There is no need to emit line number for unconditional branch.
2166  CGF.EmitBlock(NonSPMDBB);
2167  llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2168  if (const RecordDecl *SecGlobalizedVarsRecord =
2169  I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2170  SecGlobalRecTy =
2171  CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2172 
2173  // Recover pointer to this function's global record. The runtime will
2174  // handle the specifics of the allocation of the memory.
2175  // Use actual memory size of the record including the padding
2176  // for alignment purposes.
2177  unsigned Alignment =
2178  CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2179  unsigned GlobalRecordSize =
2180  CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2181  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2182  Size = Bld.CreateSelect(
2183  IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2184  }
2185  // TODO: allow the usage of shared memory to be controlled by
2186  // the user, for now, default to global.
2187  llvm::Value *GlobalRecordSizeArg[] = {
2188  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2189  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2191  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2192  GlobalRecordSizeArg);
2193  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2194  GlobalRecValue, GlobalRecPtrTy);
2195  CGF.EmitBlock(ExitBB);
2196  auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2197  /*NumReservedValues=*/2, "_select_stack");
2198  Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2199  Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2200  GlobalRecCastAddr = Phi;
2201  I->getSecond().GlobalRecordAddr = Phi;
2202  I->getSecond().IsInSPMDModeFlag = IsSPMD;
2203  } else if (IsInTTDRegion) {
2204  assert(GlobalizedRecords.back().Records.size() < 2 &&
2205  "Expected less than 2 globalized records: one for target and one "
2206  "for teams.");
2207  unsigned Offset = 0;
2208  for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2209  QualType RDTy = CGM.getContext().getRecordType(RD);
2210  unsigned Alignment =
2212  unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2213  Offset =
2214  llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2215  }
2216  unsigned Alignment =
2217  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2218  Offset = llvm::alignTo(Offset, Alignment);
2219  GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2220  ++GlobalizedRecords.back().RegionCounter;
2221  if (GlobalizedRecords.back().Records.size() == 1) {
2222  assert(KernelStaticGlobalized &&
2223  "Kernel static pointer must be initialized already.");
2224  auto *UseSharedMemory = new llvm::GlobalVariable(
2225  CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2227  "_openmp_static_kernel$is_shared");
2228  UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2229  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2230  /*DestWidth=*/16, /*Signed=*/0);
2231  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2232  Address(UseSharedMemory,
2233  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2234  /*Volatile=*/false, Int16Ty, Loc);
2235  auto *StaticGlobalized = new llvm::GlobalVariable(
2236  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2237  llvm::GlobalValue::CommonLinkage, nullptr);
2238  auto *RecSize = new llvm::GlobalVariable(
2239  CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2241  "_openmp_static_kernel$size");
2242  RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2243  llvm::Value *Ld = CGF.EmitLoadOfScalar(
2244  Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
2245  CGM.getContext().getSizeType(), Loc);
2247  KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2248  llvm::Value *GlobalRecordSizeArg[] = {
2249  llvm::ConstantInt::get(
2250  CGM.Int16Ty,
2251  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2252  StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
2254  OMPRTL_NVPTX__kmpc_get_team_static_memory),
2255  GlobalRecordSizeArg);
2256  GlobalizedRecords.back().Buffer = StaticGlobalized;
2257  GlobalizedRecords.back().RecSize = RecSize;
2258  GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2259  GlobalizedRecords.back().Loc = Loc;
2260  }
2261  assert(KernelStaticGlobalized && "Global address must be set already.");
2262  Address FrameAddr = CGF.EmitLoadOfPointer(
2263  Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2264  CGM.getContext()
2266  .castAs<PointerType>());
2267  llvm::Value *GlobalRecValue =
2268  Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
2269  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2270  I->getSecond().IsInSPMDModeFlag = nullptr;
2271  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2272  GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2273  } else {
2274  // TODO: allow the usage of shared memory to be controlled by
2275  // the user, for now, default to global.
2276  llvm::Value *GlobalRecordSizeArg[] = {
2277  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2278  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2279  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2281  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2282  GlobalRecordSizeArg);
2283  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2284  GlobalRecValue, GlobalRecPtrTy);
2285  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2286  I->getSecond().IsInSPMDModeFlag = nullptr;
2287  }
2288  LValue Base =
2289  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
2290 
2291  // Emit the "global alloca" which is a GEP from the global declaration
2292  // record using the pointer returned by the runtime.
2293  LValue SecBase;
2294  decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2295  if (IsTTD) {
2296  SecIt = I->getSecond().SecondaryLocalVarData->begin();
2297  llvm::PointerType *SecGlobalRecPtrTy =
2298  CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2299  SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2301  I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2302  SecGlobalRecTy);
2303  }
2304  for (auto &Rec : I->getSecond().LocalVarData) {
2305  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2306  llvm::Value *ParValue;
2307  if (EscapedParam) {
2308  const auto *VD = cast<VarDecl>(Rec.first);
2309  LValue ParLVal =
2310  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2311  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2312  }
2313  LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2314  // Emit VarAddr basing on lane-id if required.
2315  QualType VarTy;
2316  if (Rec.second.IsOnePerTeam) {
2317  VarTy = Rec.second.FD->getType();
2318  } else {
2319  llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2320  VarAddr.getAddress().getPointer(),
2321  {Bld.getInt32(0), getNVPTXLaneID(CGF)});
2322  VarTy =
2323  Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2324  VarAddr = CGF.MakeAddrLValue(
2325  Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2327  }
2328  Rec.second.PrivateAddr = VarAddr.getAddress();
2329  if (!IsInTTDRegion &&
2330  (WithSPMDCheck ||
2331  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2332  assert(I->getSecond().IsInSPMDModeFlag &&
2333  "Expected unknown execution mode or required SPMD check.");
2334  if (IsTTD) {
2335  assert(SecIt->second.IsOnePerTeam &&
2336  "Secondary glob data must be one per team.");
2337  LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2338  VarAddr.setAddress(
2339  Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2340  VarAddr.getPointer()),
2341  VarAddr.getAlignment()));
2342  Rec.second.PrivateAddr = VarAddr.getAddress();
2343  }
2344  Address GlobalPtr = Rec.second.PrivateAddr;
2345  Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2346  Rec.second.PrivateAddr = Address(
2347  Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2348  LocalAddr.getPointer(), GlobalPtr.getPointer()),
2349  LocalAddr.getAlignment());
2350  }
2351  if (EscapedParam) {
2352  const auto *VD = cast<VarDecl>(Rec.first);
2353  CGF.EmitStoreOfScalar(ParValue, VarAddr);
2354  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2355  }
2356  if (IsTTD)
2357  ++SecIt;
2358  }
2359  }
2360  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2361  // Recover pointer to this function's global record. The runtime will
2362  // handle the specifics of the allocation of the memory.
2363  // Use actual memory size of the record including the padding
2364  // for alignment purposes.
2365  CGBuilderTy &Bld = CGF.Builder;
2366  llvm::Value *Size = CGF.getTypeSize(VD->getType());
2367  CharUnits Align = CGM.getContext().getDeclAlign(VD);
2368  Size = Bld.CreateNUWAdd(
2369  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2370  llvm::Value *AlignVal =
2371  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2372  Size = Bld.CreateUDiv(Size, AlignVal);
2373  Size = Bld.CreateNUWMul(Size, AlignVal);
2374  // TODO: allow the usage of shared memory to be controlled by
2375  // the user, for now, default to global.
2376  llvm::Value *GlobalRecordSizeArg[] = {
2377  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2378  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2380  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2381  GlobalRecordSizeArg);
2382  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2383  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2384  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2385  CGM.getContext().getDeclAlign(VD),
2387  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2388  Base.getAddress());
2389  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2390  }
2391  I->getSecond().MappedParams->apply(CGF);
2392 }
2393 
2394 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2395  bool WithSPMDCheck) {
2397  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2398  return;
2399 
2400  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2401  if (I != FunctionGlobalizedDecls.end()) {
2402  I->getSecond().MappedParams->restore(CGF);
2403  if (!CGF.HaveInsertPoint())
2404  return;
2405  for (llvm::Value *Addr :
2406  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2407  CGF.EmitRuntimeCall(
2408  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2409  Addr);
2410  }
2411  if (I->getSecond().GlobalRecordAddr) {
2412  if (!IsInTTDRegion &&
2413  (WithSPMDCheck ||
2414  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2415  CGBuilderTy &Bld = CGF.Builder;
2416  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2417  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2418  Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2419  // There is no need to emit line number for unconditional branch.
2421  CGF.EmitBlock(NonSPMDBB);
2422  CGF.EmitRuntimeCall(
2424  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2425  CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2426  CGF.EmitBlock(ExitBB);
2427  } else if (IsInTTDRegion) {
2428  assert(GlobalizedRecords.back().RegionCounter > 0 &&
2429  "region counter must be > 0.");
2430  --GlobalizedRecords.back().RegionCounter;
2431  // Emit the restore function only in the target region.
2432  if (GlobalizedRecords.back().RegionCounter == 0) {
2433  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2434  /*DestWidth=*/16, /*Signed=*/0);
2435  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2436  Address(GlobalizedRecords.back().UseSharedMemory,
2437  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2438  /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2439  llvm::Value *Args[] = {
2440  llvm::ConstantInt::get(
2441  CGM.Int16Ty,
2442  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2443  IsInSharedMemory};
2444  CGF.EmitRuntimeCall(
2446  OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2447  Args);
2448  }
2449  } else {
2451  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2452  I->getSecond().GlobalRecordAddr);
2453  }
2454  }
2455  }
2456 }
2457 
2459  const OMPExecutableDirective &D,
2460  SourceLocation Loc,
2461  llvm::Function *OutlinedFn,
2462  ArrayRef<llvm::Value *> CapturedVars) {
2463  if (!CGF.HaveInsertPoint())
2464  return;
2465 
2466  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2467  /*Name=*/".zero.addr");
2468  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2469  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2470  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2471  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2472  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2473  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2474 }
2475 
2477  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2478  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2479  if (!CGF.HaveInsertPoint())
2480  return;
2481 
2482  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2483  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2484  else
2485  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2486 }
2487 
2488 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2489  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2490  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2491  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2492 
2493  // Force inline this outlined function at its call site.
2494  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2495 
2496  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2497  /*Name=*/".zero.addr");
2498  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2499  // ThreadId for serialized parallels is 0.
2500  Address ThreadIDAddr = ZeroAddr;
2501  auto &&CodeGen = [this, Fn, CapturedVars, Loc, &ThreadIDAddr](
2502  CodeGenFunction &CGF, PrePostActionTy &Action) {
2503  Action.Enter(CGF);
2504 
2505  Address ZeroAddr =
2506  CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2507  /*Name=*/".bound.zero.addr");
2508  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2509  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2510  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2511  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2512  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2513  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2514  };
2515  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2516  PrePostActionTy &) {
2517 
2518  RegionCodeGenTy RCG(CodeGen);
2519  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2520  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2521  llvm::Value *Args[] = {RTLoc, ThreadID};
2522 
2523  NVPTXActionTy Action(
2524  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2525  Args,
2526  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2527  Args);
2528  RCG.setAction(Action);
2529  RCG(CGF);
2530  };
2531 
2532  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2533  PrePostActionTy &Action) {
2534  CGBuilderTy &Bld = CGF.Builder;
2535  llvm::Function *WFn = WrapperFunctionsMap[Fn];
2536  assert(WFn && "Wrapper function does not exist!");
2537  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2538 
2539  // Prepare for parallel region. Indicate the outlined function.
2540  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
2541  CGF.EmitRuntimeCall(
2542  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2543  Args);
2544 
2545  // Create a private scope that will globalize the arguments
2546  // passed from the outside of the target region.
2547  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2548 
2549  // There's something to share.
2550  if (!CapturedVars.empty()) {
2551  // Prepare for parallel region. Indicate the outlined function.
2552  Address SharedArgs =
2553  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2554  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2555 
2556  llvm::Value *DataSharingArgs[] = {
2557  SharedArgsPtr,
2558  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2559  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2560  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2561  DataSharingArgs);
2562 
2563  // Store variable address in a list of references to pass to workers.
2564  unsigned Idx = 0;
2565  ASTContext &Ctx = CGF.getContext();
2566  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2567  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2568  .castAs<PointerType>());
2569  for (llvm::Value *V : CapturedVars) {
2570  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
2571  llvm::Value *PtrV;
2572  if (V->getType()->isIntegerTy())
2573  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2574  else
2575  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2576  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2577  Ctx.getPointerType(Ctx.VoidPtrTy));
2578  ++Idx;
2579  }
2580  }
2581 
2582  // Activate workers. This barrier is used by the master to signal
2583  // work for the workers.
2584  syncCTAThreads(CGF);
2585 
2586  // OpenMP [2.5, Parallel Construct, p.49]
2587  // There is an implied barrier at the end of a parallel region. After the
2588  // end of a parallel region, only the master thread of the team resumes
2589  // execution of the enclosing task region.
2590  //
2591  // The master waits at this barrier until all workers are done.
2592  syncCTAThreads(CGF);
2593 
2594  if (!CapturedVars.empty())
2595  CGF.EmitRuntimeCall(
2596  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2597 
2598  // Remember for post-processing in worker loop.
2599  Work.emplace_back(WFn);
2600  };
2601 
2602  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2603  CodeGenFunction &CGF, PrePostActionTy &Action) {
2604  if (IsInParallelRegion) {
2605  SeqGen(CGF, Action);
2606  } else if (IsInTargetMasterThreadRegion) {
2607  L0ParallelGen(CGF, Action);
2608  } else {
2609  // Check for master and then parallelism:
2610  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2611  // Serialized execution.
2612  // } else {
2613  // Worker call.
2614  // }
2615  CGBuilderTy &Bld = CGF.Builder;
2616  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2617  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2618  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2619  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2620  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2621  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2622  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2623  // There is no need to emit line number for unconditional branch.
2625  CGF.EmitBlock(ParallelCheckBB);
2626  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2627  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2628  llvm::Value *PL = CGF.EmitRuntimeCall(
2629  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2630  {RTLoc, ThreadID});
2631  llvm::Value *Res = Bld.CreateIsNotNull(PL);
2632  Bld.CreateCondBr(Res, SeqBB, MasterBB);
2633  CGF.EmitBlock(SeqBB);
2634  SeqGen(CGF, Action);
2635  CGF.EmitBranch(ExitBB);
2636  // There is no need to emit line number for unconditional branch.
2638  CGF.EmitBlock(MasterBB);
2639  L0ParallelGen(CGF, Action);
2640  CGF.EmitBranch(ExitBB);
2641  // There is no need to emit line number for unconditional branch.
2643  // Emit the continuation block for code after the if.
2644  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2645  }
2646  };
2647 
2648  if (IfCond) {
2649  emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2650  } else {
2652  RegionCodeGenTy ThenRCG(LNParallelGen);
2653  ThenRCG(CGF);
2654  }
2655 }
2656 
2657 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2658  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2659  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2660  // Just call the outlined function to execute the parallel region.
2661  // OutlinedFn(&GTid, &zero, CapturedStruct);
2662  //
2663  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2664 
2665  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2666  /*Name=*/".zero.addr");
2667  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2668  // ThreadId for serialized parallels is 0.
2669  Address ThreadIDAddr = ZeroAddr;
2670  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, &ThreadIDAddr](
2671  CodeGenFunction &CGF, PrePostActionTy &Action) {
2672  Action.Enter(CGF);
2673 
2674  Address ZeroAddr =
2675  CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
2676  /*Name=*/".bound.zero.addr");
2677  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2678  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2679  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2680  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2681  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2682  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2683  };
2684  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2685  PrePostActionTy &) {
2686 
2687  RegionCodeGenTy RCG(CodeGen);
2688  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2689  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2690  llvm::Value *Args[] = {RTLoc, ThreadID};
2691 
2692  NVPTXActionTy Action(
2693  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2694  Args,
2695  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2696  Args);
2697  RCG.setAction(Action);
2698  RCG(CGF);
2699  };
2700 
2701  if (IsInTargetMasterThreadRegion) {
2702  // In the worker need to use the real thread id.
2703  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2704  RegionCodeGenTy RCG(CodeGen);
2705  RCG(CGF);
2706  } else {
2707  // If we are not in the target region, it is definitely L2 parallelism or
2708  // more, because for SPMD mode we always has L1 parallel level, sowe don't
2709  // need to check for orphaned directives.
2710  RegionCodeGenTy RCG(SeqGen);
2711  RCG(CGF);
2712  }
2713 }
2714 
2715 void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
2716  // Always emit simple barriers!
2717  if (!CGF.HaveInsertPoint())
2718  return;
2719  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2720  // This function does not use parameters, so we can emit just default values.
2721  llvm::Value *Args[] = {
2722  llvm::ConstantPointerNull::get(
2723  cast<llvm::PointerType>(getIdentTyPointerTy())),
2724  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2725  llvm::CallInst *Call = CGF.EmitRuntimeCall(
2726  createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
2727  Call->setConvergent();
2728 }
2729 
2731  SourceLocation Loc,
2732  OpenMPDirectiveKind Kind, bool,
2733  bool) {
2734  // Always emit simple barriers!
2735  if (!CGF.HaveInsertPoint())
2736  return;
2737  // Build call __kmpc_cancel_barrier(loc, thread_id);
2738  unsigned Flags = getDefaultFlagsForBarriers(Kind);
2739  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2740  getThreadID(CGF, Loc)};
2741  llvm::CallInst *Call = CGF.EmitRuntimeCall(
2743  Call->setConvergent();
2744 }
2745 
2747  CodeGenFunction &CGF, StringRef CriticalName,
2748  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2749  const Expr *Hint) {
2750  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2751  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2752  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2753  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2754  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2755 
2756  // Get the mask of active threads in the warp.
2757  llvm::Value *Mask = CGF.EmitRuntimeCall(
2758  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
2759  // Fetch team-local id of the thread.
2760  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2761 
2762  // Get the width of the team.
2763  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2764 
2765  // Initialize the counter variable for the loop.
2766  QualType Int32Ty =
2767  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2768  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2769  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2770  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2771  /*isInit=*/true);
2772 
2773  // Block checks if loop counter exceeds upper bound.
2774  CGF.EmitBlock(LoopBB);
2775  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2776  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2777  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2778 
2779  // Block tests which single thread should execute region, and which threads
2780  // should go straight to synchronisation point.
2781  CGF.EmitBlock(TestBB);
2782  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2783  llvm::Value *CmpThreadToCounter =
2784  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2785  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2786 
2787  // Block emits the body of the critical region.
2788  CGF.EmitBlock(BodyBB);
2789 
2790  // Output the critical statement.
2791  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2792  Hint);
2793 
2794  // After the body surrounded by the critical region, the single executing
2795  // thread will jump to the synchronisation point.
2796  // Block waits for all threads in current team to finish then increments the
2797  // counter variable and returns to the loop.
2798  CGF.EmitBlock(SyncBB);
2799  // Reconverge active threads in the warp.
2800  (void)CGF.EmitRuntimeCall(
2801  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_syncwarp), Mask);
2802 
2803  llvm::Value *IncCounterVal =
2804  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2805  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2806  CGF.EmitBranch(LoopBB);
2807 
2808  // Block that is reached when all threads in the team complete the region.
2809  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2810 }
2811 
2812 /// Cast value to the specified type.
2814  QualType ValTy, QualType CastTy,
2815  SourceLocation Loc) {
2816  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2817  "Cast type must sized.");
2818  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2819  "Val type must sized.");
2820  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2821  if (ValTy == CastTy)
2822  return Val;
2823  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2824  CGF.getContext().getTypeSizeInChars(CastTy))
2825  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2826  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2827  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2828  CastTy->hasSignedIntegerRepresentation());
2829  Address CastItem = CGF.CreateMemTemp(CastTy);
2831  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2832  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2833  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2834 }
2835 
2836 /// This function creates calls to one of two shuffle functions to copy
2837 /// variables between lanes in a warp.
2839  llvm::Value *Elem,
2840  QualType ElemType,
2842  SourceLocation Loc) {
2843  CodeGenModule &CGM = CGF.CGM;
2844  CGBuilderTy &Bld = CGF.Builder;
2845  CGOpenMPRuntimeNVPTX &RT =
2846  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2847 
2848  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2849  assert(Size.getQuantity() <= 8 &&
2850  "Unsupported bitwidth in shuffle instruction.");
2851 
2852  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2853  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2854  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2855 
2856  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2857  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2858  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2859  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2860  llvm::Value *WarpSize =
2861  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2862 
2863  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2864  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2865 
2866  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2867 }
2868 
2869 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2870  Address DestAddr, QualType ElemType,
2872  CGBuilderTy &Bld = CGF.Builder;
2873 
2874  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2875  // Create the loop over the big sized data.
2876  // ptr = (void*)Elem;
2877  // ptrEnd = (void*) Elem + 1;
2878  // Step = 8;
2879  // while (ptr + Step < ptrEnd)
2880  // shuffle((int64_t)*ptr);
2881  // Step = 4;
2882  // while (ptr + Step < ptrEnd)
2883  // shuffle((int32_t)*ptr);
2884  // ...
2885  Address ElemPtr = DestAddr;
2886  Address Ptr = SrcAddr;
2888  Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2889  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2890  if (Size < CharUnits::fromQuantity(IntSize))
2891  continue;
2892  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2893  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2894  /*Signed=*/1);
2895  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2896  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2897  ElemPtr =
2898  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2899  if (Size.getQuantity() / IntSize > 1) {
2900  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2901  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2902  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2903  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2904  CGF.EmitBlock(PreCondBB);
2905  llvm::PHINode *PhiSrc =
2906  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2907  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2908  llvm::PHINode *PhiDest =
2909  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2910  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2911  Ptr = Address(PhiSrc, Ptr.getAlignment());
2912  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2913  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2915  Ptr.getPointer(), CGF.VoidPtrTy));
2916  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2917  ThenBB, ExitBB);
2918  CGF.EmitBlock(ThenBB);
2920  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2921  IntType, Offset, Loc);
2922  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2923  Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2924  Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2925  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2926  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2927  CGF.EmitBranch(PreCondBB);
2928  CGF.EmitBlock(ExitBB);
2929  } else {
2931  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2932  IntType, Offset, Loc);
2933  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2934  Ptr = Bld.CreateConstGEP(Ptr, 1);
2935  ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2936  }
2937  Size = Size % IntSize;
2938  }
2939 }
2940 
2941 namespace {
2942 enum CopyAction : unsigned {
2943  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2944  // the warp using shuffle instructions.
2945  RemoteLaneToThread,
2946  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2947  ThreadCopy,
2948  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2949  ThreadToScratchpad,
2950  // ScratchpadToThread: Copy from a scratchpad array in global memory
2951  // containing team-reduced data to a thread's stack.
2952  ScratchpadToThread,
2953 };
2954 } // namespace
2955 
2960 };
2961 
2962 /// Emit instructions to copy a Reduce list, which contains partially
2963 /// aggregated values, in the specified direction.
2965  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2966  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2967  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2968 
2969  CodeGenModule &CGM = CGF.CGM;
2970  ASTContext &C = CGM.getContext();
2971  CGBuilderTy &Bld = CGF.Builder;
2972 
2973  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2974  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2975  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2976 
2977  // Iterates, element-by-element, through the source Reduce list and
2978  // make a copy.
2979  unsigned Idx = 0;
2980  unsigned Size = Privates.size();
2981  for (const Expr *Private : Privates) {
2982  Address SrcElementAddr = Address::invalid();
2983  Address DestElementAddr = Address::invalid();
2984  Address DestElementPtrAddr = Address::invalid();
2985  // Should we shuffle in an element from a remote lane?
2986  bool ShuffleInElement = false;
2987  // Set to true to update the pointer in the dest Reduce list to a
2988  // newly created element.
2989  bool UpdateDestListPtr = false;
2990  // Increment the src or dest pointer to the scratchpad, for each
2991  // new element.
2992  bool IncrScratchpadSrc = false;
2993  bool IncrScratchpadDest = false;
2994 
2995  switch (Action) {
2996  case RemoteLaneToThread: {
2997  // Step 1.1: Get the address for the src element in the Reduce list.
2998  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2999  SrcElementAddr = CGF.EmitLoadOfPointer(
3000  SrcElementPtrAddr,
3001  C.getPointerType(Private->getType())->castAs<PointerType>());
3002 
3003  // Step 1.2: Create a temporary to store the element in the destination
3004  // Reduce list.
3005  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3006  DestElementAddr =
3007  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3008  ShuffleInElement = true;
3009  UpdateDestListPtr = true;
3010  break;
3011  }
3012  case ThreadCopy: {
3013  // Step 1.1: Get the address for the src element in the Reduce list.
3014  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3015  SrcElementAddr = CGF.EmitLoadOfPointer(
3016  SrcElementPtrAddr,
3017  C.getPointerType(Private->getType())->castAs<PointerType>());
3018 
3019  // Step 1.2: Get the address for dest element. The destination
3020  // element has already been created on the thread's stack.
3021  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3022  DestElementAddr = CGF.EmitLoadOfPointer(
3023  DestElementPtrAddr,
3024  C.getPointerType(Private->getType())->castAs<PointerType>());
3025  break;
3026  }
3027  case ThreadToScratchpad: {
3028  // Step 1.1: Get the address for the src element in the Reduce list.
3029  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3030  SrcElementAddr = CGF.EmitLoadOfPointer(
3031  SrcElementPtrAddr,
3032  C.getPointerType(Private->getType())->castAs<PointerType>());
3033 
3034  // Step 1.2: Get the address for dest element:
3035  // address = base + index * ElementSizeInChars.
3036  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3037  llvm::Value *CurrentOffset =
3038  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3039  llvm::Value *ScratchPadElemAbsolutePtrVal =
3040  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
3041  ScratchPadElemAbsolutePtrVal =
3042  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3043  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3044  C.getTypeAlignInChars(Private->getType()));
3045  IncrScratchpadDest = true;
3046  break;
3047  }
3048  case ScratchpadToThread: {
3049  // Step 1.1: Get the address for the src element in the scratchpad.
3050  // address = base + index * ElementSizeInChars.
3051  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3052  llvm::Value *CurrentOffset =
3053  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3054  llvm::Value *ScratchPadElemAbsolutePtrVal =
3055  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
3056  ScratchPadElemAbsolutePtrVal =
3057  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3058  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3059  C.getTypeAlignInChars(Private->getType()));
3060  IncrScratchpadSrc = true;
3061 
3062  // Step 1.2: Create a temporary to store the element in the destination
3063  // Reduce list.
3064  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3065  DestElementAddr =
3066  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3067  UpdateDestListPtr = true;
3068  break;
3069  }
3070  }
3071 
3072  // Regardless of src and dest of copy, we emit the load of src
3073  // element as this is required in all directions
3074  SrcElementAddr = Bld.CreateElementBitCast(
3075  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3076  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3077  SrcElementAddr.getElementType());
3078 
3079  // Now that all active lanes have read the element in the
3080  // Reduce list, shuffle over the value from the remote lane.
3081  if (ShuffleInElement) {
3082  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3083  RemoteLaneOffset, Private->getExprLoc());
3084  } else {
3085  switch (CGF.getEvaluationKind(Private->getType())) {
3086  case TEK_Scalar: {
3087  llvm::Value *Elem =
3088  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
3089  Private->getType(), Private->getExprLoc());
3090  // Store the source element value to the dest element address.
3091  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
3092  Private->getType());
3093  break;
3094  }
3095  case TEK_Complex: {
3097  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3098  Private->getExprLoc());
3099  CGF.EmitStoreOfComplex(
3100  Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3101  /*isInit=*/false);
3102  break;
3103  }
3104  case TEK_Aggregate:
3105  CGF.EmitAggregateCopy(
3106  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3107  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3109  break;
3110  }
3111  }
3112 
3113  // Step 3.1: Modify reference in dest Reduce list as needed.
3114  // Modifying the reference in Reduce list to point to the newly
3115  // created element. The element is live in the current function
3116  // scope and that of functions it invokes (i.e., reduce_function).
3117  // RemoteReduceData[i] = (void*)&RemoteElem
3118  if (UpdateDestListPtr) {
3120  DestElementAddr.getPointer(), CGF.VoidPtrTy),
3121  DestElementPtrAddr, /*Volatile=*/false,
3122  C.VoidPtrTy);
3123  }
3124 
3125  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3126  // address of the next element in scratchpad memory, unless we're currently
3127  // processing the last one. Memory alignment is also taken care of here.
3128  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3129  llvm::Value *ScratchpadBasePtr =
3130  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3131  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3132  ScratchpadBasePtr = Bld.CreateNUWAdd(
3133  ScratchpadBasePtr,
3134  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3135 
3136  // Take care of global memory alignment for performance
3137  ScratchpadBasePtr = Bld.CreateNUWSub(
3138  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3139  ScratchpadBasePtr = Bld.CreateUDiv(
3140  ScratchpadBasePtr,
3141  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3142  ScratchpadBasePtr = Bld.CreateNUWAdd(
3143  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3144  ScratchpadBasePtr = Bld.CreateNUWMul(
3145  ScratchpadBasePtr,
3146  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3147 
3148  if (IncrScratchpadDest)
3149  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3150  else /* IncrScratchpadSrc = true */
3151  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3152  }
3153 
3154  ++Idx;
3155  }
3156 }
3157 
3158 /// This function emits a helper that gathers Reduce lists from the first
3159 /// lane of every active warp to lanes in the first warp.
3160 ///
3161 /// void inter_warp_copy_func(void* reduce_data, num_warps)
3162 /// shared smem[warp_size];
3163 /// For all data entries D in reduce_data:
3164 /// sync
3165 /// If (I am the first lane in each warp)
3166 /// Copy my local D to smem[warp_id]
3167 /// sync
3168 /// if (I am the first warp)
3169 /// Copy smem[thread_id] to my local D
3171  ArrayRef<const Expr *> Privates,
3172  QualType ReductionArrayTy,
3173  SourceLocation Loc) {
3174  ASTContext &C = CGM.getContext();
3175  llvm::Module &M = CGM.getModule();
3176 
3177  // ReduceList: thread local Reduce list.
3178  // At the stage of the computation when this function is called, partially
3179  // aggregated values reside in the first lane of every active warp.
3180  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3182  // NumWarps: number of warps active in the parallel region. This could
3183  // be smaller than 32 (max warps in a CTA) for partial block reduction.
3184  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3185  C.getIntTypeForBitwidth(32, /* Signed */ true),
3187  FunctionArgList Args;
3188  Args.push_back(&ReduceListArg);
3189  Args.push_back(&NumWarpsArg);
3190 
3191  const CGFunctionInfo &CGFI =
3193  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
3195  "_omp_reduction_inter_warp_copy_func", &M);
3196  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3197  Fn->setDoesNotRecurse();
3198  CodeGenFunction CGF(CGM);
3199  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3200 
3201  CGBuilderTy &Bld = CGF.Builder;
3202 
3203  // This array is used as a medium to transfer, one reduce element at a time,
3204  // the data from the first lane of every warp to lanes in the first warp
3205  // in order to perform the final step of a reduction in a parallel region
3206  // (reduction across warps). The array is placed in NVPTX __shared__ memory
3207  // for reduced latency, as well as to have a distinct copy for concurrently
3208  // executing target regions. The array is declared with common linkage so
3209  // as to be shared across compilation units.
3210  StringRef TransferMediumName =
3211  "__openmp_nvptx_data_transfer_temporary_storage";
3212  llvm::GlobalVariable *TransferMedium =
3213  M.getGlobalVariable(TransferMediumName);
3214  if (!TransferMedium) {
3215  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3216  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3217  TransferMedium = new llvm::GlobalVariable(
3218  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3219  llvm::Constant::getNullValue(Ty), TransferMediumName,
3220  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3221  SharedAddressSpace);
3222  CGM.addCompilerUsedGlobal(TransferMedium);
3223  }
3224 
3225  // Get the CUDA thread id of the current OpenMP thread on the GPU.
3226  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3227  // nvptx_lane_id = nvptx_id % warpsize
3228  llvm::Value *LaneID = getNVPTXLaneID(CGF);
3229  // nvptx_warp_id = nvptx_id / warpsize
3230  llvm::Value *WarpID = getNVPTXWarpID(CGF);
3231 
3232  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3233  Address LocalReduceList(
3235  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3236  C.VoidPtrTy, Loc),
3237  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3238  CGF.getPointerAlign());
3239 
3240  unsigned Idx = 0;
3241  for (const Expr *Private : Privates) {
3242  //
3243  // Warp master copies reduce element to transfer medium in __shared__
3244  // memory.
3245  //
3246  unsigned RealTySize =
3247  C.getTypeSizeInChars(Private->getType())
3248  .alignTo(C.getTypeAlignInChars(Private->getType()))
3249  .getQuantity();
3250  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3251  unsigned NumIters = RealTySize / TySize;
3252  if (NumIters == 0)
3253  continue;
3254  QualType CType = C.getIntTypeForBitwidth(
3255  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
3256  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3257  CharUnits Align = CharUnits::fromQuantity(TySize);
3258  llvm::Value *Cnt = nullptr;
3259  Address CntAddr = Address::invalid();
3260  llvm::BasicBlock *PrecondBB = nullptr;
3261  llvm::BasicBlock *ExitBB = nullptr;
3262  if (NumIters > 1) {
3263  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3264  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3265  /*Volatile=*/false, C.IntTy);
3266  PrecondBB = CGF.createBasicBlock("precond");
3267  ExitBB = CGF.createBasicBlock("exit");
3268  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3269  // There is no need to emit line number for unconditional branch.
3271  CGF.EmitBlock(PrecondBB);
3272  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
3273  llvm::Value *Cmp =
3274  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3275  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3276  CGF.EmitBlock(BodyBB);
3277  }
3278  // kmpc_barrier.
3279  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3280  /*EmitChecks=*/false,
3281  /*ForceSimpleCall=*/true);
3282  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3283  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3284  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3285 
3286  // if (lane_id == 0)
3287  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3288  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3289  CGF.EmitBlock(ThenBB);
3290 
3291  // Reduce element = LocalReduceList[i]
3292  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3293  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3294  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3295  // elemptr = ((CopyType*)(elemptrptr)) + I
3296  Address ElemPtr = Address(ElemPtrPtr, Align);
3297  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3298  if (NumIters > 1) {
3299  ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3300  ElemPtr.getAlignment());
3301  }
3302 
3303  // Get pointer to location in transfer medium.
3304  // MediumPtr = &medium[warp_id]
3305  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3306  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3307  Address MediumPtr(MediumPtrVal, Align);
3308  // Casting to actual data type.
3309  // MediumPtr = (CopyType*)MediumPtrAddr;
3310  MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3311 
3312  // elem = *elemptr
3313  //*MediumPtr = elem
3314  llvm::Value *Elem =
3315  CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
3316  // Store the source element value to the dest element address.
3317  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
3318 
3319  Bld.CreateBr(MergeBB);
3320 
3321  CGF.EmitBlock(ElseBB);
3322  Bld.CreateBr(MergeBB);
3323 
3324  CGF.EmitBlock(MergeBB);
3325 
3326  // kmpc_barrier.
3327  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3328  /*EmitChecks=*/false,
3329  /*ForceSimpleCall=*/true);
3330 
3331  //
3332  // Warp 0 copies reduce element from transfer medium.
3333  //
3334  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3335  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3336  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3337 
3338  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3339  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3340  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
3341 
3342  // Up to 32 threads in warp 0 are active.
3343  llvm::Value *IsActiveThread =
3344  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3345  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3346 
3347  CGF.EmitBlock(W0ThenBB);
3348 
3349  // SrcMediumPtr = &medium[tid]
3350  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3351  TransferMedium,
3352  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3353  Address SrcMediumPtr(SrcMediumPtrVal, Align);
3354  // SrcMediumVal = *SrcMediumPtr;
3355  SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3356 
3357  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3358  Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3359  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3360  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
3361  Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3362  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3363  if (NumIters > 1) {
3364  TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3365  TargetElemPtr.getAlignment());
3366  }
3367 
3368  // *TargetElemPtr = SrcMediumVal;
3369  llvm::Value *SrcMediumValue =
3370  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
3371  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3372  CType);
3373  Bld.CreateBr(W0MergeBB);
3374 
3375  CGF.EmitBlock(W0ElseBB);
3376  Bld.CreateBr(W0MergeBB);
3377 
3378  CGF.EmitBlock(W0MergeBB);
3379 
3380  if (NumIters > 1) {
3381  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
3382  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
3383  CGF.EmitBranch(PrecondBB);
3385  CGF.EmitBlock(ExitBB);
3386  }
3387  RealTySize %= TySize;
3388  }
3389  ++Idx;
3390  }
3391 
3392  CGF.FinishFunction();
3393  return Fn;
3394 }
3395 
3396 /// Emit a helper that reduces data across two OpenMP threads (lanes)
3397 /// in the same warp. It uses shuffle instructions to copy over data from
3398 /// a remote lane's stack. The reduction algorithm performed is specified
3399 /// by the fourth parameter.
3400 ///
3401 /// Algorithm Versions.
3402 /// Full Warp Reduce (argument value 0):
3403 /// This algorithm assumes that all 32 lanes are active and gathers
3404 /// data from these 32 lanes, producing a single resultant value.
3405 /// Contiguous Partial Warp Reduce (argument value 1):
3406 /// This algorithm assumes that only a *contiguous* subset of lanes
3407 /// are active. This happens for the last warp in a parallel region
3408 /// when the user specified num_threads is not an integer multiple of
3409 /// 32. This contiguous subset always starts with the zeroth lane.
3410 /// Partial Warp Reduce (argument value 2):
3411 /// This algorithm gathers data from any number of lanes at any position.
3412 /// All reduced values are stored in the lowest possible lane. The set
3413 /// of problems every algorithm addresses is a super set of those
3414 /// addressable by algorithms with a lower version number. Overhead
3415 /// increases as algorithm version increases.
3416 ///
3417 /// Terminology
3418 /// Reduce element:
3419 /// Reduce element refers to the individual data field with primitive
3420 /// data types to be combined and reduced across threads.
3421 /// Reduce list:
3422 /// Reduce list refers to a collection of local, thread-private
3423 /// reduce elements.
3424 /// Remote Reduce list:
3425 /// Remote Reduce list refers to a collection of remote (relative to
3426 /// the current thread) reduce elements.
3427 ///
3428 /// We distinguish between three states of threads that are important to
3429 /// the implementation of this function.
3430 /// Alive threads:
3431 /// Threads in a warp executing the SIMT instruction, as distinguished from
3432 /// threads that are inactive due to divergent control flow.
3433 /// Active threads:
3434 /// The minimal set of threads that has to be alive upon entry to this
3435 /// function. The computation is correct iff active threads are alive.
3436 /// Some threads are alive but they are not active because they do not
3437 /// contribute to the computation in any useful manner. Turning them off
3438 /// may introduce control flow overheads without any tangible benefits.
3439 /// Effective threads:
3440 /// In order to comply with the argument requirements of the shuffle
3441 /// function, we must keep all lanes holding data alive. But at most
3442 /// half of them perform value aggregation; we refer to this half of
3443 /// threads as effective. The other half is simply handing off their
3444 /// data.
3445 ///
3446 /// Procedure
3447 /// Value shuffle:
3448 /// In this step active threads transfer data from higher lane positions
3449 /// in the warp to lower lane positions, creating Remote Reduce list.
3450 /// Value aggregation:
3451 /// In this step, effective threads combine their thread local Reduce list
3452 /// with Remote Reduce list and store the result in the thread local
3453 /// Reduce list.
3454 /// Value copy:
3455 /// In this step, we deal with the assumption made by algorithm 2
3456 /// (i.e. contiguity assumption). When we have an odd number of lanes
3457 /// active, say 2k+1, only k threads will be effective and therefore k
3458 /// new values will be produced. However, the Reduce list owned by the
3459 /// (2k+1)th thread is ignored in the value aggregation. Therefore
3460 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3461 /// that the contiguity assumption still holds.
3462 static llvm::Function *emitShuffleAndReduceFunction(
3463  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3464  QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
3465  ASTContext &C = CGM.getContext();
3466 
3467  // Thread local Reduce list used to host the values of data to be reduced.
3468  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3470  // Current lane id; could be logical.
3471  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3473  // Offset of the remote source lane relative to the current lane.
3474  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3476  // Algorithm version. This is expected to be known at compile time.
3477  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3479  FunctionArgList Args;
3480  Args.push_back(&ReduceListArg);
3481  Args.push_back(&LaneIDArg);
3482  Args.push_back(&RemoteLaneOffsetArg);
3483  Args.push_back(&AlgoVerArg);
3484 
3485  const CGFunctionInfo &CGFI =
3487  auto *Fn = llvm::Function::Create(
3489  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3490  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3491  Fn->setDoesNotRecurse();
3492  if (CGM.getLangOpts().Optimize) {
3493  Fn->removeFnAttr(llvm::Attribute::NoInline);
3494  Fn->removeFnAttr(llvm::Attribute::OptimizeNone);
3495  Fn->addFnAttr(llvm::Attribute::AlwaysInline);
3496  }
3497 
3498  CodeGenFunction CGF(CGM);
3499  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3500 
3501  CGBuilderTy &Bld = CGF.Builder;
3502 
3503  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3504  Address LocalReduceList(
3506  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3507  C.VoidPtrTy, SourceLocation()),
3508  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3509  CGF.getPointerAlign());
3510 
3511  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3512  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3513  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3514 
3515  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3516  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3517  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3518 
3519  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3520  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3521  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3522 
3523  // Create a local thread-private variable to host the Reduce list
3524  // from a remote lane.
3525  Address RemoteReduceList =
3526  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3527 
3528  // This loop iterates through the list of reduce elements and copies,
3529  // element by element, from a remote lane in the warp to RemoteReduceList,
3530  // hosted on the thread's stack.
3531  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3532  LocalReduceList, RemoteReduceList,
3533  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3534  /*ScratchpadIndex=*/nullptr,
3535  /*ScratchpadWidth=*/nullptr});
3536 
3537  // The actions to be performed on the Remote Reduce list is dependent
3538  // on the algorithm version.
3539  //
3540  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3541  // LaneId % 2 == 0 && Offset > 0):
3542  // do the reduction value aggregation
3543  //
3544  // The thread local variable Reduce list is mutated in place to host the
3545  // reduced data, which is the aggregated value produced from local and
3546  // remote lanes.
3547  //
3548  // Note that AlgoVer is expected to be a constant integer known at compile
3549  // time.
3550  // When AlgoVer==0, the first conjunction evaluates to true, making
3551  // the entire predicate true during compile time.
3552  // When AlgoVer==1, the second conjunction has only the second part to be
3553  // evaluated during runtime. Other conjunctions evaluates to false
3554  // during compile time.
3555  // When AlgoVer==2, the third conjunction has only the second part to be
3556  // evaluated during runtime. Other conjunctions evaluates to false
3557  // during compile time.
3558  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3559 
3560  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3561  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3562  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3563 
3564  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3565  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3566  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3567  CondAlgo2 = Bld.CreateAnd(
3568  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3569 
3570  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3571  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3572 
3573  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3574  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3575  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3576  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3577 
3578  CGF.EmitBlock(ThenBB);
3579  // reduce_function(LocalReduceList, RemoteReduceList)
3580  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3581  LocalReduceList.getPointer(), CGF.VoidPtrTy);
3582  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3583  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3584  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3585  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3586  Bld.CreateBr(MergeBB);
3587 
3588  CGF.EmitBlock(ElseBB);
3589  Bld.CreateBr(MergeBB);
3590 
3591  CGF.EmitBlock(MergeBB);
3592 
3593  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3594  // Reduce list.
3595  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3596  llvm::Value *CondCopy = Bld.CreateAnd(
3597  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3598 
3599  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3600  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3601  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3602  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3603 
3604  CGF.EmitBlock(CpyThenBB);
3605  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3606  RemoteReduceList, LocalReduceList);
3607  Bld.CreateBr(CpyMergeBB);
3608 
3609  CGF.EmitBlock(CpyElseBB);
3610  Bld.CreateBr(CpyMergeBB);
3611 
3612  CGF.EmitBlock(CpyMergeBB);
3613 
3614  CGF.FinishFunction();
3615  return Fn;
3616 }
3617 
3618 /// This function emits a helper that copies all the reduction variables from
3619 /// the team into the provided global buffer for the reduction variables.
3620 ///
3621 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3622 /// For all data entries D in reduce_data:
3623 /// Copy local D to buffer.D[Idx]
3625  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3626  QualType ReductionArrayTy, SourceLocation Loc,
3627  const RecordDecl *TeamReductionRec,
3628  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3629  &VarFieldMap) {
3630  ASTContext &C = CGM.getContext();
3631 
3632  // Buffer: global reduction buffer.
3633  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3635  // Idx: index of the buffer.
3636  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3638  // ReduceList: thread local Reduce list.
3639  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3641  FunctionArgList Args;
3642  Args.push_back(&BufferArg);
3643  Args.push_back(&IdxArg);
3644  Args.push_back(&ReduceListArg);
3645 
3646  const CGFunctionInfo &CGFI =
3648  auto *Fn = llvm::Function::Create(
3650  "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3651  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3652  Fn->setDoesNotRecurse();
3653  CodeGenFunction CGF(CGM);
3654  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3655 
3656  CGBuilderTy &Bld = CGF.Builder;
3657 
3658  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3659  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3660  Address LocalReduceList(
3662  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3663  C.VoidPtrTy, Loc),
3664  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3665  CGF.getPointerAlign());
3666  QualType StaticTy = C.getRecordType(TeamReductionRec);
3667  llvm::Type *LLVMReductionsBufferTy =
3668  CGM.getTypes().ConvertTypeForMem(StaticTy);
3670  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3671  LLVMReductionsBufferTy->getPointerTo());
3672  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3673  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3674  /*Volatile=*/false, C.IntTy,
3675  Loc)};
3676  unsigned Idx = 0;
3677  for (const Expr *Private : Privates) {
3678  // Reduce element = LocalReduceList[i]
3679  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3680  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3681  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3682  // elemptr = ((CopyType*)(elemptrptr)) + I
3683  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3684  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3685  Address ElemPtr =
3686  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3687  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3688  // Global = Buffer.VD[Idx];
3689  const FieldDecl *FD = VarFieldMap.lookup(VD);
3690  LValue GlobLVal = CGF.EmitLValueForField(
3691  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3692  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3693  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3694  switch (CGF.getEvaluationKind(Private->getType())) {
3695  case TEK_Scalar: {
3696  llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3697  Private->getType(), Loc);
3698  CGF.EmitStoreOfScalar(V, GlobLVal);
3699  break;
3700  }
3701  case TEK_Complex: {
3703  CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3704  CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3705  break;
3706  }
3707  case TEK_Aggregate:
3708  CGF.EmitAggregateCopy(GlobLVal,
3709  CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3711  break;
3712  }
3713  ++Idx;
3714  }
3715 
3716  CGF.FinishFunction();
3717  return Fn;
3718 }
3719 
3720 /// This function emits a helper that reduces all the reduction variables from
3721 /// the team into the provided global buffer for the reduction variables.
3722 ///
3723 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3724 /// void *GlobPtrs[];
3725 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3726 /// ...
3727 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3728 /// reduce_function(GlobPtrs, reduce_data);
3730  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3731  QualType ReductionArrayTy, SourceLocation Loc,
3732  const RecordDecl *TeamReductionRec,
3733  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3734  &VarFieldMap,
3735  llvm::Function *ReduceFn) {
3736  ASTContext &C = CGM.getContext();
3737 
3738  // Buffer: global reduction buffer.
3739  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3741  // Idx: index of the buffer.
3742  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3744  // ReduceList: thread local Reduce list.
3745  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3747  FunctionArgList Args;
3748  Args.push_back(&BufferArg);
3749  Args.push_back(&IdxArg);
3750  Args.push_back(&ReduceListArg);
3751 
3752  const CGFunctionInfo &CGFI =
3754  auto *Fn = llvm::Function::Create(
3756  "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3757  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3758  Fn->setDoesNotRecurse();
3759  CodeGenFunction CGF(CGM);
3760  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3761 
3762  CGBuilderTy &Bld = CGF.Builder;
3763 
3764  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3765  QualType StaticTy = C.getRecordType(TeamReductionRec);
3766  llvm::Type *LLVMReductionsBufferTy =
3767  CGM.getTypes().ConvertTypeForMem(StaticTy);
3769  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3770  LLVMReductionsBufferTy->getPointerTo());
3771 
3772  // 1. Build a list of reduction variables.
3773  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3774  Address ReductionList =
3775  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3776  auto IPriv = Privates.begin();
3777  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3778  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3779  /*Volatile=*/false, C.IntTy,
3780  Loc)};
3781  unsigned Idx = 0;
3782  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3783  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3784  // Global = Buffer.VD[Idx];
3785  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3786  const FieldDecl *FD = VarFieldMap.lookup(VD);
3787  LValue GlobLVal = CGF.EmitLValueForField(
3788  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3789  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3790  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3791  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3792  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3793  // Store array size.
3794  ++Idx;
3795  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3796  llvm::Value *Size = CGF.Builder.CreateIntCast(
3797  CGF.getVLASize(
3798  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3799  .NumElts,
3800  CGF.SizeTy, /*isSigned=*/false);
3801  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3802  Elem);
3803  }
3804  }
3805 
3806  // Call reduce_function(GlobalReduceList, ReduceList)
3807  llvm::Value *GlobalReduceList =
3808  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3809  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3810  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3811  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3812  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3813  CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3814  CGF.FinishFunction();
3815  return Fn;
3816 }
3817 
3818 /// This function emits a helper that copies all the reduction variables from
3819 /// the team into the provided global buffer for the reduction variables.
3820 ///
3821 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3822 /// For all data entries D in reduce_data:
3823 /// Copy buffer.D[Idx] to local D;
3825  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3826  QualType ReductionArrayTy, SourceLocation Loc,
3827  const RecordDecl *TeamReductionRec,
3828  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3829  &VarFieldMap) {
3830  ASTContext &C = CGM.getContext();
3831 
3832  // Buffer: global reduction buffer.
3833  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3835  // Idx: index of the buffer.
3836  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3838  // ReduceList: thread local Reduce list.
3839  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3841  FunctionArgList Args;
3842  Args.push_back(&BufferArg);
3843  Args.push_back(&IdxArg);
3844  Args.push_back(&ReduceListArg);
3845 
3846  const CGFunctionInfo &CGFI =
3848  auto *Fn = llvm::Function::Create(
3850  "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3851  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3852  Fn->setDoesNotRecurse();
3853  CodeGenFunction CGF(CGM);
3854  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3855 
3856  CGBuilderTy &Bld = CGF.Builder;
3857 
3858  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3859  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3860  Address LocalReduceList(
3862  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3863  C.VoidPtrTy, Loc),
3864  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3865  CGF.getPointerAlign());
3866  QualType StaticTy = C.getRecordType(TeamReductionRec);
3867  llvm::Type *LLVMReductionsBufferTy =
3868  CGM.getTypes().ConvertTypeForMem(StaticTy);
3870  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3871  LLVMReductionsBufferTy->getPointerTo());
3872 
3873  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3874  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3875  /*Volatile=*/false, C.IntTy,
3876  Loc)};
3877  unsigned Idx = 0;
3878  for (const Expr *Private : Privates) {
3879  // Reduce element = LocalReduceList[i]
3880  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3881  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3882  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3883  // elemptr = ((CopyType*)(elemptrptr)) + I
3884  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3885  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3886  Address ElemPtr =
3887  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3888  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3889  // Global = Buffer.VD[Idx];
3890  const FieldDecl *FD = VarFieldMap.lookup(VD);
3891  LValue GlobLVal = CGF.EmitLValueForField(
3892  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3893  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3894  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3895  switch (CGF.getEvaluationKind(Private->getType())) {
3896  case TEK_Scalar: {
3897  llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3898  CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
3899  break;
3900  }
3901  case TEK_Complex: {
3902  CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3903  CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3904  /*isInit=*/false);
3905  break;
3906  }
3907  case TEK_Aggregate:
3908  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3909  GlobLVal, Private->getType(),
3911  break;
3912  }
3913  ++Idx;
3914  }
3915 
3916  CGF.FinishFunction();
3917  return Fn;
3918 }
3919 
3920 /// This function emits a helper that reduces all the reduction variables from
3921 /// the team into the provided global buffer for the reduction variables.
3922 ///
3923 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3924 /// void *GlobPtrs[];
3925 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3926 /// ...
3927 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3928 /// reduce_function(reduce_data, GlobPtrs);
3930  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3931  QualType ReductionArrayTy, SourceLocation Loc,
3932  const RecordDecl *TeamReductionRec,
3933  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3934  &VarFieldMap,
3935  llvm::Function *ReduceFn) {
3936  ASTContext &C = CGM.getContext();
3937 
3938  // Buffer: global reduction buffer.
3939  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3941  // Idx: index of the buffer.
3942  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3944  // ReduceList: thread local Reduce list.
3945  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3947  FunctionArgList Args;
3948  Args.push_back(&BufferArg);
3949  Args.push_back(&IdxArg);
3950  Args.push_back(&ReduceListArg);
3951 
3952  const CGFunctionInfo &CGFI =
3954  auto *Fn = llvm::Function::Create(
3956  "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3957  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3958  Fn->setDoesNotRecurse();
3959  CodeGenFunction CGF(CGM);
3960  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3961 
3962  CGBuilderTy &Bld = CGF.Builder;
3963 
3964  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3965  QualType StaticTy = C.getRecordType(TeamReductionRec);
3966  llvm::Type *LLVMReductionsBufferTy =
3967  CGM.getTypes().ConvertTypeForMem(StaticTy);
3969  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3970  LLVMReductionsBufferTy->getPointerTo());
3971 
3972  // 1. Build a list of reduction variables.
3973  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3974  Address ReductionList =
3975  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3976  auto IPriv = Privates.begin();
3977  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3978  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3979  /*Volatile=*/false, C.IntTy,
3980  Loc)};
3981  unsigned Idx = 0;
3982  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3983  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3984  // Global = Buffer.VD[Idx];
3985  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3986  const FieldDecl *FD = VarFieldMap.lookup(VD);
3987  LValue GlobLVal = CGF.EmitLValueForField(
3988  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3989  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3990  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3991  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3992  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3993  // Store array size.
3994  ++Idx;
3995  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3996  llvm::Value *Size = CGF.Builder.CreateIntCast(
3997  CGF.getVLASize(
3998  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3999  .NumElts,
4000  CGF.SizeTy, /*isSigned=*/false);
4001  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
4002  Elem);
4003  }
4004  }
4005 
4006  // Call reduce_function(ReduceList, GlobalReduceList)
4007  llvm::Value *GlobalReduceList =
4008  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
4009  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
4010  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
4011  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
4012  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
4013  CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
4014  CGF.FinishFunction();
4015  return Fn;
4016 }
4017 
4018 ///
4019 /// Design of OpenMP reductions on the GPU
4020 ///
4021 /// Consider a typical OpenMP program with one or more reduction
4022 /// clauses:
4023 ///
4024 /// float foo;
4025 /// double bar;
4026 /// #pragma omp target teams distribute parallel for \
4027 /// reduction(+:foo) reduction(*:bar)
4028 /// for (int i = 0; i < N; i++) {
4029 /// foo += A[i]; bar *= B[i];
4030 /// }
4031 ///
4032 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
4033 /// all teams. In our OpenMP implementation on the NVPTX device an
4034 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
4035 /// within a team are mapped to CUDA threads within a threadblock.
4036 /// Our goal is to efficiently aggregate values across all OpenMP
4037 /// threads such that:
4038 ///
4039 /// - the compiler and runtime are logically concise, and
4040 /// - the reduction is performed efficiently in a hierarchical
4041 /// manner as follows: within OpenMP threads in the same warp,
4042 /// across warps in a threadblock, and finally across teams on
4043 /// the NVPTX device.
4044 ///
4045 /// Introduction to Decoupling
4046 ///
4047 /// We would like to decouple the compiler and the runtime so that the
4048 /// latter is ignorant of the reduction variables (number, data types)
4049 /// and the reduction operators. This allows a simpler interface
4050 /// and implementation while still attaining good performance.
4051 ///
4052 /// Pseudocode for the aforementioned OpenMP program generated by the
4053 /// compiler is as follows:
4054 ///
4055 /// 1. Create private copies of reduction variables on each OpenMP
4056 /// thread: 'foo_private', 'bar_private'
4057 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
4058 /// to it and writes the result in 'foo_private' and 'bar_private'
4059 /// respectively.
4060 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
4061 /// and store the result on the team master:
4062 ///
4063 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
4064 /// reduceData, shuffleReduceFn, interWarpCpyFn)
4065 ///
4066 /// where:
4067 /// struct ReduceData {
4068 /// double *foo;
4069 /// double *bar;
4070 /// } reduceData
4071 /// reduceData.foo = &foo_private
4072 /// reduceData.bar = &bar_private
4073 ///
4074 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
4075 /// auxiliary functions generated by the compiler that operate on
4076 /// variables of type 'ReduceData'. They aid the runtime perform
4077 /// algorithmic steps in a data agnostic manner.
4078 ///
4079 /// 'shuffleReduceFn' is a pointer to a function that reduces data
4080 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
4081 /// same warp. It takes the following arguments as input:
4082 ///
4083 /// a. variable of type 'ReduceData' on the calling lane,
4084 /// b. its lane_id,
4085 /// c. an offset relative to the current lane_id to generate a
4086 /// remote_lane_id. The remote lane contains the second
4087 /// variable of type 'ReduceData' that is to be reduced.
4088 /// d. an algorithm version parameter determining which reduction
4089 /// algorithm to use.
4090 ///
4091 /// 'shuffleReduceFn' retrieves data from the remote lane using
4092 /// efficient GPU shuffle intrinsics and reduces, using the
4093 /// algorithm specified by the 4th parameter, the two operands
4094 /// element-wise. The result is written to the first operand.
4095 ///
4096 /// Different reduction algorithms are implemented in different
4097 /// runtime functions, all calling 'shuffleReduceFn' to perform
4098 /// the essential reduction step. Therefore, based on the 4th
4099 /// parameter, this function behaves slightly differently to
4100 /// cooperate with the runtime to ensure correctness under
4101 /// different circumstances.
4102 ///
4103 /// 'InterWarpCpyFn' is a pointer to a function that transfers
4104 /// reduced variables across warps. It tunnels, through CUDA
4105 /// shared memory, the thread-private data of type 'ReduceData'
4106 /// from lane 0 of each warp to a lane in the first warp.
4107 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
4108 /// The last team writes the global reduced value to memory.
4109 ///
4110 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
4111 /// reduceData, shuffleReduceFn, interWarpCpyFn,
4112 /// scratchpadCopyFn, loadAndReduceFn)
4113 ///
4114 /// 'scratchpadCopyFn' is a helper that stores reduced
4115 /// data from the team master to a scratchpad array in
4116 /// global memory.
4117 ///
4118 /// 'loadAndReduceFn' is a helper that loads data from
4119 /// the scratchpad array and reduces it with the input
4120 /// operand.
4121 ///
4122 /// These compiler generated functions hide address
4123 /// calculation and alignment information from the runtime.
4124 /// 5. if ret == 1:
4125 /// The team master of the last team stores the reduced
4126 /// result to the globals in memory.
4127 /// foo += reduceData.foo; bar *= reduceData.bar
4128 ///
4129 ///
4130 /// Warp Reduction Algorithms
4131 ///
4132 /// On the warp level, we have three algorithms implemented in the
4133 /// OpenMP runtime depending on the number of active lanes:
4134 ///
4135 /// Full Warp Reduction
4136 ///
4137 /// The reduce algorithm within a warp where all lanes are active
4138 /// is implemented in the runtime as follows:
4139 ///
4140 /// full_warp_reduce(void *reduce_data,
4141 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4142 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
4143 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
4144 /// }
4145 ///
4146 /// The algorithm completes in log(2, WARPSIZE) steps.
4147 ///
4148 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
4149 /// not used therefore we save instructions by not retrieving lane_id
4150 /// from the corresponding special registers. The 4th parameter, which
4151 /// represents the version of the algorithm being used, is set to 0 to
4152 /// signify full warp reduction.
4153 ///
4154 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4155 ///
4156 /// #reduce_elem refers to an element in the local lane's data structure
4157 /// #remote_elem is retrieved from a remote lane
4158 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4159 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
4160 ///
4161 /// Contiguous Partial Warp Reduction
4162 ///
4163 /// This reduce algorithm is used within a warp where only the first
4164 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
4165 /// number of OpenMP threads in a parallel region is not a multiple of
4166 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
4167 ///
4168 /// void
4169 /// contiguous_partial_reduce(void *reduce_data,
4170 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
4171 /// int size, int lane_id) {
4172 /// int curr_size;
4173 /// int offset;
4174 /// curr_size = size;
4175 /// mask = curr_size/2;
4176 /// while (offset>0) {
4177 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
4178 /// curr_size = (curr_size+1)/2;
4179 /// offset = curr_size/2;
4180 /// }
4181 /// }
4182 ///
4183 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4184 ///
4185 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4186 /// if (lane_id < offset)
4187 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4188 /// else
4189 /// reduce_elem = remote_elem
4190 ///
4191 /// This algorithm assumes that the data to be reduced are located in a
4192 /// contiguous subset of lanes starting from the first. When there is
4193 /// an odd number of active lanes, the data in the last lane is not
4194 /// aggregated with any other lane's dat but is instead copied over.
4195 ///
4196 /// Dispersed Partial Warp Reduction
4197 ///
4198 /// This algorithm is used within a warp when any discontiguous subset of
4199 /// lanes are active. It is used to implement the reduction operation
4200 /// across lanes in an OpenMP simd region or in a nested parallel region.
4201 ///
4202 /// void
4203 /// dispersed_partial_reduce(void *reduce_data,
4204 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4205 /// int size, remote_id;
4206 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
4207 /// do {
4208 /// remote_id = next_active_lane_id_right_after_me();
4209 /// # the above function returns 0 of no active lane
4210 /// # is present right after the current lane.
4211 /// size = number_of_active_lanes_in_this_warp();
4212 /// logical_lane_id /= 2;
4213 /// ShuffleReduceFn(reduce_data, logical_lane_id,
4214 /// remote_id-1-threadIdx.x, 2);
4215 /// } while (logical_lane_id % 2 == 0 && size > 1);
4216 /// }
4217 ///
4218 /// There is no assumption made about the initial state of the reduction.
4219 /// Any number of lanes (>=1) could be active at any position. The reduction
4220 /// result is returned in the first active lane.
4221 ///
4222 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4223 ///
4224 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4225 /// if (lane_id % 2 == 0 && offset > 0)
4226 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4227 /// else
4228 /// reduce_elem = remote_elem
4229 ///
4230 ///
4231 /// Intra-Team Reduction
4232 ///
4233 /// This function, as implemented in the runtime call
4234 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
4235 /// threads in a team. It first reduces within a warp using the
4236 /// aforementioned algorithms. We then proceed to gather all such
4237 /// reduced values at the first warp.
4238 ///
4239 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
4240 /// data from each of the "warp master" (zeroth lane of each warp, where
4241 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
4242 /// a mathematical sense) the problem of reduction across warp masters in
4243 /// a block to the problem of warp reduction.
4244 ///
4245 ///
4246 /// Inter-Team Reduction
4247 ///
4248 /// Once a team has reduced its data to a single value, it is stored in
4249 /// a global scratchpad array. Since each team has a distinct slot, this
4250 /// can be done without locking.
4251 ///
4252 /// The last team to write to the scratchpad array proceeds to reduce the
4253 /// scratchpad array. One or more workers in the last team use the helper
4254 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
4255 /// the k'th worker reduces every k'th element.
4256 ///
4257 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
4258 /// reduce across workers and compute a globally reduced value.
4259 ///
4263  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
4264  if (!CGF.HaveInsertPoint())
4265  return;
4266 
4267  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
4268 #ifndef NDEBUG
4269  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
4270 #endif
4271 
4272  if (Options.SimpleReduction) {
4273  assert(!TeamsReduction && !ParallelReduction &&
4274  "Invalid reduction selection in emitReduction.");
4275  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
4276  ReductionOps, Options);
4277  return;
4278  }
4279 
4280  assert((TeamsReduction || ParallelReduction) &&
4281  "Invalid reduction selection in emitReduction.");
4282 
4283  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4284  // RedList, shuffle_reduce_func, interwarp_copy_func);
4285  // or
4286  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4287  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
4288  llvm::Value *ThreadId = getThreadID(CGF, Loc);
4289 
4290  llvm::Value *Res;
4291  ASTContext &C = CGM.getContext();
4292  // 1. Build a list of reduction variables.
4293  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4294  auto Size = RHSExprs.size();
4295  for (const Expr *E : Privates) {
4296  if (E->getType()->isVariablyModifiedType())
4297  // Reserve place for array size.
4298  ++Size;
4299  }
4300  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
4301  QualType ReductionArrayTy =
4302  C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
4303  /*IndexTypeQuals=*/0);
4304  Address ReductionList =
4305  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
4306  auto IPriv = Privates.begin();
4307  unsigned Idx = 0;
4308  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
4309  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4310  CGF.Builder.CreateStore(
4312  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
4313  Elem);
4314  if ((*IPriv)->getType()->isVariablyModifiedType()) {
4315  // Store array size.
4316  ++Idx;
4317  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4318  llvm::Value *Size = CGF.Builder.CreateIntCast(
4319  CGF.getVLASize(
4320  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
4321  .NumElts,
4322  CGF.SizeTy, /*isSigned=*/false);
4323  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
4324  Elem);
4325  }
4326  }
4327 
4329  ReductionList.getPointer(), CGF.VoidPtrTy);
4330  llvm::Function *ReductionFn = emitReductionFunction(
4331  Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
4332  LHSExprs, RHSExprs, ReductionOps);
4333  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
4334  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
4335  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
4336  llvm::Value *InterWarpCopyFn =
4337  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
4338 
4339  if (ParallelReduction) {
4340  llvm::Value *Args[] = {RTLoc,
4341  ThreadId,
4342  CGF.Builder.getInt32(RHSExprs.size()),
4343  ReductionArrayTySize,
4344  RL,
4345  ShuffleAndReduceFn,
4346  InterWarpCopyFn};
4347 
4348  Res = CGF.EmitRuntimeCall(
4350  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2),
4351  Args);
4352  } else {
4353  assert(TeamsReduction && "expected teams reduction.");
4354  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
4355  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
4356  int Cnt = 0;
4357  for (const Expr *DRE : Privates) {
4358  PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
4359  ++Cnt;
4360  }
4361  const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
4362  CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
4363  C.getLangOpts().OpenMPCUDAReductionBufNum);
4364  TeamsReductions.push_back(TeamReductionRec);
4365  if (!KernelTeamsReductionPtr) {
4366  KernelTeamsReductionPtr = new llvm::GlobalVariable(
4367  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
4369  "_openmp_teams_reductions_buffer_$_$ptr");
4370  }
4371  llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
4372  Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
4373  /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
4374  llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
4375  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4376  llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
4377  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4378  ReductionFn);
4379  llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
4380  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4381  llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
4382  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4383  ReductionFn);
4384 
4385  llvm::Value *Args[] = {
4386  RTLoc,
4387  ThreadId,
4388  GlobalBufferPtr,
4389  CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
4390  RL,
4391  ShuffleAndReduceFn,
4392  InterWarpCopyFn,
4393  GlobalToBufferCpyFn,
4394  GlobalToBufferRedFn,
4395  BufferToGlobalCpyFn,
4396  BufferToGlobalRedFn};
4397 
4398  Res = CGF.EmitRuntimeCall(
4400  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2),
4401  Args);
4402  }
4403 
4404  // 5. Build if (res == 1)
4405  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
4406  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
4407  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
4408  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
4409  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4410 
4411  // 6. Build then branch: where we have reduced values in the master
4412  // thread in each team.
4413  // __kmpc_end_reduce{_nowait}(<gtid>);
4414  // break;
4415  CGF.EmitBlock(ThenBB);
4416 
4417  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4418  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
4419  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4420  auto IPriv = Privates.begin();
4421  auto ILHS = LHSExprs.begin();
4422  auto IRHS = RHSExprs.begin();
4423  for (const Expr *E : ReductionOps) {
4424  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4425  cast<DeclRefExpr>(*IRHS));
4426  ++IPriv;
4427  ++ILHS;
4428  ++IRHS;
4429  }
4430  };
4431  llvm::Value *EndArgs[] = {ThreadId};
4432  RegionCodeGenTy RCG(CodeGen);
4433  NVPTXActionTy Action(
4434  nullptr, llvm::None,
4435  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4436  EndArgs);
4437  RCG.setAction(Action);
4438  RCG(CGF);
4439  // There is no need to emit line number for unconditional branch.
4441  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
4442 }
4443 
4444 const VarDecl *
4446  const VarDecl *NativeParam) const {
4447  if (!NativeParam->getType()->isReferenceType())
4448  return NativeParam;
4449  QualType ArgType = NativeParam->getType();
4450  QualifierCollector QC;
4451  const Type *NonQualTy = QC.strip(ArgType);
4452  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4453  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4454  if (Attr->getCaptureKind() == OMPC_map) {
4455  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4457  } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
4458  PointeeTy.isConstant(CGM.getContext())) {
4459  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4461  }
4462  }
4463  ArgType = CGM.getContext().getPointerType(PointeeTy);
4464  QC.addRestrict();
4465  enum { NVPTX_local_addr = 5 };
4466  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
4467  ArgType = QC.apply(CGM.getContext(), ArgType);
4468  if (isa<ImplicitParamDecl>(NativeParam))
4470  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
4471  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
4472  return ParmVarDecl::Create(
4473  CGM.getContext(),
4474  const_cast<DeclContext *>(NativeParam->getDeclContext()),
4475  NativeParam->getBeginLoc(), NativeParam->getLocation(),
4476  NativeParam->getIdentifier(), ArgType,
4477  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
4478 }
4479 
4480 Address
4482  const VarDecl *NativeParam,
4483  const VarDecl *TargetParam) const {
4484  assert(NativeParam != TargetParam &&
4485  NativeParam->getType()->isReferenceType() &&
4486  "Native arg must not be the same as target arg.");
4487  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4488  QualType NativeParamType = NativeParam->getType();
4489  QualifierCollector QC;
4490  const Type *NonQualTy = QC.strip(NativeParamType);
4491  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4492  unsigned NativePointeeAddrSpace =
4493  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4494  QualType TargetTy = TargetParam->getType();
4495  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4496  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
4497  // First cast to generic.
4499  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4500  /*AddrSpace=*/0));
4501  // Cast from generic to native address space.
4503  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4504  NativePointeeAddrSpace));
4505  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4506  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
4507  NativeParamType);
4508  return NativeParamAddr;
4509 }
4510 
4512  CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
4513  ArrayRef<llvm::Value *> Args) const {
4514  SmallVector<llvm::Value *, 4> TargetArgs;
4515  TargetArgs.reserve(Args.size());
4516  auto *FnType = OutlinedFn.getFunctionType();
4517  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
4518  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4519  TargetArgs.append(std::next(Args.begin(), I), Args.end());
4520  break;
4521  }
4522  llvm::Type *TargetType = FnType->getParamType(I);
4523  llvm::Value *NativeArg = Args[I];
4524  if (!TargetType->isPointerTy()) {
4525  TargetArgs.emplace_back(NativeArg);
4526  continue;
4527  }
4529  NativeArg,
4530  NativeArg->getType()->getPointerElementType()->getPointerTo());
4531  TargetArgs.emplace_back(
4532  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4533  }
4534  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4535 }
4536 
4537 /// Emit function which wraps the outline parallel region
4538 /// and controls the arguments which are passed to this function.
4539 /// The wrapper ensures that the outlined function is called
4540 /// with the correct arguments when data is shared.
4541 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4542  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4543  ASTContext &Ctx = CGM.getContext();
4544  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4545 
4546  // Create a function that takes as argument the source thread.
4547  FunctionArgList WrapperArgs;
4548  QualType Int16QTy =
4549  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4550  QualType Int32QTy =
4551  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4552  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4553  /*Id=*/nullptr, Int16QTy,
4555  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4556  /*Id=*/nullptr, Int32QTy,
4558  WrapperArgs.emplace_back(&ParallelLevelArg);
4559  WrapperArgs.emplace_back(&WrapperArg);
4560 
4561  const CGFunctionInfo &CGFI =
4562  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4563 
4564  auto *Fn = llvm::Function::Create(
4566  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4567  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4568  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4569  Fn->setDoesNotRecurse();
4570 
4571  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4572  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4573  D.getBeginLoc(), D.getBeginLoc());
4574 
4575  const auto *RD = CS.getCapturedRecordDecl();
4576  auto CurField = RD->field_begin();
4577 
4578  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
4579  /*Name=*/".zero.addr");
4580  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4581  // Get the array of arguments.
4583 
4584  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4585  Args.emplace_back(ZeroAddr.getPointer());
4586 
4587  CGBuilderTy &Bld = CGF.Builder;
4588  auto CI = CS.capture_begin();
4589 
4590  // Use global memory for data sharing.
4591  // Handle passing of global args to workers.
4592  Address GlobalArgs =
4593  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4594  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4595  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4596  CGF.EmitRuntimeCall(
4597  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4598  DataSharingArgs);
4599 
4600  // Retrieve the shared variables from the list of references returned
4601  // by the runtime. Pass the variables to the outlined function.
4602  Address SharedArgListAddress = Address::invalid();
4603  if (CS.capture_size() > 0 ||
4605  SharedArgListAddress = CGF.EmitLoadOfPointer(
4606  GlobalArgs, CGF.getContext()
4608  CGF.getContext().VoidPtrTy))
4609  .castAs<PointerType>());
4610  }
4611  unsigned Idx = 0;
4613  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4614  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4615  Src, CGF.SizeTy->getPointerTo());
4616  llvm::Value *LB = CGF.EmitLoadOfScalar(
4617  TypedAddress,
4618  /*Volatile=*/false,
4620  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4621  Args.emplace_back(LB);
4622  ++Idx;
4623  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4624  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4625  Src, CGF.SizeTy->getPointerTo());
4626  llvm::Value *UB = CGF.EmitLoadOfScalar(
4627  TypedAddress,
4628  /*Volatile=*/false,
4630  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4631  Args.emplace_back(UB);
4632  ++Idx;
4633  }
4634  if (CS.capture_size() > 0) {
4635  ASTContext &CGFContext = CGF.getContext();
4636  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4637  QualType ElemTy = CurField->getType();
4638  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4639  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4640  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4641  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4642  /*Volatile=*/false,
4643  CGFContext.getPointerType(ElemTy),
4644  CI->getLocation());
4645  if (CI->capturesVariableByCopy() &&
4646  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4647  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4648  CI->getLocation());
4649  }
4650  Args.emplace_back(Arg);
4651  }
4652  }
4653 
4654  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4655  CGF.FinishFunction();
4656  return Fn;
4657 }
4658 
4660  const Decl *D) {
4662  return;
4663 
4664  assert(D && "Expected function or captured|block decl.");
4665  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4666  "Function is registered already.");
4667  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4668  "Team is set but not processed.");
4669  const Stmt *Body = nullptr;
4670  bool NeedToDelayGlobalization = false;
4671  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4672  Body = FD->getBody();
4673  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4674  Body = BD->getBody();
4675  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4676  Body = CD->getBody();
4677  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4678  if (NeedToDelayGlobalization &&
4679  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4680  return;
4681  }
4682  if (!Body)
4683  return;
4684  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4685  VarChecker.Visit(Body);
4686  const RecordDecl *GlobalizedVarsRecord =
4687  VarChecker.getGlobalizedRecord(IsInTTDRegion);
4688  TeamAndReductions.first = nullptr;
4689  TeamAndReductions.second.clear();
4690  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4691  VarChecker.getEscapedVariableLengthDecls();
4692  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4693  return;
4694  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4695  I->getSecond().MappedParams =
4696  std::make_unique<CodeGenFunction::OMPMapVars>();
4697  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4698  I->getSecond().EscapedParameters.insert(
4699  VarChecker.getEscapedParameters().begin(),
4700  VarChecker.getEscapedParameters().end());
4701  I->getSecond().EscapedVariableLengthDecls.append(
4702  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4703  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4704  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4705  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4706  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4707  Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4708  }
4709  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4710  CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4711  VarChecker.Visit(Body);
4712  I->getSecond().SecondaryGlobalRecord =
4713  VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4714  I->getSecond().SecondaryLocalVarData.emplace();
4715  DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4716  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4717  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4718  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4719  Data.insert(
4720  std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4721  }
4722  }
4723  if (!NeedToDelayGlobalization) {
4724  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4725  struct GlobalizationScope final : EHScopeStack::Cleanup {
4726  GlobalizationScope() = default;
4727 
4728  void Emit(CodeGenFunction &CGF, Flags flags) override {
4729  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4730  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4731  }
4732  };
4733  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4734  }
4735 }
4736 
4738  const VarDecl *VD) {
4739  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4740  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4741  switch (A->getAllocatorType()) {
4742  // Use the default allocator here as by default local vars are
4743  // threadlocal.
4744  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4745  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4746  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4747  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4748  // Follow the user decision - use default allocation.
4749  return Address::invalid();
4750  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4751  // TODO: implement aupport for user-defined allocators.
4752  return Address::invalid();
4753  case OMPAllocateDeclAttr::OMPConstMemAlloc: {
4754  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4755  auto *GV = new llvm::GlobalVariable(
4756  CGM.getModule(), VarTy, /*isConstant=*/false,
4758  llvm::Constant::getNullValue(VarTy), VD->getName(),
4759  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4761  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4762  GV->setAlignment(Align.getAsAlign());
4763  return Address(GV, Align);
4764  }
4765  case OMPAllocateDeclAttr::OMPPTeamMemAlloc: {
4766  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4767  auto *GV = new llvm::GlobalVariable(
4768  CGM.getModule(), VarTy, /*isConstant=*/false,
4770  llvm::Constant::getNullValue(VarTy), VD->getName(),
4771  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4773  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4774  GV->setAlignment(Align.getAsAlign());
4775  return Address(GV, Align);
4776  }
4777  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4778  case OMPAllocateDeclAttr::OMPCGroupMemAlloc: {
4779  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4780  auto *GV = new llvm::GlobalVariable(
4781  CGM.getModule(), VarTy, /*isConstant=*/false,
4783  llvm::Constant::getNullValue(VarTy), VD->getName());
4784  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4785  GV->setAlignment(Align.getAsAlign());
4786  return Address(GV, Align);
4787  }
4788  }
4789  }
4790 
4792  return Address::invalid();
4793 
4794  VD = VD->getCanonicalDecl();
4795  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4796  if (I == FunctionGlobalizedDecls.end())
4797  return Address::invalid();
4798  auto VDI = I->getSecond().LocalVarData.find(VD);
4799  if (VDI != I->getSecond().LocalVarData.end())
4800  return VDI->second.PrivateAddr;
4801  if (VD->hasAttrs()) {
4803  E(VD->attr_end());
4804  IT != E; ++IT) {
4805  auto VDI = I->getSecond().LocalVarData.find(
4806  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4807  ->getCanonicalDecl());
4808  if (VDI != I->getSecond().LocalVarData.end())
4809  return VDI->second.PrivateAddr;
4810  }
4811  }
4812 
4813  return Address::invalid();
4814 }
4815 
4817  FunctionGlobalizedDecls.erase(CGF.CurFn);
4819 }
4820 
4822  CodeGenFunction &CGF, const OMPLoopDirective &S,
4823  OpenMPDistScheduleClauseKind &ScheduleKind,
4824  llvm::Value *&Chunk) const {
4825  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4826  ScheduleKind = OMPC_DIST_SCHEDULE_static;
4827  Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4828  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4830  return;
4831  }
4833  CGF, S, ScheduleKind, Chunk);
4834 }
4835 
4837  CodeGenFunction &CGF, const OMPLoopDirective &S,
4838  OpenMPScheduleClauseKind &ScheduleKind,
4839  const Expr *&ChunkExpr) const {
4840  ScheduleKind = OMPC_SCHEDULE_static;
4841  // Chunk size is 1 in this case.
4842  llvm::APInt ChunkSize(32, 1);
4843  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4844  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4845  SourceLocation());
4846 }
4847 
4849  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4851  " Expected target-based directive.");
4852  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4853  for (const CapturedStmt::Capture &C : CS->captures()) {
4854  // Capture variables captured by reference in lambdas for target-based
4855  // directives.
4856  if (!C.capturesVariable())
4857  continue;
4858  const VarDecl *VD = C.getCapturedVar();
4859  const auto *RD = VD->getType()
4860  .getCanonicalType()
4862  ->getAsCXXRecordDecl();
4863  if (!RD || !RD->isLambda())
4864  continue;
4865  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4866  LValue VDLVal;
4867  if (VD->getType().getCanonicalType()->isReferenceType())
4868  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4869  else
4870  VDLVal = CGF.MakeAddrLValue(
4871  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4872  llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4873  FieldDecl *ThisCapture = nullptr;
4874  RD->getCaptureFields(Captures, ThisCapture);
4875  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4876  LValue ThisLVal =
4877  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4878  llvm::Value *CXXThis = CGF.LoadCXXThis();
4879  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4880  }
4881  for (const LambdaCapture &LC : RD->captures()) {
4882  if (LC.getCaptureKind() != LCK_ByRef)
4883  continue;
4884  const VarDecl *VD = LC.getCapturedVar();
4885  if (!CS->capturesVariable(VD))
4886  continue;
4887  auto It = Captures.find(VD);
4888  assert(It != Captures.end() && "Found lambda capture without field.");
4889  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4890  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4891  if (VD->getType().getCanonicalType()->isReferenceType())
4892  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4893  VD->getType().getCanonicalType())
4894  .getAddress();
4895  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4896  }
4897  }
4898 }
4899 
4902 }
4903 
4905  LangAS &AS) {
4906  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4907  return false;
4908  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4909  switch(A->getAllocatorType()) {
4910  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4911  // Not supported, fallback to the default mem space.
4912  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4913  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4914  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4915  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4916  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4917  AS = LangAS::Default;
4918  return true;
4919  case OMPAllocateDeclAttr::OMPConstMemAlloc:
4920  AS = LangAS::cuda_constant;
4921  return true;
4922  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4923  AS = LangAS::cuda_shared;
4924  return true;
4925  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4926  llvm_unreachable("Expected predefined allocator for the variables with the "
4927  "static storage.");
4928  }
4929  return false;
4930 }
4931 
4932 // Get current CudaArch and ignore any unknown values
4934  if (!CGM.getTarget().hasFeature("ptx"))
4935  return CudaArch::UNKNOWN;
4936  llvm::StringMap<bool> Features;
4937  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4938  CGM.getTarget().getTargetOpts().CPU,
4939  CGM.getTarget().getTargetOpts().Features);
4940  for (const auto &Feature : Features) {
4941  if (Feature.getValue()) {
4942  CudaArch Arch = StringToCudaArch(Feature.getKey());
4943  if (Arch != CudaArch::UNKNOWN)
4944  return Arch;
4945  }
4946  }
4947  return CudaArch::UNKNOWN;
4948 }
4949 
4950 /// Check to see if target architecture supports unified addressing which is
4951 /// a restriction for OpenMP requires clause "unified_shared_memory".
4953  const OMPRequiresDecl *D) {
4954  for (const OMPClause *Clause : D->clauselists()) {
4955  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4956  switch (getCudaArch(CGM)) {
4957  case CudaArch::SM_20:
4958  case CudaArch::SM_21:
4959  case CudaArch::SM_30:
4960  case CudaArch::SM_32:
4961  case CudaArch::SM_35:
4962  case CudaArch::SM_37:
4963  case CudaArch::SM_50:
4964  case CudaArch::SM_52:
4965  case CudaArch::SM_53:
4966  case CudaArch::SM_60:
4967  case CudaArch::SM_61:
4968  case CudaArch::SM_62:
4969  CGM.Error(Clause->getBeginLoc(),
4970  "Target architecture does not support unified addressing");
4971  return;
4972  case CudaArch::SM_70:
4973  case CudaArch::SM_72:
4974  case CudaArch::SM_75:
4975  case CudaArch::GFX600:
4976  case CudaArch::GFX601:
4977  case CudaArch::GFX700:
4978  case CudaArch::GFX701:
4979  case CudaArch::GFX702:
4980  case CudaArch::GFX703:
4981  case CudaArch::GFX704:
4982  case CudaArch::GFX801:
4983  case CudaArch::GFX802:
4984  case CudaArch::GFX803:
4985  case CudaArch::GFX810:
4986  case CudaArch::GFX900:
4987  case CudaArch::GFX902:
4988  case CudaArch::GFX904:
4989  case CudaArch::GFX906:
4990  case CudaArch::GFX908:
4991  case CudaArch::GFX909:
4992  case CudaArch::GFX1010:
4993  case CudaArch::GFX1011:
4994  case CudaArch::GFX1012:
4995  case CudaArch::UNKNOWN:
4996  break;
4997  case CudaArch::LAST:
4998  llvm_unreachable("Unexpected Cuda arch.");
4999  }
5000  }
5001  }
5003 }
5004 
5005 /// Get number of SMs and number of blocks per SM.
5006 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
5007  std::pair<unsigned, unsigned> Data;
5008  if (CGM.getLangOpts().OpenMPCUDANumSMs)
5009  Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
5010  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
5011  Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
5012  if (Data.first && Data.second)
5013  return Data;
5014  switch (getCudaArch(CGM)) {
5015  case CudaArch::SM_20:
5016  case CudaArch::SM_21:
5017  case CudaArch::SM_30:
5018  case CudaArch::SM_32:
5019  case CudaArch::SM_35:
5020  case CudaArch::SM_37:
5021  case CudaArch::SM_50:
5022  case CudaArch::SM_52:
5023  case CudaArch::SM_53:
5024  return {16, 16};
5025  case CudaArch::SM_60:
5026  case CudaArch::SM_61:
5027  case CudaArch::SM_62:
5028  return {56, 32};
5029  case CudaArch::SM_70:
5030  case CudaArch::SM_72:
5031  case CudaArch::SM_75:
5032  return {84, 32};
5033  case CudaArch::GFX600:
5034  case CudaArch::GFX601:
5035  case CudaArch::GFX700:
5036  case CudaArch::GFX701:
5037  case CudaArch::GFX702:
5038  case CudaArch::GFX703:
5039  case CudaArch::GFX704:
5040  case CudaArch::GFX801:
5041  case CudaArch::GFX802:
5042  case CudaArch::GFX803:
5043  case CudaArch::GFX810:
5044  case CudaArch::GFX900:
5045  case CudaArch::GFX902:
5046  case CudaArch::GFX904:
5047  case CudaArch::GFX906:
5048  case CudaArch::GFX908:
5049  case CudaArch::GFX909:
5050  case CudaArch::GFX1010:
5051  case CudaArch::GFX1011:
5052  case CudaArch::GFX1012:
5053  case CudaArch::UNKNOWN:
5054  break;
5055  case CudaArch::LAST:
5056  llvm_unreachable("Unexpected Cuda arch.");
5057  }
5058  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
5059 }
5060 
5062  if (!GlobalizedRecords.empty()) {
5063  ASTContext &C = CGM.getContext();
5066  RecordDecl *StaticRD = C.buildImplicitRecord(
5067  "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5068  StaticRD->startDefinition();
5069  RecordDecl *SharedStaticRD = C.buildImplicitRecord(
5070  "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5071  SharedStaticRD->startDefinition();
5072  for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
5073  if (Records.Records.empty())
5074  continue;
5075  unsigned Size = 0;
5076  unsigned RecAlignment = 0;
5077  for (const RecordDecl *RD : Records.Records) {
5078  QualType RDTy = C.getRecordType(RD);
5079  unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
5080  RecAlignment = std::max(RecAlignment, Alignment);
5081  unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
5082  Size =
5083  llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
5084  }
5085  Size = llvm::alignTo(Size, RecAlignment);
5086  llvm::APInt ArySize(/*numBits=*/64, Size);
5087  QualType SubTy = C.getConstantArrayType(
5088  C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
5089  const bool UseSharedMemory = Size <= SharedMemorySize;
5090  auto *Field =
5091  FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
5092  SourceLocation(), SourceLocation(), nullptr, SubTy,
5094  /*BW=*/nullptr, /*Mutable=*/false,
5095  /*InitStyle=*/ICIS_NoInit);
5096  Field->setAccess(AS_public);
5097  if (UseSharedMemory) {
5098  SharedStaticRD->addDecl(Field);
5099  SharedRecs.push_back(&Records);
5100  } else {
5101  StaticRD->addDecl(Field);
5102  GlobalRecs.push_back(&Records);
5103  }
5104  Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
5105  Records.UseSharedMemory->setInitializer(
5106  llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
5107  }
5108  // Allocate SharedMemorySize buffer for the shared memory.
5109  // FIXME: nvlink does not handle weak linkage correctly (object with the
5110  // different size are reported as erroneous).
5111  // Restore this code as sson as nvlink is fixed.
5112  if (!SharedStaticRD->field_empty()) {
5113  llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
5114  QualType SubTy = C.getConstantArrayType(
5115  C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
5116  auto *Field = FieldDecl::Create(
5117  C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
5119  /*BW=*/nullptr, /*Mutable=*/false,
5120  /*InitStyle=*/ICIS_NoInit);
5121  Field->setAccess(AS_public);
5122  SharedStaticRD->addDecl(Field);
5123  }
5124  SharedStaticRD->completeDefinition();
5125  if (!SharedStaticRD->field_empty()) {
5126  QualType StaticTy = C.getRecordType(SharedStaticRD);
5127  llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
5128  auto *GV = new llvm::GlobalVariable(
5129  CGM.getModule(), LLVMStaticTy,
5130  /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
5131  llvm::Constant::getNullValue(LLVMStaticTy),
5132  "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
5133  llvm::GlobalValue::NotThreadLocal,
5135  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5136  GV, CGM.VoidPtrTy);
5137  for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
5138  Rec->Buffer->replaceAllUsesWith(Replacement);
5139  Rec->Buffer->eraseFromParent();
5140  }
5141  }
5142  StaticRD->completeDefinition();
5143  if (!StaticRD->field_empty()) {
5144  QualType StaticTy = C.getRecordType(StaticRD);
5145  std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
5146  llvm::APInt Size1(32, SMsBlockPerSM.second);
5147  QualType Arr1Ty =
5148  C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
5149  /*IndexTypeQuals=*/0);
5150  llvm::APInt Size2(32, SMsBlockPerSM.first);
5151  QualType Arr2Ty =
5152  C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
5153  /*IndexTypeQuals=*/0);
5154  llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
5155  // FIXME: nvlink does not handle weak linkage correctly (object with the
5156  // different size are reported as erroneous).
5157  // Restore CommonLinkage as soon as nvlink is fixed.
5158  auto *GV = new llvm::GlobalVariable(
5159  CGM.getModule(), LLVMArr2Ty,
5160  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5161  llvm::Constant::getNullValue(LLVMArr2Ty),
5162  "_openmp_static_glob_rd_$_");
5163  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5164  GV, CGM.VoidPtrTy);
5165  for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
5166  Rec->Buffer->replaceAllUsesWith(Replacement);
5167  Rec->Buffer->eraseFromParent();
5168  }
5169  }
5170  }
5171  if (!TeamsReductions.empty()) {
5172  ASTContext &C = CGM.getContext();
5173  RecordDecl *StaticRD = C.buildImplicitRecord(
5174  "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
5175  StaticRD->startDefinition();
5176  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
5177  QualType RecTy = C.getRecordType(TeamReductionRec);
5178  auto *Field = FieldDecl::Create(
5179  C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
5181  /*BW=*/nullptr, /*Mutable=*/false,
5182  /*InitStyle=*/ICIS_NoInit);
5183  Field->setAccess(AS_public);
5184  StaticRD->addDecl(Field);
5185  }
5186  StaticRD->completeDefinition();
5187  QualType StaticTy = C.getRecordType(StaticRD);
5188  llvm::Type *LLVMReductionsBufferTy =
5189  CGM.getTypes().ConvertTypeForMem(StaticTy);
5190  // FIXME: nvlink does not handle weak linkage correctly (object with the
5191  // different size are reported as erroneous).
5192  // Restore CommonLinkage as soon as nvlink is fixed.
5193  auto *GV = new llvm::GlobalVariable(
5194  CGM.getModule(), LLVMReductionsBufferTy,
5195  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5196  llvm::Constant::getNullValue(LLVMReductionsBufferTy),
5197  "_openmp_teams_reductions_buffer_$_");
5198  KernelTeamsReductionPtr->setInitializer(
5199  llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
5200  CGM.VoidPtrTy));
5201  }
5203 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:652
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:216
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5565
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:218
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl *> &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static const Decl * getCanonicalDecl(const Decl *D)
llvm::IntegerType * IntTy
int
int64_t QuantityType
Definition: CharUnits.h:40
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
Other implicit parameter.
Definition: Decl.h:1560
A class which contains all the information about a particular captured value.
Definition: Decl.h:3957
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2585
CanQualType VoidPtrTy
Definition: ASTContext.h:1043
A (possibly-)qualified type.
Definition: Type.h:643
llvm::Function * emitReductionFunction(SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
CudaArch
Definition: Cuda.h:35
ArrayRef< OMPClause * > clauses()
Definition: StmtOpenMP.h:325
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:139
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:192
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
static std::pair< unsigned, unsigned > getSMsBlocksPerSM(CodeGenModule &CGM)
Get number of SMs and number of blocks per SM.
Definition: