clang  9.0.0svn
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for OpenMP runtime code generation specialized to NVPTX
10 // targets.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeNVPTX.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/DeclOpenMP.h"
17 #include "clang/AST/StmtOpenMP.h"
18 #include "clang/AST/StmtVisitor.h"
19 #include "clang/Basic/Cuda.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 
22 using namespace clang;
23 using namespace CodeGen;
24 
25 namespace {
27  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28  /// int16_t RequiresOMPRuntime);
29  OMPRTL_NVPTX__kmpc_kernel_init,
30  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
31  OMPRTL_NVPTX__kmpc_kernel_deinit,
32  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
33  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
34  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35  /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
36  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
37  /// Call to void __kmpc_kernel_prepare_parallel(void
38  /// *outlined_function, int16_t
39  /// IsOMPRuntimeInitialized);
40  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
42  /// int16_t IsOMPRuntimeInitialized);
43  OMPRTL_NVPTX__kmpc_kernel_parallel,
44  /// Call to void __kmpc_kernel_end_parallel();
45  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47  /// global_tid);
48  OMPRTL_NVPTX__kmpc_serialized_parallel,
49  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50  /// global_tid);
51  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
53  /// int16_t lane_offset, int16_t warp_size);
54  OMPRTL_NVPTX__kmpc_shuffle_int32,
55  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
56  /// int16_t lane_offset, int16_t warp_size);
57  OMPRTL_NVPTX__kmpc_shuffle_int64,
58  /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
59  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61  /// lane_offset, int16_t shortCircuit),
62  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2,
64  /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
65  /// global_tid, void *global_buffer, int32_t num_of_records, void*
66  /// reduce_data,
67  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
68  /// lane_offset, int16_t shortCircuit),
69  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
70  /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
71  /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
72  /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
73  /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
74  /// *buffer, int idx, void *reduce_data));
75  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2,
76  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
77  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
78  /// Call to void __kmpc_data_sharing_init_stack();
79  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
80  /// Call to void __kmpc_data_sharing_init_stack_spmd();
81  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
82  /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
83  /// int16_t UseSharedMemory);
84  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
85  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
86  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
87  /// Call to void __kmpc_begin_sharing_variables(void ***args,
88  /// size_t n_args);
89  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
90  /// Call to void __kmpc_end_sharing_variables();
91  OMPRTL_NVPTX__kmpc_end_sharing_variables,
92  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
93  OMPRTL_NVPTX__kmpc_get_shared_variables,
94  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
95  /// global_tid);
96  OMPRTL_NVPTX__kmpc_parallel_level,
97  /// Call to int8_t __kmpc_is_spmd_exec_mode();
98  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
99  /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
100  /// const void *buf, size_t size, int16_t is_shared, const void **res);
101  OMPRTL_NVPTX__kmpc_get_team_static_memory,
102  /// Call to void __kmpc_restore_team_static_memory(int16_t
103  /// isSPMDExecutionMode, int16_t is_shared);
104  OMPRTL_NVPTX__kmpc_restore_team_static_memory,
105  /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
107  /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
108  /// global_tid);
109  OMPRTL__kmpc_barrier_simple_spmd,
110 };
111 
112 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
113 class NVPTXActionTy final : public PrePostActionTy {
114  llvm::FunctionCallee EnterCallee = nullptr;
115  ArrayRef<llvm::Value *> EnterArgs;
116  llvm::FunctionCallee ExitCallee = nullptr;
117  ArrayRef<llvm::Value *> ExitArgs;
118  bool Conditional = false;
119  llvm::BasicBlock *ContBlock = nullptr;
120 
121 public:
122  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
123  ArrayRef<llvm::Value *> EnterArgs,
124  llvm::FunctionCallee ExitCallee,
125  ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
126  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
127  ExitArgs(ExitArgs), Conditional(Conditional) {}
128  void Enter(CodeGenFunction &CGF) override {
129  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
130  if (Conditional) {
131  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
132  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
133  ContBlock = CGF.createBasicBlock("omp_if.end");
134  // Generate the branch (If-stmt)
135  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
136  CGF.EmitBlock(ThenBlock);
137  }
138  }
139  void Done(CodeGenFunction &CGF) {
140  // Emit the rest of blocks/branches
141  CGF.EmitBranch(ContBlock);
142  CGF.EmitBlock(ContBlock, true);
143  }
144  void Exit(CodeGenFunction &CGF) override {
145  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
146  }
147 };
148 
149 /// A class to track the execution mode when codegening directives within
150 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
151 /// to the target region and used by containing directives such as 'parallel'
152 /// to emit optimized code.
153 class ExecutionRuntimeModesRAII {
154 private:
158  bool SavedRuntimeMode = false;
159  bool *RuntimeMode = nullptr;
160 
161 public:
162  /// Constructor for Non-SPMD mode.
163  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
164  : ExecMode(ExecMode) {
165  SavedExecMode = ExecMode;
167  }
168  /// Constructor for SPMD mode.
169  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
170  bool &RuntimeMode, bool FullRuntimeMode)
171  : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
172  SavedExecMode = ExecMode;
173  SavedRuntimeMode = RuntimeMode;
175  RuntimeMode = FullRuntimeMode;
176  }
177  ~ExecutionRuntimeModesRAII() {
178  ExecMode = SavedExecMode;
179  if (RuntimeMode)
180  *RuntimeMode = SavedRuntimeMode;
181  }
182 };
183 
184 /// GPU Configuration: This information can be derived from cuda registers,
185 /// however, providing compile time constants helps generate more efficient
186 /// code. For all practical purposes this is fine because the configuration
187 /// is the same for all known NVPTX architectures.
188 enum MachineConfiguration : unsigned {
189  WarpSize = 32,
190  /// Number of bits required to represent a lane identifier, which is
191  /// computed as log_2(WarpSize).
192  LaneIDBits = 5,
193  LaneIDMask = WarpSize - 1,
194 
195  /// Global memory alignment for performance.
196  GlobalMemoryAlignment = 128,
197 
198  /// Maximal size of the shared memory buffer.
199  SharedMemorySize = 128,
200 };
201 
202 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
203  RefExpr = RefExpr->IgnoreParens();
204  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
205  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
206  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
207  Base = TempASE->getBase()->IgnoreParenImpCasts();
208  RefExpr = Base;
209  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
210  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
211  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
212  Base = TempOASE->getBase()->IgnoreParenImpCasts();
213  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
214  Base = TempASE->getBase()->IgnoreParenImpCasts();
215  RefExpr = Base;
216  }
217  RefExpr = RefExpr->IgnoreParenImpCasts();
218  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
219  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
220  const auto *ME = cast<MemberExpr>(RefExpr);
221  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
222 }
223 
224 
225 static RecordDecl *buildRecordForGlobalizedVars(
226  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
227  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
228  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
229  &MappedDeclsFields, int BufSize) {
230  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
231  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
232  return nullptr;
233  SmallVector<VarsDataTy, 4> GlobalizedVars;
234  for (const ValueDecl *D : EscapedDecls)
235  GlobalizedVars.emplace_back(
237  C.getDeclAlign(D).getQuantity(),
238  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
239  D);
240  for (const ValueDecl *D : EscapedDeclsForTeams)
241  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
242  llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
243  return L.first > R.first;
244  });
245 
246  // Build struct _globalized_locals_ty {
247  // /* globalized vars */[WarSize] align (max(decl_align,
248  // GlobalMemoryAlignment))
249  // /* globalized vars */ for EscapedDeclsForTeams
250  // };
251  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
252  GlobalizedRD->startDefinition();
253  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
254  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
255  for (const auto &Pair : GlobalizedVars) {
256  const ValueDecl *VD = Pair.second;
257  QualType Type = VD->getType();
258  if (Type->isLValueReferenceType())
259  Type = C.getPointerType(Type.getNonReferenceType());
260  else
261  Type = Type.getNonReferenceType();
262  SourceLocation Loc = VD->getLocation();
263  FieldDecl *Field;
264  if (SingleEscaped.count(VD)) {
265  Field = FieldDecl::Create(
266  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
268  /*BW=*/nullptr, /*Mutable=*/false,
269  /*InitStyle=*/ICIS_NoInit);
270  Field->setAccess(AS_public);
271  if (VD->hasAttrs()) {
272  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
273  E(VD->getAttrs().end());
274  I != E; ++I)
275  Field->addAttr(*I);
276  }
277  } else {
278  llvm::APInt ArraySize(32, BufSize);
279  Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
280  Field = FieldDecl::Create(
281  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
283  /*BW=*/nullptr, /*Mutable=*/false,
284  /*InitStyle=*/ICIS_NoInit);
285  Field->setAccess(AS_public);
286  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
287  static_cast<CharUnits::QuantityType>(
288  GlobalMemoryAlignment)));
289  Field->addAttr(AlignedAttr::CreateImplicit(
290  C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
291  IntegerLiteral::Create(C, Align,
292  C.getIntTypeForBitwidth(32, /*Signed=*/0),
293  SourceLocation())));
294  }
295  GlobalizedRD->addDecl(Field);
296  MappedDeclsFields.try_emplace(VD, Field);
297  }
298  GlobalizedRD->completeDefinition();
299  return GlobalizedRD;
300 }
301 
302 /// Get the list of variables that can escape their declaration context.
303 class CheckVarsEscapingDeclContext final
304  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
305  CodeGenFunction &CGF;
306  llvm::SetVector<const ValueDecl *> EscapedDecls;
307  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
308  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
309  RecordDecl *GlobalizedRD = nullptr;
310  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
311  bool AllEscaped = false;
312  bool IsForCombinedParallelRegion = false;
313 
314  void markAsEscaped(const ValueDecl *VD) {
315  // Do not globalize declare target variables.
316  if (!isa<VarDecl>(VD) ||
317  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
318  return;
319  VD = cast<ValueDecl>(VD->getCanonicalDecl());
320  // Use user-specified allocation.
321  if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
322  return;
323  // Variables captured by value must be globalized.
324  if (auto *CSI = CGF.CapturedStmtInfo) {
325  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
326  // Check if need to capture the variable that was already captured by
327  // value in the outer region.
328  if (!IsForCombinedParallelRegion) {
329  if (!FD->hasAttrs())
330  return;
331  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
332  if (!Attr)
333  return;
334  if (((Attr->getCaptureKind() != OMPC_map) &&
336  static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) ||
337  ((Attr->getCaptureKind() == OMPC_map) &&
338  !FD->getType()->isAnyPointerType()))
339  return;
340  }
341  if (!FD->getType()->isReferenceType()) {
342  assert(!VD->getType()->isVariablyModifiedType() &&
343  "Parameter captured by value with variably modified type");
344  EscapedParameters.insert(VD);
345  } else if (!IsForCombinedParallelRegion) {
346  return;
347  }
348  }
349  }
350  if ((!CGF.CapturedStmtInfo ||
351  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
352  VD->getType()->isReferenceType())
353  // Do not globalize variables with reference type.
354  return;
355  if (VD->getType()->isVariablyModifiedType())
356  EscapedVariableLengthDecls.insert(VD);
357  else
358  EscapedDecls.insert(VD);
359  }
360 
361  void VisitValueDecl(const ValueDecl *VD) {
362  if (VD->getType()->isLValueReferenceType())
363  markAsEscaped(VD);
364  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
365  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
366  const bool SavedAllEscaped = AllEscaped;
367  AllEscaped = VD->getType()->isLValueReferenceType();
368  Visit(VarD->getInit());
369  AllEscaped = SavedAllEscaped;
370  }
371  }
372  }
373  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
374  ArrayRef<OMPClause *> Clauses,
375  bool IsCombinedParallelRegion) {
376  if (!S)
377  return;
378  for (const CapturedStmt::Capture &C : S->captures()) {
379  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
380  const ValueDecl *VD = C.getCapturedVar();
381  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
382  if (IsCombinedParallelRegion) {
383  // Check if the variable is privatized in the combined construct and
384  // those private copies must be shared in the inner parallel
385  // directive.
386  IsForCombinedParallelRegion = false;
387  for (const OMPClause *C : Clauses) {
388  if (!isOpenMPPrivate(C->getClauseKind()) ||
389  C->getClauseKind() == OMPC_reduction ||
390  C->getClauseKind() == OMPC_linear ||
391  C->getClauseKind() == OMPC_private)
392  continue;
394  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
395  Vars = PC->getVarRefs();
396  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
397  Vars = PC->getVarRefs();
398  else
399  llvm_unreachable("Unexpected clause.");
400  for (const auto *E : Vars) {
401  const Decl *D =
402  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
403  if (D == VD->getCanonicalDecl()) {
404  IsForCombinedParallelRegion = true;
405  break;
406  }
407  }
408  if (IsForCombinedParallelRegion)
409  break;
410  }
411  }
412  markAsEscaped(VD);
413  if (isa<OMPCapturedExprDecl>(VD))
414  VisitValueDecl(VD);
415  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
416  }
417  }
418  }
419 
420  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
421  assert(!GlobalizedRD &&
422  "Record for globalized variables is built already.");
423  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
424  if (IsInTTDRegion)
425  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
426  else
427  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
428  GlobalizedRD = ::buildRecordForGlobalizedVars(
429  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
430  MappedDeclsFields, WarpSize);
431  }
432 
433 public:
434  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
435  ArrayRef<const ValueDecl *> TeamsReductions)
436  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
437  }
438  virtual ~CheckVarsEscapingDeclContext() = default;
439  void VisitDeclStmt(const DeclStmt *S) {
440  if (!S)
441  return;
442  for (const Decl *D : S->decls())
443  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
444  VisitValueDecl(VD);
445  }
446  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
447  if (!D)
448  return;
449  if (!D->hasAssociatedStmt())
450  return;
451  if (const auto *S =
452  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
453  // Do not analyze directives that do not actually require capturing,
454  // like `omp for` or `omp simd` directives.
456  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
457  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
458  VisitStmt(S->getCapturedStmt());
459  return;
460  }
461  VisitOpenMPCapturedStmt(
462  S, D->clauses(),
463  CaptureRegions.back() == OMPD_parallel &&
465  }
466  }
467  void VisitCapturedStmt(const CapturedStmt *S) {
468  if (!S)
469  return;
470  for (const CapturedStmt::Capture &C : S->captures()) {
471  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
472  const ValueDecl *VD = C.getCapturedVar();
473  markAsEscaped(VD);
474  if (isa<OMPCapturedExprDecl>(VD))
475  VisitValueDecl(VD);
476  }
477  }
478  }
479  void VisitLambdaExpr(const LambdaExpr *E) {
480  if (!E)
481  return;
482  for (const LambdaCapture &C : E->captures()) {
483  if (C.capturesVariable()) {
484  if (C.getCaptureKind() == LCK_ByRef) {
485  const ValueDecl *VD = C.getCapturedVar();
486  markAsEscaped(VD);
487  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
488  VisitValueDecl(VD);
489  }
490  }
491  }
492  }
493  void VisitBlockExpr(const BlockExpr *E) {
494  if (!E)
495  return;
496  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
497  if (C.isByRef()) {
498  const VarDecl *VD = C.getVariable();
499  markAsEscaped(VD);
500  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
501  VisitValueDecl(VD);
502  }
503  }
504  }
505  void VisitCallExpr(const CallExpr *E) {
506  if (!E)
507  return;
508  for (const Expr *Arg : E->arguments()) {
509  if (!Arg)
510  continue;
511  if (Arg->isLValue()) {
512  const bool SavedAllEscaped = AllEscaped;
513  AllEscaped = true;
514  Visit(Arg);
515  AllEscaped = SavedAllEscaped;
516  } else {
517  Visit(Arg);
518  }
519  }
520  Visit(E->getCallee());
521  }
522  void VisitDeclRefExpr(const DeclRefExpr *E) {
523  if (!E)
524  return;
525  const ValueDecl *VD = E->getDecl();
526  if (AllEscaped)
527  markAsEscaped(VD);
528  if (isa<OMPCapturedExprDecl>(VD))
529  VisitValueDecl(VD);
530  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
531  if (VarD->isInitCapture())
532  VisitValueDecl(VD);
533  }
534  void VisitUnaryOperator(const UnaryOperator *E) {
535  if (!E)
536  return;
537  if (E->getOpcode() == UO_AddrOf) {
538  const bool SavedAllEscaped = AllEscaped;
539  AllEscaped = true;
540  Visit(E->getSubExpr());
541  AllEscaped = SavedAllEscaped;
542  } else {
543  Visit(E->getSubExpr());
544  }
545  }
546  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
547  if (!E)
548  return;
549  if (E->getCastKind() == CK_ArrayToPointerDecay) {
550  const bool SavedAllEscaped = AllEscaped;
551  AllEscaped = true;
552  Visit(E->getSubExpr());
553  AllEscaped = SavedAllEscaped;
554  } else {
555  Visit(E->getSubExpr());
556  }
557  }
558  void VisitExpr(const Expr *E) {
559  if (!E)
560  return;
561  bool SavedAllEscaped = AllEscaped;
562  if (!E->isLValue())
563  AllEscaped = false;
564  for (const Stmt *Child : E->children())
565  if (Child)
566  Visit(Child);
567  AllEscaped = SavedAllEscaped;
568  }
569  void VisitStmt(const Stmt *S) {
570  if (!S)
571  return;
572  for (const Stmt *Child : S->children())
573  if (Child)
574  Visit(Child);
575  }
576 
577  /// Returns the record that handles all the escaped local variables and used
578  /// instead of their original storage.
579  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
580  if (!GlobalizedRD)
581  buildRecordForGlobalizedVars(IsInTTDRegion);
582  return GlobalizedRD;
583  }
584 
585  /// Returns the field in the globalized record for the escaped variable.
586  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
587  assert(GlobalizedRD &&
588  "Record for globalized variables must be generated already.");
589  auto I = MappedDeclsFields.find(VD);
590  if (I == MappedDeclsFields.end())
591  return nullptr;
592  return I->getSecond();
593  }
594 
595  /// Returns the list of the escaped local variables/parameters.
596  ArrayRef<const ValueDecl *> getEscapedDecls() const {
597  return EscapedDecls.getArrayRef();
598  }
599 
600  /// Checks if the escaped local variable is actually a parameter passed by
601  /// value.
602  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
603  return EscapedParameters;
604  }
605 
606  /// Returns the list of the escaped variables with the variably modified
607  /// types.
608  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
609  return EscapedVariableLengthDecls.getArrayRef();
610  }
611 };
612 } // anonymous namespace
613 
614 /// Get the GPU warp size.
616  return CGF.EmitRuntimeCall(
617  llvm::Intrinsic::getDeclaration(
618  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
619  "nvptx_warp_size");
620 }
621 
622 /// Get the id of the current thread on the GPU.
624  return CGF.EmitRuntimeCall(
625  llvm::Intrinsic::getDeclaration(
626  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
627  "nvptx_tid");
628 }
629 
630 /// Get the id of the warp in the block.
631 /// We assume that the warp size is 32, which is always the case
632 /// on the NVPTX device, to generate more efficient code.
634  CGBuilderTy &Bld = CGF.Builder;
635  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
636 }
637 
638 /// Get the id of the current lane in the Warp.
639 /// We assume that the warp size is 32, which is always the case
640 /// on the NVPTX device, to generate more efficient code.
642  CGBuilderTy &Bld = CGF.Builder;
643  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
644  "nvptx_lane_id");
645 }
646 
647 /// Get the maximum number of threads in a block of the GPU.
649  return CGF.EmitRuntimeCall(
650  llvm::Intrinsic::getDeclaration(
651  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
652  "nvptx_num_threads");
653 }
654 
655 /// Get the value of the thread_limit clause in the teams directive.
656 /// For the 'generic' execution mode, the runtime encodes thread_limit in
657 /// the launch parameters, always starting thread_limit+warpSize threads per
658 /// CTA. The threads in the last warp are reserved for master execution.
659 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
661  bool IsInSPMDExecutionMode = false) {
662  CGBuilderTy &Bld = CGF.Builder;
663  return IsInSPMDExecutionMode
664  ? getNVPTXNumThreads(CGF)
665  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
666  "thread_limit");
667 }
668 
669 /// Get the thread id of the OMP master thread.
670 /// The master thread id is the first thread (lane) of the last warp in the
671 /// GPU block. Warp size is assumed to be some power of 2.
672 /// Thread id is 0 indexed.
673 /// E.g: If NumThreads is 33, master id is 32.
674 /// If NumThreads is 64, master id is 32.
675 /// If NumThreads is 1024, master id is 992.
677  CGBuilderTy &Bld = CGF.Builder;
678  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
679 
680  // We assume that the warp size is a power of 2.
681  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
682 
683  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
684  Bld.CreateNot(Mask), "master_tid");
685 }
686 
687 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
688  CodeGenModule &CGM, SourceLocation Loc)
689  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
690  Loc(Loc) {
691  createWorkerFunction(CGM);
692 }
693 
694 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
695  CodeGenModule &CGM) {
696  // Create an worker function with no arguments.
697 
698  WorkerFn = llvm::Function::Create(
700  /*placeholder=*/"_worker", &CGM.getModule());
701  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
702  WorkerFn->setDoesNotRecurse();
703 }
704 
706 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
707  return CurrentExecutionMode;
708 }
709 
712  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
714 }
715 
716 /// Check for inner (nested) SPMD construct, if any
718  const OMPExecutableDirective &D) {
719  const auto *CS = D.getInnermostCapturedStmt();
720  const auto *Body =
721  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
722  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
723 
724  if (const auto *NestedDir =
725  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
726  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
727  switch (D.getDirectiveKind()) {
728  case OMPD_target:
729  if (isOpenMPParallelDirective(DKind))
730  return true;
731  if (DKind == OMPD_teams) {
732  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
733  /*IgnoreCaptured=*/true);
734  if (!Body)
735  return false;
736  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
737  if (const auto *NND =
738  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
739  DKind = NND->getDirectiveKind();
740  if (isOpenMPParallelDirective(DKind))
741  return true;
742  }
743  }
744  return false;
745  case OMPD_target_teams:
746  return isOpenMPParallelDirective(DKind);
747  case OMPD_target_simd:
748  case OMPD_target_parallel:
749  case OMPD_target_parallel_for:
750  case OMPD_target_parallel_for_simd:
751  case OMPD_target_teams_distribute:
752  case OMPD_target_teams_distribute_simd:
753  case OMPD_target_teams_distribute_parallel_for:
754  case OMPD_target_teams_distribute_parallel_for_simd:
755  case OMPD_parallel:
756  case OMPD_for:
757  case OMPD_parallel_for:
758  case OMPD_parallel_sections:
759  case OMPD_for_simd:
760  case OMPD_parallel_for_simd:
761  case OMPD_cancel:
762  case OMPD_cancellation_point:
763  case OMPD_ordered:
764  case OMPD_threadprivate:
765  case OMPD_allocate:
766  case OMPD_task:
767  case OMPD_simd:
768  case OMPD_sections:
769  case OMPD_section:
770  case OMPD_single:
771  case OMPD_master:
772  case OMPD_critical:
773  case OMPD_taskyield:
774  case OMPD_barrier:
775  case OMPD_taskwait:
776  case OMPD_taskgroup:
777  case OMPD_atomic:
778  case OMPD_flush:
779  case OMPD_teams:
780  case OMPD_target_data:
781  case OMPD_target_exit_data:
782  case OMPD_target_enter_data:
783  case OMPD_distribute:
784  case OMPD_distribute_simd:
785  case OMPD_distribute_parallel_for:
786  case OMPD_distribute_parallel_for_simd:
787  case OMPD_teams_distribute:
788  case OMPD_teams_distribute_simd:
789  case OMPD_teams_distribute_parallel_for:
790  case OMPD_teams_distribute_parallel_for_simd:
791  case OMPD_target_update:
792  case OMPD_declare_simd:
793  case OMPD_declare_target:
794  case OMPD_end_declare_target:
795  case OMPD_declare_reduction:
796  case OMPD_declare_mapper:
797  case OMPD_taskloop:
798  case OMPD_taskloop_simd:
799  case OMPD_requires:
800  case OMPD_unknown:
801  llvm_unreachable("Unexpected directive.");
802  }
803  }
804 
805  return false;
806 }
807 
809  const OMPExecutableDirective &D) {
810  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
811  switch (DirectiveKind) {
812  case OMPD_target:
813  case OMPD_target_teams:
814  return hasNestedSPMDDirective(Ctx, D);
815  case OMPD_target_parallel:
816  case OMPD_target_parallel_for:
817  case OMPD_target_parallel_for_simd:
818  case OMPD_target_teams_distribute_parallel_for:
819  case OMPD_target_teams_distribute_parallel_for_simd:
820  case OMPD_target_simd:
821  case OMPD_target_teams_distribute_simd:
822  return true;
823  case OMPD_target_teams_distribute:
824  return false;
825  case OMPD_parallel:
826  case OMPD_for:
827  case OMPD_parallel_for:
828  case OMPD_parallel_sections:
829  case OMPD_for_simd:
830  case OMPD_parallel_for_simd:
831  case OMPD_cancel:
832  case OMPD_cancellation_point:
833  case OMPD_ordered:
834  case OMPD_threadprivate:
835  case OMPD_allocate:
836  case OMPD_task:
837  case OMPD_simd:
838  case OMPD_sections:
839  case OMPD_section:
840  case OMPD_single:
841  case OMPD_master:
842  case OMPD_critical:
843  case OMPD_taskyield:
844  case OMPD_barrier:
845  case OMPD_taskwait:
846  case OMPD_taskgroup:
847  case OMPD_atomic:
848  case OMPD_flush:
849  case OMPD_teams:
850  case OMPD_target_data:
851  case OMPD_target_exit_data:
852  case OMPD_target_enter_data:
853  case OMPD_distribute:
854  case OMPD_distribute_simd:
855  case OMPD_distribute_parallel_for:
856  case OMPD_distribute_parallel_for_simd:
857  case OMPD_teams_distribute:
858  case OMPD_teams_distribute_simd:
859  case OMPD_teams_distribute_parallel_for:
860  case OMPD_teams_distribute_parallel_for_simd:
861  case OMPD_target_update:
862  case OMPD_declare_simd:
863  case OMPD_declare_target:
864  case OMPD_end_declare_target:
865  case OMPD_declare_reduction:
866  case OMPD_declare_mapper:
867  case OMPD_taskloop:
868  case OMPD_taskloop_simd:
869  case OMPD_requires:
870  case OMPD_unknown:
871  break;
872  }
873  llvm_unreachable(
874  "Unknown programming model for OpenMP directive on NVPTX target.");
875 }
876 
877 /// Check if the directive is loops based and has schedule clause at all or has
878 /// static scheduling.
882  "Expected loop-based directive.");
883  return !D.hasClausesOfKind<OMPOrderedClause>() &&
885  llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
886  [](const OMPScheduleClause *C) {
887  return C->getScheduleKind() == OMPC_SCHEDULE_static;
888  }));
889 }
890 
891 /// Check for inner (nested) lightweight runtime construct, if any
893  const OMPExecutableDirective &D) {
894  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
895  const auto *CS = D.getInnermostCapturedStmt();
896  const auto *Body =
897  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
898  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
899 
900  if (const auto *NestedDir =
901  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
902  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
903  switch (D.getDirectiveKind()) {
904  case OMPD_target:
905  if (isOpenMPParallelDirective(DKind) &&
907  hasStaticScheduling(*NestedDir))
908  return true;
909  if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
910  return true;
911  if (DKind == OMPD_parallel) {
912  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
913  /*IgnoreCaptured=*/true);
914  if (!Body)
915  return false;
916  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
917  if (const auto *NND =
918  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
919  DKind = NND->getDirectiveKind();
920  if (isOpenMPWorksharingDirective(DKind) &&
922  return true;
923  }
924  } else if (DKind == OMPD_teams) {
925  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
926  /*IgnoreCaptured=*/true);
927  if (!Body)
928  return false;
929  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
930  if (const auto *NND =
931  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
932  DKind = NND->getDirectiveKind();
933  if (isOpenMPParallelDirective(DKind) &&
936  return true;
937  if (DKind == OMPD_parallel) {
938  Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
939  /*IgnoreCaptured=*/true);
940  if (!Body)
941  return false;
942  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
943  if (const auto *NND =
944  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
945  DKind = NND->getDirectiveKind();
946  if (isOpenMPWorksharingDirective(DKind) &&
948  return true;
949  }
950  }
951  }
952  }
953  return false;
954  case OMPD_target_teams:
955  if (isOpenMPParallelDirective(DKind) &&
957  hasStaticScheduling(*NestedDir))
958  return true;
959  if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
960  return true;
961  if (DKind == OMPD_parallel) {
962  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
963  /*IgnoreCaptured=*/true);
964  if (!Body)
965  return false;
966  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
967  if (const auto *NND =
968  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
969  DKind = NND->getDirectiveKind();
970  if (isOpenMPWorksharingDirective(DKind) &&
972  return true;
973  }
974  }
975  return false;
976  case OMPD_target_parallel:
977  if (DKind == OMPD_simd)
978  return true;
979  return isOpenMPWorksharingDirective(DKind) &&
980  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
981  case OMPD_target_teams_distribute:
982  case OMPD_target_simd:
983  case OMPD_target_parallel_for:
984  case OMPD_target_parallel_for_simd:
985  case OMPD_target_teams_distribute_simd:
986  case OMPD_target_teams_distribute_parallel_for:
987  case OMPD_target_teams_distribute_parallel_for_simd:
988  case OMPD_parallel:
989  case OMPD_for:
990  case OMPD_parallel_for:
991  case OMPD_parallel_sections:
992  case OMPD_for_simd:
993  case OMPD_parallel_for_simd:
994  case OMPD_cancel:
995  case OMPD_cancellation_point:
996  case OMPD_ordered:
997  case OMPD_threadprivate:
998  case OMPD_allocate:
999  case OMPD_task:
1000  case OMPD_simd:
1001  case OMPD_sections:
1002  case OMPD_section:
1003  case OMPD_single:
1004  case OMPD_master:
1005  case OMPD_critical:
1006  case OMPD_taskyield:
1007  case OMPD_barrier:
1008  case OMPD_taskwait:
1009  case OMPD_taskgroup:
1010  case OMPD_atomic:
1011  case OMPD_flush:
1012  case OMPD_teams:
1013  case OMPD_target_data:
1014  case OMPD_target_exit_data:
1015  case OMPD_target_enter_data:
1016  case OMPD_distribute:
1017  case OMPD_distribute_simd:
1018  case OMPD_distribute_parallel_for:
1019  case OMPD_distribute_parallel_for_simd:
1020  case OMPD_teams_distribute:
1021  case OMPD_teams_distribute_simd:
1022  case OMPD_teams_distribute_parallel_for:
1023  case OMPD_teams_distribute_parallel_for_simd:
1024  case OMPD_target_update:
1025  case OMPD_declare_simd:
1026  case OMPD_declare_target:
1027  case OMPD_end_declare_target:
1028  case OMPD_declare_reduction:
1029  case OMPD_declare_mapper:
1030  case OMPD_taskloop:
1031  case OMPD_taskloop_simd:
1032  case OMPD_requires:
1033  case OMPD_unknown:
1034  llvm_unreachable("Unexpected directive.");
1035  }
1036  }
1037 
1038  return false;
1039 }
1040 
1041 /// Checks if the construct supports lightweight runtime. It must be SPMD
1042 /// construct + inner loop-based construct with static scheduling.
1044  const OMPExecutableDirective &D) {
1045  if (!supportsSPMDExecutionMode(Ctx, D))
1046  return false;
1047  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1048  switch (DirectiveKind) {
1049  case OMPD_target:
1050  case OMPD_target_teams:
1051  case OMPD_target_parallel:
1052  return hasNestedLightweightDirective(Ctx, D);
1053  case OMPD_target_parallel_for:
1054  case OMPD_target_parallel_for_simd:
1055  case OMPD_target_teams_distribute_parallel_for:
1056  case OMPD_target_teams_distribute_parallel_for_simd:
1057  // (Last|First)-privates must be shared in parallel region.
1058  return hasStaticScheduling(D);
1059  case OMPD_target_simd:
1060  case OMPD_target_teams_distribute_simd:
1061  return true;
1062  case OMPD_target_teams_distribute:
1063  return false;
1064  case OMPD_parallel:
1065  case OMPD_for:
1066  case OMPD_parallel_for:
1067  case OMPD_parallel_sections:
1068  case OMPD_for_simd:
1069  case OMPD_parallel_for_simd:
1070  case OMPD_cancel:
1071  case OMPD_cancellation_point:
1072  case OMPD_ordered:
1073  case OMPD_threadprivate:
1074  case OMPD_allocate:
1075  case OMPD_task:
1076  case OMPD_simd:
1077  case OMPD_sections:
1078  case OMPD_section:
1079  case OMPD_single:
1080  case OMPD_master:
1081  case OMPD_critical:
1082  case OMPD_taskyield:
1083  case OMPD_barrier:
1084  case OMPD_taskwait:
1085  case OMPD_taskgroup:
1086  case OMPD_atomic:
1087  case OMPD_flush:
1088  case OMPD_teams:
1089  case OMPD_target_data:
1090  case OMPD_target_exit_data:
1091  case OMPD_target_enter_data:
1092  case OMPD_distribute:
1093  case OMPD_distribute_simd:
1094  case OMPD_distribute_parallel_for:
1095  case OMPD_distribute_parallel_for_simd:
1096  case OMPD_teams_distribute:
1097  case OMPD_teams_distribute_simd:
1098  case OMPD_teams_distribute_parallel_for:
1099  case OMPD_teams_distribute_parallel_for_simd:
1100  case OMPD_target_update:
1101  case OMPD_declare_simd:
1102  case OMPD_declare_target:
1103  case OMPD_end_declare_target:
1104  case OMPD_declare_reduction:
1105  case OMPD_declare_mapper:
1106  case OMPD_taskloop:
1107  case OMPD_taskloop_simd:
1108  case OMPD_requires:
1109  case OMPD_unknown:
1110  break;
1111  }
1112  llvm_unreachable(
1113  "Unknown programming model for OpenMP directive on NVPTX target.");
1114 }
1115 
1116 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1117  StringRef ParentName,
1118  llvm::Function *&OutlinedFn,
1119  llvm::Constant *&OutlinedFnID,
1120  bool IsOffloadEntry,
1121  const RegionCodeGenTy &CodeGen) {
1122  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1123  EntryFunctionState EST;
1124  WorkerFunctionState WST(CGM, D.getBeginLoc());
1125  Work.clear();
1126  WrapperFunctionsMap.clear();
1127 
1128  // Emit target region as a standalone region.
1129  class NVPTXPrePostActionTy : public PrePostActionTy {
1130  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1131  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1132 
1133  public:
1134  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1135  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1136  : EST(EST), WST(WST) {}
1137  void Enter(CodeGenFunction &CGF) override {
1138  auto &RT =
1139  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1140  RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1141  // Skip target region initialization.
1142  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1143  }
1144  void Exit(CodeGenFunction &CGF) override {
1145  auto &RT =
1146  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1147  RT.clearLocThreadIdInsertPt(CGF);
1148  RT.emitNonSPMDEntryFooter(CGF, EST);
1149  }
1150  } Action(EST, WST);
1151  CodeGen.setAction(Action);
1152  IsInTTDRegion = true;
1153  // Reserve place for the globalized memory.
1154  GlobalizedRecords.emplace_back();
1155  if (!KernelStaticGlobalized) {
1156  KernelStaticGlobalized = new llvm::GlobalVariable(
1157  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1159  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1160  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1161  llvm::GlobalValue::NotThreadLocal,
1163  }
1164  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1165  IsOffloadEntry, CodeGen);
1166  IsInTTDRegion = false;
1167 
1168  // Now change the name of the worker function to correspond to this target
1169  // region's entry function.
1170  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1171 
1172  // Create the worker function
1173  emitWorkerFunction(WST);
1174 }
1175 
1176 // Setup NVPTX threads for master-worker OpenMP scheme.
1177 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1178  EntryFunctionState &EST,
1179  WorkerFunctionState &WST) {
1180  CGBuilderTy &Bld = CGF.Builder;
1181 
1182  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1183  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1184  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1185  EST.ExitBB = CGF.createBasicBlock(".exit");
1186 
1187  llvm::Value *IsWorker =
1188  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1189  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1190 
1191  CGF.EmitBlock(WorkerBB);
1192  emitCall(CGF, WST.Loc, WST.WorkerFn);
1193  CGF.EmitBranch(EST.ExitBB);
1194 
1195  CGF.EmitBlock(MasterCheckBB);
1196  llvm::Value *IsMaster =
1197  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1198  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1199 
1200  CGF.EmitBlock(MasterBB);
1201  IsInTargetMasterThreadRegion = true;
1202  // SEQUENTIAL (MASTER) REGION START
1203  // First action in sequential region:
1204  // Initialize the state of the OpenMP runtime library on the GPU.
1205  // TODO: Optimize runtime initialization and pass in correct value.
1206  llvm::Value *Args[] = {getThreadLimit(CGF),
1207  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1208  CGF.EmitRuntimeCall(
1209  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1210 
1211  // For data sharing, we need to initialize the stack.
1212  CGF.EmitRuntimeCall(
1213  createNVPTXRuntimeFunction(
1214  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1215 
1216  emitGenericVarsProlog(CGF, WST.Loc);
1217 }
1218 
1219 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1220  EntryFunctionState &EST) {
1221  IsInTargetMasterThreadRegion = false;
1222  if (!CGF.HaveInsertPoint())
1223  return;
1224 
1225  emitGenericVarsEpilog(CGF);
1226 
1227  if (!EST.ExitBB)
1228  EST.ExitBB = CGF.createBasicBlock(".exit");
1229 
1230  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1231  CGF.EmitBranch(TerminateBB);
1232 
1233  CGF.EmitBlock(TerminateBB);
1234  // Signal termination condition.
1235  // TODO: Optimize runtime initialization and pass in correct value.
1236  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1237  CGF.EmitRuntimeCall(
1238  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1239  // Barrier to terminate worker threads.
1240  syncCTAThreads(CGF);
1241  // Master thread jumps to exit point.
1242  CGF.EmitBranch(EST.ExitBB);
1243 
1244  CGF.EmitBlock(EST.ExitBB);
1245  EST.ExitBB = nullptr;
1246 }
1247 
1248 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1249  StringRef ParentName,
1250  llvm::Function *&OutlinedFn,
1251  llvm::Constant *&OutlinedFnID,
1252  bool IsOffloadEntry,
1253  const RegionCodeGenTy &CodeGen) {
1254  ExecutionRuntimeModesRAII ModeRAII(
1255  CurrentExecutionMode, RequiresFullRuntime,
1256  CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1258  EntryFunctionState EST;
1259 
1260  // Emit target region as a standalone region.
1261  class NVPTXPrePostActionTy : public PrePostActionTy {
1263  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1264  const OMPExecutableDirective &D;
1265 
1266  public:
1267  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1268  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1269  const OMPExecutableDirective &D)
1270  : RT(RT), EST(EST), D(D) {}
1271  void Enter(CodeGenFunction &CGF) override {
1272  RT.emitSPMDEntryHeader(CGF, EST, D);
1273  // Skip target region initialization.
1274  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1275  }
1276  void Exit(CodeGenFunction &CGF) override {
1277  RT.clearLocThreadIdInsertPt(CGF);
1278  RT.emitSPMDEntryFooter(CGF, EST);
1279  }
1280  } Action(*this, EST, D);
1281  CodeGen.setAction(Action);
1282  IsInTTDRegion = true;
1283  // Reserve place for the globalized memory.
1284  GlobalizedRecords.emplace_back();
1285  if (!KernelStaticGlobalized) {
1286  KernelStaticGlobalized = new llvm::GlobalVariable(
1287  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1289  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1290  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1291  llvm::GlobalValue::NotThreadLocal,
1293  }
1294  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1295  IsOffloadEntry, CodeGen);
1296  IsInTTDRegion = false;
1297 }
1298 
1299 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1300  CodeGenFunction &CGF, EntryFunctionState &EST,
1301  const OMPExecutableDirective &D) {
1302  CGBuilderTy &Bld = CGF.Builder;
1303 
1304  // Setup BBs in entry function.
1305  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1306  EST.ExitBB = CGF.createBasicBlock(".exit");
1307 
1308  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1309  /*RequiresOMPRuntime=*/
1310  Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1311  /*RequiresDataSharing=*/Bld.getInt16(0)};
1312  CGF.EmitRuntimeCall(
1313  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1314 
1315  if (RequiresFullRuntime) {
1316  // For data sharing, we need to initialize the stack.
1317  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1318  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1319  }
1320 
1321  CGF.EmitBranch(ExecuteBB);
1322 
1323  CGF.EmitBlock(ExecuteBB);
1324 
1325  IsInTargetMasterThreadRegion = true;
1326 }
1327 
1328 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1329  EntryFunctionState &EST) {
1330  IsInTargetMasterThreadRegion = false;
1331  if (!CGF.HaveInsertPoint())
1332  return;
1333 
1334  if (!EST.ExitBB)
1335  EST.ExitBB = CGF.createBasicBlock(".exit");
1336 
1337  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1338  CGF.EmitBranch(OMPDeInitBB);
1339 
1340  CGF.EmitBlock(OMPDeInitBB);
1341  // DeInitialize the OMP state in the runtime; called by all active threads.
1342  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1343  CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1344  CGF.EmitRuntimeCall(
1345  createNVPTXRuntimeFunction(
1346  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1347  CGF.EmitBranch(EST.ExitBB);
1348 
1349  CGF.EmitBlock(EST.ExitBB);
1350  EST.ExitBB = nullptr;
1351 }
1352 
1353 // Create a unique global variable to indicate the execution mode of this target
1354 // region. The execution mode is either 'generic', or 'spmd' depending on the
1355 // target directive. This variable is picked up by the offload library to setup
1356 // the device appropriately before kernel launch. If the execution mode is
1357 // 'generic', the runtime reserves one warp for the master, otherwise, all
1358 // warps participate in parallel work.
1359 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1360  bool Mode) {
1361  auto *GVMode =
1362  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1363  llvm::GlobalValue::WeakAnyLinkage,
1364  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1365  Twine(Name, "_exec_mode"));
1366  CGM.addCompilerUsedGlobal(GVMode);
1367 }
1368 
1369 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1370  ASTContext &Ctx = CGM.getContext();
1371 
1372  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1373  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1374  WST.Loc, WST.Loc);
1375  emitWorkerLoop(CGF, WST);
1376  CGF.FinishFunction();
1377 }
1378 
1379 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1380  WorkerFunctionState &WST) {
1381  //
1382  // The workers enter this loop and wait for parallel work from the master.
1383  // When the master encounters a parallel region it sets up the work + variable
1384  // arguments, and wakes up the workers. The workers first check to see if
1385  // they are required for the parallel region, i.e., within the # of requested
1386  // parallel threads. The activated workers load the variable arguments and
1387  // execute the parallel work.
1388  //
1389 
1390  CGBuilderTy &Bld = CGF.Builder;
1391 
1392  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1393  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1394  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1395  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1396  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1397  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1398 
1399  CGF.EmitBranch(AwaitBB);
1400 
1401  // Workers wait for work from master.
1402  CGF.EmitBlock(AwaitBB);
1403  // Wait for parallel work
1404  syncCTAThreads(CGF);
1405 
1406  Address WorkFn =
1407  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1408  Address ExecStatus =
1409  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1410  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1411  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1412 
1413  // TODO: Optimize runtime initialization and pass in correct value.
1414  llvm::Value *Args[] = {WorkFn.getPointer(),
1415  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1416  llvm::Value *Ret = CGF.EmitRuntimeCall(
1417  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1418  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1419 
1420  // On termination condition (workid == 0), exit loop.
1421  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1422  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1423  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1424 
1425  // Activate requested workers.
1426  CGF.EmitBlock(SelectWorkersBB);
1427  llvm::Value *IsActive =
1428  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1429  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1430 
1431  // Signal start of parallel region.
1432  CGF.EmitBlock(ExecuteBB);
1433  // Skip initialization.
1434  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1435 
1436  // Process work items: outlined parallel functions.
1437  for (llvm::Function *W : Work) {
1438  // Try to match this outlined function.
1440 
1441  llvm::Value *WorkFnMatch =
1442  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1443 
1444  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1445  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1446  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1447 
1448  // Execute this outlined function.
1449  CGF.EmitBlock(ExecuteFNBB);
1450 
1451  // Insert call to work function via shared wrapper. The shared
1452  // wrapper takes two arguments:
1453  // - the parallelism level;
1454  // - the thread ID;
1455  emitCall(CGF, WST.Loc, W,
1456  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1457 
1458  // Go to end of parallel region.
1459  CGF.EmitBranch(TerminateBB);
1460 
1461  CGF.EmitBlock(CheckNextBB);
1462  }
1463  // Default case: call to outlined function through pointer if the target
1464  // region makes a declare target call that may contain an orphaned parallel
1465  // directive.
1466  auto *ParallelFnTy =
1467  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1468  /*isVarArg=*/false);
1469  llvm::Value *WorkFnCast =
1470  Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1471  // Insert call to work function via shared wrapper. The shared
1472  // wrapper takes two arguments:
1473  // - the parallelism level;
1474  // - the thread ID;
1475  emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1476  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1477  // Go to end of parallel region.
1478  CGF.EmitBranch(TerminateBB);
1479 
1480  // Signal end of parallel region.
1481  CGF.EmitBlock(TerminateBB);
1482  CGF.EmitRuntimeCall(
1483  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1484  llvm::None);
1485  CGF.EmitBranch(BarrierBB);
1486 
1487  // All active and inactive workers wait at a barrier after parallel region.
1488  CGF.EmitBlock(BarrierBB);
1489  // Barrier after parallel region.
1490  syncCTAThreads(CGF);
1491  CGF.EmitBranch(AwaitBB);
1492 
1493  // Exit target region.
1494  CGF.EmitBlock(ExitBB);
1495  // Skip initialization.
1496  clearLocThreadIdInsertPt(CGF);
1497 }
1498 
1499 /// Returns specified OpenMP runtime function for the current OpenMP
1500 /// implementation. Specialized for the NVPTX device.
1501 /// \param Function OpenMP runtime function.
1502 /// \return Specified function.
1503 llvm::FunctionCallee
1505  llvm::FunctionCallee RTLFn = nullptr;
1506  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1507  case OMPRTL_NVPTX__kmpc_kernel_init: {
1508  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1509  // RequiresOMPRuntime);
1510  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1511  auto *FnTy =
1512  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1513  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1514  break;
1515  }
1516  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1517  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1518  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1519  auto *FnTy =
1520  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1521  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1522  break;
1523  }
1524  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1525  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1526  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1527  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1528  auto *FnTy =
1529  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1530  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1531  break;
1532  }
1533  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1534  // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1535  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1536  auto *FnTy =
1537  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1538  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1539  break;
1540  }
1541  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1542  /// Build void __kmpc_kernel_prepare_parallel(
1543  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1544  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1545  auto *FnTy =
1546  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1547  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1548  break;
1549  }
1550  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1551  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1552  /// int16_t IsOMPRuntimeInitialized);
1553  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1554  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1555  auto *FnTy =
1556  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1557  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1558  break;
1559  }
1560  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1561  /// Build void __kmpc_kernel_end_parallel();
1562  auto *FnTy =
1563  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1564  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1565  break;
1566  }
1567  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1568  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1569  // global_tid);
1570  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1571  auto *FnTy =
1572  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1573  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1574  break;
1575  }
1576  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1577  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1578  // global_tid);
1579  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1580  auto *FnTy =
1581  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1582  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1583  break;
1584  }
1585  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1586  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1587  // int16_t lane_offset, int16_t warp_size);
1588  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1589  auto *FnTy =
1590  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1591  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1592  break;
1593  }
1594  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1595  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1596  // int16_t lane_offset, int16_t warp_size);
1597  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1598  auto *FnTy =
1599  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1600  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1601  break;
1602  }
1603  case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: {
1604  // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
1605  // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
1606  // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t
1607  // lane_id, int16_t lane_offset, int16_t Algorithm Version), void
1608  // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1609  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1610  CGM.Int16Ty, CGM.Int16Ty};
1611  auto *ShuffleReduceFnTy =
1612  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1613  /*isVarArg=*/false);
1614  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1615  auto *InterWarpCopyFnTy =
1616  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1617  /*isVarArg=*/false);
1618  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1619  CGM.Int32Ty,
1620  CGM.Int32Ty,
1621  CGM.SizeTy,
1622  CGM.VoidPtrTy,
1623  ShuffleReduceFnTy->getPointerTo(),
1624  InterWarpCopyFnTy->getPointerTo()};
1625  auto *FnTy =
1626  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1627  RTLFn = CGM.CreateRuntimeFunction(
1628  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2");
1629  break;
1630  }
1631  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1632  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1633  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1634  auto *FnTy =
1635  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1636  RTLFn = CGM.CreateRuntimeFunction(
1637  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1638  break;
1639  }
1640  case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: {
1641  // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
1642  // global_tid, void *global_buffer, int32_t num_of_records, void*
1643  // reduce_data,
1644  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1645  // lane_offset, int16_t shortCircuit),
1646  // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
1647  // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
1648  // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
1649  // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
1650  // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
1651  // *buffer, int idx, void *reduce_data));
1652  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1653  CGM.Int16Ty, CGM.Int16Ty};
1654  auto *ShuffleReduceFnTy =
1655  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1656  /*isVarArg=*/false);
1657  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1658  auto *InterWarpCopyFnTy =
1659  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1660  /*isVarArg=*/false);
1661  llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy,
1662  CGM.VoidPtrTy};
1663  auto *GlobalListFnTy =
1664  llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams,
1665  /*isVarArg=*/false);
1666  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1667  CGM.Int32Ty,
1668  CGM.VoidPtrTy,
1669  CGM.Int32Ty,
1670  CGM.VoidPtrTy,
1671  ShuffleReduceFnTy->getPointerTo(),
1672  InterWarpCopyFnTy->getPointerTo(),
1673  GlobalListFnTy->getPointerTo(),
1674  GlobalListFnTy->getPointerTo(),
1675  GlobalListFnTy->getPointerTo(),
1676  GlobalListFnTy->getPointerTo()};
1677  auto *FnTy =
1678  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1679  RTLFn = CGM.CreateRuntimeFunction(
1680  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2");
1681  break;
1682  }
1683  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1684  /// Build void __kmpc_data_sharing_init_stack();
1685  auto *FnTy =
1686  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1687  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1688  break;
1689  }
1690  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1691  /// Build void __kmpc_data_sharing_init_stack_spmd();
1692  auto *FnTy =
1693  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1694  RTLFn =
1695  CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1696  break;
1697  }
1698  case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1699  // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1700  // int16_t UseSharedMemory);
1701  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1702  auto *FnTy =
1703  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1704  RTLFn = CGM.CreateRuntimeFunction(
1705  FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
1706  break;
1707  }
1708  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1709  // Build void __kmpc_data_sharing_pop_stack(void *a);
1710  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1711  auto *FnTy =
1712  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1713  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1714  /*Name=*/"__kmpc_data_sharing_pop_stack");
1715  break;
1716  }
1717  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1718  /// Build void __kmpc_begin_sharing_variables(void ***args,
1719  /// size_t n_args);
1720  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1721  auto *FnTy =
1722  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1723  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1724  break;
1725  }
1726  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1727  /// Build void __kmpc_end_sharing_variables();
1728  auto *FnTy =
1729  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1730  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1731  break;
1732  }
1733  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1734  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1735  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1736  auto *FnTy =
1737  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1738  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1739  break;
1740  }
1741  case OMPRTL_NVPTX__kmpc_parallel_level: {
1742  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1743  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1744  auto *FnTy =
1745  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1746  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1747  break;
1748  }
1749  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1750  // Build int8_t __kmpc_is_spmd_exec_mode();
1751  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1752  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1753  break;
1754  }
1755  case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1756  // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
1757  // const void *buf, size_t size, int16_t is_shared, const void **res);
1758  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
1759  CGM.Int16Ty, CGM.VoidPtrPtrTy};
1760  auto *FnTy =
1761  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1762  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1763  break;
1764  }
1765  case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1766  // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
1767  // int16_t is_shared);
1768  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
1769  auto *FnTy =
1770  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1771  RTLFn =
1772  CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1773  break;
1774  }
1775  case OMPRTL__kmpc_barrier: {
1776  // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1777  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1778  auto *FnTy =
1779  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1780  RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
1781  cast<llvm::Function>(RTLFn.getCallee())
1782  ->addFnAttr(llvm::Attribute::Convergent);
1783  break;
1784  }
1785  case OMPRTL__kmpc_barrier_simple_spmd: {
1786  // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
1787  // global_tid);
1788  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1789  auto *FnTy =
1790  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1791  RTLFn =
1792  CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
1793  cast<llvm::Function>(RTLFn.getCallee())
1794  ->addFnAttr(llvm::Attribute::Convergent);
1795  break;
1796  }
1797  }
1798  return RTLFn;
1799 }
1800 
1801 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1802  llvm::Constant *Addr,
1803  uint64_t Size, int32_t,
1804  llvm::GlobalValue::LinkageTypes) {
1805  // TODO: Add support for global variables on the device after declare target
1806  // support.
1807  if (!isa<llvm::Function>(Addr))
1808  return;
1809  llvm::Module &M = CGM.getModule();
1810  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1811 
1812  // Get "nvvm.annotations" metadata node
1813  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1814 
1815  llvm::Metadata *MDVals[] = {
1816  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1817  llvm::ConstantAsMetadata::get(
1818  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1819  // Append metadata to nvvm.annotations
1820  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1821 }
1822 
1823 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1824  const OMPExecutableDirective &D, StringRef ParentName,
1825  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1826  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1827  if (!IsOffloadEntry) // Nothing to do.
1828  return;
1829 
1830  assert(!ParentName.empty() && "Invalid target region parent name!");
1831 
1832  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1833  if (Mode)
1834  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1835  CodeGen);
1836  else
1837  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1838  CodeGen);
1839 
1840  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1841 }
1842 
1843 namespace {
1845 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1846 enum ModeFlagsTy : unsigned {
1847  /// Bit set to 1 when in SPMD mode.
1848  KMP_IDENT_SPMD_MODE = 0x01,
1849  /// Bit set to 1 when a simplified runtime is used.
1850  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1851  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1852 };
1853 
1854 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1855 static const ModeFlagsTy UndefinedMode =
1856  (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1857 } // anonymous namespace
1858 
1860  switch (getExecutionMode()) {
1861  case EM_SPMD:
1862  if (requiresFullRuntime())
1863  return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1864  return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1865  case EM_NonSPMD:
1866  assert(requiresFullRuntime() && "Expected full runtime.");
1867  return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1868  case EM_Unknown:
1869  return UndefinedMode;
1870  }
1871  llvm_unreachable("Unknown flags are requested.");
1872 }
1873 
1875  : CGOpenMPRuntime(CGM, "_", "$") {
1876  if (!CGM.getLangOpts().OpenMPIsDevice)
1877  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1878 }
1879 
1881  OpenMPProcBindClauseKind ProcBind,
1882  SourceLocation Loc) {
1883  // Do nothing in case of SPMD mode and L0 parallel.
1884  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1885  return;
1886 
1887  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1888 }
1889 
1891  llvm::Value *NumThreads,
1892  SourceLocation Loc) {
1893  // Do nothing in case of SPMD mode and L0 parallel.
1894  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1895  return;
1896 
1897  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1898 }
1899 
1901  const Expr *NumTeams,
1902  const Expr *ThreadLimit,
1903  SourceLocation Loc) {}
1904 
1906  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1907  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1908  // Emit target region as a standalone region.
1909  class NVPTXPrePostActionTy : public PrePostActionTy {
1910  bool &IsInParallelRegion;
1911  bool PrevIsInParallelRegion;
1912 
1913  public:
1914  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1915  : IsInParallelRegion(IsInParallelRegion) {}
1916  void Enter(CodeGenFunction &CGF) override {
1917  PrevIsInParallelRegion = IsInParallelRegion;
1918  IsInParallelRegion = true;
1919  }
1920  void Exit(CodeGenFunction &CGF) override {
1921  IsInParallelRegion = PrevIsInParallelRegion;
1922  }
1923  } Action(IsInParallelRegion);
1924  CodeGen.setAction(Action);
1925  bool PrevIsInTTDRegion = IsInTTDRegion;
1926  IsInTTDRegion = false;
1927  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1928  IsInTargetMasterThreadRegion = false;
1929  auto *OutlinedFun =
1930  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1931  D, ThreadIDVar, InnermostKind, CodeGen));
1932  if (CGM.getLangOpts().Optimize) {
1933  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
1934  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
1935  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
1936  }
1937  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1938  IsInTTDRegion = PrevIsInTTDRegion;
1939  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1940  !IsInParallelRegion) {
1941  llvm::Function *WrapperFun =
1942  createParallelDataSharingWrapper(OutlinedFun, D);
1943  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1944  }
1945 
1946  return OutlinedFun;
1947 }
1948 
1949 /// Get list of lastprivate variables from the teams distribute ... or
1950 /// teams {distribute ...} directives.
1951 static void
1955  "expected teams directive.");
1956  const OMPExecutableDirective *Dir = &D;
1959  Ctx,
1961  /*IgnoreCaptured=*/true))) {
1962  Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1963  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1964  Dir = nullptr;
1965  }
1966  }
1967  if (!Dir)
1968  return;
1969  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1970  for (const Expr *E : C->getVarRefs())
1971  Vars.push_back(getPrivateItem(E));
1972  }
1973 }
1974 
1975 /// Get list of reduction variables from the teams ... directives.
1976 static void
1980  "expected teams directive.");
1981  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1982  for (const Expr *E : C->privates())
1983  Vars.push_back(getPrivateItem(E));
1984  }
1985 }
1986 
1988  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1989  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1990  SourceLocation Loc = D.getBeginLoc();
1991 
1992  const RecordDecl *GlobalizedRD = nullptr;
1993  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1994  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1995  // Globalize team reductions variable unconditionally in all modes.
1996  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
1997  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1998  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
1999  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2000  if (!LastPrivatesReductions.empty()) {
2001  GlobalizedRD = ::buildRecordForGlobalizedVars(
2002  CGM.getContext(), llvm::None, LastPrivatesReductions,
2003  MappedDeclsFields, WarpSize);
2004  }
2005  } else if (!LastPrivatesReductions.empty()) {
2006  assert(!TeamAndReductions.first &&
2007  "Previous team declaration is not expected.");
2008  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2009  std::swap(TeamAndReductions.second, LastPrivatesReductions);
2010  }
2011 
2012  // Emit target region as a standalone region.
2013  class NVPTXPrePostActionTy : public PrePostActionTy {
2014  SourceLocation &Loc;
2015  const RecordDecl *GlobalizedRD;
2016  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2017  &MappedDeclsFields;
2018 
2019  public:
2020  NVPTXPrePostActionTy(
2021  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2022  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2023  &MappedDeclsFields)
2024  : Loc(Loc), GlobalizedRD(GlobalizedRD),
2025  MappedDeclsFields(MappedDeclsFields) {}
2026  void Enter(CodeGenFunction &CGF) override {
2027  auto &Rt =
2028  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2029  if (GlobalizedRD) {
2030  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2031  I->getSecond().GlobalRecord = GlobalizedRD;
2032  I->getSecond().MappedParams =
2033  llvm::make_unique<CodeGenFunction::OMPMapVars>();
2034  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2035  for (const auto &Pair : MappedDeclsFields) {
2036  assert(Pair.getFirst()->isCanonicalDecl() &&
2037  "Expected canonical declaration");
2038  Data.insert(std::make_pair(Pair.getFirst(),
2039  MappedVarData(Pair.getSecond(),
2040  /*IsOnePerTeam=*/true)));
2041  }
2042  }
2043  Rt.emitGenericVarsProlog(CGF, Loc);
2044  }
2045  void Exit(CodeGenFunction &CGF) override {
2046  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2047  .emitGenericVarsEpilog(CGF);
2048  }
2049  } Action(Loc, GlobalizedRD, MappedDeclsFields);
2050  CodeGen.setAction(Action);
2051  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2052  D, ThreadIDVar, InnermostKind, CodeGen);
2053  if (CGM.getLangOpts().Optimize) {
2054  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2055  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2056  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2057  }
2058 
2059  return OutlinedFun;
2060 }
2061 
2062 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2063  SourceLocation Loc,
2064  bool WithSPMDCheck) {
2066  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2067  return;
2068 
2069  CGBuilderTy &Bld = CGF.Builder;
2070 
2071  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2072  if (I == FunctionGlobalizedDecls.end())
2073  return;
2074  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2075  QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2076  QualType SecGlobalRecTy;
2077 
2078  // Recover pointer to this function's global record. The runtime will
2079  // handle the specifics of the allocation of the memory.
2080  // Use actual memory size of the record including the padding
2081  // for alignment purposes.
2082  unsigned Alignment =
2083  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2084  unsigned GlobalRecordSize =
2085  CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2086  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2087 
2088  llvm::PointerType *GlobalRecPtrTy =
2089  CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2090  llvm::Value *GlobalRecCastAddr;
2091  llvm::Value *IsTTD = nullptr;
2092  if (!IsInTTDRegion &&
2093  (WithSPMDCheck ||
2094  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2095  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2096  llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2097  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2098  if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2099  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2100  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2101  llvm::Value *PL = CGF.EmitRuntimeCall(
2102  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2103  {RTLoc, ThreadID});
2104  IsTTD = Bld.CreateIsNull(PL);
2105  }
2106  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2107  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2108  Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2109  // There is no need to emit line number for unconditional branch.
2111  CGF.EmitBlock(SPMDBB);
2112  Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2113  CharUnits::fromQuantity(Alignment));
2114  CGF.EmitBranch(ExitBB);
2115  // There is no need to emit line number for unconditional branch.
2117  CGF.EmitBlock(NonSPMDBB);
2118  llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2119  if (const RecordDecl *SecGlobalizedVarsRecord =
2120  I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2121  SecGlobalRecTy =
2122  CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2123 
2124  // Recover pointer to this function's global record. The runtime will
2125  // handle the specifics of the allocation of the memory.
2126  // Use actual memory size of the record including the padding
2127  // for alignment purposes.
2128  unsigned Alignment =
2129  CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2130  unsigned GlobalRecordSize =
2131  CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2132  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2133  Size = Bld.CreateSelect(
2134  IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2135  }
2136  // TODO: allow the usage of shared memory to be controlled by
2137  // the user, for now, default to global.
2138  llvm::Value *GlobalRecordSizeArg[] = {
2139  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2140  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2142  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2143  GlobalRecordSizeArg);
2144  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2145  GlobalRecValue, GlobalRecPtrTy);
2146  CGF.EmitBlock(ExitBB);
2147  auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2148  /*NumReservedValues=*/2, "_select_stack");
2149  Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2150  Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2151  GlobalRecCastAddr = Phi;
2152  I->getSecond().GlobalRecordAddr = Phi;
2153  I->getSecond().IsInSPMDModeFlag = IsSPMD;
2154  } else if (IsInTTDRegion) {
2155  assert(GlobalizedRecords.back().Records.size() < 2 &&
2156  "Expected less than 2 globalized records: one for target and one "
2157  "for teams.");
2158  unsigned Offset = 0;
2159  for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2160  QualType RDTy = CGM.getContext().getRecordType(RD);
2161  unsigned Alignment =
2163  unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2164  Offset =
2165  llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2166  }
2167  unsigned Alignment =
2168  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2169  Offset = llvm::alignTo(Offset, Alignment);
2170  GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2171  ++GlobalizedRecords.back().RegionCounter;
2172  if (GlobalizedRecords.back().Records.size() == 1) {
2173  assert(KernelStaticGlobalized &&
2174  "Kernel static pointer must be initialized already.");
2175  auto *UseSharedMemory = new llvm::GlobalVariable(
2176  CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2178  "_openmp_static_kernel$is_shared");
2179  UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2180  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2181  /*DestWidth=*/16, /*Signed=*/0);
2182  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2183  Address(UseSharedMemory,
2184  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2185  /*Volatile=*/false, Int16Ty, Loc);
2186  auto *StaticGlobalized = new llvm::GlobalVariable(
2187  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2188  llvm::GlobalValue::CommonLinkage, nullptr);
2189  auto *RecSize = new llvm::GlobalVariable(
2190  CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2192  "_openmp_static_kernel$size");
2193  RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2194  llvm::Value *Ld = CGF.EmitLoadOfScalar(
2195  Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
2196  CGM.getContext().getSizeType(), Loc);
2198  KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2199  llvm::Value *GlobalRecordSizeArg[] = {
2200  llvm::ConstantInt::get(
2201  CGM.Int16Ty,
2202  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2203  StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
2205  OMPRTL_NVPTX__kmpc_get_team_static_memory),
2206  GlobalRecordSizeArg);
2207  GlobalizedRecords.back().Buffer = StaticGlobalized;
2208  GlobalizedRecords.back().RecSize = RecSize;
2209  GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2210  GlobalizedRecords.back().Loc = Loc;
2211  }
2212  assert(KernelStaticGlobalized && "Global address must be set already.");
2213  Address FrameAddr = CGF.EmitLoadOfPointer(
2214  Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2215  CGM.getContext()
2217  .castAs<PointerType>());
2218  llvm::Value *GlobalRecValue =
2219  Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
2220  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2221  I->getSecond().IsInSPMDModeFlag = nullptr;
2222  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2223  GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2224  } else {
2225  // TODO: allow the usage of shared memory to be controlled by
2226  // the user, for now, default to global.
2227  llvm::Value *GlobalRecordSizeArg[] = {
2228  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2229  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2230  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2232  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2233  GlobalRecordSizeArg);
2234  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2235  GlobalRecValue, GlobalRecPtrTy);
2236  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2237  I->getSecond().IsInSPMDModeFlag = nullptr;
2238  }
2239  LValue Base =
2240  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
2241 
2242  // Emit the "global alloca" which is a GEP from the global declaration
2243  // record using the pointer returned by the runtime.
2244  LValue SecBase;
2245  decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2246  if (IsTTD) {
2247  SecIt = I->getSecond().SecondaryLocalVarData->begin();
2248  llvm::PointerType *SecGlobalRecPtrTy =
2249  CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2250  SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2252  I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2253  SecGlobalRecTy);
2254  }
2255  for (auto &Rec : I->getSecond().LocalVarData) {
2256  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2257  llvm::Value *ParValue;
2258  if (EscapedParam) {
2259  const auto *VD = cast<VarDecl>(Rec.first);
2260  LValue ParLVal =
2261  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2262  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2263  }
2264  LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2265  // Emit VarAddr basing on lane-id if required.
2266  QualType VarTy;
2267  if (Rec.second.IsOnePerTeam) {
2268  VarTy = Rec.second.FD->getType();
2269  } else {
2270  llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2271  VarAddr.getAddress().getPointer(),
2272  {Bld.getInt32(0), getNVPTXLaneID(CGF)});
2273  VarTy =
2274  Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2275  VarAddr = CGF.MakeAddrLValue(
2276  Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2278  }
2279  Rec.second.PrivateAddr = VarAddr.getAddress();
2280  if (!IsInTTDRegion &&
2281  (WithSPMDCheck ||
2282  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2283  assert(I->getSecond().IsInSPMDModeFlag &&
2284  "Expected unknown execution mode or required SPMD check.");
2285  if (IsTTD) {
2286  assert(SecIt->second.IsOnePerTeam &&
2287  "Secondary glob data must be one per team.");
2288  LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2289  VarAddr.setAddress(
2290  Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2291  VarAddr.getPointer()),
2292  VarAddr.getAlignment()));
2293  Rec.second.PrivateAddr = VarAddr.getAddress();
2294  }
2295  Address GlobalPtr = Rec.second.PrivateAddr;
2296  Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2297  Rec.second.PrivateAddr = Address(
2298  Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2299  LocalAddr.getPointer(), GlobalPtr.getPointer()),
2300  LocalAddr.getAlignment());
2301  }
2302  if (EscapedParam) {
2303  const auto *VD = cast<VarDecl>(Rec.first);
2304  CGF.EmitStoreOfScalar(ParValue, VarAddr);
2305  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2306  }
2307  if (IsTTD)
2308  ++SecIt;
2309  }
2310  }
2311  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2312  // Recover pointer to this function's global record. The runtime will
2313  // handle the specifics of the allocation of the memory.
2314  // Use actual memory size of the record including the padding
2315  // for alignment purposes.
2316  CGBuilderTy &Bld = CGF.Builder;
2317  llvm::Value *Size = CGF.getTypeSize(VD->getType());
2318  CharUnits Align = CGM.getContext().getDeclAlign(VD);
2319  Size = Bld.CreateNUWAdd(
2320  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2321  llvm::Value *AlignVal =
2322  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2323  Size = Bld.CreateUDiv(Size, AlignVal);
2324  Size = Bld.CreateNUWMul(Size, AlignVal);
2325  // TODO: allow the usage of shared memory to be controlled by
2326  // the user, for now, default to global.
2327  llvm::Value *GlobalRecordSizeArg[] = {
2328  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2329  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2331  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2332  GlobalRecordSizeArg);
2333  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2334  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2335  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2336  CGM.getContext().getDeclAlign(VD),
2338  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2339  Base.getAddress());
2340  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2341  }
2342  I->getSecond().MappedParams->apply(CGF);
2343 }
2344 
2345 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2346  bool WithSPMDCheck) {
2348  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2349  return;
2350 
2351  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2352  if (I != FunctionGlobalizedDecls.end()) {
2353  I->getSecond().MappedParams->restore(CGF);
2354  if (!CGF.HaveInsertPoint())
2355  return;
2356  for (llvm::Value *Addr :
2357  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2358  CGF.EmitRuntimeCall(
2359  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2360  Addr);
2361  }
2362  if (I->getSecond().GlobalRecordAddr) {
2363  if (!IsInTTDRegion &&
2364  (WithSPMDCheck ||
2365  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2366  CGBuilderTy &Bld = CGF.Builder;
2367  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2368  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2369  Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2370  // There is no need to emit line number for unconditional branch.
2372  CGF.EmitBlock(NonSPMDBB);
2373  CGF.EmitRuntimeCall(
2375  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2376  CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2377  CGF.EmitBlock(ExitBB);
2378  } else if (IsInTTDRegion) {
2379  assert(GlobalizedRecords.back().RegionCounter > 0 &&
2380  "region counter must be > 0.");
2381  --GlobalizedRecords.back().RegionCounter;
2382  // Emit the restore function only in the target region.
2383  if (GlobalizedRecords.back().RegionCounter == 0) {
2384  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2385  /*DestWidth=*/16, /*Signed=*/0);
2386  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2387  Address(GlobalizedRecords.back().UseSharedMemory,
2388  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2389  /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2390  llvm::Value *Args[] = {
2391  llvm::ConstantInt::get(
2392  CGM.Int16Ty,
2393  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2394  IsInSharedMemory};
2395  CGF.EmitRuntimeCall(
2397  OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2398  Args);
2399  }
2400  } else {
2402  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2403  I->getSecond().GlobalRecordAddr);
2404  }
2405  }
2406  }
2407 }
2408 
2410  const OMPExecutableDirective &D,
2411  SourceLocation Loc,
2412  llvm::Function *OutlinedFn,
2413  ArrayRef<llvm::Value *> CapturedVars) {
2414  if (!CGF.HaveInsertPoint())
2415  return;
2416 
2417  Address ZeroAddr = CGF.CreateMemTemp(
2418  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2419  /*Name*/ ".zero.addr");
2420  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2421  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2422  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2423  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2424  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2425  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2426 }
2427 
2429  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2430  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2431  if (!CGF.HaveInsertPoint())
2432  return;
2433 
2434  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2435  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2436  else
2437  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2438 }
2439 
2440 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2441  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2442  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2443  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2444 
2445  // Force inline this outlined function at its call site.
2446  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2447 
2449  /*DestWidth=*/32, /*Signed=*/1),
2450  ".zero.addr");
2451  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2452  // ThreadId for serialized parallels is 0.
2453  Address ThreadIDAddr = ZeroAddr;
2454  auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
2455  CodeGenFunction &CGF, PrePostActionTy &Action) {
2456  Action.Enter(CGF);
2457 
2458  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2459  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2460  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2461  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2462  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2463  };
2464  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2465  PrePostActionTy &) {
2466 
2467  RegionCodeGenTy RCG(CodeGen);
2468  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2469  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2470  llvm::Value *Args[] = {RTLoc, ThreadID};
2471 
2472  NVPTXActionTy Action(
2473  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2474  Args,
2475  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2476  Args);
2477  RCG.setAction(Action);
2478  RCG(CGF);
2479  };
2480 
2481  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2482  PrePostActionTy &Action) {
2483  CGBuilderTy &Bld = CGF.Builder;
2484  llvm::Function *WFn = WrapperFunctionsMap[Fn];
2485  assert(WFn && "Wrapper function does not exist!");
2486  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2487 
2488  // Prepare for parallel region. Indicate the outlined function.
2489  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
2490  CGF.EmitRuntimeCall(
2491  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2492  Args);
2493 
2494  // Create a private scope that will globalize the arguments
2495  // passed from the outside of the target region.
2496  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2497 
2498  // There's something to share.
2499  if (!CapturedVars.empty()) {
2500  // Prepare for parallel region. Indicate the outlined function.
2501  Address SharedArgs =
2502  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2503  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2504 
2505  llvm::Value *DataSharingArgs[] = {
2506  SharedArgsPtr,
2507  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2508  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2509  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2510  DataSharingArgs);
2511 
2512  // Store variable address in a list of references to pass to workers.
2513  unsigned Idx = 0;
2514  ASTContext &Ctx = CGF.getContext();
2515  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2516  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2517  .castAs<PointerType>());
2518  for (llvm::Value *V : CapturedVars) {
2519  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
2520  llvm::Value *PtrV;
2521  if (V->getType()->isIntegerTy())
2522  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2523  else
2524  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2525  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2526  Ctx.getPointerType(Ctx.VoidPtrTy));
2527  ++Idx;
2528  }
2529  }
2530 
2531  // Activate workers. This barrier is used by the master to signal
2532  // work for the workers.
2533  syncCTAThreads(CGF);
2534 
2535  // OpenMP [2.5, Parallel Construct, p.49]
2536  // There is an implied barrier at the end of a parallel region. After the
2537  // end of a parallel region, only the master thread of the team resumes
2538  // execution of the enclosing task region.
2539  //
2540  // The master waits at this barrier until all workers are done.
2541  syncCTAThreads(CGF);
2542 
2543  if (!CapturedVars.empty())
2544  CGF.EmitRuntimeCall(
2545  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2546 
2547  // Remember for post-processing in worker loop.
2548  Work.emplace_back(WFn);
2549  };
2550 
2551  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2552  CodeGenFunction &CGF, PrePostActionTy &Action) {
2553  if (IsInParallelRegion) {
2554  SeqGen(CGF, Action);
2555  } else if (IsInTargetMasterThreadRegion) {
2556  L0ParallelGen(CGF, Action);
2557  } else {
2558  // Check for master and then parallelism:
2559  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2560  // Serialized execution.
2561  // } else {
2562  // Worker call.
2563  // }
2564  CGBuilderTy &Bld = CGF.Builder;
2565  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2566  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2567  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2568  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2569  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2570  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2571  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2572  // There is no need to emit line number for unconditional branch.
2574  CGF.EmitBlock(ParallelCheckBB);
2575  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2576  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2577  llvm::Value *PL = CGF.EmitRuntimeCall(
2578  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2579  {RTLoc, ThreadID});
2580  llvm::Value *Res = Bld.CreateIsNotNull(PL);
2581  Bld.CreateCondBr(Res, SeqBB, MasterBB);
2582  CGF.EmitBlock(SeqBB);
2583  SeqGen(CGF, Action);
2584  CGF.EmitBranch(ExitBB);
2585  // There is no need to emit line number for unconditional branch.
2587  CGF.EmitBlock(MasterBB);
2588  L0ParallelGen(CGF, Action);
2589  CGF.EmitBranch(ExitBB);
2590  // There is no need to emit line number for unconditional branch.
2592  // Emit the continuation block for code after the if.
2593  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2594  }
2595  };
2596 
2597  if (IfCond) {
2598  emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2599  } else {
2601  RegionCodeGenTy ThenRCG(LNParallelGen);
2602  ThenRCG(CGF);
2603  }
2604 }
2605 
2606 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2607  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2608  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2609  // Just call the outlined function to execute the parallel region.
2610  // OutlinedFn(&GTid, &zero, CapturedStruct);
2611  //
2612  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2613 
2615  /*DestWidth=*/32, /*Signed=*/1),
2616  ".zero.addr");
2617  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2618  // ThreadId for serialized parallels is 0.
2619  Address ThreadIDAddr = ZeroAddr;
2620  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
2621  &ThreadIDAddr](CodeGenFunction &CGF,
2622  PrePostActionTy &Action) {
2623  Action.Enter(CGF);
2624 
2625  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2626  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2627  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2628  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2629  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2630  };
2631  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2632  PrePostActionTy &) {
2633 
2634  RegionCodeGenTy RCG(CodeGen);
2635  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2636  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2637  llvm::Value *Args[] = {RTLoc, ThreadID};
2638 
2639  NVPTXActionTy Action(
2640  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2641  Args,
2642  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2643  Args);
2644  RCG.setAction(Action);
2645  RCG(CGF);
2646  };
2647 
2648  if (IsInTargetMasterThreadRegion) {
2649  // In the worker need to use the real thread id.
2650  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2651  RegionCodeGenTy RCG(CodeGen);
2652  RCG(CGF);
2653  } else {
2654  // If we are not in the target region, it is definitely L2 parallelism or
2655  // more, because for SPMD mode we always has L1 parallel level, sowe don't
2656  // need to check for orphaned directives.
2657  RegionCodeGenTy RCG(SeqGen);
2658  RCG(CGF);
2659  }
2660 }
2661 
2662 void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
2663  // Always emit simple barriers!
2664  if (!CGF.HaveInsertPoint())
2665  return;
2666  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2667  // This function does not use parameters, so we can emit just default values.
2668  llvm::Value *Args[] = {
2669  llvm::ConstantPointerNull::get(
2670  cast<llvm::PointerType>(getIdentTyPointerTy())),
2671  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2672  CGF.EmitRuntimeCall(
2673  createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
2674 }
2675 
2677  SourceLocation Loc,
2678  OpenMPDirectiveKind Kind, bool,
2679  bool) {
2680  // Always emit simple barriers!
2681  if (!CGF.HaveInsertPoint())
2682  return;
2683  // Build call __kmpc_cancel_barrier(loc, thread_id);
2684  unsigned Flags = getDefaultFlagsForBarriers(Kind);
2685  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2686  getThreadID(CGF, Loc)};
2688 }
2689 
2691  CodeGenFunction &CGF, StringRef CriticalName,
2692  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2693  const Expr *Hint) {
2694  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2695  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2696  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2697  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2698  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2699 
2700  // Fetch team-local id of the thread.
2701  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2702 
2703  // Get the width of the team.
2704  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2705 
2706  // Initialize the counter variable for the loop.
2707  QualType Int32Ty =
2708  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2709  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2710  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2711  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2712  /*isInit=*/true);
2713 
2714  // Block checks if loop counter exceeds upper bound.
2715  CGF.EmitBlock(LoopBB);
2716  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2717  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2718  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2719 
2720  // Block tests which single thread should execute region, and which threads
2721  // should go straight to synchronisation point.
2722  CGF.EmitBlock(TestBB);
2723  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2724  llvm::Value *CmpThreadToCounter =
2725  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2726  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2727 
2728  // Block emits the body of the critical region.
2729  CGF.EmitBlock(BodyBB);
2730 
2731  // Output the critical statement.
2732  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2733  Hint);
2734 
2735  // After the body surrounded by the critical region, the single executing
2736  // thread will jump to the synchronisation point.
2737  // Block waits for all threads in current team to finish then increments the
2738  // counter variable and returns to the loop.
2739  CGF.EmitBlock(SyncBB);
2740  emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false,
2741  /*ForceSimpleCall=*/true);
2742 
2743  llvm::Value *IncCounterVal =
2744  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2745  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2746  CGF.EmitBranch(LoopBB);
2747 
2748  // Block that is reached when all threads in the team complete the region.
2749  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2750 }
2751 
2752 /// Cast value to the specified type.
2754  QualType ValTy, QualType CastTy,
2755  SourceLocation Loc) {
2756  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2757  "Cast type must sized.");
2758  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2759  "Val type must sized.");
2760  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2761  if (ValTy == CastTy)
2762  return Val;
2763  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2764  CGF.getContext().getTypeSizeInChars(CastTy))
2765  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2766  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2767  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2768  CastTy->hasSignedIntegerRepresentation());
2769  Address CastItem = CGF.CreateMemTemp(CastTy);
2771  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2772  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2773  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2774 }
2775 
2776 /// This function creates calls to one of two shuffle functions to copy
2777 /// variables between lanes in a warp.
2779  llvm::Value *Elem,
2780  QualType ElemType,
2782  SourceLocation Loc) {
2783  CodeGenModule &CGM = CGF.CGM;
2784  CGBuilderTy &Bld = CGF.Builder;
2785  CGOpenMPRuntimeNVPTX &RT =
2786  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2787 
2788  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2789  assert(Size.getQuantity() <= 8 &&
2790  "Unsupported bitwidth in shuffle instruction.");
2791 
2792  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2793  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2794  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2795 
2796  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2797  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2798  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2799  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2800  llvm::Value *WarpSize =
2801  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2802 
2803  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2804  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2805 
2806  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2807 }
2808 
2809 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2810  Address DestAddr, QualType ElemType,
2812  CGBuilderTy &Bld = CGF.Builder;
2813 
2814  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2815  // Create the loop over the big sized data.
2816  // ptr = (void*)Elem;
2817  // ptrEnd = (void*) Elem + 1;
2818  // Step = 8;
2819  // while (ptr + Step < ptrEnd)
2820  // shuffle((int64_t)*ptr);
2821  // Step = 4;
2822  // while (ptr + Step < ptrEnd)
2823  // shuffle((int32_t)*ptr);
2824  // ...
2825  Address ElemPtr = DestAddr;
2826  Address Ptr = SrcAddr;
2828  Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2829  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2830  if (Size < CharUnits::fromQuantity(IntSize))
2831  continue;
2832  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2833  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2834  /*Signed=*/1);
2835  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2836  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2837  ElemPtr =
2838  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2839  if (Size.getQuantity() / IntSize > 1) {
2840  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2841  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2842  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2843  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2844  CGF.EmitBlock(PreCondBB);
2845  llvm::PHINode *PhiSrc =
2846  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2847  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2848  llvm::PHINode *PhiDest =
2849  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2850  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2851  Ptr = Address(PhiSrc, Ptr.getAlignment());
2852  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2853  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2855  Ptr.getPointer(), CGF.VoidPtrTy));
2856  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2857  ThenBB, ExitBB);
2858  CGF.EmitBlock(ThenBB);
2860  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2861  IntType, Offset, Loc);
2862  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2863  Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2864  Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2865  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2866  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2867  CGF.EmitBranch(PreCondBB);
2868  CGF.EmitBlock(ExitBB);
2869  } else {
2871  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2872  IntType, Offset, Loc);
2873  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2874  Ptr = Bld.CreateConstGEP(Ptr, 1);
2875  ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2876  }
2877  Size = Size % IntSize;
2878  }
2879 }
2880 
2881 namespace {
2882 enum CopyAction : unsigned {
2883  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2884  // the warp using shuffle instructions.
2885  RemoteLaneToThread,
2886  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2887  ThreadCopy,
2888  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2889  ThreadToScratchpad,
2890  // ScratchpadToThread: Copy from a scratchpad array in global memory
2891  // containing team-reduced data to a thread's stack.
2892  ScratchpadToThread,
2893 };
2894 } // namespace
2895 
2900 };
2901 
2902 /// Emit instructions to copy a Reduce list, which contains partially
2903 /// aggregated values, in the specified direction.
2905  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2906  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2907  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2908 
2909  CodeGenModule &CGM = CGF.CGM;
2910  ASTContext &C = CGM.getContext();
2911  CGBuilderTy &Bld = CGF.Builder;
2912 
2913  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2914  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2915  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2916 
2917  // Iterates, element-by-element, through the source Reduce list and
2918  // make a copy.
2919  unsigned Idx = 0;
2920  unsigned Size = Privates.size();
2921  for (const Expr *Private : Privates) {
2922  Address SrcElementAddr = Address::invalid();
2923  Address DestElementAddr = Address::invalid();
2924  Address DestElementPtrAddr = Address::invalid();
2925  // Should we shuffle in an element from a remote lane?
2926  bool ShuffleInElement = false;
2927  // Set to true to update the pointer in the dest Reduce list to a
2928  // newly created element.
2929  bool UpdateDestListPtr = false;
2930  // Increment the src or dest pointer to the scratchpad, for each
2931  // new element.
2932  bool IncrScratchpadSrc = false;
2933  bool IncrScratchpadDest = false;
2934 
2935  switch (Action) {
2936  case RemoteLaneToThread: {
2937  // Step 1.1: Get the address for the src element in the Reduce list.
2938  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2939  SrcElementAddr = CGF.EmitLoadOfPointer(
2940  SrcElementPtrAddr,
2941  C.getPointerType(Private->getType())->castAs<PointerType>());
2942 
2943  // Step 1.2: Create a temporary to store the element in the destination
2944  // Reduce list.
2945  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2946  DestElementAddr =
2947  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2948  ShuffleInElement = true;
2949  UpdateDestListPtr = true;
2950  break;
2951  }
2952  case ThreadCopy: {
2953  // Step 1.1: Get the address for the src element in the Reduce list.
2954  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2955  SrcElementAddr = CGF.EmitLoadOfPointer(
2956  SrcElementPtrAddr,
2957  C.getPointerType(Private->getType())->castAs<PointerType>());
2958 
2959  // Step 1.2: Get the address for dest element. The destination
2960  // element has already been created on the thread's stack.
2961  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2962  DestElementAddr = CGF.EmitLoadOfPointer(
2963  DestElementPtrAddr,
2964  C.getPointerType(Private->getType())->castAs<PointerType>());
2965  break;
2966  }
2967  case ThreadToScratchpad: {
2968  // Step 1.1: Get the address for the src element in the Reduce list.
2969  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2970  SrcElementAddr = CGF.EmitLoadOfPointer(
2971  SrcElementPtrAddr,
2972  C.getPointerType(Private->getType())->castAs<PointerType>());
2973 
2974  // Step 1.2: Get the address for dest element:
2975  // address = base + index * ElementSizeInChars.
2976  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2977  llvm::Value *CurrentOffset =
2978  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2979  llvm::Value *ScratchPadElemAbsolutePtrVal =
2980  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2981  ScratchPadElemAbsolutePtrVal =
2982  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2983  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2984  C.getTypeAlignInChars(Private->getType()));
2985  IncrScratchpadDest = true;
2986  break;
2987  }
2988  case ScratchpadToThread: {
2989  // Step 1.1: Get the address for the src element in the scratchpad.
2990  // address = base + index * ElementSizeInChars.
2991  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2992  llvm::Value *CurrentOffset =
2993  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2994  llvm::Value *ScratchPadElemAbsolutePtrVal =
2995  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2996  ScratchPadElemAbsolutePtrVal =
2997  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2998  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2999  C.getTypeAlignInChars(Private->getType()));
3000  IncrScratchpadSrc = true;
3001 
3002  // Step 1.2: Create a temporary to store the element in the destination
3003  // Reduce list.
3004  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3005  DestElementAddr =
3006  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3007  UpdateDestListPtr = true;
3008  break;
3009  }
3010  }
3011 
3012  // Regardless of src and dest of copy, we emit the load of src
3013  // element as this is required in all directions
3014  SrcElementAddr = Bld.CreateElementBitCast(
3015  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3016  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3017  SrcElementAddr.getElementType());
3018 
3019  // Now that all active lanes have read the element in the
3020  // Reduce list, shuffle over the value from the remote lane.
3021  if (ShuffleInElement) {
3022  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3023  RemoteLaneOffset, Private->getExprLoc());
3024  } else {
3025  switch (CGF.getEvaluationKind(Private->getType())) {
3026  case TEK_Scalar: {
3027  llvm::Value *Elem =
3028  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
3029  Private->getType(), Private->getExprLoc());
3030  // Store the source element value to the dest element address.
3031  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
3032  Private->getType());
3033  break;
3034  }
3035  case TEK_Complex: {
3037  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3038  Private->getExprLoc());
3039  CGF.EmitStoreOfComplex(
3040  Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3041  /*isInit=*/false);
3042  break;
3043  }
3044  case TEK_Aggregate:
3045  CGF.EmitAggregateCopy(
3046  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3047  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3049  break;
3050  }
3051  }
3052 
3053  // Step 3.1: Modify reference in dest Reduce list as needed.
3054  // Modifying the reference in Reduce list to point to the newly
3055  // created element. The element is live in the current function
3056  // scope and that of functions it invokes (i.e., reduce_function).
3057  // RemoteReduceData[i] = (void*)&RemoteElem
3058  if (UpdateDestListPtr) {
3060  DestElementAddr.getPointer(), CGF.VoidPtrTy),
3061  DestElementPtrAddr, /*Volatile=*/false,
3062  C.VoidPtrTy);
3063  }
3064 
3065  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3066  // address of the next element in scratchpad memory, unless we're currently
3067  // processing the last one. Memory alignment is also taken care of here.
3068  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3069  llvm::Value *ScratchpadBasePtr =
3070  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3071  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3072  ScratchpadBasePtr = Bld.CreateNUWAdd(
3073  ScratchpadBasePtr,
3074  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3075 
3076  // Take care of global memory alignment for performance
3077  ScratchpadBasePtr = Bld.CreateNUWSub(
3078  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3079  ScratchpadBasePtr = Bld.CreateUDiv(
3080  ScratchpadBasePtr,
3081  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3082  ScratchpadBasePtr = Bld.CreateNUWAdd(
3083  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3084  ScratchpadBasePtr = Bld.CreateNUWMul(
3085  ScratchpadBasePtr,
3086  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3087 
3088  if (IncrScratchpadDest)
3089  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3090  else /* IncrScratchpadSrc = true */
3091  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3092  }
3093 
3094  ++Idx;
3095  }
3096 }
3097 
3098 /// This function emits a helper that gathers Reduce lists from the first
3099 /// lane of every active warp to lanes in the first warp.
3100 ///
3101 /// void inter_warp_copy_func(void* reduce_data, num_warps)
3102 /// shared smem[warp_size];
3103 /// For all data entries D in reduce_data:
3104 /// sync
3105 /// If (I am the first lane in each warp)
3106 /// Copy my local D to smem[warp_id]
3107 /// sync
3108 /// if (I am the first warp)
3109 /// Copy smem[thread_id] to my local D
3111  ArrayRef<const Expr *> Privates,
3112  QualType ReductionArrayTy,
3113  SourceLocation Loc) {
3114  ASTContext &C = CGM.getContext();
3115  llvm::Module &M = CGM.getModule();
3116 
3117  // ReduceList: thread local Reduce list.
3118  // At the stage of the computation when this function is called, partially
3119  // aggregated values reside in the first lane of every active warp.
3120  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3122  // NumWarps: number of warps active in the parallel region. This could
3123  // be smaller than 32 (max warps in a CTA) for partial block reduction.
3124  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3125  C.getIntTypeForBitwidth(32, /* Signed */ true),
3127  FunctionArgList Args;
3128  Args.push_back(&ReduceListArg);
3129  Args.push_back(&NumWarpsArg);
3130 
3131  const CGFunctionInfo &CGFI =
3133  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
3135  "_omp_reduction_inter_warp_copy_func", &M);
3136  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3137  Fn->setDoesNotRecurse();
3138  CodeGenFunction CGF(CGM);
3139  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3140 
3141  CGBuilderTy &Bld = CGF.Builder;
3142 
3143  // This array is used as a medium to transfer, one reduce element at a time,
3144  // the data from the first lane of every warp to lanes in the first warp
3145  // in order to perform the final step of a reduction in a parallel region
3146  // (reduction across warps). The array is placed in NVPTX __shared__ memory
3147  // for reduced latency, as well as to have a distinct copy for concurrently
3148  // executing target regions. The array is declared with common linkage so
3149  // as to be shared across compilation units.
3150  StringRef TransferMediumName =
3151  "__openmp_nvptx_data_transfer_temporary_storage";
3152  llvm::GlobalVariable *TransferMedium =
3153  M.getGlobalVariable(TransferMediumName);
3154  if (!TransferMedium) {
3155  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3156  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3157  TransferMedium = new llvm::GlobalVariable(
3158  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3159  llvm::Constant::getNullValue(Ty), TransferMediumName,
3160  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3161  SharedAddressSpace);
3162  CGM.addCompilerUsedGlobal(TransferMedium);
3163  }
3164 
3165  // Get the CUDA thread id of the current OpenMP thread on the GPU.
3166  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3167  // nvptx_lane_id = nvptx_id % warpsize
3168  llvm::Value *LaneID = getNVPTXLaneID(CGF);
3169  // nvptx_warp_id = nvptx_id / warpsize
3170  llvm::Value *WarpID = getNVPTXWarpID(CGF);
3171 
3172  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3173  Address LocalReduceList(
3175  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3176  C.VoidPtrTy, Loc),
3177  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3178  CGF.getPointerAlign());
3179 
3180  unsigned Idx = 0;
3181  for (const Expr *Private : Privates) {
3182  //
3183  // Warp master copies reduce element to transfer medium in __shared__
3184  // memory.
3185  //
3186  unsigned RealTySize =
3187  C.getTypeSizeInChars(Private->getType())
3188  .alignTo(C.getTypeAlignInChars(Private->getType()))
3189  .getQuantity();
3190  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3191  unsigned NumIters = RealTySize / TySize;
3192  if (NumIters == 0)
3193  continue;
3194  QualType CType = C.getIntTypeForBitwidth(
3195  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
3196  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3197  CharUnits Align = CharUnits::fromQuantity(TySize);
3198  llvm::Value *Cnt = nullptr;
3199  Address CntAddr = Address::invalid();
3200  llvm::BasicBlock *PrecondBB = nullptr;
3201  llvm::BasicBlock *ExitBB = nullptr;
3202  if (NumIters > 1) {
3203  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3204  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3205  /*Volatile=*/false, C.IntTy);
3206  PrecondBB = CGF.createBasicBlock("precond");
3207  ExitBB = CGF.createBasicBlock("exit");
3208  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3209  // There is no need to emit line number for unconditional branch.
3211  CGF.EmitBlock(PrecondBB);
3212  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
3213  llvm::Value *Cmp =
3214  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3215  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3216  CGF.EmitBlock(BodyBB);
3217  }
3218  // kmpc_barrier.
3219  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3220  /*EmitChecks=*/false,
3221  /*ForceSimpleCall=*/true);
3222  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3223  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3224  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3225 
3226  // if (lane_id == 0)
3227  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3228  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3229  CGF.EmitBlock(ThenBB);
3230 
3231  // Reduce element = LocalReduceList[i]
3232  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3233  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3234  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3235  // elemptr = ((CopyType*)(elemptrptr)) + I
3236  Address ElemPtr = Address(ElemPtrPtr, Align);
3237  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3238  if (NumIters > 1) {
3239  ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3240  ElemPtr.getAlignment());
3241  }
3242 
3243  // Get pointer to location in transfer medium.
3244  // MediumPtr = &medium[warp_id]
3245  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3246  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3247  Address MediumPtr(MediumPtrVal, Align);
3248  // Casting to actual data type.
3249  // MediumPtr = (CopyType*)MediumPtrAddr;
3250  MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3251 
3252  // elem = *elemptr
3253  //*MediumPtr = elem
3254  llvm::Value *Elem =
3255  CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
3256  // Store the source element value to the dest element address.
3257  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
3258 
3259  Bld.CreateBr(MergeBB);
3260 
3261  CGF.EmitBlock(ElseBB);
3262  Bld.CreateBr(MergeBB);
3263 
3264  CGF.EmitBlock(MergeBB);
3265 
3266  // kmpc_barrier.
3267  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3268  /*EmitChecks=*/false,
3269  /*ForceSimpleCall=*/true);
3270 
3271  //
3272  // Warp 0 copies reduce element from transfer medium.
3273  //
3274  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3275  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3276  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3277 
3278  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3279  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3280  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
3281 
3282  // Up to 32 threads in warp 0 are active.
3283  llvm::Value *IsActiveThread =
3284  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3285  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3286 
3287  CGF.EmitBlock(W0ThenBB);
3288 
3289  // SrcMediumPtr = &medium[tid]
3290  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3291  TransferMedium,
3292  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3293  Address SrcMediumPtr(SrcMediumPtrVal, Align);
3294  // SrcMediumVal = *SrcMediumPtr;
3295  SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3296 
3297  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3298  Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3299  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3300  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
3301  Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3302  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3303  if (NumIters > 1) {
3304  TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3305  TargetElemPtr.getAlignment());
3306  }
3307 
3308  // *TargetElemPtr = SrcMediumVal;
3309  llvm::Value *SrcMediumValue =
3310  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
3311  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3312  CType);
3313  Bld.CreateBr(W0MergeBB);
3314 
3315  CGF.EmitBlock(W0ElseBB);
3316  Bld.CreateBr(W0MergeBB);
3317 
3318  CGF.EmitBlock(W0MergeBB);
3319 
3320  if (NumIters > 1) {
3321  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
3322  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
3323  CGF.EmitBranch(PrecondBB);
3325  CGF.EmitBlock(ExitBB);
3326  }
3327  RealTySize %= TySize;
3328  }
3329  ++Idx;
3330  }
3331 
3332  CGF.FinishFunction();
3333  return Fn;
3334 }
3335 
3336 /// Emit a helper that reduces data across two OpenMP threads (lanes)
3337 /// in the same warp. It uses shuffle instructions to copy over data from
3338 /// a remote lane's stack. The reduction algorithm performed is specified
3339 /// by the fourth parameter.
3340 ///
3341 /// Algorithm Versions.
3342 /// Full Warp Reduce (argument value 0):
3343 /// This algorithm assumes that all 32 lanes are active and gathers
3344 /// data from these 32 lanes, producing a single resultant value.
3345 /// Contiguous Partial Warp Reduce (argument value 1):
3346 /// This algorithm assumes that only a *contiguous* subset of lanes
3347 /// are active. This happens for the last warp in a parallel region
3348 /// when the user specified num_threads is not an integer multiple of
3349 /// 32. This contiguous subset always starts with the zeroth lane.
3350 /// Partial Warp Reduce (argument value 2):
3351 /// This algorithm gathers data from any number of lanes at any position.
3352 /// All reduced values are stored in the lowest possible lane. The set
3353 /// of problems every algorithm addresses is a super set of those
3354 /// addressable by algorithms with a lower version number. Overhead
3355 /// increases as algorithm version increases.
3356 ///
3357 /// Terminology
3358 /// Reduce element:
3359 /// Reduce element refers to the individual data field with primitive
3360 /// data types to be combined and reduced across threads.
3361 /// Reduce list:
3362 /// Reduce list refers to a collection of local, thread-private
3363 /// reduce elements.
3364 /// Remote Reduce list:
3365 /// Remote Reduce list refers to a collection of remote (relative to
3366 /// the current thread) reduce elements.
3367 ///
3368 /// We distinguish between three states of threads that are important to
3369 /// the implementation of this function.
3370 /// Alive threads:
3371 /// Threads in a warp executing the SIMT instruction, as distinguished from
3372 /// threads that are inactive due to divergent control flow.
3373 /// Active threads:
3374 /// The minimal set of threads that has to be alive upon entry to this
3375 /// function. The computation is correct iff active threads are alive.
3376 /// Some threads are alive but they are not active because they do not
3377 /// contribute to the computation in any useful manner. Turning them off
3378 /// may introduce control flow overheads without any tangible benefits.
3379 /// Effective threads:
3380 /// In order to comply with the argument requirements of the shuffle
3381 /// function, we must keep all lanes holding data alive. But at most
3382 /// half of them perform value aggregation; we refer to this half of
3383 /// threads as effective. The other half is simply handing off their
3384 /// data.
3385 ///
3386 /// Procedure
3387 /// Value shuffle:
3388 /// In this step active threads transfer data from higher lane positions
3389 /// in the warp to lower lane positions, creating Remote Reduce list.
3390 /// Value aggregation:
3391 /// In this step, effective threads combine their thread local Reduce list
3392 /// with Remote Reduce list and store the result in the thread local
3393 /// Reduce list.
3394 /// Value copy:
3395 /// In this step, we deal with the assumption made by algorithm 2
3396 /// (i.e. contiguity assumption). When we have an odd number of lanes
3397 /// active, say 2k+1, only k threads will be effective and therefore k
3398 /// new values will be produced. However, the Reduce list owned by the
3399 /// (2k+1)th thread is ignored in the value aggregation. Therefore
3400 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3401 /// that the contiguity assumption still holds.
3402 static llvm::Function *emitShuffleAndReduceFunction(
3403  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3404  QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
3405  ASTContext &C = CGM.getContext();
3406 
3407  // Thread local Reduce list used to host the values of data to be reduced.
3408  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3410  // Current lane id; could be logical.
3411  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3413  // Offset of the remote source lane relative to the current lane.
3414  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3416  // Algorithm version. This is expected to be known at compile time.
3417  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3419  FunctionArgList Args;
3420  Args.push_back(&ReduceListArg);
3421  Args.push_back(&LaneIDArg);
3422  Args.push_back(&RemoteLaneOffsetArg);
3423  Args.push_back(&AlgoVerArg);
3424 
3425  const CGFunctionInfo &CGFI =
3427  auto *Fn = llvm::Function::Create(
3429  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3430  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3431  Fn->setDoesNotRecurse();
3432  if (CGM.getLangOpts().Optimize) {
3433  Fn->removeFnAttr(llvm::Attribute::NoInline);
3434  Fn->removeFnAttr(llvm::Attribute::OptimizeNone);
3435  Fn->addFnAttr(llvm::Attribute::AlwaysInline);
3436  }
3437 
3438  CodeGenFunction CGF(CGM);
3439  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3440 
3441  CGBuilderTy &Bld = CGF.Builder;
3442 
3443  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3444  Address LocalReduceList(
3446  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3447  C.VoidPtrTy, SourceLocation()),
3448  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3449  CGF.getPointerAlign());
3450 
3451  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3452  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3453  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3454 
3455  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3456  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3457  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3458 
3459  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3460  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3461  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3462 
3463  // Create a local thread-private variable to host the Reduce list
3464  // from a remote lane.
3465  Address RemoteReduceList =
3466  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3467 
3468  // This loop iterates through the list of reduce elements and copies,
3469  // element by element, from a remote lane in the warp to RemoteReduceList,
3470  // hosted on the thread's stack.
3471  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3472  LocalReduceList, RemoteReduceList,
3473  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3474  /*ScratchpadIndex=*/nullptr,
3475  /*ScratchpadWidth=*/nullptr});
3476 
3477  // The actions to be performed on the Remote Reduce list is dependent
3478  // on the algorithm version.
3479  //
3480  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3481  // LaneId % 2 == 0 && Offset > 0):
3482  // do the reduction value aggregation
3483  //
3484  // The thread local variable Reduce list is mutated in place to host the
3485  // reduced data, which is the aggregated value produced from local and
3486  // remote lanes.
3487  //
3488  // Note that AlgoVer is expected to be a constant integer known at compile
3489  // time.
3490  // When AlgoVer==0, the first conjunction evaluates to true, making
3491  // the entire predicate true during compile time.
3492  // When AlgoVer==1, the second conjunction has only the second part to be
3493  // evaluated during runtime. Other conjunctions evaluates to false
3494  // during compile time.
3495  // When AlgoVer==2, the third conjunction has only the second part to be
3496  // evaluated during runtime. Other conjunctions evaluates to false
3497  // during compile time.
3498  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3499 
3500  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3501  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3502  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3503 
3504  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3505  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3506  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3507  CondAlgo2 = Bld.CreateAnd(
3508  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3509 
3510  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3511  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3512 
3513  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3514  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3515  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3516  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3517 
3518  CGF.EmitBlock(ThenBB);
3519  // reduce_function(LocalReduceList, RemoteReduceList)
3520  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3521  LocalReduceList.getPointer(), CGF.VoidPtrTy);
3522  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3523  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3524  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3525  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3526  Bld.CreateBr(MergeBB);
3527 
3528  CGF.EmitBlock(ElseBB);
3529  Bld.CreateBr(MergeBB);
3530 
3531  CGF.EmitBlock(MergeBB);
3532 
3533  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3534  // Reduce list.
3535  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3536  llvm::Value *CondCopy = Bld.CreateAnd(
3537  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3538 
3539  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3540  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3541  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3542  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3543 
3544  CGF.EmitBlock(CpyThenBB);
3545  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3546  RemoteReduceList, LocalReduceList);
3547  Bld.CreateBr(CpyMergeBB);
3548 
3549  CGF.EmitBlock(CpyElseBB);
3550  Bld.CreateBr(CpyMergeBB);
3551 
3552  CGF.EmitBlock(CpyMergeBB);
3553 
3554  CGF.FinishFunction();
3555  return Fn;
3556 }
3557 
3558 /// This function emits a helper that copies all the reduction variables from
3559 /// the team into the provided global buffer for the reduction variables.
3560 ///
3561 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3562 /// For all data entries D in reduce_data:
3563 /// Copy local D to buffer.D[Idx]
3565  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3566  QualType ReductionArrayTy, SourceLocation Loc,
3567  const RecordDecl *TeamReductionRec,
3568  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3569  &VarFieldMap) {
3570  ASTContext &C = CGM.getContext();
3571 
3572  // Buffer: global reduction buffer.
3573  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3575  // Idx: index of the buffer.
3576  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3578  // ReduceList: thread local Reduce list.
3579  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3581  FunctionArgList Args;
3582  Args.push_back(&BufferArg);
3583  Args.push_back(&IdxArg);
3584  Args.push_back(&ReduceListArg);
3585 
3586  const CGFunctionInfo &CGFI =
3588  auto *Fn = llvm::Function::Create(
3590  "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3591  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3592  Fn->setDoesNotRecurse();
3593  CodeGenFunction CGF(CGM);
3594  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3595 
3596  CGBuilderTy &Bld = CGF.Builder;
3597 
3598  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3599  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3600  Address LocalReduceList(
3602  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3603  C.VoidPtrTy, Loc),
3604  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3605  CGF.getPointerAlign());
3606  QualType StaticTy = C.getRecordType(TeamReductionRec);
3607  llvm::Type *LLVMReductionsBufferTy =
3608  CGM.getTypes().ConvertTypeForMem(StaticTy);
3610  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3611  LLVMReductionsBufferTy->getPointerTo());
3612  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3613  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3614  /*Volatile=*/false, C.IntTy,
3615  Loc)};
3616  unsigned Idx = 0;
3617  for (const Expr *Private : Privates) {
3618  // Reduce element = LocalReduceList[i]
3619  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3620  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3621  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3622  // elemptr = ((CopyType*)(elemptrptr)) + I
3623  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3624  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3625  Address ElemPtr =
3626  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3627  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3628  // Global = Buffer.VD[Idx];
3629  const FieldDecl *FD = VarFieldMap.lookup(VD);
3630  LValue GlobLVal = CGF.EmitLValueForField(
3631  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3632  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3633  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3634  switch (CGF.getEvaluationKind(Private->getType())) {
3635  case TEK_Scalar: {
3636  llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3637  Private->getType(), Loc);
3638  CGF.EmitStoreOfScalar(V, GlobLVal);
3639  break;
3640  }
3641  case TEK_Complex: {
3643  CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3644  CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3645  break;
3646  }
3647  case TEK_Aggregate:
3648  CGF.EmitAggregateCopy(GlobLVal,
3649  CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3651  break;
3652  }
3653  ++Idx;
3654  }
3655 
3656  CGF.FinishFunction();
3657  return Fn;
3658 }
3659 
3660 /// This function emits a helper that reduces all the reduction variables from
3661 /// the team into the provided global buffer for the reduction variables.
3662 ///
3663 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3664 /// void *GlobPtrs[];
3665 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3666 /// ...
3667 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3668 /// reduce_function(GlobPtrs, reduce_data);
3670  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3671  QualType ReductionArrayTy, SourceLocation Loc,
3672  const RecordDecl *TeamReductionRec,
3673  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3674  &VarFieldMap,
3675  llvm::Function *ReduceFn) {
3676  ASTContext &C = CGM.getContext();
3677 
3678  // Buffer: global reduction buffer.
3679  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3681  // Idx: index of the buffer.
3682  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3684  // ReduceList: thread local Reduce list.
3685  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3687  FunctionArgList Args;
3688  Args.push_back(&BufferArg);
3689  Args.push_back(&IdxArg);
3690  Args.push_back(&ReduceListArg);
3691 
3692  const CGFunctionInfo &CGFI =
3694  auto *Fn = llvm::Function::Create(
3696  "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3697  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3698  Fn->setDoesNotRecurse();
3699  CodeGenFunction CGF(CGM);
3700  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3701 
3702  CGBuilderTy &Bld = CGF.Builder;
3703 
3704  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3705  QualType StaticTy = C.getRecordType(TeamReductionRec);
3706  llvm::Type *LLVMReductionsBufferTy =
3707  CGM.getTypes().ConvertTypeForMem(StaticTy);
3709  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3710  LLVMReductionsBufferTy->getPointerTo());
3711 
3712  // 1. Build a list of reduction variables.
3713  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3714  Address ReductionList =
3715  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3716  auto IPriv = Privates.begin();
3717  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3718  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3719  /*Volatile=*/false, C.IntTy,
3720  Loc)};
3721  unsigned Idx = 0;
3722  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3723  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3724  // Global = Buffer.VD[Idx];
3725  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3726  const FieldDecl *FD = VarFieldMap.lookup(VD);
3727  LValue GlobLVal = CGF.EmitLValueForField(
3728  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3729  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3730  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3731  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3732  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3733  // Store array size.
3734  ++Idx;
3735  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3736  llvm::Value *Size = CGF.Builder.CreateIntCast(
3737  CGF.getVLASize(
3738  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3739  .NumElts,
3740  CGF.SizeTy, /*isSigned=*/false);
3741  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3742  Elem);
3743  }
3744  }
3745 
3746  // Call reduce_function(GlobalReduceList, ReduceList)
3747  llvm::Value *GlobalReduceList =
3748  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3749  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3750  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3751  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3752  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3753  CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3754  CGF.FinishFunction();
3755  return Fn;
3756 }
3757 
3758 /// This function emits a helper that copies all the reduction variables from
3759 /// the team into the provided global buffer for the reduction variables.
3760 ///
3761 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3762 /// For all data entries D in reduce_data:
3763 /// Copy buffer.D[Idx] to local D;
3765  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3766  QualType ReductionArrayTy, SourceLocation Loc,
3767  const RecordDecl *TeamReductionRec,
3768  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3769  &VarFieldMap) {
3770  ASTContext &C = CGM.getContext();
3771 
3772  // Buffer: global reduction buffer.
3773  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3775  // Idx: index of the buffer.
3776  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3778  // ReduceList: thread local Reduce list.
3779  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3781  FunctionArgList Args;
3782  Args.push_back(&BufferArg);
3783  Args.push_back(&IdxArg);
3784  Args.push_back(&ReduceListArg);
3785 
3786  const CGFunctionInfo &CGFI =
3788  auto *Fn = llvm::Function::Create(
3790  "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3791  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3792  Fn->setDoesNotRecurse();
3793  CodeGenFunction CGF(CGM);
3794  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3795 
3796  CGBuilderTy &Bld = CGF.Builder;
3797 
3798  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3799  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3800  Address LocalReduceList(
3802  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3803  C.VoidPtrTy, Loc),
3804  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3805  CGF.getPointerAlign());
3806  QualType StaticTy = C.getRecordType(TeamReductionRec);
3807  llvm::Type *LLVMReductionsBufferTy =
3808  CGM.getTypes().ConvertTypeForMem(StaticTy);
3810  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3811  LLVMReductionsBufferTy->getPointerTo());
3812 
3813  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3814  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3815  /*Volatile=*/false, C.IntTy,
3816  Loc)};
3817  unsigned Idx = 0;
3818  for (const Expr *Private : Privates) {
3819  // Reduce element = LocalReduceList[i]
3820  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3821  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3822  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3823  // elemptr = ((CopyType*)(elemptrptr)) + I
3824  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3825  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3826  Address ElemPtr =
3827  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3828  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3829  // Global = Buffer.VD[Idx];
3830  const FieldDecl *FD = VarFieldMap.lookup(VD);
3831  LValue GlobLVal = CGF.EmitLValueForField(
3832  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3833  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3834  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3835  switch (CGF.getEvaluationKind(Private->getType())) {
3836  case TEK_Scalar: {
3837  llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3838  CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
3839  break;
3840  }
3841  case TEK_Complex: {
3842  CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3843  CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3844  /*isInit=*/false);
3845  break;
3846  }
3847  case TEK_Aggregate:
3848  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3849  GlobLVal, Private->getType(),
3851  break;
3852  }
3853  ++Idx;
3854  }
3855 
3856  CGF.FinishFunction();
3857  return Fn;
3858 }
3859 
3860 /// This function emits a helper that reduces all the reduction variables from
3861 /// the team into the provided global buffer for the reduction variables.
3862 ///
3863 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3864 /// void *GlobPtrs[];
3865 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3866 /// ...
3867 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3868 /// reduce_function(reduce_data, GlobPtrs);
3870  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3871  QualType ReductionArrayTy, SourceLocation Loc,
3872  const RecordDecl *TeamReductionRec,
3873  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3874  &VarFieldMap,
3875  llvm::Function *ReduceFn) {
3876  ASTContext &C = CGM.getContext();
3877 
3878  // Buffer: global reduction buffer.
3879  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3881  // Idx: index of the buffer.
3882  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3884  // ReduceList: thread local Reduce list.
3885  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3887  FunctionArgList Args;
3888  Args.push_back(&BufferArg);
3889  Args.push_back(&IdxArg);
3890  Args.push_back(&ReduceListArg);
3891 
3892  const CGFunctionInfo &CGFI =
3894  auto *Fn = llvm::Function::Create(
3896  "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3897  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3898  Fn->setDoesNotRecurse();
3899  CodeGenFunction CGF(CGM);
3900  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3901 
3902  CGBuilderTy &Bld = CGF.Builder;
3903 
3904  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3905  QualType StaticTy = C.getRecordType(TeamReductionRec);
3906  llvm::Type *LLVMReductionsBufferTy =
3907  CGM.getTypes().ConvertTypeForMem(StaticTy);
3909  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3910  LLVMReductionsBufferTy->getPointerTo());
3911 
3912  // 1. Build a list of reduction variables.
3913  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3914  Address ReductionList =
3915  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3916  auto IPriv = Privates.begin();
3917  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3918  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3919  /*Volatile=*/false, C.IntTy,
3920  Loc)};
3921  unsigned Idx = 0;
3922  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3923  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3924  // Global = Buffer.VD[Idx];
3925  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3926  const FieldDecl *FD = VarFieldMap.lookup(VD);
3927  LValue GlobLVal = CGF.EmitLValueForField(
3928  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3929  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3930  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3931  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3932  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3933  // Store array size.
3934  ++Idx;
3935  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3936  llvm::Value *Size = CGF.Builder.CreateIntCast(
3937  CGF.getVLASize(
3938  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3939  .NumElts,
3940  CGF.SizeTy, /*isSigned=*/false);
3941  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3942  Elem);
3943  }
3944  }
3945 
3946  // Call reduce_function(ReduceList, GlobalReduceList)
3947  llvm::Value *GlobalReduceList =
3948  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3949  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3950  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3951  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3952  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3953  CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
3954  CGF.FinishFunction();
3955  return Fn;
3956 }
3957 
3958 ///
3959 /// Design of OpenMP reductions on the GPU
3960 ///
3961 /// Consider a typical OpenMP program with one or more reduction
3962 /// clauses:
3963 ///
3964 /// float foo;
3965 /// double bar;
3966 /// #pragma omp target teams distribute parallel for \
3967 /// reduction(+:foo) reduction(*:bar)
3968 /// for (int i = 0; i < N; i++) {
3969 /// foo += A[i]; bar *= B[i];
3970 /// }
3971 ///
3972 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3973 /// all teams. In our OpenMP implementation on the NVPTX device an
3974 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3975 /// within a team are mapped to CUDA threads within a threadblock.
3976 /// Our goal is to efficiently aggregate values across all OpenMP
3977 /// threads such that:
3978 ///
3979 /// - the compiler and runtime are logically concise, and
3980 /// - the reduction is performed efficiently in a hierarchical
3981 /// manner as follows: within OpenMP threads in the same warp,
3982 /// across warps in a threadblock, and finally across teams on
3983 /// the NVPTX device.
3984 ///
3985 /// Introduction to Decoupling
3986 ///
3987 /// We would like to decouple the compiler and the runtime so that the
3988 /// latter is ignorant of the reduction variables (number, data types)
3989 /// and the reduction operators. This allows a simpler interface
3990 /// and implementation while still attaining good performance.
3991 ///
3992 /// Pseudocode for the aforementioned OpenMP program generated by the
3993 /// compiler is as follows:
3994 ///
3995 /// 1. Create private copies of reduction variables on each OpenMP
3996 /// thread: 'foo_private', 'bar_private'
3997 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3998 /// to it and writes the result in 'foo_private' and 'bar_private'
3999 /// respectively.
4000 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
4001 /// and store the result on the team master:
4002 ///
4003 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
4004 /// reduceData, shuffleReduceFn, interWarpCpyFn)
4005 ///
4006 /// where:
4007 /// struct ReduceData {
4008 /// double *foo;
4009 /// double *bar;
4010 /// } reduceData
4011 /// reduceData.foo = &foo_private
4012 /// reduceData.bar = &bar_private
4013 ///
4014 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
4015 /// auxiliary functions generated by the compiler that operate on
4016 /// variables of type 'ReduceData'. They aid the runtime perform
4017 /// algorithmic steps in a data agnostic manner.
4018 ///
4019 /// 'shuffleReduceFn' is a pointer to a function that reduces data
4020 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
4021 /// same warp. It takes the following arguments as input:
4022 ///
4023 /// a. variable of type 'ReduceData' on the calling lane,
4024 /// b. its lane_id,
4025 /// c. an offset relative to the current lane_id to generate a
4026 /// remote_lane_id. The remote lane contains the second
4027 /// variable of type 'ReduceData' that is to be reduced.
4028 /// d. an algorithm version parameter determining which reduction
4029 /// algorithm to use.
4030 ///
4031 /// 'shuffleReduceFn' retrieves data from the remote lane using
4032 /// efficient GPU shuffle intrinsics and reduces, using the
4033 /// algorithm specified by the 4th parameter, the two operands
4034 /// element-wise. The result is written to the first operand.
4035 ///
4036 /// Different reduction algorithms are implemented in different
4037 /// runtime functions, all calling 'shuffleReduceFn' to perform
4038 /// the essential reduction step. Therefore, based on the 4th
4039 /// parameter, this function behaves slightly differently to
4040 /// cooperate with the runtime to ensure correctness under
4041 /// different circumstances.
4042 ///
4043 /// 'InterWarpCpyFn' is a pointer to a function that transfers
4044 /// reduced variables across warps. It tunnels, through CUDA
4045 /// shared memory, the thread-private data of type 'ReduceData'
4046 /// from lane 0 of each warp to a lane in the first warp.
4047 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
4048 /// The last team writes the global reduced value to memory.
4049 ///
4050 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
4051 /// reduceData, shuffleReduceFn, interWarpCpyFn,
4052 /// scratchpadCopyFn, loadAndReduceFn)
4053 ///
4054 /// 'scratchpadCopyFn' is a helper that stores reduced
4055 /// data from the team master to a scratchpad array in
4056 /// global memory.
4057 ///
4058 /// 'loadAndReduceFn' is a helper that loads data from
4059 /// the scratchpad array and reduces it with the input
4060 /// operand.
4061 ///
4062 /// These compiler generated functions hide address
4063 /// calculation and alignment information from the runtime.
4064 /// 5. if ret == 1:
4065 /// The team master of the last team stores the reduced
4066 /// result to the globals in memory.
4067 /// foo += reduceData.foo; bar *= reduceData.bar
4068 ///
4069 ///
4070 /// Warp Reduction Algorithms
4071 ///
4072 /// On the warp level, we have three algorithms implemented in the
4073 /// OpenMP runtime depending on the number of active lanes:
4074 ///
4075 /// Full Warp Reduction
4076 ///
4077 /// The reduce algorithm within a warp where all lanes are active
4078 /// is implemented in the runtime as follows:
4079 ///
4080 /// full_warp_reduce(void *reduce_data,
4081 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4082 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
4083 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
4084 /// }
4085 ///
4086 /// The algorithm completes in log(2, WARPSIZE) steps.
4087 ///
4088 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
4089 /// not used therefore we save instructions by not retrieving lane_id
4090 /// from the corresponding special registers. The 4th parameter, which
4091 /// represents the version of the algorithm being used, is set to 0 to
4092 /// signify full warp reduction.
4093 ///
4094 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4095 ///
4096 /// #reduce_elem refers to an element in the local lane's data structure
4097 /// #remote_elem is retrieved from a remote lane
4098 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4099 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
4100 ///
4101 /// Contiguous Partial Warp Reduction
4102 ///
4103 /// This reduce algorithm is used within a warp where only the first
4104 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
4105 /// number of OpenMP threads in a parallel region is not a multiple of
4106 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
4107 ///
4108 /// void
4109 /// contiguous_partial_reduce(void *reduce_data,
4110 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
4111 /// int size, int lane_id) {
4112 /// int curr_size;
4113 /// int offset;
4114 /// curr_size = size;
4115 /// mask = curr_size/2;
4116 /// while (offset>0) {
4117 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
4118 /// curr_size = (curr_size+1)/2;
4119 /// offset = curr_size/2;
4120 /// }
4121 /// }
4122 ///
4123 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4124 ///
4125 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4126 /// if (lane_id < offset)
4127 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4128 /// else
4129 /// reduce_elem = remote_elem
4130 ///
4131 /// This algorithm assumes that the data to be reduced are located in a
4132 /// contiguous subset of lanes starting from the first. When there is
4133 /// an odd number of active lanes, the data in the last lane is not
4134 /// aggregated with any other lane's dat but is instead copied over.
4135 ///
4136 /// Dispersed Partial Warp Reduction
4137 ///
4138 /// This algorithm is used within a warp when any discontiguous subset of
4139 /// lanes are active. It is used to implement the reduction operation
4140 /// across lanes in an OpenMP simd region or in a nested parallel region.
4141 ///
4142 /// void
4143 /// dispersed_partial_reduce(void *reduce_data,
4144 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4145 /// int size, remote_id;
4146 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
4147 /// do {
4148 /// remote_id = next_active_lane_id_right_after_me();
4149 /// # the above function returns 0 of no active lane
4150 /// # is present right after the current lane.
4151 /// size = number_of_active_lanes_in_this_warp();
4152 /// logical_lane_id /= 2;
4153 /// ShuffleReduceFn(reduce_data, logical_lane_id,
4154 /// remote_id-1-threadIdx.x, 2);
4155 /// } while (logical_lane_id % 2 == 0 && size > 1);
4156 /// }
4157 ///
4158 /// There is no assumption made about the initial state of the reduction.
4159 /// Any number of lanes (>=1) could be active at any position. The reduction
4160 /// result is returned in the first active lane.
4161 ///
4162 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4163 ///
4164 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4165 /// if (lane_id % 2 == 0 && offset > 0)
4166 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4167 /// else
4168 /// reduce_elem = remote_elem
4169 ///
4170 ///
4171 /// Intra-Team Reduction
4172 ///
4173 /// This function, as implemented in the runtime call
4174 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
4175 /// threads in a team. It first reduces within a warp using the
4176 /// aforementioned algorithms. We then proceed to gather all such
4177 /// reduced values at the first warp.
4178 ///
4179 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
4180 /// data from each of the "warp master" (zeroth lane of each warp, where
4181 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
4182 /// a mathematical sense) the problem of reduction across warp masters in
4183 /// a block to the problem of warp reduction.
4184 ///
4185 ///
4186 /// Inter-Team Reduction
4187 ///
4188 /// Once a team has reduced its data to a single value, it is stored in
4189 /// a global scratchpad array. Since each team has a distinct slot, this
4190 /// can be done without locking.
4191 ///
4192 /// The last team to write to the scratchpad array proceeds to reduce the
4193 /// scratchpad array. One or more workers in the last team use the helper
4194 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
4195 /// the k'th worker reduces every k'th element.
4196 ///
4197 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
4198 /// reduce across workers and compute a globally reduced value.
4199 ///
4203  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
4204  if (!CGF.HaveInsertPoint())
4205  return;
4206 
4207  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
4208 #ifndef NDEBUG
4209  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
4210 #endif
4211 
4212  if (Options.SimpleReduction) {
4213  assert(!TeamsReduction && !ParallelReduction &&
4214  "Invalid reduction selection in emitReduction.");
4215  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
4216  ReductionOps, Options);
4217  return;
4218  }
4219 
4220  assert((TeamsReduction || ParallelReduction) &&
4221  "Invalid reduction selection in emitReduction.");
4222 
4223  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4224  // RedList, shuffle_reduce_func, interwarp_copy_func);
4225  // or
4226  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4227  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
4228  llvm::Value *ThreadId = getThreadID(CGF, Loc);
4229 
4230  llvm::Value *Res;
4231  ASTContext &C = CGM.getContext();
4232  // 1. Build a list of reduction variables.
4233  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4234  auto Size = RHSExprs.size();
4235  for (const Expr *E : Privates) {
4236  if (E->getType()->isVariablyModifiedType())
4237  // Reserve place for array size.
4238  ++Size;
4239  }
4240  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
4241  QualType ReductionArrayTy =
4243  /*IndexTypeQuals=*/0);
4244  Address ReductionList =
4245  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
4246  auto IPriv = Privates.begin();
4247  unsigned Idx = 0;
4248  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
4249  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4250  CGF.Builder.CreateStore(
4252  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
4253  Elem);
4254  if ((*IPriv)->getType()->isVariablyModifiedType()) {
4255  // Store array size.
4256  ++Idx;
4257  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4258  llvm::Value *Size = CGF.Builder.CreateIntCast(
4259  CGF.getVLASize(
4260  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
4261  .NumElts,
4262  CGF.SizeTy, /*isSigned=*/false);
4263  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
4264  Elem);
4265  }
4266  }
4267 
4269  ReductionList.getPointer(), CGF.VoidPtrTy);
4270  llvm::Function *ReductionFn = emitReductionFunction(
4271  Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
4272  LHSExprs, RHSExprs, ReductionOps);
4273  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
4274  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
4275  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
4276  llvm::Value *InterWarpCopyFn =
4277  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
4278 
4279  if (ParallelReduction) {
4280  llvm::Value *Args[] = {RTLoc,
4281  ThreadId,
4282  CGF.Builder.getInt32(RHSExprs.size()),
4283  ReductionArrayTySize,
4284  RL,
4285  ShuffleAndReduceFn,
4286  InterWarpCopyFn};
4287 
4288  Res = CGF.EmitRuntimeCall(
4290  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2),
4291  Args);
4292  } else {
4293  assert(TeamsReduction && "expected teams reduction.");
4294  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
4295  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
4296  int Cnt = 0;
4297  for (const Expr *DRE : Privates) {
4298  PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
4299  ++Cnt;
4300  }
4301  const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
4302  CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
4303  C.getLangOpts().OpenMPCUDAReductionBufNum);
4304  TeamsReductions.push_back(TeamReductionRec);
4305  if (!KernelTeamsReductionPtr) {
4306  KernelTeamsReductionPtr = new llvm::GlobalVariable(
4307  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
4309  "_openmp_teams_reductions_buffer_$_$ptr");
4310  }
4311  llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
4312  Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
4313  /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
4314  llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
4315  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4316  llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
4317  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4318  ReductionFn);
4319  llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
4320  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4321  llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
4322  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4323  ReductionFn);
4324 
4325  llvm::Value *Args[] = {
4326  RTLoc,
4327  ThreadId,
4328  GlobalBufferPtr,
4329  CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
4330  RL,
4331  ShuffleAndReduceFn,
4332  InterWarpCopyFn,
4333  GlobalToBufferCpyFn,
4334  GlobalToBufferRedFn,
4335  BufferToGlobalCpyFn,
4336  BufferToGlobalRedFn};
4337 
4338  Res = CGF.EmitRuntimeCall(
4340  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2),
4341  Args);
4342  }
4343 
4344  // 5. Build if (res == 1)
4345  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
4346  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
4347  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
4348  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
4349  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4350 
4351  // 6. Build then branch: where we have reduced values in the master
4352  // thread in each team.
4353  // __kmpc_end_reduce{_nowait}(<gtid>);
4354  // break;
4355  CGF.EmitBlock(ThenBB);
4356 
4357  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4358  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
4359  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4360  auto IPriv = Privates.begin();
4361  auto ILHS = LHSExprs.begin();
4362  auto IRHS = RHSExprs.begin();
4363  for (const Expr *E : ReductionOps) {
4364  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4365  cast<DeclRefExpr>(*IRHS));
4366  ++IPriv;
4367  ++ILHS;
4368  ++IRHS;
4369  }
4370  };
4371  llvm::Value *EndArgs[] = {ThreadId};
4372  RegionCodeGenTy RCG(CodeGen);
4373  NVPTXActionTy Action(
4374  nullptr, llvm::None,
4375  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4376  EndArgs);
4377  RCG.setAction(Action);
4378  RCG(CGF);
4379  // There is no need to emit line number for unconditional branch.
4381  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
4382 }
4383 
4384 const VarDecl *
4386  const VarDecl *NativeParam) const {
4387  if (!NativeParam->getType()->isReferenceType())
4388  return NativeParam;
4389  QualType ArgType = NativeParam->getType();
4390  QualifierCollector QC;
4391  const Type *NonQualTy = QC.strip(ArgType);
4392  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4393  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4394  if (Attr->getCaptureKind() == OMPC_map) {
4395  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4397  } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
4398  PointeeTy.isConstant(CGM.getContext())) {
4399  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4401  }
4402  }
4403  ArgType = CGM.getContext().getPointerType(PointeeTy);
4404  QC.addRestrict();
4405  enum { NVPTX_local_addr = 5 };
4406  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
4407  ArgType = QC.apply(CGM.getContext(), ArgType);
4408  if (isa<ImplicitParamDecl>(NativeParam))
4410  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
4411  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
4412  return ParmVarDecl::Create(
4413  CGM.getContext(),
4414  const_cast<DeclContext *>(NativeParam->getDeclContext()),
4415  NativeParam->getBeginLoc(), NativeParam->getLocation(),
4416  NativeParam->getIdentifier(), ArgType,
4417  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
4418 }
4419 
4420 Address
4422  const VarDecl *NativeParam,
4423  const VarDecl *TargetParam) const {
4424  assert(NativeParam != TargetParam &&
4425  NativeParam->getType()->isReferenceType() &&
4426  "Native arg must not be the same as target arg.");
4427  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4428  QualType NativeParamType = NativeParam->getType();
4429  QualifierCollector QC;
4430  const Type *NonQualTy = QC.strip(NativeParamType);
4431  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4432  unsigned NativePointeeAddrSpace =
4433  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4434  QualType TargetTy = TargetParam->getType();
4435  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4436  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
4437  // First cast to generic.
4439  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4440  /*AddrSpace=*/0));
4441  // Cast from generic to native address space.
4443  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4444  NativePointeeAddrSpace));
4445  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4446  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
4447  NativeParamType);
4448  return NativeParamAddr;
4449 }
4450 
4452  CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
4453  ArrayRef<llvm::Value *> Args) const {
4454  SmallVector<llvm::Value *, 4> TargetArgs;
4455  TargetArgs.reserve(Args.size());
4456  auto *FnType = OutlinedFn.getFunctionType();
4457  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
4458  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4459  TargetArgs.append(std::next(Args.begin(), I), Args.end());
4460  break;
4461  }
4462  llvm::Type *TargetType = FnType->getParamType(I);
4463  llvm::Value *NativeArg = Args[I];
4464  if (!TargetType->isPointerTy()) {
4465  TargetArgs.emplace_back(NativeArg);
4466  continue;
4467  }
4469  NativeArg,
4470  NativeArg->getType()->getPointerElementType()->getPointerTo());
4471  TargetArgs.emplace_back(
4472  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4473  }
4474  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4475 }
4476 
4477 /// Emit function which wraps the outline parallel region
4478 /// and controls the arguments which are passed to this function.
4479 /// The wrapper ensures that the outlined function is called
4480 /// with the correct arguments when data is shared.
4481 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4482  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4483  ASTContext &Ctx = CGM.getContext();
4484  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4485 
4486  // Create a function that takes as argument the source thread.
4487  FunctionArgList WrapperArgs;
4488  QualType Int16QTy =
4489  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4490  QualType Int32QTy =
4491  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4492  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4493  /*Id=*/nullptr, Int16QTy,
4495  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4496  /*Id=*/nullptr, Int32QTy,
4498  WrapperArgs.emplace_back(&ParallelLevelArg);
4499  WrapperArgs.emplace_back(&WrapperArg);
4500 
4501  const CGFunctionInfo &CGFI =
4502  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4503 
4504  auto *Fn = llvm::Function::Create(
4506  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4507  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4508  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4509  Fn->setDoesNotRecurse();
4510 
4511  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4512  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4513  D.getBeginLoc(), D.getBeginLoc());
4514 
4515  const auto *RD = CS.getCapturedRecordDecl();
4516  auto CurField = RD->field_begin();
4517 
4518  Address ZeroAddr = CGF.CreateMemTemp(
4519  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
4520  /*Name*/ ".zero.addr");
4521  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4522  // Get the array of arguments.
4524 
4525  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4526  Args.emplace_back(ZeroAddr.getPointer());
4527 
4528  CGBuilderTy &Bld = CGF.Builder;
4529  auto CI = CS.capture_begin();
4530 
4531  // Use global memory for data sharing.
4532  // Handle passing of global args to workers.
4533  Address GlobalArgs =
4534  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4535  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4536  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4537  CGF.EmitRuntimeCall(
4538  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4539  DataSharingArgs);
4540 
4541  // Retrieve the shared variables from the list of references returned
4542  // by the runtime. Pass the variables to the outlined function.
4543  Address SharedArgListAddress = Address::invalid();
4544  if (CS.capture_size() > 0 ||
4546  SharedArgListAddress = CGF.EmitLoadOfPointer(
4547  GlobalArgs, CGF.getContext()
4549  CGF.getContext().VoidPtrTy))
4550  .castAs<PointerType>());
4551  }
4552  unsigned Idx = 0;
4554  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4555  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4556  Src, CGF.SizeTy->getPointerTo());
4557  llvm::Value *LB = CGF.EmitLoadOfScalar(
4558  TypedAddress,
4559  /*Volatile=*/false,
4561  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4562  Args.emplace_back(LB);
4563  ++Idx;
4564  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4565  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4566  Src, CGF.SizeTy->getPointerTo());
4567  llvm::Value *UB = CGF.EmitLoadOfScalar(
4568  TypedAddress,
4569  /*Volatile=*/false,
4571  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4572  Args.emplace_back(UB);
4573  ++Idx;
4574  }
4575  if (CS.capture_size() > 0) {
4576  ASTContext &CGFContext = CGF.getContext();
4577  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4578  QualType ElemTy = CurField->getType();
4579  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4580  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4581  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4582  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4583  /*Volatile=*/false,
4584  CGFContext.getPointerType(ElemTy),
4585  CI->getLocation());
4586  if (CI->capturesVariableByCopy() &&
4587  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4588  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4589  CI->getLocation());
4590  }
4591  Args.emplace_back(Arg);
4592  }
4593  }
4594 
4595  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4596  CGF.FinishFunction();
4597  return Fn;
4598 }
4599 
4601  const Decl *D) {
4603  return;
4604 
4605  assert(D && "Expected function or captured|block decl.");
4606  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4607  "Function is registered already.");
4608  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4609  "Team is set but not processed.");
4610  const Stmt *Body = nullptr;
4611  bool NeedToDelayGlobalization = false;
4612  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4613  Body = FD->getBody();
4614  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4615  Body = BD->getBody();
4616  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4617  Body = CD->getBody();
4618  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4619  if (NeedToDelayGlobalization &&
4620  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4621  return;
4622  }
4623  if (!Body)
4624  return;
4625  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4626  VarChecker.Visit(Body);
4627  const RecordDecl *GlobalizedVarsRecord =
4628  VarChecker.getGlobalizedRecord(IsInTTDRegion);
4629  TeamAndReductions.first = nullptr;
4630  TeamAndReductions.second.clear();
4631  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4632  VarChecker.getEscapedVariableLengthDecls();
4633  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4634  return;
4635  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4636  I->getSecond().MappedParams =
4637  llvm::make_unique<CodeGenFunction::OMPMapVars>();
4638  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4639  I->getSecond().EscapedParameters.insert(
4640  VarChecker.getEscapedParameters().begin(),
4641  VarChecker.getEscapedParameters().end());
4642  I->getSecond().EscapedVariableLengthDecls.append(
4643  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4644  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4645  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4646  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4647  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4648  Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4649  }
4650  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4651  CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4652  VarChecker.Visit(Body);
4653  I->getSecond().SecondaryGlobalRecord =
4654  VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4655  I->getSecond().SecondaryLocalVarData.emplace();
4656  DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4657  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4658  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4659  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4660  Data.insert(
4661  std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4662  }
4663  }
4664  if (!NeedToDelayGlobalization) {
4665  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4666  struct GlobalizationScope final : EHScopeStack::Cleanup {
4667  GlobalizationScope() = default;
4668 
4669  void Emit(CodeGenFunction &CGF, Flags flags) override {
4670  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4671  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4672  }
4673  };
4674  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4675  }
4676 }
4677 
4679  const VarDecl *VD) {
4680  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4681  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4682  switch (A->getAllocatorType()) {
4683  // Use the default allocator here as by default local vars are
4684  // threadlocal.
4685  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4686  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4687  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4688  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4689  // Follow the user decision - use default allocation.
4690  return Address::invalid();
4691  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4692  // TODO: implement aupport for user-defined allocators.
4693  return Address::invalid();
4694  case OMPAllocateDeclAttr::OMPConstMemAlloc: {
4695  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4696  auto *GV = new llvm::GlobalVariable(
4697  CGM.getModule(), VarTy, /*isConstant=*/false,
4699  llvm::Constant::getNullValue(VarTy), VD->getName(),
4700  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4702  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4703  GV->setAlignment(Align.getQuantity());
4704  return Address(GV, Align);
4705  }
4706  case OMPAllocateDeclAttr::OMPPTeamMemAlloc: {
4707  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4708  auto *GV = new llvm::GlobalVariable(
4709  CGM.getModule(), VarTy, /*isConstant=*/false,
4711  llvm::Constant::getNullValue(VarTy), VD->getName(),
4712  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4714  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4715  GV->setAlignment(Align.getQuantity());
4716  return Address(GV, Align);
4717  }
4718  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4719  case OMPAllocateDeclAttr::OMPCGroupMemAlloc: {
4720  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4721  auto *GV = new llvm::GlobalVariable(
4722  CGM.getModule(), VarTy, /*isConstant=*/false,
4724  llvm::Constant::getNullValue(VarTy), VD->getName());
4725  CharUnits Align = CGM.getContext().getDeclAlign(VD);
4726  GV->setAlignment(Align.getQuantity());
4727  return Address(GV, Align);
4728  }
4729  }
4730  }
4731 
4733  return Address::invalid();
4734 
4735  VD = VD->getCanonicalDecl();
4736  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4737  if (I == FunctionGlobalizedDecls.end())
4738  return Address::invalid();
4739  auto VDI = I->getSecond().LocalVarData.find(VD);
4740  if (VDI != I->getSecond().LocalVarData.end())
4741  return VDI->second.PrivateAddr;
4742  if (VD->hasAttrs()) {
4744  E(VD->attr_end());
4745  IT != E; ++IT) {
4746  auto VDI = I->getSecond().LocalVarData.find(
4747  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4748  ->getCanonicalDecl());
4749  if (VDI != I->getSecond().LocalVarData.end())
4750  return VDI->second.PrivateAddr;
4751  }
4752  }
4753 
4754  return Address::invalid();
4755 }
4756 
4758  FunctionGlobalizedDecls.erase(CGF.CurFn);
4760 }
4761 
4763  CodeGenFunction &CGF, const OMPLoopDirective &S,
4764  OpenMPDistScheduleClauseKind &ScheduleKind,
4765  llvm::Value *&Chunk) const {
4766  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4767  ScheduleKind = OMPC_DIST_SCHEDULE_static;
4768  Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4769  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4771  return;
4772  }
4774  CGF, S, ScheduleKind, Chunk);
4775 }
4776 
4778  CodeGenFunction &CGF, const OMPLoopDirective &S,
4779  OpenMPScheduleClauseKind &ScheduleKind,
4780  const Expr *&ChunkExpr) const {
4781  ScheduleKind = OMPC_SCHEDULE_static;
4782  // Chunk size is 1 in this case.
4783  llvm::APInt ChunkSize(32, 1);
4784  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4785  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4786  SourceLocation());
4787 }
4788 
4790  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4792  " Expected target-based directive.");
4793  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4794  for (const CapturedStmt::Capture &C : CS->captures()) {
4795  // Capture variables captured by reference in lambdas for target-based
4796  // directives.
4797  if (!C.capturesVariable())
4798  continue;
4799  const VarDecl *VD = C.getCapturedVar();
4800  const auto *RD = VD->getType()
4801  .getCanonicalType()
4803  ->getAsCXXRecordDecl();
4804  if (!RD || !RD->isLambda())
4805  continue;
4806  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4807  LValue VDLVal;
4808  if (VD->getType().getCanonicalType()->isReferenceType())
4809  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4810  else
4811  VDLVal = CGF.MakeAddrLValue(
4812  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4813  llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4814  FieldDecl *ThisCapture = nullptr;
4815  RD->getCaptureFields(Captures, ThisCapture);
4816  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4817  LValue ThisLVal =
4818  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4819  llvm::Value *CXXThis = CGF.LoadCXXThis();
4820  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4821  }
4822  for (const LambdaCapture &LC : RD->captures()) {
4823  if (LC.getCaptureKind() != LCK_ByRef)
4824  continue;
4825  const VarDecl *VD = LC.getCapturedVar();
4826  if (!CS->capturesVariable(VD))
4827  continue;
4828  auto It = Captures.find(VD);
4829  assert(It != Captures.end() && "Found lambda capture without field.");
4830  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4831  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4832  if (VD->getType().getCanonicalType()->isReferenceType())
4833  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4834  VD->getType().getCanonicalType())
4835  .getAddress();
4836  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4837  }
4838  }
4839 }
4840 
4843 }
4844 
4846  LangAS &AS) {
4847  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4848  return false;
4849  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4850  switch(A->getAllocatorType()) {
4851  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4852  // Not supported, fallback to the default mem space.
4853  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4854  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4855  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4856  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4857  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4858  AS = LangAS::Default;
4859  return true;
4860  case OMPAllocateDeclAttr::OMPConstMemAlloc:
4861  AS = LangAS::cuda_constant;
4862  return true;
4863  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4864  AS = LangAS::cuda_shared;
4865  return true;
4866  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4867  llvm_unreachable("Expected predefined allocator for the variables with the "
4868  "static storage.");
4869  }
4870  return false;
4871 }
4872 
4873 // Get current CudaArch and ignore any unknown values
4875  if (!CGM.getTarget().hasFeature("ptx"))
4876  return CudaArch::UNKNOWN;
4877  llvm::StringMap<bool> Features;
4878  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4879  CGM.getTarget().getTargetOpts().CPU,
4880  CGM.getTarget().getTargetOpts().Features);
4881  for (const auto &Feature : Features) {
4882  if (Feature.getValue()) {
4883  CudaArch Arch = StringToCudaArch(Feature.getKey());
4884  if (Arch != CudaArch::UNKNOWN)
4885  return Arch;
4886  }
4887  }
4888  return CudaArch::UNKNOWN;
4889 }
4890 
4891 /// Check to see if target architecture supports unified addressing which is
4892 /// a restriction for OpenMP requires clause "unified_shared_memory".
4894  const OMPRequiresDecl *D) {
4895  for (const OMPClause *Clause : D->clauselists()) {
4896  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4897  switch (getCudaArch(CGM)) {
4898  case CudaArch::SM_20:
4899  case CudaArch::SM_21:
4900  case CudaArch::SM_30:
4901  case CudaArch::SM_32:
4902  case CudaArch::SM_35:
4903  case CudaArch::SM_37:
4904  case CudaArch::SM_50:
4905  case CudaArch::SM_52:
4906  case CudaArch::SM_53:
4907  case CudaArch::SM_60:
4908  case CudaArch::SM_61:
4909  case CudaArch::SM_62:
4910  CGM.Error(Clause->getBeginLoc(),
4911  "Target architecture does not support unified addressing");
4912  return;
4913  case CudaArch::SM_70:
4914  case CudaArch::SM_72:
4915  case CudaArch::SM_75:
4916  case CudaArch::GFX600:
4917  case CudaArch::GFX601:
4918  case CudaArch::GFX700:
4919  case CudaArch::GFX701:
4920  case CudaArch::GFX702:
4921  case CudaArch::GFX703:
4922  case CudaArch::GFX704:
4923  case CudaArch::GFX801:
4924  case CudaArch::GFX802:
4925  case CudaArch::GFX803:
4926  case CudaArch::GFX810:
4927  case CudaArch::GFX900:
4928  case CudaArch::GFX902:
4929  case CudaArch::GFX904:
4930  case CudaArch::GFX906:
4931  case CudaArch::GFX908:
4932  case CudaArch::GFX909:
4933  case CudaArch::GFX1010:
4934  case CudaArch::GFX1011:
4935  case CudaArch::GFX1012:
4936  case CudaArch::UNKNOWN:
4937  break;
4938  case CudaArch::LAST:
4939  llvm_unreachable("Unexpected Cuda arch.");
4940  }
4941  }
4942  }
4944 }
4945 
4946 /// Get number of SMs and number of blocks per SM.
4947 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4948  std::pair<unsigned, unsigned> Data;
4949  if (CGM.getLangOpts().OpenMPCUDANumSMs)
4950  Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4951  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4952  Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4953  if (Data.first && Data.second)
4954  return Data;
4955  switch (getCudaArch(CGM)) {
4956  case CudaArch::SM_20:
4957  case CudaArch::SM_21:
4958  case CudaArch::SM_30:
4959  case CudaArch::SM_32:
4960  case CudaArch::SM_35:
4961  case CudaArch::SM_37:
4962  case CudaArch::SM_50:
4963  case CudaArch::SM_52:
4964  case CudaArch::SM_53:
4965  return {16, 16};
4966  case CudaArch::SM_60:
4967  case CudaArch::SM_61:
4968  case CudaArch::SM_62:
4969  return {56, 32};
4970  case CudaArch::SM_70:
4971  case CudaArch::SM_72:
4972  case CudaArch::SM_75:
4973  return {84, 32};
4974  case CudaArch::GFX600:
4975  case CudaArch::GFX601:
4976  case CudaArch::GFX700:
4977  case CudaArch::GFX701:
4978  case CudaArch::GFX702:
4979  case CudaArch::GFX703:
4980  case CudaArch::GFX704:
4981  case CudaArch::GFX801:
4982  case CudaArch::GFX802:
4983  case CudaArch::GFX803:
4984  case CudaArch::GFX810:
4985  case CudaArch::GFX900:
4986  case CudaArch::GFX902:
4987  case CudaArch::GFX904:
4988  case CudaArch::GFX906:
4989  case CudaArch::GFX908:
4990  case CudaArch::GFX909:
4991  case CudaArch::GFX1010:
4992  case CudaArch::GFX1011:
4993  case CudaArch::GFX1012:
4994  case CudaArch::UNKNOWN:
4995  break;
4996  case CudaArch::LAST:
4997  llvm_unreachable("Unexpected Cuda arch.");
4998  }
4999  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
5000 }
5001 
5003  if (!GlobalizedRecords.empty()) {
5004  ASTContext &C = CGM.getContext();
5007  RecordDecl *StaticRD = C.buildImplicitRecord(
5008  "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5009  StaticRD->startDefinition();
5010  RecordDecl *SharedStaticRD = C.buildImplicitRecord(
5011  "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5012  SharedStaticRD->startDefinition();
5013  for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
5014  if (Records.Records.empty())
5015  continue;
5016  unsigned Size = 0;
5017  unsigned RecAlignment = 0;
5018  for (const RecordDecl *RD : Records.Records) {
5019  QualType RDTy = C.getRecordType(RD);
5020  unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
5021  RecAlignment = std::max(RecAlignment, Alignment);
5022  unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
5023  Size =
5024  llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
5025  }
5026  Size = llvm::alignTo(Size, RecAlignment);
5027  llvm::APInt ArySize(/*numBits=*/64, Size);
5028  QualType SubTy = C.getConstantArrayType(
5029  C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
5030  const bool UseSharedMemory = Size <= SharedMemorySize;
5031  auto *Field =
5032  FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
5033  SourceLocation(), SourceLocation(), nullptr, SubTy,
5035  /*BW=*/nullptr, /*Mutable=*/false,
5036  /*InitStyle=*/ICIS_NoInit);
5037  Field->setAccess(AS_public);
5038  if (UseSharedMemory) {
5039  SharedStaticRD->addDecl(Field);
5040  SharedRecs.push_back(&Records);
5041  } else {
5042  StaticRD->addDecl(Field);
5043  GlobalRecs.push_back(&Records);
5044  }
5045  Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
5046  Records.UseSharedMemory->setInitializer(
5047  llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
5048  }
5049  // Allocate SharedMemorySize buffer for the shared memory.
5050  // FIXME: nvlink does not handle weak linkage correctly (object with the
5051  // different size are reported as erroneous).
5052  // Restore this code as sson as nvlink is fixed.
5053  if (!SharedStaticRD->field_empty()) {
5054  llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
5055  QualType SubTy = C.getConstantArrayType(
5056  C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
5057  auto *Field = FieldDecl::Create(
5058  C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
5060  /*BW=*/nullptr, /*Mutable=*/false,
5061  /*InitStyle=*/ICIS_NoInit);
5062  Field->setAccess(AS_public);
5063  SharedStaticRD->addDecl(Field);
5064  }
5065  SharedStaticRD->completeDefinition();
5066  if (!SharedStaticRD->field_empty()) {
5067  QualType StaticTy = C.getRecordType(SharedStaticRD);
5068  llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
5069  auto *GV = new llvm::GlobalVariable(
5070  CGM.getModule(), LLVMStaticTy,
5071  /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
5072  llvm::Constant::getNullValue(LLVMStaticTy),
5073  "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
5074  llvm::GlobalValue::NotThreadLocal,
5076  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5077  GV, CGM.VoidPtrTy);
5078  for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
5079  Rec->Buffer->replaceAllUsesWith(Replacement);
5080  Rec->Buffer->eraseFromParent();
5081  }
5082  }
5083  StaticRD->completeDefinition();
5084  if (!StaticRD->field_empty()) {
5085  QualType StaticTy = C.getRecordType(StaticRD);
5086  std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
5087  llvm::APInt Size1(32, SMsBlockPerSM.second);
5088  QualType Arr1Ty =
5089  C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
5090  /*IndexTypeQuals=*/0);
5091  llvm::APInt Size2(32, SMsBlockPerSM.first);
5092  QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
5093  /*IndexTypeQuals=*/0);
5094  llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
5095  // FIXME: nvlink does not handle weak linkage correctly (object with the
5096  // different size are reported as erroneous).
5097  // Restore CommonLinkage as soon as nvlink is fixed.
5098  auto *GV = new llvm::GlobalVariable(
5099  CGM.getModule(), LLVMArr2Ty,
5100  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5101  llvm::Constant::getNullValue(LLVMArr2Ty),
5102  "_openmp_static_glob_rd_$_");
5103  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5104  GV, CGM.VoidPtrTy);
5105  for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
5106  Rec->Buffer->replaceAllUsesWith(Replacement);
5107  Rec->Buffer->eraseFromParent();
5108  }
5109  }
5110  }
5111  if (!TeamsReductions.empty()) {
5112  ASTContext &C = CGM.getContext();
5113  RecordDecl *StaticRD = C.buildImplicitRecord(
5114  "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
5115  StaticRD->startDefinition();
5116  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
5117  QualType RecTy = C.getRecordType(TeamReductionRec);
5118  auto *Field = FieldDecl::Create(
5119  C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
5121  /*BW=*/nullptr, /*Mutable=*/false,
5122  /*InitStyle=*/ICIS_NoInit);
5123  Field->setAccess(AS_public);
5124  StaticRD->addDecl(Field);
5125  }
5126  StaticRD->completeDefinition();
5127  QualType StaticTy = C.getRecordType(StaticRD);
5128  llvm::Type *LLVMReductionsBufferTy =
5129  CGM.getTypes().ConvertTypeForMem(StaticTy);
5130  // FIXME: nvlink does not handle weak linkage correctly (object with the
5131  // different size are reported as erroneous).
5132  // Restore CommonLinkage as soon as nvlink is fixed.
5133  auto *GV = new llvm::GlobalVariable(
5134  CGM.getModule(), LLVMReductionsBufferTy,
5135  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5136  llvm::Constant::getNullValue(LLVMReductionsBufferTy),
5137  "_openmp_teams_reductions_buffer_$_");
5138  KernelTeamsReductionPtr->setInitializer(
5139  llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
5140  CGM.VoidPtrTy));
5141  }
5143 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:652
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:216
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5555
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:216
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl *> &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static const Decl * getCanonicalDecl(const Decl *D)
llvm::IntegerType * IntTy
int
int64_t QuantityType
Definition: CharUnits.h:39
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
Other implicit parameter.
Definition: Decl.h:1524
A class which contains all the information about a particular captured value.
Definition: Decl.h:3921
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2569
CanQualType VoidPtrTy
Definition: ASTContext.h:1042
A (possibly-)qualified type.
Definition: Type.h:643
llvm::Function * emitReductionFunction(SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
CudaArch
Definition: Cuda.h:35
ArrayRef< OMPClause * > clauses()
Definition: StmtOpenMP.h:324
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:139
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:192
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
static std::pair< unsigned, unsigned > getSMsBlocksPerSM(CodeGenModule &CGM)
Get number of SMs and number of blocks per SM.
attr_iterator attr_begin() const
Definition: DeclBase.h:505
Stmt - This represents one statement.
Definition: Stmt.h:66
void checkArchForUnifiedAddressing(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl *> &Vars)
Get list of reduction variables from the teams ... directives.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:88
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:34
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:224
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:421
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter...
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
llvm::Value * ScratchpadIndex
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:282
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6333
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of &#39;this&#39;.
The base class of the type hierarchy.
Definition: Type.h:1433
virtual void clear()
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:4309
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:115
The l-value was an access to a declared entity or something equivalently strong, like the address of ...