clang  9.0.0svn
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for OpenMP runtime code generation specialized to NVPTX
10 // targets.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeNVPTX.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/DeclOpenMP.h"
17 #include "clang/AST/StmtOpenMP.h"
18 #include "clang/AST/StmtVisitor.h"
19 #include "clang/Basic/Cuda.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 
22 using namespace clang;
23 using namespace CodeGen;
24 
25 namespace {
27  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28  /// int16_t RequiresOMPRuntime);
29  OMPRTL_NVPTX__kmpc_kernel_init,
30  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
31  OMPRTL_NVPTX__kmpc_kernel_deinit,
32  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
33  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
34  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35  /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
36  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
37  /// Call to void __kmpc_kernel_prepare_parallel(void
38  /// *outlined_function, int16_t
39  /// IsOMPRuntimeInitialized);
40  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
42  /// int16_t IsOMPRuntimeInitialized);
43  OMPRTL_NVPTX__kmpc_kernel_parallel,
44  /// Call to void __kmpc_kernel_end_parallel();
45  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47  /// global_tid);
48  OMPRTL_NVPTX__kmpc_serialized_parallel,
49  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50  /// global_tid);
51  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
53  /// int16_t lane_offset, int16_t warp_size);
54  OMPRTL_NVPTX__kmpc_shuffle_int32,
55  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
56  /// int16_t lane_offset, int16_t warp_size);
57  OMPRTL_NVPTX__kmpc_shuffle_int64,
58  /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
59  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61  /// lane_offset, int16_t shortCircuit),
62  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2,
64  /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
65  /// global_tid, void *global_buffer, int32_t num_of_records, void*
66  /// reduce_data,
67  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
68  /// lane_offset, int16_t shortCircuit),
69  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
70  /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
71  /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
72  /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
73  /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
74  /// *buffer, int idx, void *reduce_data));
75  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2,
76  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
77  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
78  /// Call to void __kmpc_data_sharing_init_stack();
79  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
80  /// Call to void __kmpc_data_sharing_init_stack_spmd();
81  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
82  /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
83  /// int16_t UseSharedMemory);
84  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
85  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
86  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
87  /// Call to void __kmpc_begin_sharing_variables(void ***args,
88  /// size_t n_args);
89  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
90  /// Call to void __kmpc_end_sharing_variables();
91  OMPRTL_NVPTX__kmpc_end_sharing_variables,
92  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
93  OMPRTL_NVPTX__kmpc_get_shared_variables,
94  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
95  /// global_tid);
96  OMPRTL_NVPTX__kmpc_parallel_level,
97  /// Call to int8_t __kmpc_is_spmd_exec_mode();
98  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
99  /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
100  /// const void *buf, size_t size, int16_t is_shared, const void **res);
101  OMPRTL_NVPTX__kmpc_get_team_static_memory,
102  /// Call to void __kmpc_restore_team_static_memory(int16_t
103  /// isSPMDExecutionMode, int16_t is_shared);
104  OMPRTL_NVPTX__kmpc_restore_team_static_memory,
105  /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
107  /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
108  /// global_tid);
109  OMPRTL__kmpc_barrier_simple_spmd,
110 };
111 
112 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
113 class NVPTXActionTy final : public PrePostActionTy {
114  llvm::FunctionCallee EnterCallee = nullptr;
115  ArrayRef<llvm::Value *> EnterArgs;
116  llvm::FunctionCallee ExitCallee = nullptr;
117  ArrayRef<llvm::Value *> ExitArgs;
118  bool Conditional = false;
119  llvm::BasicBlock *ContBlock = nullptr;
120 
121 public:
122  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
123  ArrayRef<llvm::Value *> EnterArgs,
124  llvm::FunctionCallee ExitCallee,
125  ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
126  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
127  ExitArgs(ExitArgs), Conditional(Conditional) {}
128  void Enter(CodeGenFunction &CGF) override {
129  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
130  if (Conditional) {
131  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
132  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
133  ContBlock = CGF.createBasicBlock("omp_if.end");
134  // Generate the branch (If-stmt)
135  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
136  CGF.EmitBlock(ThenBlock);
137  }
138  }
139  void Done(CodeGenFunction &CGF) {
140  // Emit the rest of blocks/branches
141  CGF.EmitBranch(ContBlock);
142  CGF.EmitBlock(ContBlock, true);
143  }
144  void Exit(CodeGenFunction &CGF) override {
145  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
146  }
147 };
148 
149 /// A class to track the execution mode when codegening directives within
150 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
151 /// to the target region and used by containing directives such as 'parallel'
152 /// to emit optimized code.
153 class ExecutionRuntimeModesRAII {
154 private:
158  bool SavedRuntimeMode = false;
159  bool *RuntimeMode = nullptr;
160 
161 public:
162  /// Constructor for Non-SPMD mode.
163  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
164  : ExecMode(ExecMode) {
165  SavedExecMode = ExecMode;
167  }
168  /// Constructor for SPMD mode.
169  ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
170  bool &RuntimeMode, bool FullRuntimeMode)
171  : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
172  SavedExecMode = ExecMode;
173  SavedRuntimeMode = RuntimeMode;
175  RuntimeMode = FullRuntimeMode;
176  }
177  ~ExecutionRuntimeModesRAII() {
178  ExecMode = SavedExecMode;
179  if (RuntimeMode)
180  *RuntimeMode = SavedRuntimeMode;
181  }
182 };
183 
184 /// GPU Configuration: This information can be derived from cuda registers,
185 /// however, providing compile time constants helps generate more efficient
186 /// code. For all practical purposes this is fine because the configuration
187 /// is the same for all known NVPTX architectures.
188 enum MachineConfiguration : unsigned {
189  WarpSize = 32,
190  /// Number of bits required to represent a lane identifier, which is
191  /// computed as log_2(WarpSize).
192  LaneIDBits = 5,
193  LaneIDMask = WarpSize - 1,
194 
195  /// Global memory alignment for performance.
196  GlobalMemoryAlignment = 128,
197 
198  /// Maximal size of the shared memory buffer.
199  SharedMemorySize = 128,
200 };
201 
202 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
203  RefExpr = RefExpr->IgnoreParens();
204  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
205  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
206  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
207  Base = TempASE->getBase()->IgnoreParenImpCasts();
208  RefExpr = Base;
209  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
210  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
211  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
212  Base = TempOASE->getBase()->IgnoreParenImpCasts();
213  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
214  Base = TempASE->getBase()->IgnoreParenImpCasts();
215  RefExpr = Base;
216  }
217  RefExpr = RefExpr->IgnoreParenImpCasts();
218  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
219  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
220  const auto *ME = cast<MemberExpr>(RefExpr);
221  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
222 }
223 
224 typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
225 static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
226  return P1.first > P2.first;
227 }
228 
229 static RecordDecl *buildRecordForGlobalizedVars(
230  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
231  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
232  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
233  &MappedDeclsFields, int BufSize) {
234  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
235  return nullptr;
236  SmallVector<VarsDataTy, 4> GlobalizedVars;
237  for (const ValueDecl *D : EscapedDecls)
238  GlobalizedVars.emplace_back(
240  C.getDeclAlign(D).getQuantity(),
241  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
242  D);
243  for (const ValueDecl *D : EscapedDeclsForTeams)
244  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
245  std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
247  // Build struct _globalized_locals_ty {
248  // /* globalized vars */[WarSize] align (max(decl_align,
249  // GlobalMemoryAlignment))
250  // /* globalized vars */ for EscapedDeclsForTeams
251  // };
252  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
253  GlobalizedRD->startDefinition();
254  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
255  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
256  for (const auto &Pair : GlobalizedVars) {
257  const ValueDecl *VD = Pair.second;
258  QualType Type = VD->getType();
259  if (Type->isLValueReferenceType())
260  Type = C.getPointerType(Type.getNonReferenceType());
261  else
262  Type = Type.getNonReferenceType();
263  SourceLocation Loc = VD->getLocation();
264  FieldDecl *Field;
265  if (SingleEscaped.count(VD)) {
266  Field = FieldDecl::Create(
267  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
269  /*BW=*/nullptr, /*Mutable=*/false,
270  /*InitStyle=*/ICIS_NoInit);
271  Field->setAccess(AS_public);
272  if (VD->hasAttrs()) {
273  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
274  E(VD->getAttrs().end());
275  I != E; ++I)
276  Field->addAttr(*I);
277  }
278  } else {
279  llvm::APInt ArraySize(32, BufSize);
280  Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
281  Field = FieldDecl::Create(
282  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
284  /*BW=*/nullptr, /*Mutable=*/false,
285  /*InitStyle=*/ICIS_NoInit);
286  Field->setAccess(AS_public);
287  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
288  static_cast<CharUnits::QuantityType>(
289  GlobalMemoryAlignment)));
290  Field->addAttr(AlignedAttr::CreateImplicit(
291  C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
292  IntegerLiteral::Create(C, Align,
293  C.getIntTypeForBitwidth(32, /*Signed=*/0),
294  SourceLocation())));
295  }
296  GlobalizedRD->addDecl(Field);
297  MappedDeclsFields.try_emplace(VD, Field);
298  }
299  GlobalizedRD->completeDefinition();
300  return GlobalizedRD;
301 }
302 
303 /// Get the list of variables that can escape their declaration context.
304 class CheckVarsEscapingDeclContext final
305  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
306  CodeGenFunction &CGF;
307  llvm::SetVector<const ValueDecl *> EscapedDecls;
308  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
309  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
310  RecordDecl *GlobalizedRD = nullptr;
311  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
312  bool AllEscaped = false;
313  bool IsForCombinedParallelRegion = false;
314 
315  void markAsEscaped(const ValueDecl *VD) {
316  // Do not globalize declare target variables.
317  if (!isa<VarDecl>(VD) ||
318  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
319  return;
320  VD = cast<ValueDecl>(VD->getCanonicalDecl());
321  // Variables captured by value must be globalized.
322  if (auto *CSI = CGF.CapturedStmtInfo) {
323  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
324  // Check if need to capture the variable that was already captured by
325  // value in the outer region.
326  if (!IsForCombinedParallelRegion) {
327  if (!FD->hasAttrs())
328  return;
329  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
330  if (!Attr)
331  return;
332  if (((Attr->getCaptureKind() != OMPC_map) &&
334  static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) ||
335  ((Attr->getCaptureKind() == OMPC_map) &&
336  !FD->getType()->isAnyPointerType()))
337  return;
338  }
339  if (!FD->getType()->isReferenceType()) {
340  assert(!VD->getType()->isVariablyModifiedType() &&
341  "Parameter captured by value with variably modified type");
342  EscapedParameters.insert(VD);
343  } else if (!IsForCombinedParallelRegion) {
344  return;
345  }
346  }
347  }
348  if ((!CGF.CapturedStmtInfo ||
349  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
350  VD->getType()->isReferenceType())
351  // Do not globalize variables with reference type.
352  return;
353  if (VD->getType()->isVariablyModifiedType())
354  EscapedVariableLengthDecls.insert(VD);
355  else
356  EscapedDecls.insert(VD);
357  }
358 
359  void VisitValueDecl(const ValueDecl *VD) {
360  if (VD->getType()->isLValueReferenceType())
361  markAsEscaped(VD);
362  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
363  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
364  const bool SavedAllEscaped = AllEscaped;
365  AllEscaped = VD->getType()->isLValueReferenceType();
366  Visit(VarD->getInit());
367  AllEscaped = SavedAllEscaped;
368  }
369  }
370  }
371  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
372  ArrayRef<OMPClause *> Clauses,
373  bool IsCombinedParallelRegion) {
374  if (!S)
375  return;
376  for (const CapturedStmt::Capture &C : S->captures()) {
377  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
378  const ValueDecl *VD = C.getCapturedVar();
379  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
380  if (IsCombinedParallelRegion) {
381  // Check if the variable is privatized in the combined construct and
382  // those private copies must be shared in the inner parallel
383  // directive.
384  IsForCombinedParallelRegion = false;
385  for (const OMPClause *C : Clauses) {
386  if (!isOpenMPPrivate(C->getClauseKind()) ||
387  C->getClauseKind() == OMPC_reduction ||
388  C->getClauseKind() == OMPC_linear ||
389  C->getClauseKind() == OMPC_private)
390  continue;
392  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
393  Vars = PC->getVarRefs();
394  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
395  Vars = PC->getVarRefs();
396  else
397  llvm_unreachable("Unexpected clause.");
398  for (const auto *E : Vars) {
399  const Decl *D =
400  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
401  if (D == VD->getCanonicalDecl()) {
402  IsForCombinedParallelRegion = true;
403  break;
404  }
405  }
406  if (IsForCombinedParallelRegion)
407  break;
408  }
409  }
410  markAsEscaped(VD);
411  if (isa<OMPCapturedExprDecl>(VD))
412  VisitValueDecl(VD);
413  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
414  }
415  }
416  }
417 
418  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
419  assert(!GlobalizedRD &&
420  "Record for globalized variables is built already.");
421  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
422  if (IsInTTDRegion)
423  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
424  else
425  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
426  GlobalizedRD = ::buildRecordForGlobalizedVars(
427  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
428  MappedDeclsFields, WarpSize);
429  }
430 
431 public:
432  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
433  ArrayRef<const ValueDecl *> TeamsReductions)
434  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
435  }
436  virtual ~CheckVarsEscapingDeclContext() = default;
437  void VisitDeclStmt(const DeclStmt *S) {
438  if (!S)
439  return;
440  for (const Decl *D : S->decls())
441  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
442  VisitValueDecl(VD);
443  }
444  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
445  if (!D)
446  return;
447  if (!D->hasAssociatedStmt())
448  return;
449  if (const auto *S =
450  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
451  // Do not analyze directives that do not actually require capturing,
452  // like `omp for` or `omp simd` directives.
454  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
455  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
456  VisitStmt(S->getCapturedStmt());
457  return;
458  }
459  VisitOpenMPCapturedStmt(
460  S, D->clauses(),
461  CaptureRegions.back() == OMPD_parallel &&
463  }
464  }
465  void VisitCapturedStmt(const CapturedStmt *S) {
466  if (!S)
467  return;
468  for (const CapturedStmt::Capture &C : S->captures()) {
469  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
470  const ValueDecl *VD = C.getCapturedVar();
471  markAsEscaped(VD);
472  if (isa<OMPCapturedExprDecl>(VD))
473  VisitValueDecl(VD);
474  }
475  }
476  }
477  void VisitLambdaExpr(const LambdaExpr *E) {
478  if (!E)
479  return;
480  for (const LambdaCapture &C : E->captures()) {
481  if (C.capturesVariable()) {
482  if (C.getCaptureKind() == LCK_ByRef) {
483  const ValueDecl *VD = C.getCapturedVar();
484  markAsEscaped(VD);
485  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
486  VisitValueDecl(VD);
487  }
488  }
489  }
490  }
491  void VisitBlockExpr(const BlockExpr *E) {
492  if (!E)
493  return;
494  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
495  if (C.isByRef()) {
496  const VarDecl *VD = C.getVariable();
497  markAsEscaped(VD);
498  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
499  VisitValueDecl(VD);
500  }
501  }
502  }
503  void VisitCallExpr(const CallExpr *E) {
504  if (!E)
505  return;
506  for (const Expr *Arg : E->arguments()) {
507  if (!Arg)
508  continue;
509  if (Arg->isLValue()) {
510  const bool SavedAllEscaped = AllEscaped;
511  AllEscaped = true;
512  Visit(Arg);
513  AllEscaped = SavedAllEscaped;
514  } else {
515  Visit(Arg);
516  }
517  }
518  Visit(E->getCallee());
519  }
520  void VisitDeclRefExpr(const DeclRefExpr *E) {
521  if (!E)
522  return;
523  const ValueDecl *VD = E->getDecl();
524  if (AllEscaped)
525  markAsEscaped(VD);
526  if (isa<OMPCapturedExprDecl>(VD))
527  VisitValueDecl(VD);
528  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
529  if (VarD->isInitCapture())
530  VisitValueDecl(VD);
531  }
532  void VisitUnaryOperator(const UnaryOperator *E) {
533  if (!E)
534  return;
535  if (E->getOpcode() == UO_AddrOf) {
536  const bool SavedAllEscaped = AllEscaped;
537  AllEscaped = true;
538  Visit(E->getSubExpr());
539  AllEscaped = SavedAllEscaped;
540  } else {
541  Visit(E->getSubExpr());
542  }
543  }
544  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
545  if (!E)
546  return;
547  if (E->getCastKind() == CK_ArrayToPointerDecay) {
548  const bool SavedAllEscaped = AllEscaped;
549  AllEscaped = true;
550  Visit(E->getSubExpr());
551  AllEscaped = SavedAllEscaped;
552  } else {
553  Visit(E->getSubExpr());
554  }
555  }
556  void VisitExpr(const Expr *E) {
557  if (!E)
558  return;
559  bool SavedAllEscaped = AllEscaped;
560  if (!E->isLValue())
561  AllEscaped = false;
562  for (const Stmt *Child : E->children())
563  if (Child)
564  Visit(Child);
565  AllEscaped = SavedAllEscaped;
566  }
567  void VisitStmt(const Stmt *S) {
568  if (!S)
569  return;
570  for (const Stmt *Child : S->children())
571  if (Child)
572  Visit(Child);
573  }
574 
575  /// Returns the record that handles all the escaped local variables and used
576  /// instead of their original storage.
577  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
578  if (!GlobalizedRD)
579  buildRecordForGlobalizedVars(IsInTTDRegion);
580  return GlobalizedRD;
581  }
582 
583  /// Returns the field in the globalized record for the escaped variable.
584  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
585  assert(GlobalizedRD &&
586  "Record for globalized variables must be generated already.");
587  auto I = MappedDeclsFields.find(VD);
588  if (I == MappedDeclsFields.end())
589  return nullptr;
590  return I->getSecond();
591  }
592 
593  /// Returns the list of the escaped local variables/parameters.
594  ArrayRef<const ValueDecl *> getEscapedDecls() const {
595  return EscapedDecls.getArrayRef();
596  }
597 
598  /// Checks if the escaped local variable is actually a parameter passed by
599  /// value.
600  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
601  return EscapedParameters;
602  }
603 
604  /// Returns the list of the escaped variables with the variably modified
605  /// types.
606  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
607  return EscapedVariableLengthDecls.getArrayRef();
608  }
609 };
610 } // anonymous namespace
611 
612 /// Get the GPU warp size.
614  return CGF.EmitRuntimeCall(
615  llvm::Intrinsic::getDeclaration(
616  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
617  "nvptx_warp_size");
618 }
619 
620 /// Get the id of the current thread on the GPU.
622  return CGF.EmitRuntimeCall(
623  llvm::Intrinsic::getDeclaration(
624  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
625  "nvptx_tid");
626 }
627 
628 /// Get the id of the warp in the block.
629 /// We assume that the warp size is 32, which is always the case
630 /// on the NVPTX device, to generate more efficient code.
632  CGBuilderTy &Bld = CGF.Builder;
633  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
634 }
635 
636 /// Get the id of the current lane in the Warp.
637 /// We assume that the warp size is 32, which is always the case
638 /// on the NVPTX device, to generate more efficient code.
640  CGBuilderTy &Bld = CGF.Builder;
641  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
642  "nvptx_lane_id");
643 }
644 
645 /// Get the maximum number of threads in a block of the GPU.
647  return CGF.EmitRuntimeCall(
648  llvm::Intrinsic::getDeclaration(
649  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
650  "nvptx_num_threads");
651 }
652 
653 /// Get the value of the thread_limit clause in the teams directive.
654 /// For the 'generic' execution mode, the runtime encodes thread_limit in
655 /// the launch parameters, always starting thread_limit+warpSize threads per
656 /// CTA. The threads in the last warp are reserved for master execution.
657 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
659  bool IsInSPMDExecutionMode = false) {
660  CGBuilderTy &Bld = CGF.Builder;
661  return IsInSPMDExecutionMode
662  ? getNVPTXNumThreads(CGF)
663  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
664  "thread_limit");
665 }
666 
667 /// Get the thread id of the OMP master thread.
668 /// The master thread id is the first thread (lane) of the last warp in the
669 /// GPU block. Warp size is assumed to be some power of 2.
670 /// Thread id is 0 indexed.
671 /// E.g: If NumThreads is 33, master id is 32.
672 /// If NumThreads is 64, master id is 32.
673 /// If NumThreads is 1024, master id is 992.
675  CGBuilderTy &Bld = CGF.Builder;
676  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
677 
678  // We assume that the warp size is a power of 2.
679  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
680 
681  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
682  Bld.CreateNot(Mask), "master_tid");
683 }
684 
685 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
686  CodeGenModule &CGM, SourceLocation Loc)
687  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
688  Loc(Loc) {
689  createWorkerFunction(CGM);
690 }
691 
692 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
693  CodeGenModule &CGM) {
694  // Create an worker function with no arguments.
695 
696  WorkerFn = llvm::Function::Create(
698  /*placeholder=*/"_worker", &CGM.getModule());
699  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
700  WorkerFn->setDoesNotRecurse();
701 }
702 
704 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
705  return CurrentExecutionMode;
706 }
707 
710  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
712 }
713 
714 /// Checks if the expression is constant or does not have non-trivial function
715 /// calls.
716 static bool isTrivial(ASTContext &Ctx, const Expr * E) {
717  // We can skip constant expressions.
718  // We can skip expressions with trivial calls or simple expressions.
720  !E->hasNonTrivialCall(Ctx)) &&
721  !E->HasSideEffects(Ctx, /*IncludePossibleEffects=*/true);
722 }
723 
724 /// Checks if the \p Body is the \a CompoundStmt and returns its child statement
725 /// iff there is only one that is not evaluatable at the compile time.
726 static const Stmt *getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body) {
727  if (const auto *C = dyn_cast<CompoundStmt>(Body)) {
728  const Stmt *Child = nullptr;
729  for (const Stmt *S : C->body()) {
730  if (const auto *E = dyn_cast<Expr>(S)) {
731  if (isTrivial(Ctx, E))
732  continue;
733  }
734  // Some of the statements can be ignored.
735  if (isa<AsmStmt>(S) || isa<NullStmt>(S) || isa<OMPFlushDirective>(S) ||
736  isa<OMPBarrierDirective>(S) || isa<OMPTaskyieldDirective>(S))
737  continue;
738  // Analyze declarations.
739  if (const auto *DS = dyn_cast<DeclStmt>(S)) {
740  if (llvm::all_of(DS->decls(), [&Ctx](const Decl *D) {
741  if (isa<EmptyDecl>(D) || isa<DeclContext>(D) ||
742  isa<TypeDecl>(D) || isa<PragmaCommentDecl>(D) ||
743  isa<PragmaDetectMismatchDecl>(D) || isa<UsingDecl>(D) ||
744  isa<UsingDirectiveDecl>(D) ||
745  isa<OMPDeclareReductionDecl>(D) ||
746  isa<OMPThreadPrivateDecl>(D) || isa<OMPAllocateDecl>(D))
747  return true;
748  const auto *VD = dyn_cast<VarDecl>(D);
749  if (!VD)
750  return false;
751  return VD->isConstexpr() ||
752  ((VD->getType().isTrivialType(Ctx) ||
753  VD->getType()->isReferenceType()) &&
754  (!VD->hasInit() || isTrivial(Ctx, VD->getInit())));
755  }))
756  continue;
757  }
758  // Found multiple children - cannot get the one child only.
759  if (Child)
760  return Body;
761  Child = S;
762  }
763  if (Child)
764  return Child;
765  }
766  return Body;
767 }
768 
769 /// Check if the parallel directive has an 'if' clause with non-constant or
770 /// false condition. Also, check if the number of threads is strictly specified
771 /// and run those directives in non-SPMD mode.
773  const OMPExecutableDirective &D) {
775  return true;
776  for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
777  OpenMPDirectiveKind NameModifier = C->getNameModifier();
778  if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
779  continue;
780  const Expr *Cond = C->getCondition();
781  bool Result;
782  if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
783  return true;
784  }
785  return false;
786 }
787 
788 /// Check for inner (nested) SPMD construct, if any
790  const OMPExecutableDirective &D) {
791  const auto *CS = D.getInnermostCapturedStmt();
792  const auto *Body =
793  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
794  const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
795 
796  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
797  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
798  switch (D.getDirectiveKind()) {
799  case OMPD_target:
800  if (isOpenMPParallelDirective(DKind) &&
801  !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
802  return true;
803  if (DKind == OMPD_teams) {
804  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
805  /*IgnoreCaptured=*/true);
806  if (!Body)
807  return false;
808  ChildStmt = getSingleCompoundChild(Ctx, Body);
809  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
810  DKind = NND->getDirectiveKind();
811  if (isOpenMPParallelDirective(DKind) &&
812  !hasParallelIfNumThreadsClause(Ctx, *NND))
813  return true;
814  }
815  }
816  return false;
817  case OMPD_target_teams:
818  return isOpenMPParallelDirective(DKind) &&
819  !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
820  case OMPD_target_simd:
821  case OMPD_target_parallel:
822  case OMPD_target_parallel_for:
823  case OMPD_target_parallel_for_simd:
824  case OMPD_target_teams_distribute:
825  case OMPD_target_teams_distribute_simd:
826  case OMPD_target_teams_distribute_parallel_for:
827  case OMPD_target_teams_distribute_parallel_for_simd:
828  case OMPD_parallel:
829  case OMPD_for:
830  case OMPD_parallel_for:
831  case OMPD_parallel_sections:
832  case OMPD_for_simd:
833  case OMPD_parallel_for_simd:
834  case OMPD_cancel:
835  case OMPD_cancellation_point:
836  case OMPD_ordered:
837  case OMPD_threadprivate:
838  case OMPD_allocate:
839  case OMPD_task:
840  case OMPD_simd:
841  case OMPD_sections:
842  case OMPD_section:
843  case OMPD_single:
844  case OMPD_master:
845  case OMPD_critical:
846  case OMPD_taskyield:
847  case OMPD_barrier:
848  case OMPD_taskwait:
849  case OMPD_taskgroup:
850  case OMPD_atomic:
851  case OMPD_flush:
852  case OMPD_teams:
853  case OMPD_target_data:
854  case OMPD_target_exit_data:
855  case OMPD_target_enter_data:
856  case OMPD_distribute:
857  case OMPD_distribute_simd:
858  case OMPD_distribute_parallel_for:
859  case OMPD_distribute_parallel_for_simd:
860  case OMPD_teams_distribute:
861  case OMPD_teams_distribute_simd:
862  case OMPD_teams_distribute_parallel_for:
863  case OMPD_teams_distribute_parallel_for_simd:
864  case OMPD_target_update:
865  case OMPD_declare_simd:
866  case OMPD_declare_target:
867  case OMPD_end_declare_target:
868  case OMPD_declare_reduction:
869  case OMPD_declare_mapper:
870  case OMPD_taskloop:
871  case OMPD_taskloop_simd:
872  case OMPD_requires:
873  case OMPD_unknown:
874  llvm_unreachable("Unexpected directive.");
875  }
876  }
877 
878  return false;
879 }
880 
882  const OMPExecutableDirective &D) {
883  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
884  switch (DirectiveKind) {
885  case OMPD_target:
886  case OMPD_target_teams:
887  return hasNestedSPMDDirective(Ctx, D);
888  case OMPD_target_parallel:
889  case OMPD_target_parallel_for:
890  case OMPD_target_parallel_for_simd:
891  case OMPD_target_teams_distribute_parallel_for:
892  case OMPD_target_teams_distribute_parallel_for_simd:
893  return !hasParallelIfNumThreadsClause(Ctx, D);
894  case OMPD_target_simd:
895  case OMPD_target_teams_distribute:
896  case OMPD_target_teams_distribute_simd:
897  return false;
898  case OMPD_parallel:
899  case OMPD_for:
900  case OMPD_parallel_for:
901  case OMPD_parallel_sections:
902  case OMPD_for_simd:
903  case OMPD_parallel_for_simd:
904  case OMPD_cancel:
905  case OMPD_cancellation_point:
906  case OMPD_ordered:
907  case OMPD_threadprivate:
908  case OMPD_allocate:
909  case OMPD_task:
910  case OMPD_simd:
911  case OMPD_sections:
912  case OMPD_section:
913  case OMPD_single:
914  case OMPD_master:
915  case OMPD_critical:
916  case OMPD_taskyield:
917  case OMPD_barrier:
918  case OMPD_taskwait:
919  case OMPD_taskgroup:
920  case OMPD_atomic:
921  case OMPD_flush:
922  case OMPD_teams:
923  case OMPD_target_data:
924  case OMPD_target_exit_data:
925  case OMPD_target_enter_data:
926  case OMPD_distribute:
927  case OMPD_distribute_simd:
928  case OMPD_distribute_parallel_for:
929  case OMPD_distribute_parallel_for_simd:
930  case OMPD_teams_distribute:
931  case OMPD_teams_distribute_simd:
932  case OMPD_teams_distribute_parallel_for:
933  case OMPD_teams_distribute_parallel_for_simd:
934  case OMPD_target_update:
935  case OMPD_declare_simd:
936  case OMPD_declare_target:
937  case OMPD_end_declare_target:
938  case OMPD_declare_reduction:
939  case OMPD_declare_mapper:
940  case OMPD_taskloop:
941  case OMPD_taskloop_simd:
942  case OMPD_requires:
943  case OMPD_unknown:
944  break;
945  }
946  llvm_unreachable(
947  "Unknown programming model for OpenMP directive on NVPTX target.");
948 }
949 
950 /// Check if the directive is loops based and has schedule clause at all or has
951 /// static scheduling.
955  "Expected loop-based directive.");
956  return !D.hasClausesOfKind<OMPOrderedClause>() &&
958  llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
959  [](const OMPScheduleClause *C) {
960  return C->getScheduleKind() == OMPC_SCHEDULE_static;
961  }));
962 }
963 
964 /// Check for inner (nested) lightweight runtime construct, if any
966  const OMPExecutableDirective &D) {
967  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
968  const auto *CS = D.getInnermostCapturedStmt();
969  const auto *Body =
970  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
971  const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
972 
973  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
974  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
975  switch (D.getDirectiveKind()) {
976  case OMPD_target:
977  if (isOpenMPParallelDirective(DKind) &&
979  hasStaticScheduling(*NestedDir))
980  return true;
981  if (DKind == OMPD_parallel) {
982  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
983  /*IgnoreCaptured=*/true);
984  if (!Body)
985  return false;
986  ChildStmt = getSingleCompoundChild(Ctx, Body);
987  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
988  DKind = NND->getDirectiveKind();
989  if (isOpenMPWorksharingDirective(DKind) &&
991  return true;
992  }
993  } else if (DKind == OMPD_teams) {
994  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
995  /*IgnoreCaptured=*/true);
996  if (!Body)
997  return false;
998  ChildStmt = getSingleCompoundChild(Ctx, Body);
999  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1000  DKind = NND->getDirectiveKind();
1001  if (isOpenMPParallelDirective(DKind) &&
1004  return true;
1005  if (DKind == OMPD_parallel) {
1006  Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
1007  /*IgnoreCaptured=*/true);
1008  if (!Body)
1009  return false;
1010  ChildStmt = getSingleCompoundChild(Ctx, Body);
1011  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1012  DKind = NND->getDirectiveKind();
1013  if (isOpenMPWorksharingDirective(DKind) &&
1015  return true;
1016  }
1017  }
1018  }
1019  }
1020  return false;
1021  case OMPD_target_teams:
1022  if (isOpenMPParallelDirective(DKind) &&
1024  hasStaticScheduling(*NestedDir))
1025  return true;
1026  if (DKind == OMPD_parallel) {
1027  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
1028  /*IgnoreCaptured=*/true);
1029  if (!Body)
1030  return false;
1031  ChildStmt = getSingleCompoundChild(Ctx, Body);
1032  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1033  DKind = NND->getDirectiveKind();
1034  if (isOpenMPWorksharingDirective(DKind) &&
1036  return true;
1037  }
1038  }
1039  return false;
1040  case OMPD_target_parallel:
1041  return isOpenMPWorksharingDirective(DKind) &&
1042  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
1043  case OMPD_target_teams_distribute:
1044  case OMPD_target_simd:
1045  case OMPD_target_parallel_for:
1046  case OMPD_target_parallel_for_simd:
1047  case OMPD_target_teams_distribute_simd:
1048  case OMPD_target_teams_distribute_parallel_for:
1049  case OMPD_target_teams_distribute_parallel_for_simd:
1050  case OMPD_parallel:
1051  case OMPD_for:
1052  case OMPD_parallel_for:
1053  case OMPD_parallel_sections:
1054  case OMPD_for_simd:
1055  case OMPD_parallel_for_simd:
1056  case OMPD_cancel:
1057  case OMPD_cancellation_point:
1058  case OMPD_ordered:
1059  case OMPD_threadprivate:
1060  case OMPD_allocate:
1061  case OMPD_task:
1062  case OMPD_simd:
1063  case OMPD_sections:
1064  case OMPD_section:
1065  case OMPD_single:
1066  case OMPD_master:
1067  case OMPD_critical:
1068  case OMPD_taskyield:
1069  case OMPD_barrier:
1070  case OMPD_taskwait:
1071  case OMPD_taskgroup:
1072  case OMPD_atomic:
1073  case OMPD_flush:
1074  case OMPD_teams:
1075  case OMPD_target_data:
1076  case OMPD_target_exit_data:
1077  case OMPD_target_enter_data:
1078  case OMPD_distribute:
1079  case OMPD_distribute_simd:
1080  case OMPD_distribute_parallel_for:
1081  case OMPD_distribute_parallel_for_simd:
1082  case OMPD_teams_distribute:
1083  case OMPD_teams_distribute_simd:
1084  case OMPD_teams_distribute_parallel_for:
1085  case OMPD_teams_distribute_parallel_for_simd:
1086  case OMPD_target_update:
1087  case OMPD_declare_simd:
1088  case OMPD_declare_target:
1089  case OMPD_end_declare_target:
1090  case OMPD_declare_reduction:
1091  case OMPD_declare_mapper:
1092  case OMPD_taskloop:
1093  case OMPD_taskloop_simd:
1094  case OMPD_requires:
1095  case OMPD_unknown:
1096  llvm_unreachable("Unexpected directive.");
1097  }
1098  }
1099 
1100  return false;
1101 }
1102 
1103 /// Checks if the construct supports lightweight runtime. It must be SPMD
1104 /// construct + inner loop-based construct with static scheduling.
1106  const OMPExecutableDirective &D) {
1107  if (!supportsSPMDExecutionMode(Ctx, D))
1108  return false;
1109  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1110  switch (DirectiveKind) {
1111  case OMPD_target:
1112  case OMPD_target_teams:
1113  case OMPD_target_parallel:
1114  return hasNestedLightweightDirective(Ctx, D);
1115  case OMPD_target_parallel_for:
1116  case OMPD_target_parallel_for_simd:
1117  case OMPD_target_teams_distribute_parallel_for:
1118  case OMPD_target_teams_distribute_parallel_for_simd:
1119  // (Last|First)-privates must be shared in parallel region.
1120  return hasStaticScheduling(D);
1121  case OMPD_target_simd:
1122  case OMPD_target_teams_distribute:
1123  case OMPD_target_teams_distribute_simd:
1124  return false;
1125  case OMPD_parallel:
1126  case OMPD_for:
1127  case OMPD_parallel_for:
1128  case OMPD_parallel_sections:
1129  case OMPD_for_simd:
1130  case OMPD_parallel_for_simd:
1131  case OMPD_cancel:
1132  case OMPD_cancellation_point:
1133  case OMPD_ordered:
1134  case OMPD_threadprivate:
1135  case OMPD_allocate:
1136  case OMPD_task:
1137  case OMPD_simd:
1138  case OMPD_sections:
1139  case OMPD_section:
1140  case OMPD_single:
1141  case OMPD_master:
1142  case OMPD_critical:
1143  case OMPD_taskyield:
1144  case OMPD_barrier:
1145  case OMPD_taskwait:
1146  case OMPD_taskgroup:
1147  case OMPD_atomic:
1148  case OMPD_flush:
1149  case OMPD_teams:
1150  case OMPD_target_data:
1151  case OMPD_target_exit_data:
1152  case OMPD_target_enter_data:
1153  case OMPD_distribute:
1154  case OMPD_distribute_simd:
1155  case OMPD_distribute_parallel_for:
1156  case OMPD_distribute_parallel_for_simd:
1157  case OMPD_teams_distribute:
1158  case OMPD_teams_distribute_simd:
1159  case OMPD_teams_distribute_parallel_for:
1160  case OMPD_teams_distribute_parallel_for_simd:
1161  case OMPD_target_update:
1162  case OMPD_declare_simd:
1163  case OMPD_declare_target:
1164  case OMPD_end_declare_target:
1165  case OMPD_declare_reduction:
1166  case OMPD_declare_mapper:
1167  case OMPD_taskloop:
1168  case OMPD_taskloop_simd:
1169  case OMPD_requires:
1170  case OMPD_unknown:
1171  break;
1172  }
1173  llvm_unreachable(
1174  "Unknown programming model for OpenMP directive on NVPTX target.");
1175 }
1176 
1177 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1178  StringRef ParentName,
1179  llvm::Function *&OutlinedFn,
1180  llvm::Constant *&OutlinedFnID,
1181  bool IsOffloadEntry,
1182  const RegionCodeGenTy &CodeGen) {
1183  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1184  EntryFunctionState EST;
1185  WorkerFunctionState WST(CGM, D.getBeginLoc());
1186  Work.clear();
1187  WrapperFunctionsMap.clear();
1188 
1189  // Emit target region as a standalone region.
1190  class NVPTXPrePostActionTy : public PrePostActionTy {
1191  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1192  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1193 
1194  public:
1195  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1196  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1197  : EST(EST), WST(WST) {}
1198  void Enter(CodeGenFunction &CGF) override {
1199  auto &RT =
1200  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1201  RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1202  // Skip target region initialization.
1203  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1204  }
1205  void Exit(CodeGenFunction &CGF) override {
1206  auto &RT =
1207  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1208  RT.clearLocThreadIdInsertPt(CGF);
1209  RT.emitNonSPMDEntryFooter(CGF, EST);
1210  }
1211  } Action(EST, WST);
1212  CodeGen.setAction(Action);
1213  IsInTTDRegion = true;
1214  // Reserve place for the globalized memory.
1215  GlobalizedRecords.emplace_back();
1216  if (!KernelStaticGlobalized) {
1217  KernelStaticGlobalized = new llvm::GlobalVariable(
1218  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1220  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1221  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1222  llvm::GlobalValue::NotThreadLocal,
1224  }
1225  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1226  IsOffloadEntry, CodeGen);
1227  IsInTTDRegion = false;
1228 
1229  // Now change the name of the worker function to correspond to this target
1230  // region's entry function.
1231  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1232 
1233  // Create the worker function
1234  emitWorkerFunction(WST);
1235 }
1236 
1237 // Setup NVPTX threads for master-worker OpenMP scheme.
1238 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1239  EntryFunctionState &EST,
1240  WorkerFunctionState &WST) {
1241  CGBuilderTy &Bld = CGF.Builder;
1242 
1243  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1244  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1245  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1246  EST.ExitBB = CGF.createBasicBlock(".exit");
1247 
1248  llvm::Value *IsWorker =
1249  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1250  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1251 
1252  CGF.EmitBlock(WorkerBB);
1253  emitCall(CGF, WST.Loc, WST.WorkerFn);
1254  CGF.EmitBranch(EST.ExitBB);
1255 
1256  CGF.EmitBlock(MasterCheckBB);
1257  llvm::Value *IsMaster =
1258  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1259  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1260 
1261  CGF.EmitBlock(MasterBB);
1262  IsInTargetMasterThreadRegion = true;
1263  // SEQUENTIAL (MASTER) REGION START
1264  // First action in sequential region:
1265  // Initialize the state of the OpenMP runtime library on the GPU.
1266  // TODO: Optimize runtime initialization and pass in correct value.
1267  llvm::Value *Args[] = {getThreadLimit(CGF),
1268  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1269  CGF.EmitRuntimeCall(
1270  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1271 
1272  // For data sharing, we need to initialize the stack.
1273  CGF.EmitRuntimeCall(
1274  createNVPTXRuntimeFunction(
1275  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1276 
1277  emitGenericVarsProlog(CGF, WST.Loc);
1278 }
1279 
1280 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1281  EntryFunctionState &EST) {
1282  IsInTargetMasterThreadRegion = false;
1283  if (!CGF.HaveInsertPoint())
1284  return;
1285 
1286  emitGenericVarsEpilog(CGF);
1287 
1288  if (!EST.ExitBB)
1289  EST.ExitBB = CGF.createBasicBlock(".exit");
1290 
1291  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1292  CGF.EmitBranch(TerminateBB);
1293 
1294  CGF.EmitBlock(TerminateBB);
1295  // Signal termination condition.
1296  // TODO: Optimize runtime initialization and pass in correct value.
1297  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1298  CGF.EmitRuntimeCall(
1299  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1300  // Barrier to terminate worker threads.
1301  syncCTAThreads(CGF);
1302  // Master thread jumps to exit point.
1303  CGF.EmitBranch(EST.ExitBB);
1304 
1305  CGF.EmitBlock(EST.ExitBB);
1306  EST.ExitBB = nullptr;
1307 }
1308 
1309 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1310  StringRef ParentName,
1311  llvm::Function *&OutlinedFn,
1312  llvm::Constant *&OutlinedFnID,
1313  bool IsOffloadEntry,
1314  const RegionCodeGenTy &CodeGen) {
1315  ExecutionRuntimeModesRAII ModeRAII(
1316  CurrentExecutionMode, RequiresFullRuntime,
1317  CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1319  EntryFunctionState EST;
1320 
1321  // Emit target region as a standalone region.
1322  class NVPTXPrePostActionTy : public PrePostActionTy {
1324  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1325  const OMPExecutableDirective &D;
1326 
1327  public:
1328  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1329  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1330  const OMPExecutableDirective &D)
1331  : RT(RT), EST(EST), D(D) {}
1332  void Enter(CodeGenFunction &CGF) override {
1333  RT.emitSPMDEntryHeader(CGF, EST, D);
1334  // Skip target region initialization.
1335  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1336  }
1337  void Exit(CodeGenFunction &CGF) override {
1338  RT.clearLocThreadIdInsertPt(CGF);
1339  RT.emitSPMDEntryFooter(CGF, EST);
1340  }
1341  } Action(*this, EST, D);
1342  CodeGen.setAction(Action);
1343  IsInTTDRegion = true;
1344  // Reserve place for the globalized memory.
1345  GlobalizedRecords.emplace_back();
1346  if (!KernelStaticGlobalized) {
1347  KernelStaticGlobalized = new llvm::GlobalVariable(
1348  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1350  llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1351  "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1352  llvm::GlobalValue::NotThreadLocal,
1354  }
1355  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1356  IsOffloadEntry, CodeGen);
1357  IsInTTDRegion = false;
1358 }
1359 
1360 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1361  CodeGenFunction &CGF, EntryFunctionState &EST,
1362  const OMPExecutableDirective &D) {
1363  CGBuilderTy &Bld = CGF.Builder;
1364 
1365  // Setup BBs in entry function.
1366  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1367  EST.ExitBB = CGF.createBasicBlock(".exit");
1368 
1369  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1370  /*RequiresOMPRuntime=*/
1371  Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1372  /*RequiresDataSharing=*/Bld.getInt16(0)};
1373  CGF.EmitRuntimeCall(
1374  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1375 
1376  if (RequiresFullRuntime) {
1377  // For data sharing, we need to initialize the stack.
1378  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1379  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1380  }
1381 
1382  CGF.EmitBranch(ExecuteBB);
1383 
1384  CGF.EmitBlock(ExecuteBB);
1385 
1386  IsInTargetMasterThreadRegion = true;
1387 }
1388 
1389 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1390  EntryFunctionState &EST) {
1391  IsInTargetMasterThreadRegion = false;
1392  if (!CGF.HaveInsertPoint())
1393  return;
1394 
1395  if (!EST.ExitBB)
1396  EST.ExitBB = CGF.createBasicBlock(".exit");
1397 
1398  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1399  CGF.EmitBranch(OMPDeInitBB);
1400 
1401  CGF.EmitBlock(OMPDeInitBB);
1402  // DeInitialize the OMP state in the runtime; called by all active threads.
1403  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1404  CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1405  CGF.EmitRuntimeCall(
1406  createNVPTXRuntimeFunction(
1407  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1408  CGF.EmitBranch(EST.ExitBB);
1409 
1410  CGF.EmitBlock(EST.ExitBB);
1411  EST.ExitBB = nullptr;
1412 }
1413 
1414 // Create a unique global variable to indicate the execution mode of this target
1415 // region. The execution mode is either 'generic', or 'spmd' depending on the
1416 // target directive. This variable is picked up by the offload library to setup
1417 // the device appropriately before kernel launch. If the execution mode is
1418 // 'generic', the runtime reserves one warp for the master, otherwise, all
1419 // warps participate in parallel work.
1420 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1421  bool Mode) {
1422  auto *GVMode =
1423  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1424  llvm::GlobalValue::WeakAnyLinkage,
1425  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1426  Twine(Name, "_exec_mode"));
1427  CGM.addCompilerUsedGlobal(GVMode);
1428 }
1429 
1430 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1431  ASTContext &Ctx = CGM.getContext();
1432 
1433  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1434  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1435  WST.Loc, WST.Loc);
1436  emitWorkerLoop(CGF, WST);
1437  CGF.FinishFunction();
1438 }
1439 
1440 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1441  WorkerFunctionState &WST) {
1442  //
1443  // The workers enter this loop and wait for parallel work from the master.
1444  // When the master encounters a parallel region it sets up the work + variable
1445  // arguments, and wakes up the workers. The workers first check to see if
1446  // they are required for the parallel region, i.e., within the # of requested
1447  // parallel threads. The activated workers load the variable arguments and
1448  // execute the parallel work.
1449  //
1450 
1451  CGBuilderTy &Bld = CGF.Builder;
1452 
1453  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1454  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1455  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1456  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1457  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1458  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1459 
1460  CGF.EmitBranch(AwaitBB);
1461 
1462  // Workers wait for work from master.
1463  CGF.EmitBlock(AwaitBB);
1464  // Wait for parallel work
1465  syncCTAThreads(CGF);
1466 
1467  Address WorkFn =
1468  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1469  Address ExecStatus =
1470  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1471  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1472  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1473 
1474  // TODO: Optimize runtime initialization and pass in correct value.
1475  llvm::Value *Args[] = {WorkFn.getPointer(),
1476  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1477  llvm::Value *Ret = CGF.EmitRuntimeCall(
1478  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1479  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1480 
1481  // On termination condition (workid == 0), exit loop.
1482  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1483  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1484  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1485 
1486  // Activate requested workers.
1487  CGF.EmitBlock(SelectWorkersBB);
1488  llvm::Value *IsActive =
1489  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1490  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1491 
1492  // Signal start of parallel region.
1493  CGF.EmitBlock(ExecuteBB);
1494  // Skip initialization.
1495  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1496 
1497  // Process work items: outlined parallel functions.
1498  for (llvm::Function *W : Work) {
1499  // Try to match this outlined function.
1501 
1502  llvm::Value *WorkFnMatch =
1503  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1504 
1505  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1506  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1507  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1508 
1509  // Execute this outlined function.
1510  CGF.EmitBlock(ExecuteFNBB);
1511 
1512  // Insert call to work function via shared wrapper. The shared
1513  // wrapper takes two arguments:
1514  // - the parallelism level;
1515  // - the thread ID;
1516  emitCall(CGF, WST.Loc, W,
1517  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1518 
1519  // Go to end of parallel region.
1520  CGF.EmitBranch(TerminateBB);
1521 
1522  CGF.EmitBlock(CheckNextBB);
1523  }
1524  // Default case: call to outlined function through pointer if the target
1525  // region makes a declare target call that may contain an orphaned parallel
1526  // directive.
1527  auto *ParallelFnTy =
1528  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1529  /*isVarArg=*/false);
1530  llvm::Value *WorkFnCast =
1531  Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1532  // Insert call to work function via shared wrapper. The shared
1533  // wrapper takes two arguments:
1534  // - the parallelism level;
1535  // - the thread ID;
1536  emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1537  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1538  // Go to end of parallel region.
1539  CGF.EmitBranch(TerminateBB);
1540 
1541  // Signal end of parallel region.
1542  CGF.EmitBlock(TerminateBB);
1543  CGF.EmitRuntimeCall(
1544  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1545  llvm::None);
1546  CGF.EmitBranch(BarrierBB);
1547 
1548  // All active and inactive workers wait at a barrier after parallel region.
1549  CGF.EmitBlock(BarrierBB);
1550  // Barrier after parallel region.
1551  syncCTAThreads(CGF);
1552  CGF.EmitBranch(AwaitBB);
1553 
1554  // Exit target region.
1555  CGF.EmitBlock(ExitBB);
1556  // Skip initialization.
1557  clearLocThreadIdInsertPt(CGF);
1558 }
1559 
1560 /// Returns specified OpenMP runtime function for the current OpenMP
1561 /// implementation. Specialized for the NVPTX device.
1562 /// \param Function OpenMP runtime function.
1563 /// \return Specified function.
1564 llvm::FunctionCallee
1566  llvm::FunctionCallee RTLFn = nullptr;
1567  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1568  case OMPRTL_NVPTX__kmpc_kernel_init: {
1569  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1570  // RequiresOMPRuntime);
1571  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1572  auto *FnTy =
1573  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1574  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1575  break;
1576  }
1577  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1578  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1579  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1580  auto *FnTy =
1581  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1582  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1583  break;
1584  }
1585  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1586  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1587  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1588  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1589  auto *FnTy =
1590  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1591  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1592  break;
1593  }
1594  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1595  // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1596  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1597  auto *FnTy =
1598  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1599  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1600  break;
1601  }
1602  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1603  /// Build void __kmpc_kernel_prepare_parallel(
1604  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1605  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1606  auto *FnTy =
1607  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1608  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1609  break;
1610  }
1611  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1612  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1613  /// int16_t IsOMPRuntimeInitialized);
1614  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1615  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1616  auto *FnTy =
1617  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1618  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1619  break;
1620  }
1621  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1622  /// Build void __kmpc_kernel_end_parallel();
1623  auto *FnTy =
1624  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1625  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1626  break;
1627  }
1628  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1629  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1630  // global_tid);
1631  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1632  auto *FnTy =
1633  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1634  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1635  break;
1636  }
1637  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1638  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1639  // global_tid);
1640  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1641  auto *FnTy =
1642  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1643  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1644  break;
1645  }
1646  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1647  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1648  // int16_t lane_offset, int16_t warp_size);
1649  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1650  auto *FnTy =
1651  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1652  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1653  break;
1654  }
1655  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1656  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1657  // int16_t lane_offset, int16_t warp_size);
1658  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1659  auto *FnTy =
1660  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1661  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1662  break;
1663  }
1664  case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: {
1665  // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
1666  // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
1667  // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t
1668  // lane_id, int16_t lane_offset, int16_t Algorithm Version), void
1669  // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1670  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1671  CGM.Int16Ty, CGM.Int16Ty};
1672  auto *ShuffleReduceFnTy =
1673  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1674  /*isVarArg=*/false);
1675  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1676  auto *InterWarpCopyFnTy =
1677  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1678  /*isVarArg=*/false);
1679  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1680  CGM.Int32Ty,
1681  CGM.Int32Ty,
1682  CGM.SizeTy,
1683  CGM.VoidPtrTy,
1684  ShuffleReduceFnTy->getPointerTo(),
1685  InterWarpCopyFnTy->getPointerTo()};
1686  auto *FnTy =
1687  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1688  RTLFn = CGM.CreateRuntimeFunction(
1689  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2");
1690  break;
1691  }
1692  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1693  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1694  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1695  auto *FnTy =
1696  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1697  RTLFn = CGM.CreateRuntimeFunction(
1698  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1699  break;
1700  }
1701  case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: {
1702  // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
1703  // global_tid, void *global_buffer, int32_t num_of_records, void*
1704  // reduce_data,
1705  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1706  // lane_offset, int16_t shortCircuit),
1707  // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
1708  // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
1709  // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
1710  // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
1711  // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
1712  // *buffer, int idx, void *reduce_data));
1713  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1714  CGM.Int16Ty, CGM.Int16Ty};
1715  auto *ShuffleReduceFnTy =
1716  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1717  /*isVarArg=*/false);
1718  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1719  auto *InterWarpCopyFnTy =
1720  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1721  /*isVarArg=*/false);
1722  llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy,
1723  CGM.VoidPtrTy};
1724  auto *GlobalListFnTy =
1725  llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams,
1726  /*isVarArg=*/false);
1727  llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1728  CGM.Int32Ty,
1729  CGM.VoidPtrTy,
1730  CGM.Int32Ty,
1731  CGM.VoidPtrTy,
1732  ShuffleReduceFnTy->getPointerTo(),
1733  InterWarpCopyFnTy->getPointerTo(),
1734  GlobalListFnTy->getPointerTo(),
1735  GlobalListFnTy->getPointerTo(),
1736  GlobalListFnTy->getPointerTo(),
1737  GlobalListFnTy->getPointerTo()};
1738  auto *FnTy =
1739  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1740  RTLFn = CGM.CreateRuntimeFunction(
1741  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2");
1742  break;
1743  }
1744  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1745  /// Build void __kmpc_data_sharing_init_stack();
1746  auto *FnTy =
1747  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1748  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1749  break;
1750  }
1751  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1752  /// Build void __kmpc_data_sharing_init_stack_spmd();
1753  auto *FnTy =
1754  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1755  RTLFn =
1756  CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1757  break;
1758  }
1759  case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1760  // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1761  // int16_t UseSharedMemory);
1762  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1763  auto *FnTy =
1764  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1765  RTLFn = CGM.CreateRuntimeFunction(
1766  FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
1767  break;
1768  }
1769  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1770  // Build void __kmpc_data_sharing_pop_stack(void *a);
1771  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1772  auto *FnTy =
1773  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1774  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1775  /*Name=*/"__kmpc_data_sharing_pop_stack");
1776  break;
1777  }
1778  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1779  /// Build void __kmpc_begin_sharing_variables(void ***args,
1780  /// size_t n_args);
1781  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1782  auto *FnTy =
1783  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1784  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1785  break;
1786  }
1787  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1788  /// Build void __kmpc_end_sharing_variables();
1789  auto *FnTy =
1790  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1791  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1792  break;
1793  }
1794  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1795  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1796  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1797  auto *FnTy =
1798  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1799  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1800  break;
1801  }
1802  case OMPRTL_NVPTX__kmpc_parallel_level: {
1803  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1804  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1805  auto *FnTy =
1806  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1807  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1808  break;
1809  }
1810  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1811  // Build int8_t __kmpc_is_spmd_exec_mode();
1812  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1813  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1814  break;
1815  }
1816  case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1817  // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
1818  // const void *buf, size_t size, int16_t is_shared, const void **res);
1819  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
1820  CGM.Int16Ty, CGM.VoidPtrPtrTy};
1821  auto *FnTy =
1822  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1823  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1824  break;
1825  }
1826  case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1827  // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
1828  // int16_t is_shared);
1829  llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
1830  auto *FnTy =
1831  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1832  RTLFn =
1833  CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1834  break;
1835  }
1836  case OMPRTL__kmpc_barrier: {
1837  // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1838  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1839  auto *FnTy =
1840  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1841  RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
1842  cast<llvm::Function>(RTLFn.getCallee())
1843  ->addFnAttr(llvm::Attribute::Convergent);
1844  break;
1845  }
1846  case OMPRTL__kmpc_barrier_simple_spmd: {
1847  // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
1848  // global_tid);
1849  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1850  auto *FnTy =
1851  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1852  RTLFn =
1853  CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
1854  cast<llvm::Function>(RTLFn.getCallee())
1855  ->addFnAttr(llvm::Attribute::Convergent);
1856  break;
1857  }
1858  }
1859  return RTLFn;
1860 }
1861 
1862 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1863  llvm::Constant *Addr,
1864  uint64_t Size, int32_t,
1865  llvm::GlobalValue::LinkageTypes) {
1866  // TODO: Add support for global variables on the device after declare target
1867  // support.
1868  if (!isa<llvm::Function>(Addr))
1869  return;
1870  llvm::Module &M = CGM.getModule();
1871  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1872 
1873  // Get "nvvm.annotations" metadata node
1874  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1875 
1876  llvm::Metadata *MDVals[] = {
1877  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1878  llvm::ConstantAsMetadata::get(
1879  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1880  // Append metadata to nvvm.annotations
1881  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1882 }
1883 
1884 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1885  const OMPExecutableDirective &D, StringRef ParentName,
1886  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1887  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1888  if (!IsOffloadEntry) // Nothing to do.
1889  return;
1890 
1891  assert(!ParentName.empty() && "Invalid target region parent name!");
1892 
1893  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1894  if (Mode)
1895  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1896  CodeGen);
1897  else
1898  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1899  CodeGen);
1900 
1901  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1902 }
1903 
1904 namespace {
1906 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1907 enum ModeFlagsTy : unsigned {
1908  /// Bit set to 1 when in SPMD mode.
1909  KMP_IDENT_SPMD_MODE = 0x01,
1910  /// Bit set to 1 when a simplified runtime is used.
1911  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1912  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1913 };
1914 
1915 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1916 static const ModeFlagsTy UndefinedMode =
1917  (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1918 } // anonymous namespace
1919 
1921  switch (getExecutionMode()) {
1922  case EM_SPMD:
1923  if (requiresFullRuntime())
1924  return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1925  return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1926  case EM_NonSPMD:
1927  assert(requiresFullRuntime() && "Expected full runtime.");
1928  return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1929  case EM_Unknown:
1930  return UndefinedMode;
1931  }
1932  llvm_unreachable("Unknown flags are requested.");
1933 }
1934 
1936  : CGOpenMPRuntime(CGM, "_", "$") {
1937  if (!CGM.getLangOpts().OpenMPIsDevice)
1938  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1939 }
1940 
1942  OpenMPProcBindClauseKind ProcBind,
1943  SourceLocation Loc) {
1944  // Do nothing in case of SPMD mode and L0 parallel.
1945  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1946  return;
1947 
1948  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1949 }
1950 
1952  llvm::Value *NumThreads,
1953  SourceLocation Loc) {
1954  // Do nothing in case of SPMD mode and L0 parallel.
1955  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1956  return;
1957 
1958  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1959 }
1960 
1962  const Expr *NumTeams,
1963  const Expr *ThreadLimit,
1964  SourceLocation Loc) {}
1965 
1967  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1968  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1969  // Emit target region as a standalone region.
1970  class NVPTXPrePostActionTy : public PrePostActionTy {
1971  bool &IsInParallelRegion;
1972  bool PrevIsInParallelRegion;
1973 
1974  public:
1975  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1976  : IsInParallelRegion(IsInParallelRegion) {}
1977  void Enter(CodeGenFunction &CGF) override {
1978  PrevIsInParallelRegion = IsInParallelRegion;
1979  IsInParallelRegion = true;
1980  }
1981  void Exit(CodeGenFunction &CGF) override {
1982  IsInParallelRegion = PrevIsInParallelRegion;
1983  }
1984  } Action(IsInParallelRegion);
1985  CodeGen.setAction(Action);
1986  bool PrevIsInTTDRegion = IsInTTDRegion;
1987  IsInTTDRegion = false;
1988  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1989  IsInTargetMasterThreadRegion = false;
1990  auto *OutlinedFun =
1991  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1992  D, ThreadIDVar, InnermostKind, CodeGen));
1993  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1994  IsInTTDRegion = PrevIsInTTDRegion;
1995  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1996  !IsInParallelRegion) {
1997  llvm::Function *WrapperFun =
1998  createParallelDataSharingWrapper(OutlinedFun, D);
1999  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
2000  }
2001 
2002  return OutlinedFun;
2003 }
2004 
2005 /// Get list of lastprivate variables from the teams distribute ... or
2006 /// teams {distribute ...} directives.
2007 static void
2011  "expected teams directive.");
2012  const OMPExecutableDirective *Dir = &D;
2014  if (const Stmt *S = getSingleCompoundChild(
2015  Ctx,
2017  /*IgnoreCaptured=*/true))) {
2018  Dir = dyn_cast<OMPExecutableDirective>(S);
2019  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
2020  Dir = nullptr;
2021  }
2022  }
2023  if (!Dir)
2024  return;
2025  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
2026  for (const Expr *E : C->getVarRefs())
2027  Vars.push_back(getPrivateItem(E));
2028  }
2029 }
2030 
2031 /// Get list of reduction variables from the teams ... directives.
2032 static void
2036  "expected teams directive.");
2037  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
2038  for (const Expr *E : C->privates())
2039  Vars.push_back(getPrivateItem(E));
2040  }
2041 }
2042 
2044  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
2045  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
2046  SourceLocation Loc = D.getBeginLoc();
2047 
2048  const RecordDecl *GlobalizedRD = nullptr;
2049  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
2050  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
2051  // Globalize team reductions variable unconditionally in all modes.
2052  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2053  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
2054  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
2055  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2056  if (!LastPrivatesReductions.empty()) {
2057  GlobalizedRD = ::buildRecordForGlobalizedVars(
2058  CGM.getContext(), llvm::None, LastPrivatesReductions,
2059  MappedDeclsFields, WarpSize);
2060  }
2061  } else if (!LastPrivatesReductions.empty()) {
2062  assert(!TeamAndReductions.first &&
2063  "Previous team declaration is not expected.");
2064  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2065  std::swap(TeamAndReductions.second, LastPrivatesReductions);
2066  }
2067 
2068  // Emit target region as a standalone region.
2069  class NVPTXPrePostActionTy : public PrePostActionTy {
2070  SourceLocation &Loc;
2071  const RecordDecl *GlobalizedRD;
2072  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2073  &MappedDeclsFields;
2074 
2075  public:
2076  NVPTXPrePostActionTy(
2077  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2078  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2079  &MappedDeclsFields)
2080  : Loc(Loc), GlobalizedRD(GlobalizedRD),
2081  MappedDeclsFields(MappedDeclsFields) {}
2082  void Enter(CodeGenFunction &CGF) override {
2083  auto &Rt =
2084  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2085  if (GlobalizedRD) {
2086  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2087  I->getSecond().GlobalRecord = GlobalizedRD;
2088  I->getSecond().MappedParams =
2089  llvm::make_unique<CodeGenFunction::OMPMapVars>();
2090  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2091  for (const auto &Pair : MappedDeclsFields) {
2092  assert(Pair.getFirst()->isCanonicalDecl() &&
2093  "Expected canonical declaration");
2094  Data.insert(std::make_pair(Pair.getFirst(),
2095  MappedVarData(Pair.getSecond(),
2096  /*IsOnePerTeam=*/true)));
2097  }
2098  }
2099  Rt.emitGenericVarsProlog(CGF, Loc);
2100  }
2101  void Exit(CodeGenFunction &CGF) override {
2102  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2103  .emitGenericVarsEpilog(CGF);
2104  }
2105  } Action(Loc, GlobalizedRD, MappedDeclsFields);
2106  CodeGen.setAction(Action);
2107  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2108  D, ThreadIDVar, InnermostKind, CodeGen);
2109  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2110  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2111  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2112 
2113  return OutlinedFun;
2114 }
2115 
2116 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2117  SourceLocation Loc,
2118  bool WithSPMDCheck) {
2120  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2121  return;
2122 
2123  CGBuilderTy &Bld = CGF.Builder;
2124 
2125  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2126  if (I == FunctionGlobalizedDecls.end())
2127  return;
2128  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2129  QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2130  QualType SecGlobalRecTy;
2131 
2132  // Recover pointer to this function's global record. The runtime will
2133  // handle the specifics of the allocation of the memory.
2134  // Use actual memory size of the record including the padding
2135  // for alignment purposes.
2136  unsigned Alignment =
2137  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2138  unsigned GlobalRecordSize =
2139  CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2140  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2141 
2142  llvm::PointerType *GlobalRecPtrTy =
2143  CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2144  llvm::Value *GlobalRecCastAddr;
2145  llvm::Value *IsTTD = nullptr;
2146  if (!IsInTTDRegion &&
2147  (WithSPMDCheck ||
2148  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2149  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2150  llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2151  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2152  if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2153  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2154  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2155  llvm::Value *PL = CGF.EmitRuntimeCall(
2156  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2157  {RTLoc, ThreadID});
2158  IsTTD = Bld.CreateIsNull(PL);
2159  }
2160  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2161  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2162  Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2163  // There is no need to emit line number for unconditional branch.
2165  CGF.EmitBlock(SPMDBB);
2166  Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2167  CharUnits::fromQuantity(Alignment));
2168  CGF.EmitBranch(ExitBB);
2169  // There is no need to emit line number for unconditional branch.
2171  CGF.EmitBlock(NonSPMDBB);
2172  llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2173  if (const RecordDecl *SecGlobalizedVarsRecord =
2174  I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2175  SecGlobalRecTy =
2176  CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2177 
2178  // Recover pointer to this function's global record. The runtime will
2179  // handle the specifics of the allocation of the memory.
2180  // Use actual memory size of the record including the padding
2181  // for alignment purposes.
2182  unsigned Alignment =
2183  CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2184  unsigned GlobalRecordSize =
2185  CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2186  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2187  Size = Bld.CreateSelect(
2188  IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2189  }
2190  // TODO: allow the usage of shared memory to be controlled by
2191  // the user, for now, default to global.
2192  llvm::Value *GlobalRecordSizeArg[] = {
2193  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2194  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2196  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2197  GlobalRecordSizeArg);
2198  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2199  GlobalRecValue, GlobalRecPtrTy);
2200  CGF.EmitBlock(ExitBB);
2201  auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2202  /*NumReservedValues=*/2, "_select_stack");
2203  Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2204  Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2205  GlobalRecCastAddr = Phi;
2206  I->getSecond().GlobalRecordAddr = Phi;
2207  I->getSecond().IsInSPMDModeFlag = IsSPMD;
2208  } else if (IsInTTDRegion) {
2209  assert(GlobalizedRecords.back().Records.size() < 2 &&
2210  "Expected less than 2 globalized records: one for target and one "
2211  "for teams.");
2212  unsigned Offset = 0;
2213  for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2214  QualType RDTy = CGM.getContext().getRecordType(RD);
2215  unsigned Alignment =
2217  unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2218  Offset =
2219  llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2220  }
2221  unsigned Alignment =
2222  CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2223  Offset = llvm::alignTo(Offset, Alignment);
2224  GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2225  ++GlobalizedRecords.back().RegionCounter;
2226  if (GlobalizedRecords.back().Records.size() == 1) {
2227  assert(KernelStaticGlobalized &&
2228  "Kernel static pointer must be initialized already.");
2229  auto *UseSharedMemory = new llvm::GlobalVariable(
2230  CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2232  "_openmp_static_kernel$is_shared");
2233  UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2234  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2235  /*DestWidth=*/16, /*Signed=*/0);
2236  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2237  Address(UseSharedMemory,
2238  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2239  /*Volatile=*/false, Int16Ty, Loc);
2240  auto *StaticGlobalized = new llvm::GlobalVariable(
2241  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2242  llvm::GlobalValue::CommonLinkage, nullptr);
2243  auto *RecSize = new llvm::GlobalVariable(
2244  CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2246  "_openmp_static_kernel$size");
2247  RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2248  llvm::Value *Ld = CGF.EmitLoadOfScalar(
2249  Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
2250  CGM.getContext().getSizeType(), Loc);
2252  KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2253  llvm::Value *GlobalRecordSizeArg[] = {
2254  llvm::ConstantInt::get(
2255  CGM.Int16Ty,
2256  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2257  StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
2259  OMPRTL_NVPTX__kmpc_get_team_static_memory),
2260  GlobalRecordSizeArg);
2261  GlobalizedRecords.back().Buffer = StaticGlobalized;
2262  GlobalizedRecords.back().RecSize = RecSize;
2263  GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2264  GlobalizedRecords.back().Loc = Loc;
2265  }
2266  assert(KernelStaticGlobalized && "Global address must be set already.");
2267  Address FrameAddr = CGF.EmitLoadOfPointer(
2268  Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2269  CGM.getContext()
2271  .castAs<PointerType>());
2272  llvm::Value *GlobalRecValue =
2273  Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
2274  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2275  I->getSecond().IsInSPMDModeFlag = nullptr;
2276  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2277  GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2278  } else {
2279  // TODO: allow the usage of shared memory to be controlled by
2280  // the user, for now, default to global.
2281  llvm::Value *GlobalRecordSizeArg[] = {
2282  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2283  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2284  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2286  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2287  GlobalRecordSizeArg);
2288  GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2289  GlobalRecValue, GlobalRecPtrTy);
2290  I->getSecond().GlobalRecordAddr = GlobalRecValue;
2291  I->getSecond().IsInSPMDModeFlag = nullptr;
2292  }
2293  LValue Base =
2294  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
2295 
2296  // Emit the "global alloca" which is a GEP from the global declaration
2297  // record using the pointer returned by the runtime.
2298  LValue SecBase;
2299  decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2300  if (IsTTD) {
2301  SecIt = I->getSecond().SecondaryLocalVarData->begin();
2302  llvm::PointerType *SecGlobalRecPtrTy =
2303  CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2304  SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2306  I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2307  SecGlobalRecTy);
2308  }
2309  for (auto &Rec : I->getSecond().LocalVarData) {
2310  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2311  llvm::Value *ParValue;
2312  if (EscapedParam) {
2313  const auto *VD = cast<VarDecl>(Rec.first);
2314  LValue ParLVal =
2315  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2316  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2317  }
2318  LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2319  // Emit VarAddr basing on lane-id if required.
2320  QualType VarTy;
2321  if (Rec.second.IsOnePerTeam) {
2322  VarTy = Rec.second.FD->getType();
2323  } else {
2324  llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2325  VarAddr.getAddress().getPointer(),
2326  {Bld.getInt32(0), getNVPTXLaneID(CGF)});
2327  VarTy =
2328  Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2329  VarAddr = CGF.MakeAddrLValue(
2330  Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2332  }
2333  Rec.second.PrivateAddr = VarAddr.getAddress();
2334  if (!IsInTTDRegion &&
2335  (WithSPMDCheck ||
2336  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2337  assert(I->getSecond().IsInSPMDModeFlag &&
2338  "Expected unknown execution mode or required SPMD check.");
2339  if (IsTTD) {
2340  assert(SecIt->second.IsOnePerTeam &&
2341  "Secondary glob data must be one per team.");
2342  LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2343  VarAddr.setAddress(
2344  Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2345  VarAddr.getPointer()),
2346  VarAddr.getAlignment()));
2347  Rec.second.PrivateAddr = VarAddr.getAddress();
2348  }
2349  Address GlobalPtr = Rec.second.PrivateAddr;
2350  Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2351  Rec.second.PrivateAddr = Address(
2352  Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2353  LocalAddr.getPointer(), GlobalPtr.getPointer()),
2354  LocalAddr.getAlignment());
2355  }
2356  if (EscapedParam) {
2357  const auto *VD = cast<VarDecl>(Rec.first);
2358  CGF.EmitStoreOfScalar(ParValue, VarAddr);
2359  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2360  }
2361  if (IsTTD)
2362  ++SecIt;
2363  }
2364  }
2365  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2366  // Recover pointer to this function's global record. The runtime will
2367  // handle the specifics of the allocation of the memory.
2368  // Use actual memory size of the record including the padding
2369  // for alignment purposes.
2370  CGBuilderTy &Bld = CGF.Builder;
2371  llvm::Value *Size = CGF.getTypeSize(VD->getType());
2372  CharUnits Align = CGM.getContext().getDeclAlign(VD);
2373  Size = Bld.CreateNUWAdd(
2374  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2375  llvm::Value *AlignVal =
2376  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2377  Size = Bld.CreateUDiv(Size, AlignVal);
2378  Size = Bld.CreateNUWMul(Size, AlignVal);
2379  // TODO: allow the usage of shared memory to be controlled by
2380  // the user, for now, default to global.
2381  llvm::Value *GlobalRecordSizeArg[] = {
2382  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2383  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2385  OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2386  GlobalRecordSizeArg);
2387  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2388  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2389  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2390  CGM.getContext().getDeclAlign(VD),
2392  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2393  Base.getAddress());
2394  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2395  }
2396  I->getSecond().MappedParams->apply(CGF);
2397 }
2398 
2399 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2400  bool WithSPMDCheck) {
2402  getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2403  return;
2404 
2405  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2406  if (I != FunctionGlobalizedDecls.end()) {
2407  I->getSecond().MappedParams->restore(CGF);
2408  if (!CGF.HaveInsertPoint())
2409  return;
2410  for (llvm::Value *Addr :
2411  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2412  CGF.EmitRuntimeCall(
2413  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2414  Addr);
2415  }
2416  if (I->getSecond().GlobalRecordAddr) {
2417  if (!IsInTTDRegion &&
2418  (WithSPMDCheck ||
2419  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2420  CGBuilderTy &Bld = CGF.Builder;
2421  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2422  llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2423  Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2424  // There is no need to emit line number for unconditional branch.
2426  CGF.EmitBlock(NonSPMDBB);
2427  CGF.EmitRuntimeCall(
2429  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2430  CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2431  CGF.EmitBlock(ExitBB);
2432  } else if (IsInTTDRegion) {
2433  assert(GlobalizedRecords.back().RegionCounter > 0 &&
2434  "region counter must be > 0.");
2435  --GlobalizedRecords.back().RegionCounter;
2436  // Emit the restore function only in the target region.
2437  if (GlobalizedRecords.back().RegionCounter == 0) {
2438  QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2439  /*DestWidth=*/16, /*Signed=*/0);
2440  llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2441  Address(GlobalizedRecords.back().UseSharedMemory,
2442  CGM.getContext().getTypeAlignInChars(Int16Ty)),
2443  /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2444  llvm::Value *Args[] = {
2445  llvm::ConstantInt::get(
2446  CGM.Int16Ty,
2447  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2448  IsInSharedMemory};
2449  CGF.EmitRuntimeCall(
2451  OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2452  Args);
2453  }
2454  } else {
2456  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2457  I->getSecond().GlobalRecordAddr);
2458  }
2459  }
2460  }
2461 }
2462 
2464  const OMPExecutableDirective &D,
2465  SourceLocation Loc,
2466  llvm::Function *OutlinedFn,
2467  ArrayRef<llvm::Value *> CapturedVars) {
2468  if (!CGF.HaveInsertPoint())
2469  return;
2470 
2471  Address ZeroAddr = CGF.CreateMemTemp(
2472  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2473  /*Name*/ ".zero.addr");
2474  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2475  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2476  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2477  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2478  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2479  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2480 }
2481 
2483  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2484  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2485  if (!CGF.HaveInsertPoint())
2486  return;
2487 
2488  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2489  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2490  else
2491  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2492 }
2493 
2494 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2495  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2496  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2497  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2498 
2499  // Force inline this outlined function at its call site.
2500  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2501 
2503  /*DestWidth=*/32, /*Signed=*/1),
2504  ".zero.addr");
2505  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2506  // ThreadId for serialized parallels is 0.
2507  Address ThreadIDAddr = ZeroAddr;
2508  auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
2509  CodeGenFunction &CGF, PrePostActionTy &Action) {
2510  Action.Enter(CGF);
2511 
2512  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2513  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2514  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2515  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2516  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2517  };
2518  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2519  PrePostActionTy &) {
2520 
2521  RegionCodeGenTy RCG(CodeGen);
2522  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2523  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2524  llvm::Value *Args[] = {RTLoc, ThreadID};
2525 
2526  NVPTXActionTy Action(
2527  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2528  Args,
2529  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2530  Args);
2531  RCG.setAction(Action);
2532  RCG(CGF);
2533  };
2534 
2535  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2536  PrePostActionTy &Action) {
2537  CGBuilderTy &Bld = CGF.Builder;
2538  llvm::Function *WFn = WrapperFunctionsMap[Fn];
2539  assert(WFn && "Wrapper function does not exist!");
2540  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2541 
2542  // Prepare for parallel region. Indicate the outlined function.
2543  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
2544  CGF.EmitRuntimeCall(
2545  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2546  Args);
2547 
2548  // Create a private scope that will globalize the arguments
2549  // passed from the outside of the target region.
2550  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2551 
2552  // There's something to share.
2553  if (!CapturedVars.empty()) {
2554  // Prepare for parallel region. Indicate the outlined function.
2555  Address SharedArgs =
2556  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2557  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2558 
2559  llvm::Value *DataSharingArgs[] = {
2560  SharedArgsPtr,
2561  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2562  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2563  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2564  DataSharingArgs);
2565 
2566  // Store variable address in a list of references to pass to workers.
2567  unsigned Idx = 0;
2568  ASTContext &Ctx = CGF.getContext();
2569  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2570  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2571  .castAs<PointerType>());
2572  for (llvm::Value *V : CapturedVars) {
2573  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
2574  llvm::Value *PtrV;
2575  if (V->getType()->isIntegerTy())
2576  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2577  else
2578  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2579  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2580  Ctx.getPointerType(Ctx.VoidPtrTy));
2581  ++Idx;
2582  }
2583  }
2584 
2585  // Activate workers. This barrier is used by the master to signal
2586  // work for the workers.
2587  syncCTAThreads(CGF);
2588 
2589  // OpenMP [2.5, Parallel Construct, p.49]
2590  // There is an implied barrier at the end of a parallel region. After the
2591  // end of a parallel region, only the master thread of the team resumes
2592  // execution of the enclosing task region.
2593  //
2594  // The master waits at this barrier until all workers are done.
2595  syncCTAThreads(CGF);
2596 
2597  if (!CapturedVars.empty())
2598  CGF.EmitRuntimeCall(
2599  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2600 
2601  // Remember for post-processing in worker loop.
2602  Work.emplace_back(WFn);
2603  };
2604 
2605  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2606  CodeGenFunction &CGF, PrePostActionTy &Action) {
2607  if (IsInParallelRegion) {
2608  SeqGen(CGF, Action);
2609  } else if (IsInTargetMasterThreadRegion) {
2610  L0ParallelGen(CGF, Action);
2611  } else {
2612  // Check for master and then parallelism:
2613  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2614  // Serialized execution.
2615  // } else {
2616  // Worker call.
2617  // }
2618  CGBuilderTy &Bld = CGF.Builder;
2619  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2620  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2621  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2622  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2623  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2624  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2625  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2626  // There is no need to emit line number for unconditional branch.
2628  CGF.EmitBlock(ParallelCheckBB);
2629  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2630  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2631  llvm::Value *PL = CGF.EmitRuntimeCall(
2632  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2633  {RTLoc, ThreadID});
2634  llvm::Value *Res = Bld.CreateIsNotNull(PL);
2635  Bld.CreateCondBr(Res, SeqBB, MasterBB);
2636  CGF.EmitBlock(SeqBB);
2637  SeqGen(CGF, Action);
2638  CGF.EmitBranch(ExitBB);
2639  // There is no need to emit line number for unconditional branch.
2641  CGF.EmitBlock(MasterBB);
2642  L0ParallelGen(CGF, Action);
2643  CGF.EmitBranch(ExitBB);
2644  // There is no need to emit line number for unconditional branch.
2646  // Emit the continuation block for code after the if.
2647  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2648  }
2649  };
2650 
2651  if (IfCond) {
2652  emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2653  } else {
2655  RegionCodeGenTy ThenRCG(LNParallelGen);
2656  ThenRCG(CGF);
2657  }
2658 }
2659 
2660 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2661  CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2662  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2663  // Just call the outlined function to execute the parallel region.
2664  // OutlinedFn(&GTid, &zero, CapturedStruct);
2665  //
2666  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2667 
2669  /*DestWidth=*/32, /*Signed=*/1),
2670  ".zero.addr");
2671  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2672  // ThreadId for serialized parallels is 0.
2673  Address ThreadIDAddr = ZeroAddr;
2674  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
2675  &ThreadIDAddr](CodeGenFunction &CGF,
2676  PrePostActionTy &Action) {
2677  Action.Enter(CGF);
2678 
2679  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2680  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2681  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2682  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2683  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2684  };
2685  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2686  PrePostActionTy &) {
2687 
2688  RegionCodeGenTy RCG(CodeGen);
2689  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2690  llvm::Value *ThreadID = getThreadID(CGF, Loc);
2691  llvm::Value *Args[] = {RTLoc, ThreadID};
2692 
2693  NVPTXActionTy Action(
2694  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2695  Args,
2696  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2697  Args);
2698  RCG.setAction(Action);
2699  RCG(CGF);
2700  };
2701 
2702  if (IsInTargetMasterThreadRegion) {
2703  // In the worker need to use the real thread id.
2704  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2705  RegionCodeGenTy RCG(CodeGen);
2706  RCG(CGF);
2707  } else {
2708  // If we are not in the target region, it is definitely L2 parallelism or
2709  // more, because for SPMD mode we always has L1 parallel level, sowe don't
2710  // need to check for orphaned directives.
2711  RegionCodeGenTy RCG(SeqGen);
2712  RCG(CGF);
2713  }
2714 }
2715 
2716 void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
2717  // Always emit simple barriers!
2718  if (!CGF.HaveInsertPoint())
2719  return;
2720  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2721  // This function does not use parameters, so we can emit just default values.
2722  llvm::Value *Args[] = {
2723  llvm::ConstantPointerNull::get(
2724  cast<llvm::PointerType>(getIdentTyPointerTy())),
2725  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2726  CGF.EmitRuntimeCall(
2727  createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
2728 }
2729 
2731  SourceLocation Loc,
2732  OpenMPDirectiveKind Kind, bool,
2733  bool) {
2734  // Always emit simple barriers!
2735  if (!CGF.HaveInsertPoint())
2736  return;
2737  // Build call __kmpc_cancel_barrier(loc, thread_id);
2738  unsigned Flags = getDefaultFlagsForBarriers(Kind);
2739  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2740  getThreadID(CGF, Loc)};
2742 }
2743 
2745  CodeGenFunction &CGF, StringRef CriticalName,
2746  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2747  const Expr *Hint) {
2748  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2749  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2750  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2751  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2752  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2753 
2754  // Fetch team-local id of the thread.
2755  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2756 
2757  // Get the width of the team.
2758  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2759 
2760  // Initialize the counter variable for the loop.
2761  QualType Int32Ty =
2762  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2763  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2764  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2765  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2766  /*isInit=*/true);
2767 
2768  // Block checks if loop counter exceeds upper bound.
2769  CGF.EmitBlock(LoopBB);
2770  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2771  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2772  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2773 
2774  // Block tests which single thread should execute region, and which threads
2775  // should go straight to synchronisation point.
2776  CGF.EmitBlock(TestBB);
2777  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2778  llvm::Value *CmpThreadToCounter =
2779  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2780  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2781 
2782  // Block emits the body of the critical region.
2783  CGF.EmitBlock(BodyBB);
2784 
2785  // Output the critical statement.
2786  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2787  Hint);
2788 
2789  // After the body surrounded by the critical region, the single executing
2790  // thread will jump to the synchronisation point.
2791  // Block waits for all threads in current team to finish then increments the
2792  // counter variable and returns to the loop.
2793  CGF.EmitBlock(SyncBB);
2794  emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false,
2795  /*ForceSimpleCall=*/true);
2796 
2797  llvm::Value *IncCounterVal =
2798  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2799  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2800  CGF.EmitBranch(LoopBB);
2801 
2802  // Block that is reached when all threads in the team complete the region.
2803  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2804 }
2805 
2806 /// Cast value to the specified type.
2808  QualType ValTy, QualType CastTy,
2809  SourceLocation Loc) {
2810  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2811  "Cast type must sized.");
2812  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2813  "Val type must sized.");
2814  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2815  if (ValTy == CastTy)
2816  return Val;
2817  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2818  CGF.getContext().getTypeSizeInChars(CastTy))
2819  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2820  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2821  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2822  CastTy->hasSignedIntegerRepresentation());
2823  Address CastItem = CGF.CreateMemTemp(CastTy);
2825  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2826  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2827  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2828 }
2829 
2830 /// This function creates calls to one of two shuffle functions to copy
2831 /// variables between lanes in a warp.
2833  llvm::Value *Elem,
2834  QualType ElemType,
2836  SourceLocation Loc) {
2837  CodeGenModule &CGM = CGF.CGM;
2838  CGBuilderTy &Bld = CGF.Builder;
2839  CGOpenMPRuntimeNVPTX &RT =
2840  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2841 
2842  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2843  assert(Size.getQuantity() <= 8 &&
2844  "Unsupported bitwidth in shuffle instruction.");
2845 
2846  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2847  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2848  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2849 
2850  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2851  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2852  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2853  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2854  llvm::Value *WarpSize =
2855  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2856 
2857  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2858  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2859 
2860  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2861 }
2862 
2863 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2864  Address DestAddr, QualType ElemType,
2866  CGBuilderTy &Bld = CGF.Builder;
2867 
2868  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2869  // Create the loop over the big sized data.
2870  // ptr = (void*)Elem;
2871  // ptrEnd = (void*) Elem + 1;
2872  // Step = 8;
2873  // while (ptr + Step < ptrEnd)
2874  // shuffle((int64_t)*ptr);
2875  // Step = 4;
2876  // while (ptr + Step < ptrEnd)
2877  // shuffle((int32_t)*ptr);
2878  // ...
2879  Address ElemPtr = DestAddr;
2880  Address Ptr = SrcAddr;
2882  Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2883  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2884  if (Size < CharUnits::fromQuantity(IntSize))
2885  continue;
2886  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2887  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2888  /*Signed=*/1);
2889  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2890  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2891  ElemPtr =
2892  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2893  if (Size.getQuantity() / IntSize > 1) {
2894  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2895  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2896  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2897  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2898  CGF.EmitBlock(PreCondBB);
2899  llvm::PHINode *PhiSrc =
2900  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2901  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2902  llvm::PHINode *PhiDest =
2903  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2904  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2905  Ptr = Address(PhiSrc, Ptr.getAlignment());
2906  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2907  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2909  Ptr.getPointer(), CGF.VoidPtrTy));
2910  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2911  ThenBB, ExitBB);
2912  CGF.EmitBlock(ThenBB);
2914  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2915  IntType, Offset, Loc);
2916  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2917  Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2918  Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2919  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2920  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2921  CGF.EmitBranch(PreCondBB);
2922  CGF.EmitBlock(ExitBB);
2923  } else {
2925  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2926  IntType, Offset, Loc);
2927  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2928  Ptr = Bld.CreateConstGEP(Ptr, 1);
2929  ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2930  }
2931  Size = Size % IntSize;
2932  }
2933 }
2934 
2935 namespace {
2936 enum CopyAction : unsigned {
2937  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2938  // the warp using shuffle instructions.
2939  RemoteLaneToThread,
2940  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2941  ThreadCopy,
2942  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2943  ThreadToScratchpad,
2944  // ScratchpadToThread: Copy from a scratchpad array in global memory
2945  // containing team-reduced data to a thread's stack.
2946  ScratchpadToThread,
2947 };
2948 } // namespace
2949 
2954 };
2955 
2956 /// Emit instructions to copy a Reduce list, which contains partially
2957 /// aggregated values, in the specified direction.
2959  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2960  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2961  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2962 
2963  CodeGenModule &CGM = CGF.CGM;
2964  ASTContext &C = CGM.getContext();
2965  CGBuilderTy &Bld = CGF.Builder;
2966 
2967  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2968  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2969  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2970 
2971  // Iterates, element-by-element, through the source Reduce list and
2972  // make a copy.
2973  unsigned Idx = 0;
2974  unsigned Size = Privates.size();
2975  for (const Expr *Private : Privates) {
2976  Address SrcElementAddr = Address::invalid();
2977  Address DestElementAddr = Address::invalid();
2978  Address DestElementPtrAddr = Address::invalid();
2979  // Should we shuffle in an element from a remote lane?
2980  bool ShuffleInElement = false;
2981  // Set to true to update the pointer in the dest Reduce list to a
2982  // newly created element.
2983  bool UpdateDestListPtr = false;
2984  // Increment the src or dest pointer to the scratchpad, for each
2985  // new element.
2986  bool IncrScratchpadSrc = false;
2987  bool IncrScratchpadDest = false;
2988 
2989  switch (Action) {
2990  case RemoteLaneToThread: {
2991  // Step 1.1: Get the address for the src element in the Reduce list.
2992  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2993  SrcElementAddr = CGF.EmitLoadOfPointer(
2994  SrcElementPtrAddr,
2995  C.getPointerType(Private->getType())->castAs<PointerType>());
2996 
2997  // Step 1.2: Create a temporary to store the element in the destination
2998  // Reduce list.
2999  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3000  DestElementAddr =
3001  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3002  ShuffleInElement = true;
3003  UpdateDestListPtr = true;
3004  break;
3005  }
3006  case ThreadCopy: {
3007  // Step 1.1: Get the address for the src element in the Reduce list.
3008  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3009  SrcElementAddr = CGF.EmitLoadOfPointer(
3010  SrcElementPtrAddr,
3011  C.getPointerType(Private->getType())->castAs<PointerType>());
3012 
3013  // Step 1.2: Get the address for dest element. The destination
3014  // element has already been created on the thread's stack.
3015  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3016  DestElementAddr = CGF.EmitLoadOfPointer(
3017  DestElementPtrAddr,
3018  C.getPointerType(Private->getType())->castAs<PointerType>());
3019  break;
3020  }
3021  case ThreadToScratchpad: {
3022  // Step 1.1: Get the address for the src element in the Reduce list.
3023  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
3024  SrcElementAddr = CGF.EmitLoadOfPointer(
3025  SrcElementPtrAddr,
3026  C.getPointerType(Private->getType())->castAs<PointerType>());
3027 
3028  // Step 1.2: Get the address for dest element:
3029  // address = base + index * ElementSizeInChars.
3030  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3031  llvm::Value *CurrentOffset =
3032  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3033  llvm::Value *ScratchPadElemAbsolutePtrVal =
3034  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
3035  ScratchPadElemAbsolutePtrVal =
3036  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3037  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3038  C.getTypeAlignInChars(Private->getType()));
3039  IncrScratchpadDest = true;
3040  break;
3041  }
3042  case ScratchpadToThread: {
3043  // Step 1.1: Get the address for the src element in the scratchpad.
3044  // address = base + index * ElementSizeInChars.
3045  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3046  llvm::Value *CurrentOffset =
3047  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
3048  llvm::Value *ScratchPadElemAbsolutePtrVal =
3049  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
3050  ScratchPadElemAbsolutePtrVal =
3051  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
3052  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
3053  C.getTypeAlignInChars(Private->getType()));
3054  IncrScratchpadSrc = true;
3055 
3056  // Step 1.2: Create a temporary to store the element in the destination
3057  // Reduce list.
3058  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3059  DestElementAddr =
3060  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3061  UpdateDestListPtr = true;
3062  break;
3063  }
3064  }
3065 
3066  // Regardless of src and dest of copy, we emit the load of src
3067  // element as this is required in all directions
3068  SrcElementAddr = Bld.CreateElementBitCast(
3069  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3070  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3071  SrcElementAddr.getElementType());
3072 
3073  // Now that all active lanes have read the element in the
3074  // Reduce list, shuffle over the value from the remote lane.
3075  if (ShuffleInElement) {
3076  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3077  RemoteLaneOffset, Private->getExprLoc());
3078  } else {
3079  switch (CGF.getEvaluationKind(Private->getType())) {
3080  case TEK_Scalar: {
3081  llvm::Value *Elem =
3082  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
3083  Private->getType(), Private->getExprLoc());
3084  // Store the source element value to the dest element address.
3085  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
3086  Private->getType());
3087  break;
3088  }
3089  case TEK_Complex: {
3091  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3092  Private->getExprLoc());
3093  CGF.EmitStoreOfComplex(
3094  Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3095  /*isInit=*/false);
3096  break;
3097  }
3098  case TEK_Aggregate:
3099  CGF.EmitAggregateCopy(
3100  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3101  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3103  break;
3104  }
3105  }
3106 
3107  // Step 3.1: Modify reference in dest Reduce list as needed.
3108  // Modifying the reference in Reduce list to point to the newly
3109  // created element. The element is live in the current function
3110  // scope and that of functions it invokes (i.e., reduce_function).
3111  // RemoteReduceData[i] = (void*)&RemoteElem
3112  if (UpdateDestListPtr) {
3114  DestElementAddr.getPointer(), CGF.VoidPtrTy),
3115  DestElementPtrAddr, /*Volatile=*/false,
3116  C.VoidPtrTy);
3117  }
3118 
3119  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3120  // address of the next element in scratchpad memory, unless we're currently
3121  // processing the last one. Memory alignment is also taken care of here.
3122  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3123  llvm::Value *ScratchpadBasePtr =
3124  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3125  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3126  ScratchpadBasePtr = Bld.CreateNUWAdd(
3127  ScratchpadBasePtr,
3128  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3129 
3130  // Take care of global memory alignment for performance
3131  ScratchpadBasePtr = Bld.CreateNUWSub(
3132  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3133  ScratchpadBasePtr = Bld.CreateUDiv(
3134  ScratchpadBasePtr,
3135  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3136  ScratchpadBasePtr = Bld.CreateNUWAdd(
3137  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3138  ScratchpadBasePtr = Bld.CreateNUWMul(
3139  ScratchpadBasePtr,
3140  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3141 
3142  if (IncrScratchpadDest)
3143  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3144  else /* IncrScratchpadSrc = true */
3145  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3146  }
3147 
3148  ++Idx;
3149  }
3150 }
3151 
3152 /// This function emits a helper that gathers Reduce lists from the first
3153 /// lane of every active warp to lanes in the first warp.
3154 ///
3155 /// void inter_warp_copy_func(void* reduce_data, num_warps)
3156 /// shared smem[warp_size];
3157 /// For all data entries D in reduce_data:
3158 /// sync
3159 /// If (I am the first lane in each warp)
3160 /// Copy my local D to smem[warp_id]
3161 /// sync
3162 /// if (I am the first warp)
3163 /// Copy smem[thread_id] to my local D
3165  ArrayRef<const Expr *> Privates,
3166  QualType ReductionArrayTy,
3167  SourceLocation Loc) {
3168  ASTContext &C = CGM.getContext();
3169  llvm::Module &M = CGM.getModule();
3170 
3171  // ReduceList: thread local Reduce list.
3172  // At the stage of the computation when this function is called, partially
3173  // aggregated values reside in the first lane of every active warp.
3174  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3176  // NumWarps: number of warps active in the parallel region. This could
3177  // be smaller than 32 (max warps in a CTA) for partial block reduction.
3178  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3179  C.getIntTypeForBitwidth(32, /* Signed */ true),
3181  FunctionArgList Args;
3182  Args.push_back(&ReduceListArg);
3183  Args.push_back(&NumWarpsArg);
3184 
3185  const CGFunctionInfo &CGFI =
3187  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
3189  "_omp_reduction_inter_warp_copy_func", &M);
3190  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3191  Fn->setDoesNotRecurse();
3192  CodeGenFunction CGF(CGM);
3193  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3194 
3195  CGBuilderTy &Bld = CGF.Builder;
3196 
3197  // This array is used as a medium to transfer, one reduce element at a time,
3198  // the data from the first lane of every warp to lanes in the first warp
3199  // in order to perform the final step of a reduction in a parallel region
3200  // (reduction across warps). The array is placed in NVPTX __shared__ memory
3201  // for reduced latency, as well as to have a distinct copy for concurrently
3202  // executing target regions. The array is declared with common linkage so
3203  // as to be shared across compilation units.
3204  StringRef TransferMediumName =
3205  "__openmp_nvptx_data_transfer_temporary_storage";
3206  llvm::GlobalVariable *TransferMedium =
3207  M.getGlobalVariable(TransferMediumName);
3208  if (!TransferMedium) {
3209  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3210  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3211  TransferMedium = new llvm::GlobalVariable(
3212  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3213  llvm::Constant::getNullValue(Ty), TransferMediumName,
3214  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3215  SharedAddressSpace);
3216  CGM.addCompilerUsedGlobal(TransferMedium);
3217  }
3218 
3219  // Get the CUDA thread id of the current OpenMP thread on the GPU.
3220  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3221  // nvptx_lane_id = nvptx_id % warpsize
3222  llvm::Value *LaneID = getNVPTXLaneID(CGF);
3223  // nvptx_warp_id = nvptx_id / warpsize
3224  llvm::Value *WarpID = getNVPTXWarpID(CGF);
3225 
3226  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3227  Address LocalReduceList(
3229  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3230  C.VoidPtrTy, Loc),
3231  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3232  CGF.getPointerAlign());
3233 
3234  unsigned Idx = 0;
3235  for (const Expr *Private : Privates) {
3236  //
3237  // Warp master copies reduce element to transfer medium in __shared__
3238  // memory.
3239  //
3240  unsigned RealTySize =
3241  C.getTypeSizeInChars(Private->getType())
3242  .alignTo(C.getTypeAlignInChars(Private->getType()))
3243  .getQuantity();
3244  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3245  unsigned NumIters = RealTySize / TySize;
3246  if (NumIters == 0)
3247  continue;
3248  QualType CType = C.getIntTypeForBitwidth(
3249  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
3250  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3251  CharUnits Align = CharUnits::fromQuantity(TySize);
3252  llvm::Value *Cnt = nullptr;
3253  Address CntAddr = Address::invalid();
3254  llvm::BasicBlock *PrecondBB = nullptr;
3255  llvm::BasicBlock *ExitBB = nullptr;
3256  if (NumIters > 1) {
3257  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3258  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3259  /*Volatile=*/false, C.IntTy);
3260  PrecondBB = CGF.createBasicBlock("precond");
3261  ExitBB = CGF.createBasicBlock("exit");
3262  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3263  // There is no need to emit line number for unconditional branch.
3265  CGF.EmitBlock(PrecondBB);
3266  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
3267  llvm::Value *Cmp =
3268  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3269  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3270  CGF.EmitBlock(BodyBB);
3271  }
3272  // kmpc_barrier.
3273  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3274  /*EmitChecks=*/false,
3275  /*ForceSimpleCall=*/true);
3276  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3277  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3278  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3279 
3280  // if (lane_id == 0)
3281  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3282  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3283  CGF.EmitBlock(ThenBB);
3284 
3285  // Reduce element = LocalReduceList[i]
3286  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3287  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3288  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3289  // elemptr = ((CopyType*)(elemptrptr)) + I
3290  Address ElemPtr = Address(ElemPtrPtr, Align);
3291  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3292  if (NumIters > 1) {
3293  ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3294  ElemPtr.getAlignment());
3295  }
3296 
3297  // Get pointer to location in transfer medium.
3298  // MediumPtr = &medium[warp_id]
3299  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3300  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3301  Address MediumPtr(MediumPtrVal, Align);
3302  // Casting to actual data type.
3303  // MediumPtr = (CopyType*)MediumPtrAddr;
3304  MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3305 
3306  // elem = *elemptr
3307  //*MediumPtr = elem
3308  llvm::Value *Elem =
3309  CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
3310  // Store the source element value to the dest element address.
3311  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
3312 
3313  Bld.CreateBr(MergeBB);
3314 
3315  CGF.EmitBlock(ElseBB);
3316  Bld.CreateBr(MergeBB);
3317 
3318  CGF.EmitBlock(MergeBB);
3319 
3320  // kmpc_barrier.
3321  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3322  /*EmitChecks=*/false,
3323  /*ForceSimpleCall=*/true);
3324 
3325  //
3326  // Warp 0 copies reduce element from transfer medium.
3327  //
3328  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3329  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3330  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3331 
3332  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3333  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3334  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
3335 
3336  // Up to 32 threads in warp 0 are active.
3337  llvm::Value *IsActiveThread =
3338  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3339  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3340 
3341  CGF.EmitBlock(W0ThenBB);
3342 
3343  // SrcMediumPtr = &medium[tid]
3344  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3345  TransferMedium,
3346  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3347  Address SrcMediumPtr(SrcMediumPtrVal, Align);
3348  // SrcMediumVal = *SrcMediumPtr;
3349  SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3350 
3351  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3352  Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3353  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3354  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
3355  Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3356  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3357  if (NumIters > 1) {
3358  TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3359  TargetElemPtr.getAlignment());
3360  }
3361 
3362  // *TargetElemPtr = SrcMediumVal;
3363  llvm::Value *SrcMediumValue =
3364  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
3365  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3366  CType);
3367  Bld.CreateBr(W0MergeBB);
3368 
3369  CGF.EmitBlock(W0ElseBB);
3370  Bld.CreateBr(W0MergeBB);
3371 
3372  CGF.EmitBlock(W0MergeBB);
3373 
3374  if (NumIters > 1) {
3375  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
3376  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
3377  CGF.EmitBranch(PrecondBB);
3379  CGF.EmitBlock(ExitBB);
3380  }
3381  RealTySize %= TySize;
3382  }
3383  ++Idx;
3384  }
3385 
3386  CGF.FinishFunction();
3387  return Fn;
3388 }
3389 
3390 /// Emit a helper that reduces data across two OpenMP threads (lanes)
3391 /// in the same warp. It uses shuffle instructions to copy over data from
3392 /// a remote lane's stack. The reduction algorithm performed is specified
3393 /// by the fourth parameter.
3394 ///
3395 /// Algorithm Versions.
3396 /// Full Warp Reduce (argument value 0):
3397 /// This algorithm assumes that all 32 lanes are active and gathers
3398 /// data from these 32 lanes, producing a single resultant value.
3399 /// Contiguous Partial Warp Reduce (argument value 1):
3400 /// This algorithm assumes that only a *contiguous* subset of lanes
3401 /// are active. This happens for the last warp in a parallel region
3402 /// when the user specified num_threads is not an integer multiple of
3403 /// 32. This contiguous subset always starts with the zeroth lane.
3404 /// Partial Warp Reduce (argument value 2):
3405 /// This algorithm gathers data from any number of lanes at any position.
3406 /// All reduced values are stored in the lowest possible lane. The set
3407 /// of problems every algorithm addresses is a super set of those
3408 /// addressable by algorithms with a lower version number. Overhead
3409 /// increases as algorithm version increases.
3410 ///
3411 /// Terminology
3412 /// Reduce element:
3413 /// Reduce element refers to the individual data field with primitive
3414 /// data types to be combined and reduced across threads.
3415 /// Reduce list:
3416 /// Reduce list refers to a collection of local, thread-private
3417 /// reduce elements.
3418 /// Remote Reduce list:
3419 /// Remote Reduce list refers to a collection of remote (relative to
3420 /// the current thread) reduce elements.
3421 ///
3422 /// We distinguish between three states of threads that are important to
3423 /// the implementation of this function.
3424 /// Alive threads:
3425 /// Threads in a warp executing the SIMT instruction, as distinguished from
3426 /// threads that are inactive due to divergent control flow.
3427 /// Active threads:
3428 /// The minimal set of threads that has to be alive upon entry to this
3429 /// function. The computation is correct iff active threads are alive.
3430 /// Some threads are alive but they are not active because they do not
3431 /// contribute to the computation in any useful manner. Turning them off
3432 /// may introduce control flow overheads without any tangible benefits.
3433 /// Effective threads:
3434 /// In order to comply with the argument requirements of the shuffle
3435 /// function, we must keep all lanes holding data alive. But at most
3436 /// half of them perform value aggregation; we refer to this half of
3437 /// threads as effective. The other half is simply handing off their
3438 /// data.
3439 ///
3440 /// Procedure
3441 /// Value shuffle:
3442 /// In this step active threads transfer data from higher lane positions
3443 /// in the warp to lower lane positions, creating Remote Reduce list.
3444 /// Value aggregation:
3445 /// In this step, effective threads combine their thread local Reduce list
3446 /// with Remote Reduce list and store the result in the thread local
3447 /// Reduce list.
3448 /// Value copy:
3449 /// In this step, we deal with the assumption made by algorithm 2
3450 /// (i.e. contiguity assumption). When we have an odd number of lanes
3451 /// active, say 2k+1, only k threads will be effective and therefore k
3452 /// new values will be produced. However, the Reduce list owned by the
3453 /// (2k+1)th thread is ignored in the value aggregation. Therefore
3454 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3455 /// that the contiguity assumption still holds.
3456 static llvm::Function *emitShuffleAndReduceFunction(
3457  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3458  QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
3459  ASTContext &C = CGM.getContext();
3460 
3461  // Thread local Reduce list used to host the values of data to be reduced.
3462  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3464  // Current lane id; could be logical.
3465  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3467  // Offset of the remote source lane relative to the current lane.
3468  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3470  // Algorithm version. This is expected to be known at compile time.
3471  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3473  FunctionArgList Args;
3474  Args.push_back(&ReduceListArg);
3475  Args.push_back(&LaneIDArg);
3476  Args.push_back(&RemoteLaneOffsetArg);
3477  Args.push_back(&AlgoVerArg);
3478 
3479  const CGFunctionInfo &CGFI =
3481  auto *Fn = llvm::Function::Create(
3483  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3484  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3485  Fn->setDoesNotRecurse();
3486  CodeGenFunction CGF(CGM);
3487  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3488 
3489  CGBuilderTy &Bld = CGF.Builder;
3490 
3491  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3492  Address LocalReduceList(
3494  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3495  C.VoidPtrTy, SourceLocation()),
3496  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3497  CGF.getPointerAlign());
3498 
3499  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3500  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3501  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3502 
3503  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3504  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3505  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3506 
3507  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3508  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3509  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3510 
3511  // Create a local thread-private variable to host the Reduce list
3512  // from a remote lane.
3513  Address RemoteReduceList =
3514  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3515 
3516  // This loop iterates through the list of reduce elements and copies,
3517  // element by element, from a remote lane in the warp to RemoteReduceList,
3518  // hosted on the thread's stack.
3519  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3520  LocalReduceList, RemoteReduceList,
3521  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3522  /*ScratchpadIndex=*/nullptr,
3523  /*ScratchpadWidth=*/nullptr});
3524 
3525  // The actions to be performed on the Remote Reduce list is dependent
3526  // on the algorithm version.
3527  //
3528  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3529  // LaneId % 2 == 0 && Offset > 0):
3530  // do the reduction value aggregation
3531  //
3532  // The thread local variable Reduce list is mutated in place to host the
3533  // reduced data, which is the aggregated value produced from local and
3534  // remote lanes.
3535  //
3536  // Note that AlgoVer is expected to be a constant integer known at compile
3537  // time.
3538  // When AlgoVer==0, the first conjunction evaluates to true, making
3539  // the entire predicate true during compile time.
3540  // When AlgoVer==1, the second conjunction has only the second part to be
3541  // evaluated during runtime. Other conjunctions evaluates to false
3542  // during compile time.
3543  // When AlgoVer==2, the third conjunction has only the second part to be
3544  // evaluated during runtime. Other conjunctions evaluates to false
3545  // during compile time.
3546  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3547 
3548  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3549  llvm::Value *CondAlgo1 = Bld.CreateAnd(
3550  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3551 
3552  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3553  llvm::Value *CondAlgo2 = Bld.CreateAnd(
3554  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3555  CondAlgo2 = Bld.CreateAnd(
3556  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3557 
3558  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3559  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3560 
3561  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3562  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3563  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3564  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3565 
3566  CGF.EmitBlock(ThenBB);
3567  // reduce_function(LocalReduceList, RemoteReduceList)
3568  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3569  LocalReduceList.getPointer(), CGF.VoidPtrTy);
3570  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3571  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3572  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3573  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3574  Bld.CreateBr(MergeBB);
3575 
3576  CGF.EmitBlock(ElseBB);
3577  Bld.CreateBr(MergeBB);
3578 
3579  CGF.EmitBlock(MergeBB);
3580 
3581  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3582  // Reduce list.
3583  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3584  llvm::Value *CondCopy = Bld.CreateAnd(
3585  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3586 
3587  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3588  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3589  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3590  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3591 
3592  CGF.EmitBlock(CpyThenBB);
3593  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3594  RemoteReduceList, LocalReduceList);
3595  Bld.CreateBr(CpyMergeBB);
3596 
3597  CGF.EmitBlock(CpyElseBB);
3598  Bld.CreateBr(CpyMergeBB);
3599 
3600  CGF.EmitBlock(CpyMergeBB);
3601 
3602  CGF.FinishFunction();
3603  return Fn;
3604 }
3605 
3606 /// This function emits a helper that copies all the reduction variables from
3607 /// the team into the provided global buffer for the reduction variables.
3608 ///
3609 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3610 /// For all data entries D in reduce_data:
3611 /// Copy local D to buffer.D[Idx]
3613  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3614  QualType ReductionArrayTy, SourceLocation Loc,
3615  const RecordDecl *TeamReductionRec,
3616  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3617  &VarFieldMap) {
3618  ASTContext &C = CGM.getContext();
3619 
3620  // Buffer: global reduction buffer.
3621  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3623  // Idx: index of the buffer.
3624  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3626  // ReduceList: thread local Reduce list.
3627  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3629  FunctionArgList Args;
3630  Args.push_back(&BufferArg);
3631  Args.push_back(&IdxArg);
3632  Args.push_back(&ReduceListArg);
3633 
3634  const CGFunctionInfo &CGFI =
3636  auto *Fn = llvm::Function::Create(
3638  "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3639  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3640  Fn->setDoesNotRecurse();
3641  CodeGenFunction CGF(CGM);
3642  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3643 
3644  CGBuilderTy &Bld = CGF.Builder;
3645 
3646  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3647  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3648  Address LocalReduceList(
3650  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3651  C.VoidPtrTy, Loc),
3652  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3653  CGF.getPointerAlign());
3654  QualType StaticTy = C.getRecordType(TeamReductionRec);
3655  llvm::Type *LLVMReductionsBufferTy =
3656  CGM.getTypes().ConvertTypeForMem(StaticTy);
3658  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3659  LLVMReductionsBufferTy->getPointerTo());
3660  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3661  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3662  /*Volatile=*/false, C.IntTy,
3663  Loc)};
3664  unsigned Idx = 0;
3665  for (const Expr *Private : Privates) {
3666  // Reduce element = LocalReduceList[i]
3667  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3668  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3669  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3670  // elemptr = ((CopyType*)(elemptrptr)) + I
3671  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3672  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3673  Address ElemPtr =
3674  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3675  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3676  // Global = Buffer.VD[Idx];
3677  const FieldDecl *FD = VarFieldMap.lookup(VD);
3678  LValue GlobLVal = CGF.EmitLValueForField(
3679  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3680  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3681  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3682  switch (CGF.getEvaluationKind(Private->getType())) {
3683  case TEK_Scalar: {
3684  llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3685  Private->getType(), Loc);
3686  CGF.EmitStoreOfScalar(V, GlobLVal);
3687  break;
3688  }
3689  case TEK_Complex: {
3691  CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3692  CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3693  break;
3694  }
3695  case TEK_Aggregate:
3696  CGF.EmitAggregateCopy(GlobLVal,
3697  CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3699  break;
3700  }
3701  ++Idx;
3702  }
3703 
3704  CGF.FinishFunction();
3705  return Fn;
3706 }
3707 
3708 /// This function emits a helper that reduces all the reduction variables from
3709 /// the team into the provided global buffer for the reduction variables.
3710 ///
3711 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3712 /// void *GlobPtrs[];
3713 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3714 /// ...
3715 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3716 /// reduce_function(GlobPtrs, reduce_data);
3718  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3719  QualType ReductionArrayTy, SourceLocation Loc,
3720  const RecordDecl *TeamReductionRec,
3721  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3722  &VarFieldMap,
3723  llvm::Function *ReduceFn) {
3724  ASTContext &C = CGM.getContext();
3725 
3726  // Buffer: global reduction buffer.
3727  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3729  // Idx: index of the buffer.
3730  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3732  // ReduceList: thread local Reduce list.
3733  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3735  FunctionArgList Args;
3736  Args.push_back(&BufferArg);
3737  Args.push_back(&IdxArg);
3738  Args.push_back(&ReduceListArg);
3739 
3740  const CGFunctionInfo &CGFI =
3742  auto *Fn = llvm::Function::Create(
3744  "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3745  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3746  Fn->setDoesNotRecurse();
3747  CodeGenFunction CGF(CGM);
3748  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3749 
3750  CGBuilderTy &Bld = CGF.Builder;
3751 
3752  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3753  QualType StaticTy = C.getRecordType(TeamReductionRec);
3754  llvm::Type *LLVMReductionsBufferTy =
3755  CGM.getTypes().ConvertTypeForMem(StaticTy);
3757  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3758  LLVMReductionsBufferTy->getPointerTo());
3759 
3760  // 1. Build a list of reduction variables.
3761  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3762  Address ReductionList =
3763  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3764  auto IPriv = Privates.begin();
3765  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3766  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3767  /*Volatile=*/false, C.IntTy,
3768  Loc)};
3769  unsigned Idx = 0;
3770  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3771  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3772  // Global = Buffer.VD[Idx];
3773  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3774  const FieldDecl *FD = VarFieldMap.lookup(VD);
3775  LValue GlobLVal = CGF.EmitLValueForField(
3776  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3777  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3778  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3779  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3780  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3781  // Store array size.
3782  ++Idx;
3783  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3784  llvm::Value *Size = CGF.Builder.CreateIntCast(
3785  CGF.getVLASize(
3786  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3787  .NumElts,
3788  CGF.SizeTy, /*isSigned=*/false);
3789  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3790  Elem);
3791  }
3792  }
3793 
3794  // Call reduce_function(GlobalReduceList, ReduceList)
3795  llvm::Value *GlobalReduceList =
3796  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3797  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3798  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3799  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3800  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3801  CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3802  CGF.FinishFunction();
3803  return Fn;
3804 }
3805 
3806 /// This function emits a helper that copies all the reduction variables from
3807 /// the team into the provided global buffer for the reduction variables.
3808 ///
3809 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3810 /// For all data entries D in reduce_data:
3811 /// Copy buffer.D[Idx] to local D;
3813  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3814  QualType ReductionArrayTy, SourceLocation Loc,
3815  const RecordDecl *TeamReductionRec,
3816  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3817  &VarFieldMap) {
3818  ASTContext &C = CGM.getContext();
3819 
3820  // Buffer: global reduction buffer.
3821  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3823  // Idx: index of the buffer.
3824  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3826  // ReduceList: thread local Reduce list.
3827  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3829  FunctionArgList Args;
3830  Args.push_back(&BufferArg);
3831  Args.push_back(&IdxArg);
3832  Args.push_back(&ReduceListArg);
3833 
3834  const CGFunctionInfo &CGFI =
3836  auto *Fn = llvm::Function::Create(
3838  "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3839  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3840  Fn->setDoesNotRecurse();
3841  CodeGenFunction CGF(CGM);
3842  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3843 
3844  CGBuilderTy &Bld = CGF.Builder;
3845 
3846  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3847  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3848  Address LocalReduceList(
3850  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3851  C.VoidPtrTy, Loc),
3852  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3853  CGF.getPointerAlign());
3854  QualType StaticTy = C.getRecordType(TeamReductionRec);
3855  llvm::Type *LLVMReductionsBufferTy =
3856  CGM.getTypes().ConvertTypeForMem(StaticTy);
3858  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3859  LLVMReductionsBufferTy->getPointerTo());
3860 
3861  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3862  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3863  /*Volatile=*/false, C.IntTy,
3864  Loc)};
3865  unsigned Idx = 0;
3866  for (const Expr *Private : Privates) {
3867  // Reduce element = LocalReduceList[i]
3868  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3869  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3870  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3871  // elemptr = ((CopyType*)(elemptrptr)) + I
3872  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3873  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3874  Address ElemPtr =
3875  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3876  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3877  // Global = Buffer.VD[Idx];
3878  const FieldDecl *FD = VarFieldMap.lookup(VD);
3879  LValue GlobLVal = CGF.EmitLValueForField(
3880  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3881  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3882  GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3883  switch (CGF.getEvaluationKind(Private->getType())) {
3884  case TEK_Scalar: {
3885  llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3886  CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
3887  break;
3888  }
3889  case TEK_Complex: {
3890  CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3891  CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3892  /*isInit=*/false);
3893  break;
3894  }
3895  case TEK_Aggregate:
3896  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3897  GlobLVal, Private->getType(),
3899  break;
3900  }
3901  ++Idx;
3902  }
3903 
3904  CGF.FinishFunction();
3905  return Fn;
3906 }
3907 
3908 /// This function emits a helper that reduces all the reduction variables from
3909 /// the team into the provided global buffer for the reduction variables.
3910 ///
3911 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3912 /// void *GlobPtrs[];
3913 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
3914 /// ...
3915 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
3916 /// reduce_function(reduce_data, GlobPtrs);
3918  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3919  QualType ReductionArrayTy, SourceLocation Loc,
3920  const RecordDecl *TeamReductionRec,
3921  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3922  &VarFieldMap,
3923  llvm::Function *ReduceFn) {
3924  ASTContext &C = CGM.getContext();
3925 
3926  // Buffer: global reduction buffer.
3927  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3929  // Idx: index of the buffer.
3930  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3932  // ReduceList: thread local Reduce list.
3933  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3935  FunctionArgList Args;
3936  Args.push_back(&BufferArg);
3937  Args.push_back(&IdxArg);
3938  Args.push_back(&ReduceListArg);
3939 
3940  const CGFunctionInfo &CGFI =
3942  auto *Fn = llvm::Function::Create(
3944  "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3945  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3946  Fn->setDoesNotRecurse();
3947  CodeGenFunction CGF(CGM);
3948  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3949 
3950  CGBuilderTy &Bld = CGF.Builder;
3951 
3952  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3953  QualType StaticTy = C.getRecordType(TeamReductionRec);
3954  llvm::Type *LLVMReductionsBufferTy =
3955  CGM.getTypes().ConvertTypeForMem(StaticTy);
3957  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3958  LLVMReductionsBufferTy->getPointerTo());
3959 
3960  // 1. Build a list of reduction variables.
3961  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3962  Address ReductionList =
3963  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3964  auto IPriv = Privates.begin();
3965  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3966  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3967  /*Volatile=*/false, C.IntTy,
3968  Loc)};
3969  unsigned Idx = 0;
3970  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3971  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3972  // Global = Buffer.VD[Idx];
3973  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3974  const FieldDecl *FD = VarFieldMap.lookup(VD);
3975  LValue GlobLVal = CGF.EmitLValueForField(
3976  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3977  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3978  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3979  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3980  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3981  // Store array size.
3982  ++Idx;
3983  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3984  llvm::Value *Size = CGF.Builder.CreateIntCast(
3985  CGF.getVLASize(
3986  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3987  .NumElts,
3988  CGF.SizeTy, /*isSigned=*/false);
3989  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3990  Elem);
3991  }
3992  }
3993 
3994  // Call reduce_function(ReduceList, GlobalReduceList)
3995  llvm::Value *GlobalReduceList =
3996  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3997  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3998  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3999  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
4000  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
4001  CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
4002  CGF.FinishFunction();
4003  return Fn;
4004 }
4005 
4006 ///
4007 /// Design of OpenMP reductions on the GPU
4008 ///
4009 /// Consider a typical OpenMP program with one or more reduction
4010 /// clauses:
4011 ///
4012 /// float foo;
4013 /// double bar;
4014 /// #pragma omp target teams distribute parallel for \
4015 /// reduction(+:foo) reduction(*:bar)
4016 /// for (int i = 0; i < N; i++) {
4017 /// foo += A[i]; bar *= B[i];
4018 /// }
4019 ///
4020 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
4021 /// all teams. In our OpenMP implementation on the NVPTX device an
4022 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
4023 /// within a team are mapped to CUDA threads within a threadblock.
4024 /// Our goal is to efficiently aggregate values across all OpenMP
4025 /// threads such that:
4026 ///
4027 /// - the compiler and runtime are logically concise, and
4028 /// - the reduction is performed efficiently in a hierarchical
4029 /// manner as follows: within OpenMP threads in the same warp,
4030 /// across warps in a threadblock, and finally across teams on
4031 /// the NVPTX device.
4032 ///
4033 /// Introduction to Decoupling
4034 ///
4035 /// We would like to decouple the compiler and the runtime so that the
4036 /// latter is ignorant of the reduction variables (number, data types)
4037 /// and the reduction operators. This allows a simpler interface
4038 /// and implementation while still attaining good performance.
4039 ///
4040 /// Pseudocode for the aforementioned OpenMP program generated by the
4041 /// compiler is as follows:
4042 ///
4043 /// 1. Create private copies of reduction variables on each OpenMP
4044 /// thread: 'foo_private', 'bar_private'
4045 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
4046 /// to it and writes the result in 'foo_private' and 'bar_private'
4047 /// respectively.
4048 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
4049 /// and store the result on the team master:
4050 ///
4051 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
4052 /// reduceData, shuffleReduceFn, interWarpCpyFn)
4053 ///
4054 /// where:
4055 /// struct ReduceData {
4056 /// double *foo;
4057 /// double *bar;
4058 /// } reduceData
4059 /// reduceData.foo = &foo_private
4060 /// reduceData.bar = &bar_private
4061 ///
4062 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
4063 /// auxiliary functions generated by the compiler that operate on
4064 /// variables of type 'ReduceData'. They aid the runtime perform
4065 /// algorithmic steps in a data agnostic manner.
4066 ///
4067 /// 'shuffleReduceFn' is a pointer to a function that reduces data
4068 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
4069 /// same warp. It takes the following arguments as input:
4070 ///
4071 /// a. variable of type 'ReduceData' on the calling lane,
4072 /// b. its lane_id,
4073 /// c. an offset relative to the current lane_id to generate a
4074 /// remote_lane_id. The remote lane contains the second
4075 /// variable of type 'ReduceData' that is to be reduced.
4076 /// d. an algorithm version parameter determining which reduction
4077 /// algorithm to use.
4078 ///
4079 /// 'shuffleReduceFn' retrieves data from the remote lane using
4080 /// efficient GPU shuffle intrinsics and reduces, using the
4081 /// algorithm specified by the 4th parameter, the two operands
4082 /// element-wise. The result is written to the first operand.
4083 ///
4084 /// Different reduction algorithms are implemented in different
4085 /// runtime functions, all calling 'shuffleReduceFn' to perform
4086 /// the essential reduction step. Therefore, based on the 4th
4087 /// parameter, this function behaves slightly differently to
4088 /// cooperate with the runtime to ensure correctness under
4089 /// different circumstances.
4090 ///
4091 /// 'InterWarpCpyFn' is a pointer to a function that transfers
4092 /// reduced variables across warps. It tunnels, through CUDA
4093 /// shared memory, the thread-private data of type 'ReduceData'
4094 /// from lane 0 of each warp to a lane in the first warp.
4095 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
4096 /// The last team writes the global reduced value to memory.
4097 ///
4098 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
4099 /// reduceData, shuffleReduceFn, interWarpCpyFn,
4100 /// scratchpadCopyFn, loadAndReduceFn)
4101 ///
4102 /// 'scratchpadCopyFn' is a helper that stores reduced
4103 /// data from the team master to a scratchpad array in
4104 /// global memory.
4105 ///
4106 /// 'loadAndReduceFn' is a helper that loads data from
4107 /// the scratchpad array and reduces it with the input
4108 /// operand.
4109 ///
4110 /// These compiler generated functions hide address
4111 /// calculation and alignment information from the runtime.
4112 /// 5. if ret == 1:
4113 /// The team master of the last team stores the reduced
4114 /// result to the globals in memory.
4115 /// foo += reduceData.foo; bar *= reduceData.bar
4116 ///
4117 ///
4118 /// Warp Reduction Algorithms
4119 ///
4120 /// On the warp level, we have three algorithms implemented in the
4121 /// OpenMP runtime depending on the number of active lanes:
4122 ///
4123 /// Full Warp Reduction
4124 ///
4125 /// The reduce algorithm within a warp where all lanes are active
4126 /// is implemented in the runtime as follows:
4127 ///
4128 /// full_warp_reduce(void *reduce_data,
4129 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4130 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
4131 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
4132 /// }
4133 ///
4134 /// The algorithm completes in log(2, WARPSIZE) steps.
4135 ///
4136 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
4137 /// not used therefore we save instructions by not retrieving lane_id
4138 /// from the corresponding special registers. The 4th parameter, which
4139 /// represents the version of the algorithm being used, is set to 0 to
4140 /// signify full warp reduction.
4141 ///
4142 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4143 ///
4144 /// #reduce_elem refers to an element in the local lane's data structure
4145 /// #remote_elem is retrieved from a remote lane
4146 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4147 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
4148 ///
4149 /// Contiguous Partial Warp Reduction
4150 ///
4151 /// This reduce algorithm is used within a warp where only the first
4152 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
4153 /// number of OpenMP threads in a parallel region is not a multiple of
4154 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
4155 ///
4156 /// void
4157 /// contiguous_partial_reduce(void *reduce_data,
4158 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
4159 /// int size, int lane_id) {
4160 /// int curr_size;
4161 /// int offset;
4162 /// curr_size = size;
4163 /// mask = curr_size/2;
4164 /// while (offset>0) {
4165 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
4166 /// curr_size = (curr_size+1)/2;
4167 /// offset = curr_size/2;
4168 /// }
4169 /// }
4170 ///
4171 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4172 ///
4173 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4174 /// if (lane_id < offset)
4175 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4176 /// else
4177 /// reduce_elem = remote_elem
4178 ///
4179 /// This algorithm assumes that the data to be reduced are located in a
4180 /// contiguous subset of lanes starting from the first. When there is
4181 /// an odd number of active lanes, the data in the last lane is not
4182 /// aggregated with any other lane's dat but is instead copied over.
4183 ///
4184 /// Dispersed Partial Warp Reduction
4185 ///
4186 /// This algorithm is used within a warp when any discontiguous subset of
4187 /// lanes are active. It is used to implement the reduction operation
4188 /// across lanes in an OpenMP simd region or in a nested parallel region.
4189 ///
4190 /// void
4191 /// dispersed_partial_reduce(void *reduce_data,
4192 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4193 /// int size, remote_id;
4194 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
4195 /// do {
4196 /// remote_id = next_active_lane_id_right_after_me();
4197 /// # the above function returns 0 of no active lane
4198 /// # is present right after the current lane.
4199 /// size = number_of_active_lanes_in_this_warp();
4200 /// logical_lane_id /= 2;
4201 /// ShuffleReduceFn(reduce_data, logical_lane_id,
4202 /// remote_id-1-threadIdx.x, 2);
4203 /// } while (logical_lane_id % 2 == 0 && size > 1);
4204 /// }
4205 ///
4206 /// There is no assumption made about the initial state of the reduction.
4207 /// Any number of lanes (>=1) could be active at any position. The reduction
4208 /// result is returned in the first active lane.
4209 ///
4210 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4211 ///
4212 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4213 /// if (lane_id % 2 == 0 && offset > 0)
4214 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
4215 /// else
4216 /// reduce_elem = remote_elem
4217 ///
4218 ///
4219 /// Intra-Team Reduction
4220 ///
4221 /// This function, as implemented in the runtime call
4222 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
4223 /// threads in a team. It first reduces within a warp using the
4224 /// aforementioned algorithms. We then proceed to gather all such
4225 /// reduced values at the first warp.
4226 ///
4227 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
4228 /// data from each of the "warp master" (zeroth lane of each warp, where
4229 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
4230 /// a mathematical sense) the problem of reduction across warp masters in
4231 /// a block to the problem of warp reduction.
4232 ///
4233 ///
4234 /// Inter-Team Reduction
4235 ///
4236 /// Once a team has reduced its data to a single value, it is stored in
4237 /// a global scratchpad array. Since each team has a distinct slot, this
4238 /// can be done without locking.
4239 ///
4240 /// The last team to write to the scratchpad array proceeds to reduce the
4241 /// scratchpad array. One or more workers in the last team use the helper
4242 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
4243 /// the k'th worker reduces every k'th element.
4244 ///
4245 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
4246 /// reduce across workers and compute a globally reduced value.
4247 ///
4251  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
4252  if (!CGF.HaveInsertPoint())
4253  return;
4254 
4255  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
4256 #ifndef NDEBUG
4257  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
4258 #endif
4259 
4260  if (Options.SimpleReduction) {
4261  assert(!TeamsReduction && !ParallelReduction &&
4262  "Invalid reduction selection in emitReduction.");
4263  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
4264  ReductionOps, Options);
4265  return;
4266  }
4267 
4268  assert((TeamsReduction || ParallelReduction) &&
4269  "Invalid reduction selection in emitReduction.");
4270 
4271  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4272  // RedList, shuffle_reduce_func, interwarp_copy_func);
4273  // or
4274  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4275  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
4276  llvm::Value *ThreadId = getThreadID(CGF, Loc);
4277 
4278  llvm::Value *Res;
4279  ASTContext &C = CGM.getContext();
4280  // 1. Build a list of reduction variables.
4281  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4282  auto Size = RHSExprs.size();
4283  for (const Expr *E : Privates) {
4284  if (E->getType()->isVariablyModifiedType())
4285  // Reserve place for array size.
4286  ++Size;
4287  }
4288  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
4289  QualType ReductionArrayTy =
4291  /*IndexTypeQuals=*/0);
4292  Address ReductionList =
4293  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
4294  auto IPriv = Privates.begin();
4295  unsigned Idx = 0;
4296  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
4297  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4298  CGF.Builder.CreateStore(
4300  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
4301  Elem);
4302  if ((*IPriv)->getType()->isVariablyModifiedType()) {
4303  // Store array size.
4304  ++Idx;
4305  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4306  llvm::Value *Size = CGF.Builder.CreateIntCast(
4307  CGF.getVLASize(
4308  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
4309  .NumElts,
4310  CGF.SizeTy, /*isSigned=*/false);
4311  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
4312  Elem);
4313  }
4314  }
4315 
4317  ReductionList.getPointer(), CGF.VoidPtrTy);
4318  llvm::Function *ReductionFn = emitReductionFunction(
4319  CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
4320  Privates, LHSExprs, RHSExprs, ReductionOps);
4321  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
4322  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
4323  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
4324  llvm::Value *InterWarpCopyFn =
4325  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
4326 
4327  if (ParallelReduction) {
4328  llvm::Value *Args[] = {RTLoc,
4329  ThreadId,
4330  CGF.Builder.getInt32(RHSExprs.size()),
4331  ReductionArrayTySize,
4332  RL,
4333  ShuffleAndReduceFn,
4334  InterWarpCopyFn};
4335 
4336  Res = CGF.EmitRuntimeCall(
4338  OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2),
4339  Args);
4340  } else {
4341  assert(TeamsReduction && "expected teams reduction.");
4342  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
4343  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
4344  int Cnt = 0;
4345  for (const Expr *DRE : Privates) {
4346  PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
4347  ++Cnt;
4348  }
4349  const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
4350  CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
4351  C.getLangOpts().OpenMPCUDAReductionBufNum);
4352  TeamsReductions.push_back(TeamReductionRec);
4353  if (!KernelTeamsReductionPtr) {
4354  KernelTeamsReductionPtr = new llvm::GlobalVariable(
4355  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
4357  "_openmp_teams_reductions_buffer_$_$ptr");
4358  }
4359  llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
4360  Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
4361  /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
4362  llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
4363  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4364  llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
4365  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4366  ReductionFn);
4367  llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
4368  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4369  llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
4370  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4371  ReductionFn);
4372 
4373  llvm::Value *Args[] = {
4374  RTLoc,
4375  ThreadId,
4376  GlobalBufferPtr,
4377  CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
4378  RL,
4379  ShuffleAndReduceFn,
4380  InterWarpCopyFn,
4381  GlobalToBufferCpyFn,
4382  GlobalToBufferRedFn,
4383  BufferToGlobalCpyFn,
4384  BufferToGlobalRedFn};
4385 
4386  Res = CGF.EmitRuntimeCall(
4388  OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2),
4389  Args);
4390  }
4391 
4392  // 5. Build if (res == 1)
4393  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
4394  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
4395  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
4396  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
4397  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4398 
4399  // 6. Build then branch: where we have reduced values in the master
4400  // thread in each team.
4401  // __kmpc_end_reduce{_nowait}(<gtid>);
4402  // break;
4403  CGF.EmitBlock(ThenBB);
4404 
4405  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4406  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
4407  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4408  auto IPriv = Privates.begin();
4409  auto ILHS = LHSExprs.begin();
4410  auto IRHS = RHSExprs.begin();
4411  for (const Expr *E : ReductionOps) {
4412  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4413  cast<DeclRefExpr>(*IRHS));
4414  ++IPriv;
4415  ++ILHS;
4416  ++IRHS;
4417  }
4418  };
4419  llvm::Value *EndArgs[] = {ThreadId};
4420  RegionCodeGenTy RCG(CodeGen);
4421  NVPTXActionTy Action(
4422  nullptr, llvm::None,
4423  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4424  EndArgs);
4425  RCG.setAction(Action);
4426  RCG(CGF);
4427  // There is no need to emit line number for unconditional branch.
4429  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
4430 }
4431 
4432 const VarDecl *
4434  const VarDecl *NativeParam) const {
4435  if (!NativeParam->getType()->isReferenceType())
4436  return NativeParam;
4437  QualType ArgType = NativeParam->getType();
4438  QualifierCollector QC;
4439  const Type *NonQualTy = QC.strip(ArgType);
4440  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4441  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4442  if (Attr->getCaptureKind() == OMPC_map) {
4443  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4445  } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
4446  PointeeTy.isConstant(CGM.getContext())) {
4447  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4449  }
4450  }
4451  ArgType = CGM.getContext().getPointerType(PointeeTy);
4452  QC.addRestrict();
4453  enum { NVPTX_local_addr = 5 };
4454  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
4455  ArgType = QC.apply(CGM.getContext(), ArgType);
4456  if (isa<ImplicitParamDecl>(NativeParam))
4458  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
4459  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
4460  return ParmVarDecl::Create(
4461  CGM.getContext(),
4462  const_cast<DeclContext *>(NativeParam->getDeclContext()),
4463  NativeParam->getBeginLoc(), NativeParam->getLocation(),
4464  NativeParam->getIdentifier(), ArgType,
4465  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
4466 }
4467 
4468 Address
4470  const VarDecl *NativeParam,
4471  const VarDecl *TargetParam) const {
4472  assert(NativeParam != TargetParam &&
4473  NativeParam->getType()->isReferenceType() &&
4474  "Native arg must not be the same as target arg.");
4475  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4476  QualType NativeParamType = NativeParam->getType();
4477  QualifierCollector QC;
4478  const Type *NonQualTy = QC.strip(NativeParamType);
4479  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4480  unsigned NativePointeeAddrSpace =
4481  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4482  QualType TargetTy = TargetParam->getType();
4483  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4484  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
4485  // First cast to generic.
4487  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4488  /*AddrSpace=*/0));
4489  // Cast from generic to native address space.
4491  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4492  NativePointeeAddrSpace));
4493  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4494  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
4495  NativeParamType);
4496  return NativeParamAddr;
4497 }
4498 
4500  CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
4501  ArrayRef<llvm::Value *> Args) const {
4502  SmallVector<llvm::Value *, 4> TargetArgs;
4503  TargetArgs.reserve(Args.size());
4504  auto *FnType = OutlinedFn.getFunctionType();
4505  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
4506  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4507  TargetArgs.append(std::next(Args.begin(), I), Args.end());
4508  break;
4509  }
4510  llvm::Type *TargetType = FnType->getParamType(I);
4511  llvm::Value *NativeArg = Args[I];
4512  if (!TargetType->isPointerTy()) {
4513  TargetArgs.emplace_back(NativeArg);
4514  continue;
4515  }
4517  NativeArg,
4518  NativeArg->getType()->getPointerElementType()->getPointerTo());
4519  TargetArgs.emplace_back(
4520  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4521  }
4522  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4523 }
4524 
4525 /// Emit function which wraps the outline parallel region
4526 /// and controls the arguments which are passed to this function.
4527 /// The wrapper ensures that the outlined function is called
4528 /// with the correct arguments when data is shared.
4529 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4530  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4531  ASTContext &Ctx = CGM.getContext();
4532  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4533 
4534  // Create a function that takes as argument the source thread.
4535  FunctionArgList WrapperArgs;
4536  QualType Int16QTy =
4537  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4538  QualType Int32QTy =
4539  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4540  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4541  /*Id=*/nullptr, Int16QTy,
4543  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4544  /*Id=*/nullptr, Int32QTy,
4546  WrapperArgs.emplace_back(&ParallelLevelArg);
4547  WrapperArgs.emplace_back(&WrapperArg);
4548 
4549  const CGFunctionInfo &CGFI =
4550  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4551 
4552  auto *Fn = llvm::Function::Create(
4554  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4555  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4556  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4557  Fn->setDoesNotRecurse();
4558 
4559  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4560  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4561  D.getBeginLoc(), D.getBeginLoc());
4562 
4563  const auto *RD = CS.getCapturedRecordDecl();
4564  auto CurField = RD->field_begin();
4565 
4566  Address ZeroAddr = CGF.CreateMemTemp(
4567  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
4568  /*Name*/ ".zero.addr");
4569  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4570  // Get the array of arguments.
4572 
4573  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4574  Args.emplace_back(ZeroAddr.getPointer());
4575 
4576  CGBuilderTy &Bld = CGF.Builder;
4577  auto CI = CS.capture_begin();
4578 
4579  // Use global memory for data sharing.
4580  // Handle passing of global args to workers.
4581  Address GlobalArgs =
4582  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4583  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4584  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4585  CGF.EmitRuntimeCall(
4586  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4587  DataSharingArgs);
4588 
4589  // Retrieve the shared variables from the list of references returned
4590  // by the runtime. Pass the variables to the outlined function.
4591  Address SharedArgListAddress = Address::invalid();
4592  if (CS.capture_size() > 0 ||
4594  SharedArgListAddress = CGF.EmitLoadOfPointer(
4595  GlobalArgs, CGF.getContext()
4597  CGF.getContext().VoidPtrTy))
4598  .castAs<PointerType>());
4599  }
4600  unsigned Idx = 0;
4602  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4603  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4604  Src, CGF.SizeTy->getPointerTo());
4605  llvm::Value *LB = CGF.EmitLoadOfScalar(
4606  TypedAddress,
4607  /*Volatile=*/false,
4609  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4610  Args.emplace_back(LB);
4611  ++Idx;
4612  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4613  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4614  Src, CGF.SizeTy->getPointerTo());
4615  llvm::Value *UB = CGF.EmitLoadOfScalar(
4616  TypedAddress,
4617  /*Volatile=*/false,
4619  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4620  Args.emplace_back(UB);
4621  ++Idx;
4622  }
4623  if (CS.capture_size() > 0) {
4624  ASTContext &CGFContext = CGF.getContext();
4625  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4626  QualType ElemTy = CurField->getType();
4627  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4628  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4629  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4630  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4631  /*Volatile=*/false,
4632  CGFContext.getPointerType(ElemTy),
4633  CI->getLocation());
4634  if (CI->capturesVariableByCopy() &&
4635  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4636  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4637  CI->getLocation());
4638  }
4639  Args.emplace_back(Arg);
4640  }
4641  }
4642 
4643  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4644  CGF.FinishFunction();
4645  return Fn;
4646 }
4647 
4649  const Decl *D) {
4651  return;
4652 
4653  assert(D && "Expected function or captured|block decl.");
4654  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4655  "Function is registered already.");
4656  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4657  "Team is set but not processed.");
4658  const Stmt *Body = nullptr;
4659  bool NeedToDelayGlobalization = false;
4660  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4661  Body = FD->getBody();
4662  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4663  Body = BD->getBody();
4664  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4665  Body = CD->getBody();
4666  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4667  if (NeedToDelayGlobalization &&
4668  getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4669  return;
4670  }
4671  if (!Body)
4672  return;
4673  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4674  VarChecker.Visit(Body);
4675  const RecordDecl *GlobalizedVarsRecord =
4676  VarChecker.getGlobalizedRecord(IsInTTDRegion);
4677  TeamAndReductions.first = nullptr;
4678  TeamAndReductions.second.clear();
4679  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4680  VarChecker.getEscapedVariableLengthDecls();
4681  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4682  return;
4683  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4684  I->getSecond().MappedParams =
4685  llvm::make_unique<CodeGenFunction::OMPMapVars>();
4686  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4687  I->getSecond().EscapedParameters.insert(
4688  VarChecker.getEscapedParameters().begin(),
4689  VarChecker.getEscapedParameters().end());
4690  I->getSecond().EscapedVariableLengthDecls.append(
4691  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4692  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4693  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4694  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4695  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4696  Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4697  }
4698  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4699  CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4700  VarChecker.Visit(Body);
4701  I->getSecond().SecondaryGlobalRecord =
4702  VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4703  I->getSecond().SecondaryLocalVarData.emplace();
4704  DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4705  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4706  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4707  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4708  Data.insert(
4709  std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4710  }
4711  }
4712  if (!NeedToDelayGlobalization) {
4713  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4714  struct GlobalizationScope final : EHScopeStack::Cleanup {
4715  GlobalizationScope() = default;
4716 
4717  void Emit(CodeGenFunction &CGF, Flags flags) override {
4718  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4719  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4720  }
4721  };
4722  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4723  }
4724 }
4725 
4727  const VarDecl *VD) {
4729  return Address::invalid();
4730 
4731  VD = VD->getCanonicalDecl();
4732  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4733  if (I == FunctionGlobalizedDecls.end())
4734  return Address::invalid();
4735  auto VDI = I->getSecond().LocalVarData.find(VD);
4736  if (VDI != I->getSecond().LocalVarData.end())
4737  return VDI->second.PrivateAddr;
4738  if (VD->hasAttrs()) {
4740  E(VD->attr_end());
4741  IT != E; ++IT) {
4742  auto VDI = I->getSecond().LocalVarData.find(
4743  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4744  ->getCanonicalDecl());
4745  if (VDI != I->getSecond().LocalVarData.end())
4746  return VDI->second.PrivateAddr;
4747  }
4748  }
4749  return Address::invalid();
4750 }
4751 
4753  FunctionGlobalizedDecls.erase(CGF.CurFn);
4755 }
4756 
4758  CodeGenFunction &CGF, const OMPLoopDirective &S,
4759  OpenMPDistScheduleClauseKind &ScheduleKind,
4760  llvm::Value *&Chunk) const {
4761  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4762  ScheduleKind = OMPC_DIST_SCHEDULE_static;
4763  Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4764  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4766  return;
4767  }
4769  CGF, S, ScheduleKind, Chunk);
4770 }
4771 
4773  CodeGenFunction &CGF, const OMPLoopDirective &S,
4774  OpenMPScheduleClauseKind &ScheduleKind,
4775  const Expr *&ChunkExpr) const {
4776  ScheduleKind = OMPC_SCHEDULE_static;
4777  // Chunk size is 1 in this case.
4778  llvm::APInt ChunkSize(32, 1);
4779  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4780  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4781  SourceLocation());
4782 }
4783 
4785  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4787  " Expected target-based directive.");
4788  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4789  for (const CapturedStmt::Capture &C : CS->captures()) {
4790  // Capture variables captured by reference in lambdas for target-based
4791  // directives.
4792  if (!C.capturesVariable())
4793  continue;
4794  const VarDecl *VD = C.getCapturedVar();
4795  const auto *RD = VD->getType()
4796  .getCanonicalType()
4798  ->getAsCXXRecordDecl();
4799  if (!RD || !RD->isLambda())
4800  continue;
4801  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4802  LValue VDLVal;
4803  if (VD->getType().getCanonicalType()->isReferenceType())
4804  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4805  else
4806  VDLVal = CGF.MakeAddrLValue(
4807  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4808  llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4809  FieldDecl *ThisCapture = nullptr;
4810  RD->getCaptureFields(Captures, ThisCapture);
4811  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4812  LValue ThisLVal =
4813  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4814  llvm::Value *CXXThis = CGF.LoadCXXThis();
4815  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4816  }
4817  for (const LambdaCapture &LC : RD->captures()) {
4818  if (LC.getCaptureKind() != LCK_ByRef)
4819  continue;
4820  const VarDecl *VD = LC.getCapturedVar();
4821  if (!CS->capturesVariable(VD))
4822  continue;
4823  auto It = Captures.find(VD);
4824  assert(It != Captures.end() && "Found lambda capture without field.");
4825  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4826  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4827  if (VD->getType().getCanonicalType()->isReferenceType())
4828  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4829  VD->getType().getCanonicalType())
4830  .getAddress();
4831  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4832  }
4833  }
4834 }
4835 
4838 }
4839 
4840 // Get current CudaArch and ignore any unknown values
4842  if (!CGM.getTarget().hasFeature("ptx"))
4843  return CudaArch::UNKNOWN;
4844  llvm::StringMap<bool> Features;
4845  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4846  CGM.getTarget().getTargetOpts().CPU,
4847  CGM.getTarget().getTargetOpts().Features);
4848  for (const auto &Feature : Features) {
4849  if (Feature.getValue()) {
4850  CudaArch Arch = StringToCudaArch(Feature.getKey());
4851  if (Arch != CudaArch::UNKNOWN)
4852  return Arch;
4853  }
4854  }
4855  return CudaArch::UNKNOWN;
4856 }
4857 
4858 /// Check to see if target architecture supports unified addressing which is
4859 /// a restriction for OpenMP requires clause "unified_shared_memory".
4861  CodeGenModule &CGM, const OMPRequiresDecl *D) const {
4862  for (const OMPClause *Clause : D->clauselists()) {
4863  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4864  switch (getCudaArch(CGM)) {
4865  case CudaArch::SM_20:
4866  case CudaArch::SM_21:
4867  case CudaArch::SM_30:
4868  case CudaArch::SM_32:
4869  case CudaArch::SM_35:
4870  case CudaArch::SM_37:
4871  case CudaArch::SM_50:
4872  case CudaArch::SM_52:
4873  case CudaArch::SM_53:
4874  case CudaArch::SM_60:
4875  case CudaArch::SM_61:
4876  case CudaArch::SM_62:
4877  CGM.Error(Clause->getBeginLoc(),
4878  "Target architecture does not support unified addressing");
4879  return;
4880  case CudaArch::SM_70:
4881  case CudaArch::SM_72:
4882  case CudaArch::SM_75:
4883  case CudaArch::GFX600:
4884  case CudaArch::GFX601:
4885  case CudaArch::GFX700:
4886  case CudaArch::GFX701:
4887  case CudaArch::GFX702:
4888  case CudaArch::GFX703:
4889  case CudaArch::GFX704:
4890  case CudaArch::GFX801:
4891  case CudaArch::GFX802:
4892  case CudaArch::GFX803:
4893  case CudaArch::GFX810:
4894  case CudaArch::GFX900:
4895  case CudaArch::GFX902:
4896  case CudaArch::GFX904:
4897  case CudaArch::GFX906:
4898  case CudaArch::GFX909:
4899  case CudaArch::UNKNOWN:
4900  break;
4901  case CudaArch::LAST:
4902  llvm_unreachable("Unexpected Cuda arch.");
4903  }
4904  }
4905  }
4906 }
4907 
4908 /// Get number of SMs and number of blocks per SM.
4909 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4910  std::pair<unsigned, unsigned> Data;
4911  if (CGM.getLangOpts().OpenMPCUDANumSMs)
4912  Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4913  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4914  Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4915  if (Data.first && Data.second)
4916  return Data;
4917  switch (getCudaArch(CGM)) {
4918  case CudaArch::SM_20:
4919  case CudaArch::SM_21:
4920  case CudaArch::SM_30:
4921  case CudaArch::SM_32:
4922  case CudaArch::SM_35:
4923  case CudaArch::SM_37:
4924  case CudaArch::SM_50:
4925  case CudaArch::SM_52:
4926  case CudaArch::SM_53:
4927  return {16, 16};
4928  case CudaArch::SM_60:
4929  case CudaArch::SM_61:
4930  case CudaArch::SM_62:
4931  return {56, 32};
4932  case CudaArch::SM_70:
4933  case CudaArch::SM_72:
4934  case CudaArch::SM_75:
4935  return {84, 32};
4936  case CudaArch::GFX600:
4937  case CudaArch::GFX601:
4938  case CudaArch::GFX700:
4939  case CudaArch::GFX701:
4940  case CudaArch::GFX702:
4941  case CudaArch::GFX703:
4942  case CudaArch::GFX704:
4943  case CudaArch::GFX801:
4944  case CudaArch::GFX802:
4945  case CudaArch::GFX803:
4946  case CudaArch::GFX810:
4947  case CudaArch::GFX900:
4948  case CudaArch::GFX902:
4949  case CudaArch::GFX904:
4950  case CudaArch::GFX906:
4951  case CudaArch::GFX909:
4952  case CudaArch::UNKNOWN:
4953  break;
4954  case CudaArch::LAST:
4955  llvm_unreachable("Unexpected Cuda arch.");
4956  }
4957  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4958 }
4959 
4961  if (!GlobalizedRecords.empty()) {
4962  ASTContext &C = CGM.getContext();
4965  RecordDecl *StaticRD = C.buildImplicitRecord(
4966  "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4967  StaticRD->startDefinition();
4968  RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4969  "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4970  SharedStaticRD->startDefinition();
4971  for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4972  if (Records.Records.empty())
4973  continue;
4974  unsigned Size = 0;
4975  unsigned RecAlignment = 0;
4976  for (const RecordDecl *RD : Records.Records) {
4977  QualType RDTy = C.getRecordType(RD);
4978  unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
4979  RecAlignment = std::max(RecAlignment, Alignment);
4980  unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
4981  Size =
4982  llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4983  }
4984  Size = llvm::alignTo(Size, RecAlignment);
4985  llvm::APInt ArySize(/*numBits=*/64, Size);
4986  QualType SubTy = C.getConstantArrayType(
4987  C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
4988  const bool UseSharedMemory = Size <= SharedMemorySize;
4989  auto *Field =
4990  FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4991  SourceLocation(), SourceLocation(), nullptr, SubTy,
4993  /*BW=*/nullptr, /*Mutable=*/false,
4994  /*InitStyle=*/ICIS_NoInit);
4995  Field->setAccess(AS_public);
4996  if (UseSharedMemory) {
4997  SharedStaticRD->addDecl(Field);
4998  SharedRecs.push_back(&Records);
4999  } else {
5000  StaticRD->addDecl(Field);
5001  GlobalRecs.push_back(&Records);
5002  }
5003  Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
5004  Records.UseSharedMemory->setInitializer(
5005  llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
5006  }
5007  // Allocate SharedMemorySize buffer for the shared memory.
5008  // FIXME: nvlink does not handle weak linkage correctly (object with the
5009  // different size are reported as erroneous).
5010  // Restore this code as sson as nvlink is fixed.
5011  if (!SharedStaticRD->field_empty()) {
5012  llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
5013  QualType SubTy = C.getConstantArrayType(
5014  C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
5015  auto *Field = FieldDecl::Create(
5016  C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
5018  /*BW=*/nullptr, /*Mutable=*/false,
5019  /*InitStyle=*/ICIS_NoInit);
5020  Field->setAccess(AS_public);
5021  SharedStaticRD->addDecl(Field);
5022  }
5023  SharedStaticRD->completeDefinition();
5024  if (!SharedStaticRD->field_empty()) {
5025  QualType StaticTy = C.getRecordType(SharedStaticRD);
5026  llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
5027  auto *GV = new llvm::GlobalVariable(
5028  CGM.getModule(), LLVMStaticTy,
5029  /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
5030  llvm::Constant::getNullValue(LLVMStaticTy),
5031  "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
5032  llvm::GlobalValue::NotThreadLocal,
5034  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5035  GV, CGM.VoidPtrTy);
5036  for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
5037  Rec->Buffer->replaceAllUsesWith(Replacement);
5038  Rec->Buffer->eraseFromParent();
5039  }
5040  }
5041  StaticRD->completeDefinition();
5042  if (!StaticRD->field_empty()) {
5043  QualType StaticTy = C.getRecordType(StaticRD);
5044  std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
5045  llvm::APInt Size1(32, SMsBlockPerSM.second);
5046  QualType Arr1Ty =
5047  C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
5048  /*IndexTypeQuals=*/0);
5049  llvm::APInt Size2(32, SMsBlockPerSM.first);
5050  QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
5051  /*IndexTypeQuals=*/0);
5052  llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
5053  // FIXME: nvlink does not handle weak linkage correctly (object with the
5054  // different size are reported as erroneous).
5055  // Restore CommonLinkage as soon as nvlink is fixed.
5056  auto *GV = new llvm::GlobalVariable(
5057  CGM.getModule(), LLVMArr2Ty,
5058  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5059  llvm::Constant::getNullValue(LLVMArr2Ty),
5060  "_openmp_static_glob_rd_$_");
5061  auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5062  GV, CGM.VoidPtrTy);
5063  for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
5064  Rec->Buffer->replaceAllUsesWith(Replacement);
5065  Rec->Buffer->eraseFromParent();
5066  }
5067  }
5068  }
5069  if (!TeamsReductions.empty()) {
5070  ASTContext &C = CGM.getContext();
5071  RecordDecl *StaticRD = C.buildImplicitRecord(
5072  "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
5073  StaticRD->startDefinition();
5074  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
5075  QualType RecTy = C.getRecordType(TeamReductionRec);
5076  auto *Field = FieldDecl::Create(
5077  C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
5079  /*BW=*/nullptr, /*Mutable=*/false,
5080  /*InitStyle=*/ICIS_NoInit);
5081  Field->setAccess(AS_public);
5082  StaticRD->addDecl(Field);
5083  }
5084  StaticRD->completeDefinition();
5085  QualType StaticTy = C.getRecordType(StaticRD);
5086  llvm::Type *LLVMReductionsBufferTy =
5087  CGM.getTypes().ConvertTypeForMem(StaticTy);
5088  // FIXME: nvlink does not handle weak linkage correctly (object with the
5089  // different size are reported as erroneous).
5090  // Restore CommonLinkage as soon as nvlink is fixed.
5091  auto *GV = new llvm::GlobalVariable(
5092  CGM.getModule(), LLVMReductionsBufferTy,
5093  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5094  llvm::Constant::getNullValue(LLVMReductionsBufferTy),
5095  "_openmp_teams_reductions_buffer_$_");
5096  KernelTeamsReductionPtr->setInitializer(
5097  llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
5098  CGM.VoidPtrTy));
5099  }
5101 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:660
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:216
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5398
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:216
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl *> &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static const Decl * getCanonicalDecl(const Decl *D)
llvm::IntegerType * IntTy
int
int64_t QuantityType
Definition: CharUnits.h:39
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
Other implicit parameter.
Definition: Decl.h:1509
A class which contains all the information about a particular captured value.
Definition: Decl.h:3863
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2547
CanQualType VoidPtrTy
Definition: ASTContext.h:1043
A (possibly-)qualified type.
Definition: Type.h:639
CudaArch
Definition: Cuda.h:35
ArrayRef< OMPClause * > clauses()
Definition: StmtOpenMP.h:259
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static bool hasParallelIfNumThreadsClause(ASTContext &Ctx, const OMPExecutableDirective &D)
Check if the parallel directive has an &#39;if&#39; clause with non-constant or false condition.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:138
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:192
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
static std::pair< unsigned, unsigned > getSMsBlocksPerSM(CodeGenModule &CGM)
Get number of SMs and number of blocks per SM.
attr_iterator attr_begin() const
Definition: DeclBase.h:496
Stmt - This represents one statement.
Definition: Stmt.h:65
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
bool hasNonTrivialCall(const ASTContext &Ctx) const
Determine whether this expression involves a call to any function that is not trivial.
Definition: Expr.cpp:3421
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl *> &Vars)
Get list of reduction variables from the teams ... directives.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:34
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:167
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:412
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter...
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
static bool stable_sort_comparator(const PrivateDataTy P1, const PrivateDataTy P2)
This represents &#39;if&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:305
llvm::Value * ScratchpadIndex
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:225
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6255
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of &#39;this&#39;.
The base class of the type hierarchy.
Definition: Type.h:1414
virtual void clear()
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:4159
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:115
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Definition: CGExpr.cpp:2326
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference...
Definition: CGExpr.cpp:3972
static bool hasStaticScheduling(const OMPExecutableDirective &D)
Check if the directive is loops based and has schedule clause at all or has static scheduling...
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false)
llvm::FunctionCallee createNVPTXRuntimeFunction(unsigned Function)
Returns specified OpenMP runtime function for the current OpenMP implementation.
QualType getElementType() const
Definition: Type.h:2857
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1281
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:118
Represents a variable declaration or definition.
Definition: Decl.h:812
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:66