clang  14.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "CodeGenFunction.h"
17 #include "clang/AST/Attr.h"
18 #include "clang/AST/DeclOpenMP.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 #include "llvm/IR/IntrinsicsNVPTX.h"
25 #include "llvm/Support/MathExtras.h"
26 
27 using namespace clang;
28 using namespace CodeGen;
29 using namespace llvm::omp;
30 
31 namespace {
32 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
33 class NVPTXActionTy final : public PrePostActionTy {
34  llvm::FunctionCallee EnterCallee = nullptr;
35  ArrayRef<llvm::Value *> EnterArgs;
36  llvm::FunctionCallee ExitCallee = nullptr;
37  ArrayRef<llvm::Value *> ExitArgs;
38  bool Conditional = false;
39  llvm::BasicBlock *ContBlock = nullptr;
40 
41 public:
42  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43  ArrayRef<llvm::Value *> EnterArgs,
44  llvm::FunctionCallee ExitCallee,
45  ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
46  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
47  ExitArgs(ExitArgs), Conditional(Conditional) {}
48  void Enter(CodeGenFunction &CGF) override {
49  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
50  if (Conditional) {
51  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
52  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
53  ContBlock = CGF.createBasicBlock("omp_if.end");
54  // Generate the branch (If-stmt)
55  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
56  CGF.EmitBlock(ThenBlock);
57  }
58  }
59  void Done(CodeGenFunction &CGF) {
60  // Emit the rest of blocks/branches
61  CGF.EmitBranch(ContBlock);
62  CGF.EmitBlock(ContBlock, true);
63  }
64  void Exit(CodeGenFunction &CGF) override {
65  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
66  }
67 };
68 
69 /// A class to track the execution mode when codegening directives within
70 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
71 /// to the target region and used by containing directives such as 'parallel'
72 /// to emit optimized code.
73 class ExecutionRuntimeModesRAII {
74 private:
75  CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
78  bool SavedRuntimeMode = false;
79  bool *RuntimeMode = nullptr;
80 
81 public:
82  /// Constructor for Non-SPMD mode.
83  ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
84  : ExecMode(ExecMode) {
85  SavedExecMode = ExecMode;
87  }
88  /// Constructor for SPMD mode.
89  ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
90  bool &RuntimeMode, bool FullRuntimeMode)
91  : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
92  SavedExecMode = ExecMode;
93  SavedRuntimeMode = RuntimeMode;
94  ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
95  RuntimeMode = FullRuntimeMode;
96  }
97  ~ExecutionRuntimeModesRAII() {
98  ExecMode = SavedExecMode;
99  if (RuntimeMode)
100  *RuntimeMode = SavedRuntimeMode;
101  }
102 };
103 
104 /// GPU Configuration: This information can be derived from cuda registers,
105 /// however, providing compile time constants helps generate more efficient
106 /// code. For all practical purposes this is fine because the configuration
107 /// is the same for all known NVPTX architectures.
108 enum MachineConfiguration : unsigned {
109  /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
110  /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
111 
112  /// Global memory alignment for performance.
113  GlobalMemoryAlignment = 128,
114 
115  /// Maximal size of the shared memory buffer.
116  SharedMemorySize = 128,
117 };
118 
119 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
120  RefExpr = RefExpr->IgnoreParens();
121  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
122  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
123  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
124  Base = TempASE->getBase()->IgnoreParenImpCasts();
125  RefExpr = Base;
126  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
127  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
128  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
129  Base = TempOASE->getBase()->IgnoreParenImpCasts();
130  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
131  Base = TempASE->getBase()->IgnoreParenImpCasts();
132  RefExpr = Base;
133  }
134  RefExpr = RefExpr->IgnoreParenImpCasts();
135  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
136  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
137  const auto *ME = cast<MemberExpr>(RefExpr);
138  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
139 }
140 
141 
142 static RecordDecl *buildRecordForGlobalizedVars(
143  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
144  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
145  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
146  &MappedDeclsFields, int BufSize) {
147  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
148  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
149  return nullptr;
150  SmallVector<VarsDataTy, 4> GlobalizedVars;
151  for (const ValueDecl *D : EscapedDecls)
152  GlobalizedVars.emplace_back(
154  C.getDeclAlign(D).getQuantity(),
155  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
156  D);
157  for (const ValueDecl *D : EscapedDeclsForTeams)
158  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
159  llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
160  return L.first > R.first;
161  });
162 
163  // Build struct _globalized_locals_ty {
164  // /* globalized vars */[WarSize] align (max(decl_align,
165  // GlobalMemoryAlignment))
166  // /* globalized vars */ for EscapedDeclsForTeams
167  // };
168  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
169  GlobalizedRD->startDefinition();
171  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
172  for (const auto &Pair : GlobalizedVars) {
173  const ValueDecl *VD = Pair.second;
174  QualType Type = VD->getType();
176  Type = C.getPointerType(Type.getNonReferenceType());
177  else
178  Type = Type.getNonReferenceType();
179  SourceLocation Loc = VD->getLocation();
180  FieldDecl *Field;
181  if (SingleEscaped.count(VD)) {
183  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
184  C.getTrivialTypeSourceInfo(Type, SourceLocation()),
185  /*BW=*/nullptr, /*Mutable=*/false,
186  /*InitStyle=*/ICIS_NoInit);
187  Field->setAccess(AS_public);
188  if (VD->hasAttrs()) {
189  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
190  E(VD->getAttrs().end());
191  I != E; ++I)
192  Field->addAttr(*I);
193  }
194  } else {
195  llvm::APInt ArraySize(32, BufSize);
196  Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
197  0);
199  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
200  C.getTrivialTypeSourceInfo(Type, SourceLocation()),
201  /*BW=*/nullptr, /*Mutable=*/false,
202  /*InitStyle=*/ICIS_NoInit);
203  Field->setAccess(AS_public);
204  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
205  static_cast<CharUnits::QuantityType>(
206  GlobalMemoryAlignment)));
207  Field->addAttr(AlignedAttr::CreateImplicit(
208  C, /*IsAlignmentExpr=*/true,
209  IntegerLiteral::Create(C, Align,
210  C.getIntTypeForBitwidth(32, /*Signed=*/0),
211  SourceLocation()),
212  {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
213  }
214  GlobalizedRD->addDecl(Field);
215  MappedDeclsFields.try_emplace(VD, Field);
216  }
217  GlobalizedRD->completeDefinition();
218  return GlobalizedRD;
219 }
220 
221 /// Get the list of variables that can escape their declaration context.
222 class CheckVarsEscapingDeclContext final
223  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
224  CodeGenFunction &CGF;
225  llvm::SetVector<const ValueDecl *> EscapedDecls;
226  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
227  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
228  RecordDecl *GlobalizedRD = nullptr;
229  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
230  bool AllEscaped = false;
231  bool IsForCombinedParallelRegion = false;
232 
233  void markAsEscaped(const ValueDecl *VD) {
234  // Do not globalize declare target variables.
235  if (!isa<VarDecl>(VD) ||
236  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
237  return;
238  VD = cast<ValueDecl>(VD->getCanonicalDecl());
239  // Use user-specified allocation.
240  if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
241  return;
242  // Variables captured by value must be globalized.
243  if (auto *CSI = CGF.CapturedStmtInfo) {
244  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
245  // Check if need to capture the variable that was already captured by
246  // value in the outer region.
247  if (!IsForCombinedParallelRegion) {
248  if (!FD->hasAttrs())
249  return;
250  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
251  if (!Attr)
252  return;
253  if (((Attr->getCaptureKind() != OMPC_map) &&
254  !isOpenMPPrivate(Attr->getCaptureKind())) ||
255  ((Attr->getCaptureKind() == OMPC_map) &&
256  !FD->getType()->isAnyPointerType()))
257  return;
258  }
259  if (!FD->getType()->isReferenceType()) {
260  assert(!VD->getType()->isVariablyModifiedType() &&
261  "Parameter captured by value with variably modified type");
262  EscapedParameters.insert(VD);
263  } else if (!IsForCombinedParallelRegion) {
264  return;
265  }
266  }
267  }
268  if ((!CGF.CapturedStmtInfo ||
269  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
270  VD->getType()->isReferenceType())
271  // Do not globalize variables with reference type.
272  return;
273  if (VD->getType()->isVariablyModifiedType())
274  EscapedVariableLengthDecls.insert(VD);
275  else
276  EscapedDecls.insert(VD);
277  }
278 
279  void VisitValueDecl(const ValueDecl *VD) {
280  if (VD->getType()->isLValueReferenceType())
281  markAsEscaped(VD);
282  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
283  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
284  const bool SavedAllEscaped = AllEscaped;
285  AllEscaped = VD->getType()->isLValueReferenceType();
286  Visit(VarD->getInit());
287  AllEscaped = SavedAllEscaped;
288  }
289  }
290  }
291  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
292  ArrayRef<OMPClause *> Clauses,
293  bool IsCombinedParallelRegion) {
294  if (!S)
295  return;
296  for (const CapturedStmt::Capture &C : S->captures()) {
297  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
298  const ValueDecl *VD = C.getCapturedVar();
299  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
300  if (IsCombinedParallelRegion) {
301  // Check if the variable is privatized in the combined construct and
302  // those private copies must be shared in the inner parallel
303  // directive.
304  IsForCombinedParallelRegion = false;
305  for (const OMPClause *C : Clauses) {
306  if (!isOpenMPPrivate(C->getClauseKind()) ||
307  C->getClauseKind() == OMPC_reduction ||
308  C->getClauseKind() == OMPC_linear ||
309  C->getClauseKind() == OMPC_private)
310  continue;
312  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
313  Vars = PC->getVarRefs();
314  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
315  Vars = PC->getVarRefs();
316  else
317  llvm_unreachable("Unexpected clause.");
318  for (const auto *E : Vars) {
319  const Decl *D =
320  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
321  if (D == VD->getCanonicalDecl()) {
322  IsForCombinedParallelRegion = true;
323  break;
324  }
325  }
326  if (IsForCombinedParallelRegion)
327  break;
328  }
329  }
330  markAsEscaped(VD);
331  if (isa<OMPCapturedExprDecl>(VD))
332  VisitValueDecl(VD);
333  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
334  }
335  }
336  }
337 
338  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
339  assert(!GlobalizedRD &&
340  "Record for globalized variables is built already.");
341  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
342  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
343  if (IsInTTDRegion)
344  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
345  else
346  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
347  GlobalizedRD = ::buildRecordForGlobalizedVars(
348  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
349  MappedDeclsFields, WarpSize);
350  }
351 
352 public:
353  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
354  ArrayRef<const ValueDecl *> TeamsReductions)
355  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
356  }
357  virtual ~CheckVarsEscapingDeclContext() = default;
358  void VisitDeclStmt(const DeclStmt *S) {
359  if (!S)
360  return;
361  for (const Decl *D : S->decls())
362  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
363  VisitValueDecl(VD);
364  }
365  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
366  if (!D)
367  return;
368  if (!D->hasAssociatedStmt())
369  return;
370  if (const auto *S =
371  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
372  // Do not analyze directives that do not actually require capturing,
373  // like `omp for` or `omp simd` directives.
375  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
376  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
377  VisitStmt(S->getCapturedStmt());
378  return;
379  }
380  VisitOpenMPCapturedStmt(
381  S, D->clauses(),
382  CaptureRegions.back() == OMPD_parallel &&
384  }
385  }
386  void VisitCapturedStmt(const CapturedStmt *S) {
387  if (!S)
388  return;
389  for (const CapturedStmt::Capture &C : S->captures()) {
390  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
391  const ValueDecl *VD = C.getCapturedVar();
392  markAsEscaped(VD);
393  if (isa<OMPCapturedExprDecl>(VD))
394  VisitValueDecl(VD);
395  }
396  }
397  }
398  void VisitLambdaExpr(const LambdaExpr *E) {
399  if (!E)
400  return;
401  for (const LambdaCapture &C : E->captures()) {
402  if (C.capturesVariable()) {
403  if (C.getCaptureKind() == LCK_ByRef) {
404  const ValueDecl *VD = C.getCapturedVar();
405  markAsEscaped(VD);
406  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
407  VisitValueDecl(VD);
408  }
409  }
410  }
411  }
412  void VisitBlockExpr(const BlockExpr *E) {
413  if (!E)
414  return;
415  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
416  if (C.isByRef()) {
417  const VarDecl *VD = C.getVariable();
418  markAsEscaped(VD);
419  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
420  VisitValueDecl(VD);
421  }
422  }
423  }
424  void VisitCallExpr(const CallExpr *E) {
425  if (!E)
426  return;
427  for (const Expr *Arg : E->arguments()) {
428  if (!Arg)
429  continue;
430  if (Arg->isLValue()) {
431  const bool SavedAllEscaped = AllEscaped;
432  AllEscaped = true;
433  Visit(Arg);
434  AllEscaped = SavedAllEscaped;
435  } else {
436  Visit(Arg);
437  }
438  }
439  Visit(E->getCallee());
440  }
441  void VisitDeclRefExpr(const DeclRefExpr *E) {
442  if (!E)
443  return;
444  const ValueDecl *VD = E->getDecl();
445  if (AllEscaped)
446  markAsEscaped(VD);
447  if (isa<OMPCapturedExprDecl>(VD))
448  VisitValueDecl(VD);
449  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
450  if (VarD->isInitCapture())
451  VisitValueDecl(VD);
452  }
453  void VisitUnaryOperator(const UnaryOperator *E) {
454  if (!E)
455  return;
456  if (E->getOpcode() == UO_AddrOf) {
457  const bool SavedAllEscaped = AllEscaped;
458  AllEscaped = true;
459  Visit(E->getSubExpr());
460  AllEscaped = SavedAllEscaped;
461  } else {
462  Visit(E->getSubExpr());
463  }
464  }
465  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
466  if (!E)
467  return;
468  if (E->getCastKind() == CK_ArrayToPointerDecay) {
469  const bool SavedAllEscaped = AllEscaped;
470  AllEscaped = true;
471  Visit(E->getSubExpr());
472  AllEscaped = SavedAllEscaped;
473  } else {
474  Visit(E->getSubExpr());
475  }
476  }
477  void VisitExpr(const Expr *E) {
478  if (!E)
479  return;
480  bool SavedAllEscaped = AllEscaped;
481  if (!E->isLValue())
482  AllEscaped = false;
483  for (const Stmt *Child : E->children())
484  if (Child)
485  Visit(Child);
486  AllEscaped = SavedAllEscaped;
487  }
488  void VisitStmt(const Stmt *S) {
489  if (!S)
490  return;
491  for (const Stmt *Child : S->children())
492  if (Child)
493  Visit(Child);
494  }
495 
496  /// Returns the record that handles all the escaped local variables and used
497  /// instead of their original storage.
498  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
499  if (!GlobalizedRD)
500  buildRecordForGlobalizedVars(IsInTTDRegion);
501  return GlobalizedRD;
502  }
503 
504  /// Returns the field in the globalized record for the escaped variable.
505  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
506  assert(GlobalizedRD &&
507  "Record for globalized variables must be generated already.");
508  auto I = MappedDeclsFields.find(VD);
509  if (I == MappedDeclsFields.end())
510  return nullptr;
511  return I->getSecond();
512  }
513 
514  /// Returns the list of the escaped local variables/parameters.
515  ArrayRef<const ValueDecl *> getEscapedDecls() const {
516  return EscapedDecls.getArrayRef();
517  }
518 
519  /// Checks if the escaped local variable is actually a parameter passed by
520  /// value.
521  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
522  return EscapedParameters;
523  }
524 
525  /// Returns the list of the escaped variables with the variably modified
526  /// types.
527  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
528  return EscapedVariableLengthDecls.getArrayRef();
529  }
530 };
531 } // anonymous namespace
532 
533 /// Get the id of the warp in the block.
534 /// We assume that the warp size is 32, which is always the case
535 /// on the NVPTX device, to generate more efficient code.
537  CGBuilderTy &Bld = CGF.Builder;
538  unsigned LaneIDBits =
539  llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
540  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
541  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542 }
543 
544 /// Get the id of the current lane in the Warp.
545 /// We assume that the warp size is 32, which is always the case
546 /// on the NVPTX device, to generate more efficient code.
548  CGBuilderTy &Bld = CGF.Builder;
549  unsigned LaneIDBits =
550  llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
551  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
552  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
553  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
554  "nvptx_lane_id");
555 }
556 
558 CGOpenMPRuntimeGPU::getExecutionMode() const {
559  return CurrentExecutionMode;
560 }
561 
564  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
566 }
567 
568 /// Check for inner (nested) SPMD construct, if any
570  const OMPExecutableDirective &D) {
571  const auto *CS = D.getInnermostCapturedStmt();
572  const auto *Body =
573  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
574  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
575 
576  if (const auto *NestedDir =
577  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
578  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
579  switch (D.getDirectiveKind()) {
580  case OMPD_target:
581  if (isOpenMPParallelDirective(DKind))
582  return true;
583  if (DKind == OMPD_teams) {
584  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
585  /*IgnoreCaptured=*/true);
586  if (!Body)
587  return false;
588  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
589  if (const auto *NND =
590  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
591  DKind = NND->getDirectiveKind();
592  if (isOpenMPParallelDirective(DKind))
593  return true;
594  }
595  }
596  return false;
597  case OMPD_target_teams:
598  return isOpenMPParallelDirective(DKind);
599  case OMPD_target_simd:
600  case OMPD_target_parallel:
601  case OMPD_target_parallel_for:
602  case OMPD_target_parallel_for_simd:
603  case OMPD_target_teams_distribute:
604  case OMPD_target_teams_distribute_simd:
605  case OMPD_target_teams_distribute_parallel_for:
606  case OMPD_target_teams_distribute_parallel_for_simd:
607  case OMPD_parallel:
608  case OMPD_for:
609  case OMPD_parallel_for:
610  case OMPD_parallel_master:
611  case OMPD_parallel_sections:
612  case OMPD_for_simd:
613  case OMPD_parallel_for_simd:
614  case OMPD_cancel:
615  case OMPD_cancellation_point:
616  case OMPD_ordered:
617  case OMPD_threadprivate:
618  case OMPD_allocate:
619  case OMPD_task:
620  case OMPD_simd:
621  case OMPD_sections:
622  case OMPD_section:
623  case OMPD_single:
624  case OMPD_master:
625  case OMPD_critical:
626  case OMPD_taskyield:
627  case OMPD_barrier:
628  case OMPD_taskwait:
629  case OMPD_taskgroup:
630  case OMPD_atomic:
631  case OMPD_flush:
632  case OMPD_depobj:
633  case OMPD_scan:
634  case OMPD_teams:
635  case OMPD_target_data:
636  case OMPD_target_exit_data:
637  case OMPD_target_enter_data:
638  case OMPD_distribute:
639  case OMPD_distribute_simd:
640  case OMPD_distribute_parallel_for:
641  case OMPD_distribute_parallel_for_simd:
642  case OMPD_teams_distribute:
643  case OMPD_teams_distribute_simd:
644  case OMPD_teams_distribute_parallel_for:
645  case OMPD_teams_distribute_parallel_for_simd:
646  case OMPD_target_update:
647  case OMPD_declare_simd:
648  case OMPD_declare_variant:
649  case OMPD_begin_declare_variant:
650  case OMPD_end_declare_variant:
651  case OMPD_declare_target:
652  case OMPD_end_declare_target:
653  case OMPD_declare_reduction:
654  case OMPD_declare_mapper:
655  case OMPD_taskloop:
656  case OMPD_taskloop_simd:
657  case OMPD_master_taskloop:
658  case OMPD_master_taskloop_simd:
659  case OMPD_parallel_master_taskloop:
660  case OMPD_parallel_master_taskloop_simd:
661  case OMPD_requires:
662  case OMPD_unknown:
663  default:
664  llvm_unreachable("Unexpected directive.");
665  }
666  }
667 
668  return false;
669 }
670 
672  const OMPExecutableDirective &D) {
673  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
674  switch (DirectiveKind) {
675  case OMPD_target:
676  case OMPD_target_teams:
677  return hasNestedSPMDDirective(Ctx, D);
678  case OMPD_target_parallel:
679  case OMPD_target_parallel_for:
680  case OMPD_target_parallel_for_simd:
681  case OMPD_target_teams_distribute_parallel_for:
682  case OMPD_target_teams_distribute_parallel_for_simd:
683  case OMPD_target_simd:
684  case OMPD_target_teams_distribute_simd:
685  return true;
686  case OMPD_target_teams_distribute:
687  return false;
688  case OMPD_parallel:
689  case OMPD_for:
690  case OMPD_parallel_for:
691  case OMPD_parallel_master:
692  case OMPD_parallel_sections:
693  case OMPD_for_simd:
694  case OMPD_parallel_for_simd:
695  case OMPD_cancel:
696  case OMPD_cancellation_point:
697  case OMPD_ordered:
698  case OMPD_threadprivate:
699  case OMPD_allocate:
700  case OMPD_task:
701  case OMPD_simd:
702  case OMPD_sections:
703  case OMPD_section:
704  case OMPD_single:
705  case OMPD_master:
706  case OMPD_critical:
707  case OMPD_taskyield:
708  case OMPD_barrier:
709  case OMPD_taskwait:
710  case OMPD_taskgroup:
711  case OMPD_atomic:
712  case OMPD_flush:
713  case OMPD_depobj:
714  case OMPD_scan:
715  case OMPD_teams:
716  case OMPD_target_data:
717  case OMPD_target_exit_data:
718  case OMPD_target_enter_data:
719  case OMPD_distribute:
720  case OMPD_distribute_simd:
721  case OMPD_distribute_parallel_for:
722  case OMPD_distribute_parallel_for_simd:
723  case OMPD_teams_distribute:
724  case OMPD_teams_distribute_simd:
725  case OMPD_teams_distribute_parallel_for:
726  case OMPD_teams_distribute_parallel_for_simd:
727  case OMPD_target_update:
728  case OMPD_declare_simd:
729  case OMPD_declare_variant:
730  case OMPD_begin_declare_variant:
731  case OMPD_end_declare_variant:
732  case OMPD_declare_target:
733  case OMPD_end_declare_target:
734  case OMPD_declare_reduction:
735  case OMPD_declare_mapper:
736  case OMPD_taskloop:
737  case OMPD_taskloop_simd:
738  case OMPD_master_taskloop:
739  case OMPD_master_taskloop_simd:
740  case OMPD_parallel_master_taskloop:
741  case OMPD_parallel_master_taskloop_simd:
742  case OMPD_requires:
743  case OMPD_unknown:
744  default:
745  break;
746  }
747  llvm_unreachable(
748  "Unknown programming model for OpenMP directive on NVPTX target.");
749 }
750 
751 /// Check if the directive is loops based and has schedule clause at all or has
752 /// static scheduling.
756  "Expected loop-based directive.");
757  return !D.hasClausesOfKind<OMPOrderedClause>() &&
759  llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
760  [](const OMPScheduleClause *C) {
761  return C->getScheduleKind() == OMPC_SCHEDULE_static;
762  }));
763 }
764 
765 /// Check for inner (nested) lightweight runtime construct, if any
767  const OMPExecutableDirective &D) {
768  assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
769  const auto *CS = D.getInnermostCapturedStmt();
770  const auto *Body =
771  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
772  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
773 
774  if (const auto *NestedDir =
775  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
776  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
777  switch (D.getDirectiveKind()) {
778  case OMPD_target:
779  if (isOpenMPParallelDirective(DKind) &&
781  hasStaticScheduling(*NestedDir))
782  return true;
783  if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
784  return true;
785  if (DKind == OMPD_parallel) {
786  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
787  /*IgnoreCaptured=*/true);
788  if (!Body)
789  return false;
790  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
791  if (const auto *NND =
792  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
793  DKind = NND->getDirectiveKind();
794  if (isOpenMPWorksharingDirective(DKind) &&
796  return true;
797  }
798  } else if (DKind == OMPD_teams) {
799  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
800  /*IgnoreCaptured=*/true);
801  if (!Body)
802  return false;
803  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
804  if (const auto *NND =
805  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
806  DKind = NND->getDirectiveKind();
807  if (isOpenMPParallelDirective(DKind) &&
810  return true;
811  if (DKind == OMPD_parallel) {
812  Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
813  /*IgnoreCaptured=*/true);
814  if (!Body)
815  return false;
816  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
817  if (const auto *NND =
818  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
819  DKind = NND->getDirectiveKind();
820  if (isOpenMPWorksharingDirective(DKind) &&
822  return true;
823  }
824  }
825  }
826  }
827  return false;
828  case OMPD_target_teams:
829  if (isOpenMPParallelDirective(DKind) &&
831  hasStaticScheduling(*NestedDir))
832  return true;
833  if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
834  return true;
835  if (DKind == OMPD_parallel) {
836  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
837  /*IgnoreCaptured=*/true);
838  if (!Body)
839  return false;
840  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
841  if (const auto *NND =
842  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
843  DKind = NND->getDirectiveKind();
844  if (isOpenMPWorksharingDirective(DKind) &&
846  return true;
847  }
848  }
849  return false;
850  case OMPD_target_parallel:
851  if (DKind == OMPD_simd)
852  return true;
853  return isOpenMPWorksharingDirective(DKind) &&
854  isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
855  case OMPD_target_teams_distribute:
856  case OMPD_target_simd:
857  case OMPD_target_parallel_for:
858  case OMPD_target_parallel_for_simd:
859  case OMPD_target_teams_distribute_simd:
860  case OMPD_target_teams_distribute_parallel_for:
861  case OMPD_target_teams_distribute_parallel_for_simd:
862  case OMPD_parallel:
863  case OMPD_for:
864  case OMPD_parallel_for:
865  case OMPD_parallel_master:
866  case OMPD_parallel_sections:
867  case OMPD_for_simd:
868  case OMPD_parallel_for_simd:
869  case OMPD_cancel:
870  case OMPD_cancellation_point:
871  case OMPD_ordered:
872  case OMPD_threadprivate:
873  case OMPD_allocate:
874  case OMPD_task:
875  case OMPD_simd:
876  case OMPD_sections:
877  case OMPD_section:
878  case OMPD_single:
879  case OMPD_master:
880  case OMPD_critical:
881  case OMPD_taskyield:
882  case OMPD_barrier:
883  case OMPD_taskwait:
884  case OMPD_taskgroup:
885  case OMPD_atomic:
886  case OMPD_flush:
887  case OMPD_depobj:
888  case OMPD_scan:
889  case OMPD_teams:
890  case OMPD_target_data:
891  case OMPD_target_exit_data:
892  case OMPD_target_enter_data:
893  case OMPD_distribute:
894  case OMPD_distribute_simd:
895  case OMPD_distribute_parallel_for:
896  case OMPD_distribute_parallel_for_simd:
897  case OMPD_teams_distribute:
898  case OMPD_teams_distribute_simd:
899  case OMPD_teams_distribute_parallel_for:
900  case OMPD_teams_distribute_parallel_for_simd:
901  case OMPD_target_update:
902  case OMPD_declare_simd:
903  case OMPD_declare_variant:
904  case OMPD_begin_declare_variant:
905  case OMPD_end_declare_variant:
906  case OMPD_declare_target:
907  case OMPD_end_declare_target:
908  case OMPD_declare_reduction:
909  case OMPD_declare_mapper:
910  case OMPD_taskloop:
911  case OMPD_taskloop_simd:
912  case OMPD_master_taskloop:
913  case OMPD_master_taskloop_simd:
914  case OMPD_parallel_master_taskloop:
915  case OMPD_parallel_master_taskloop_simd:
916  case OMPD_requires:
917  case OMPD_unknown:
918  default:
919  llvm_unreachable("Unexpected directive.");
920  }
921  }
922 
923  return false;
924 }
925 
926 /// Checks if the construct supports lightweight runtime. It must be SPMD
927 /// construct + inner loop-based construct with static scheduling.
929  const OMPExecutableDirective &D) {
930  if (!supportsSPMDExecutionMode(Ctx, D))
931  return false;
932  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
933  switch (DirectiveKind) {
934  case OMPD_target:
935  case OMPD_target_teams:
936  case OMPD_target_parallel:
937  return hasNestedLightweightDirective(Ctx, D);
938  case OMPD_target_parallel_for:
939  case OMPD_target_parallel_for_simd:
940  case OMPD_target_teams_distribute_parallel_for:
941  case OMPD_target_teams_distribute_parallel_for_simd:
942  // (Last|First)-privates must be shared in parallel region.
943  return hasStaticScheduling(D);
944  case OMPD_target_simd:
945  case OMPD_target_teams_distribute_simd:
946  return true;
947  case OMPD_target_teams_distribute:
948  return false;
949  case OMPD_parallel:
950  case OMPD_for:
951  case OMPD_parallel_for:
952  case OMPD_parallel_master:
953  case OMPD_parallel_sections:
954  case OMPD_for_simd:
955  case OMPD_parallel_for_simd:
956  case OMPD_cancel:
957  case OMPD_cancellation_point:
958  case OMPD_ordered:
959  case OMPD_threadprivate:
960  case OMPD_allocate:
961  case OMPD_task:
962  case OMPD_simd:
963  case OMPD_sections:
964  case OMPD_section:
965  case OMPD_single:
966  case OMPD_master:
967  case OMPD_critical:
968  case OMPD_taskyield:
969  case OMPD_barrier:
970  case OMPD_taskwait:
971  case OMPD_taskgroup:
972  case OMPD_atomic:
973  case OMPD_flush:
974  case OMPD_depobj:
975  case OMPD_scan:
976  case OMPD_teams:
977  case OMPD_target_data:
978  case OMPD_target_exit_data:
979  case OMPD_target_enter_data:
980  case OMPD_distribute:
981  case OMPD_distribute_simd:
982  case OMPD_distribute_parallel_for:
983  case OMPD_distribute_parallel_for_simd:
984  case OMPD_teams_distribute:
985  case OMPD_teams_distribute_simd:
986  case OMPD_teams_distribute_parallel_for:
987  case OMPD_teams_distribute_parallel_for_simd:
988  case OMPD_target_update:
989  case OMPD_declare_simd:
990  case OMPD_declare_variant:
991  case OMPD_begin_declare_variant:
992  case OMPD_end_declare_variant:
993  case OMPD_declare_target:
994  case OMPD_end_declare_target:
995  case OMPD_declare_reduction:
996  case OMPD_declare_mapper:
997  case OMPD_taskloop:
998  case OMPD_taskloop_simd:
999  case OMPD_master_taskloop:
1000  case OMPD_master_taskloop_simd:
1001  case OMPD_parallel_master_taskloop:
1002  case OMPD_parallel_master_taskloop_simd:
1003  case OMPD_requires:
1004  case OMPD_unknown:
1005  default:
1006  break;
1007  }
1008  llvm_unreachable(
1009  "Unknown programming model for OpenMP directive on NVPTX target.");
1010 }
1011 
1012 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1013  StringRef ParentName,
1014  llvm::Function *&OutlinedFn,
1015  llvm::Constant *&OutlinedFnID,
1016  bool IsOffloadEntry,
1017  const RegionCodeGenTy &CodeGen) {
1018  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1019  EntryFunctionState EST;
1020  WrapperFunctionsMap.clear();
1021 
1022  // Emit target region as a standalone region.
1023  class NVPTXPrePostActionTy : public PrePostActionTy {
1024  CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1025 
1026  public:
1027  NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1028  : EST(EST) {}
1029  void Enter(CodeGenFunction &CGF) override {
1030  auto &RT =
1031  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1032  RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
1033  // Skip target region initialization.
1034  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1035  }
1036  void Exit(CodeGenFunction &CGF) override {
1037  auto &RT =
1038  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1039  RT.clearLocThreadIdInsertPt(CGF);
1040  RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
1041  }
1042  } Action(EST);
1043  CodeGen.setAction(Action);
1044  IsInTTDRegion = true;
1045  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1046  IsOffloadEntry, CodeGen);
1047  IsInTTDRegion = false;
1048 }
1049 
1050 void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
1051  EntryFunctionState &EST, bool IsSPMD) {
1052  CGBuilderTy &Bld = CGF.Builder;
1053  Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
1054  IsInTargetMasterThreadRegion = IsSPMD;
1055  if (!IsSPMD)
1056  emitGenericVarsProlog(CGF, EST.Loc);
1057 }
1058 
1059 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
1060  EntryFunctionState &EST,
1061  bool IsSPMD) {
1062  if (!IsSPMD)
1063  emitGenericVarsEpilog(CGF);
1064 
1065  CGBuilderTy &Bld = CGF.Builder;
1066  OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
1067 }
1068 
1069 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1070  StringRef ParentName,
1071  llvm::Function *&OutlinedFn,
1072  llvm::Constant *&OutlinedFnID,
1073  bool IsOffloadEntry,
1074  const RegionCodeGenTy &CodeGen) {
1075  ExecutionRuntimeModesRAII ModeRAII(
1076  CurrentExecutionMode, RequiresFullRuntime,
1077  CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1078  !supportsLightweightRuntime(CGM.getContext(), D));
1079  EntryFunctionState EST;
1080 
1081  // Emit target region as a standalone region.
1082  class NVPTXPrePostActionTy : public PrePostActionTy {
1083  CGOpenMPRuntimeGPU &RT;
1084  CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1085 
1086  public:
1087  NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1088  CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1089  : RT(RT), EST(EST) {}
1090  void Enter(CodeGenFunction &CGF) override {
1091  RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
1092  // Skip target region initialization.
1093  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1094  }
1095  void Exit(CodeGenFunction &CGF) override {
1096  RT.clearLocThreadIdInsertPt(CGF);
1097  RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
1098  }
1099  } Action(*this, EST);
1100  CodeGen.setAction(Action);
1101  IsInTTDRegion = true;
1102  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1103  IsOffloadEntry, CodeGen);
1104  IsInTTDRegion = false;
1105 }
1106 
1107 // Create a unique global variable to indicate the execution mode of this target
1108 // region. The execution mode is either 'generic', or 'spmd' depending on the
1109 // target directive. This variable is picked up by the offload library to setup
1110 // the device appropriately before kernel launch. If the execution mode is
1111 // 'generic', the runtime reserves one warp for the master, otherwise, all
1112 // warps participate in parallel work.
1113 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1114  bool Mode) {
1115  auto *GVMode = new llvm::GlobalVariable(
1116  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1117  llvm::GlobalValue::WeakAnyLinkage,
1118  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
1119  : OMP_TGT_EXEC_MODE_GENERIC),
1120  Twine(Name, "_exec_mode"));
1121  CGM.addCompilerUsedGlobal(GVMode);
1122 }
1123 
1124 void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1125  llvm::Constant *Addr,
1126  uint64_t Size, int32_t,
1127  llvm::GlobalValue::LinkageTypes) {
1128  // TODO: Add support for global variables on the device after declare target
1129  // support.
1130  if (!isa<llvm::Function>(Addr))
1131  return;
1132  llvm::Module &M = CGM.getModule();
1133  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1134 
1135  // Get "nvvm.annotations" metadata node
1136  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1137 
1138  llvm::Metadata *MDVals[] = {
1139  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1140  llvm::ConstantAsMetadata::get(
1141  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1142  // Append metadata to nvvm.annotations
1143  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1144 }
1145 
1146 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1147  const OMPExecutableDirective &D, StringRef ParentName,
1148  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1149  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1150  if (!IsOffloadEntry) // Nothing to do.
1151  return;
1152 
1153  assert(!ParentName.empty() && "Invalid target region parent name!");
1154 
1155  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1156  if (Mode)
1157  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1158  CodeGen);
1159  else
1160  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1161  CodeGen);
1162 
1163  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1164 }
1165 
1166 namespace {
1168 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1169 enum ModeFlagsTy : unsigned {
1170  /// Bit set to 1 when in SPMD mode.
1171  KMP_IDENT_SPMD_MODE = 0x01,
1172  /// Bit set to 1 when a simplified runtime is used.
1173  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1174  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1175 };
1176 
1177 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1178 static const ModeFlagsTy UndefinedMode =
1179  (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1180 } // anonymous namespace
1181 
1183  switch (getExecutionMode()) {
1184  case EM_SPMD:
1185  if (requiresFullRuntime())
1186  return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1187  return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1188  case EM_NonSPMD:
1189  assert(requiresFullRuntime() && "Expected full runtime.");
1190  return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1191  case EM_Unknown:
1192  return UndefinedMode;
1193  }
1194  llvm_unreachable("Unknown flags are requested.");
1195 }
1196 
1198  : CGOpenMPRuntime(CGM, "_", "$") {
1199  if (!CGM.getLangOpts().OpenMPIsDevice)
1200  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1201 
1202  llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
1203  if (CGM.getLangOpts().OpenMPTargetNewRuntime) {
1204  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
1205  "__omp_rtl_debug_kind");
1206  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
1207  "__omp_rtl_assume_teams_oversubscription");
1208  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
1209  "__omp_rtl_assume_threads_oversubscription");
1210  }
1211 }
1212 
1214  ProcBindKind ProcBind,
1215  SourceLocation Loc) {
1216  // Do nothing in case of SPMD mode and L0 parallel.
1217  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1218  return;
1219 
1220  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1221 }
1222 
1224  llvm::Value *NumThreads,
1225  SourceLocation Loc) {
1226  // Do nothing in case of SPMD mode and L0 parallel.
1227  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1228  return;
1229 
1230  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1231 }
1232 
1234  const Expr *NumTeams,
1235  const Expr *ThreadLimit,
1236  SourceLocation Loc) {}
1237 
1239  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1240  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1241  // Emit target region as a standalone region.
1242  class NVPTXPrePostActionTy : public PrePostActionTy {
1243  bool &IsInParallelRegion;
1244  bool PrevIsInParallelRegion;
1245 
1246  public:
1247  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1248  : IsInParallelRegion(IsInParallelRegion) {}
1249  void Enter(CodeGenFunction &CGF) override {
1250  PrevIsInParallelRegion = IsInParallelRegion;
1251  IsInParallelRegion = true;
1252  }
1253  void Exit(CodeGenFunction &CGF) override {
1254  IsInParallelRegion = PrevIsInParallelRegion;
1255  }
1256  } Action(IsInParallelRegion);
1257  CodeGen.setAction(Action);
1258  bool PrevIsInTTDRegion = IsInTTDRegion;
1259  IsInTTDRegion = false;
1260  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1261  IsInTargetMasterThreadRegion = false;
1262  auto *OutlinedFun =
1263  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1264  D, ThreadIDVar, InnermostKind, CodeGen));
1265  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1266  IsInTTDRegion = PrevIsInTTDRegion;
1267  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1268  !IsInParallelRegion) {
1269  llvm::Function *WrapperFun =
1270  createParallelDataSharingWrapper(OutlinedFun, D);
1271  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1272  }
1273 
1274  return OutlinedFun;
1275 }
1276 
1277 /// Get list of lastprivate variables from the teams distribute ... or
1278 /// teams {distribute ...} directives.
1279 static void
1283  "expected teams directive.");
1284  const OMPExecutableDirective *Dir = &D;
1287  Ctx,
1289  /*IgnoreCaptured=*/true))) {
1290  Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1291  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1292  Dir = nullptr;
1293  }
1294  }
1295  if (!Dir)
1296  return;
1297  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1298  for (const Expr *E : C->getVarRefs())
1299  Vars.push_back(getPrivateItem(E));
1300  }
1301 }
1302 
1303 /// Get list of reduction variables from the teams ... directives.
1304 static void
1308  "expected teams directive.");
1309  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1310  for (const Expr *E : C->privates())
1311  Vars.push_back(getPrivateItem(E));
1312  }
1313 }
1314 
1316  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1317  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1318  SourceLocation Loc = D.getBeginLoc();
1319 
1320  const RecordDecl *GlobalizedRD = nullptr;
1321  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1322  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1323  unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1324  // Globalize team reductions variable unconditionally in all modes.
1325  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1326  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1327  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1328  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1329  if (!LastPrivatesReductions.empty()) {
1330  GlobalizedRD = ::buildRecordForGlobalizedVars(
1331  CGM.getContext(), llvm::None, LastPrivatesReductions,
1332  MappedDeclsFields, WarpSize);
1333  }
1334  } else if (!LastPrivatesReductions.empty()) {
1335  assert(!TeamAndReductions.first &&
1336  "Previous team declaration is not expected.");
1337  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1338  std::swap(TeamAndReductions.second, LastPrivatesReductions);
1339  }
1340 
1341  // Emit target region as a standalone region.
1342  class NVPTXPrePostActionTy : public PrePostActionTy {
1343  SourceLocation &Loc;
1344  const RecordDecl *GlobalizedRD;
1345  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1346  &MappedDeclsFields;
1347 
1348  public:
1349  NVPTXPrePostActionTy(
1350  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1351  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1352  &MappedDeclsFields)
1353  : Loc(Loc), GlobalizedRD(GlobalizedRD),
1354  MappedDeclsFields(MappedDeclsFields) {}
1355  void Enter(CodeGenFunction &CGF) override {
1356  auto &Rt =
1357  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1358  if (GlobalizedRD) {
1359  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1360  I->getSecond().MappedParams =
1361  std::make_unique<CodeGenFunction::OMPMapVars>();
1362  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1363  for (const auto &Pair : MappedDeclsFields) {
1364  assert(Pair.getFirst()->isCanonicalDecl() &&
1365  "Expected canonical declaration");
1366  Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1367  }
1368  }
1369  Rt.emitGenericVarsProlog(CGF, Loc);
1370  }
1371  void Exit(CodeGenFunction &CGF) override {
1372  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1373  .emitGenericVarsEpilog(CGF);
1374  }
1375  } Action(Loc, GlobalizedRD, MappedDeclsFields);
1376  CodeGen.setAction(Action);
1377  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1378  D, ThreadIDVar, InnermostKind, CodeGen);
1379 
1380  return OutlinedFun;
1381 }
1382 
1383 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1384  SourceLocation Loc,
1385  bool WithSPMDCheck) {
1387  getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1388  return;
1389 
1390  CGBuilderTy &Bld = CGF.Builder;
1391 
1392  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1393  if (I == FunctionGlobalizedDecls.end())
1394  return;
1395 
1396  for (auto &Rec : I->getSecond().LocalVarData) {
1397  const auto *VD = cast<VarDecl>(Rec.first);
1398  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1399  QualType VarTy = VD->getType();
1400 
1401  // Get the local allocation of a firstprivate variable before sharing
1402  llvm::Value *ParValue;
1403  if (EscapedParam) {
1404  LValue ParLVal =
1405  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1406  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1407  }
1408 
1409  // Allocate space for the variable to be globalized
1410  llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1411  llvm::Instruction *VoidPtr =
1412  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1413  CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1414  AllocArgs, VD->getName());
1415 
1416  // Cast the void pointer and get the address of the globalized variable.
1417  llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1418  llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1419  VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1420  LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1421  Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1422  Rec.second.GlobalizedVal = VoidPtr;
1423 
1424  // Assign the local allocation to the newly globalized location.
1425  if (EscapedParam) {
1426  CGF.EmitStoreOfScalar(ParValue, VarAddr);
1427  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1428  }
1429  if (auto *DI = CGF.getDebugInfo())
1430  VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1431  }
1432  for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1433  // Use actual memory size of the VLA object including the padding
1434  // for alignment purposes.
1435  llvm::Value *Size = CGF.getTypeSize(VD->getType());
1436  CharUnits Align = CGM.getContext().getDeclAlign(VD);
1437  Size = Bld.CreateNUWAdd(
1438  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1439  llvm::Value *AlignVal =
1440  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1441 
1442  Size = Bld.CreateUDiv(Size, AlignVal);
1443  Size = Bld.CreateNUWMul(Size, AlignVal);
1444 
1445  // Allocate space for this VLA object to be globalized.
1446  llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1447  llvm::Instruction *VoidPtr =
1448  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1449  CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1450  AllocArgs, VD->getName());
1451 
1452  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1453  std::pair<llvm::Value *, llvm::Value *>(
1454  {VoidPtr, CGF.getTypeSize(VD->getType())}));
1455  LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1456  CGM.getContext().getDeclAlign(VD),
1458  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1459  Base.getAddress(CGF));
1460  }
1461  I->getSecond().MappedParams->apply(CGF);
1462 }
1463 
1464 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1465  bool WithSPMDCheck) {
1467  getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1468  return;
1469 
1470  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1471  if (I != FunctionGlobalizedDecls.end()) {
1472  // Deallocate the memory for each globalized VLA object
1473  for (auto AddrSizePair :
1474  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1475  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1476  CGM.getModule(), OMPRTL___kmpc_free_shared),
1477  {AddrSizePair.first, AddrSizePair.second});
1478  }
1479  // Deallocate the memory for each globalized value
1480  for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1481  const auto *VD = cast<VarDecl>(Rec.first);
1482  I->getSecond().MappedParams->restore(CGF);
1483 
1484  llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1485  CGF.getTypeSize(VD->getType())};
1486  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1487  CGM.getModule(), OMPRTL___kmpc_free_shared),
1488  FreeArgs);
1489  }
1490  }
1491 }
1492 
1494  const OMPExecutableDirective &D,
1495  SourceLocation Loc,
1496  llvm::Function *OutlinedFn,
1497  ArrayRef<llvm::Value *> CapturedVars) {
1498  if (!CGF.HaveInsertPoint())
1499  return;
1500 
1501  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1502  /*Name=*/".zero.addr");
1503  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1504  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1505  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1506  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1507  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1508  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1509 }
1510 
1512  SourceLocation Loc,
1513  llvm::Function *OutlinedFn,
1514  ArrayRef<llvm::Value *> CapturedVars,
1515  const Expr *IfCond) {
1516  if (!CGF.HaveInsertPoint())
1517  return;
1518 
1519  auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
1520  IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
1521  CGBuilderTy &Bld = CGF.Builder;
1522  llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1523  llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1524  if (WFn)
1525  ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1526  llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1527 
1528  // Create a private scope that will globalize the arguments
1529  // passed from the outside of the target region.
1530  // TODO: Is that needed?
1531  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1532 
1533  Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1534  llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1535  "captured_vars_addrs");
1536  // There's something to share.
1537  if (!CapturedVars.empty()) {
1538  // Prepare for parallel region. Indicate the outlined function.
1539  ASTContext &Ctx = CGF.getContext();
1540  unsigned Idx = 0;
1541  for (llvm::Value *V : CapturedVars) {
1542  Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1543  llvm::Value *PtrV;
1544  if (V->getType()->isIntegerTy())
1545  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1546  else
1547  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1548  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1549  Ctx.getPointerType(Ctx.VoidPtrTy));
1550  ++Idx;
1551  }
1552  }
1553 
1554  llvm::Value *IfCondVal = nullptr;
1555  if (IfCond)
1556  IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1557  /* isSigned */ false);
1558  else
1559  IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1560 
1561  assert(IfCondVal && "Expected a value");
1562  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1563  llvm::Value *Args[] = {
1564  RTLoc,
1565  getThreadID(CGF, Loc),
1566  IfCondVal,
1567  llvm::ConstantInt::get(CGF.Int32Ty, -1),
1568  llvm::ConstantInt::get(CGF.Int32Ty, -1),
1569  FnPtr,
1570  ID,
1571  Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1572  CGF.VoidPtrPtrTy),
1573  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1574  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1575  CGM.getModule(), OMPRTL___kmpc_parallel_51),
1576  Args);
1577  };
1578 
1579  RegionCodeGenTy RCG(ParallelGen);
1580  RCG(CGF);
1581 }
1582 
1583 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1584  // Always emit simple barriers!
1585  if (!CGF.HaveInsertPoint())
1586  return;
1587  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1588  // This function does not use parameters, so we can emit just default values.
1589  llvm::Value *Args[] = {
1590  llvm::ConstantPointerNull::get(
1591  cast<llvm::PointerType>(getIdentTyPointerTy())),
1592  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1593  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1594  CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1595  Args);
1596 }
1597 
1599  SourceLocation Loc,
1600  OpenMPDirectiveKind Kind, bool,
1601  bool) {
1602  // Always emit simple barriers!
1603  if (!CGF.HaveInsertPoint())
1604  return;
1605  // Build call __kmpc_cancel_barrier(loc, thread_id);
1606  unsigned Flags = getDefaultFlagsForBarriers(Kind);
1607  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1608  getThreadID(CGF, Loc)};
1609 
1610  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1611  CGM.getModule(), OMPRTL___kmpc_barrier),
1612  Args);
1613 }
1614 
1616  CodeGenFunction &CGF, StringRef CriticalName,
1617  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1618  const Expr *Hint) {
1619  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1620  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1621  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1622  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1623  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1624 
1625  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1626 
1627  // Get the mask of active threads in the warp.
1628  llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1629  CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1630  // Fetch team-local id of the thread.
1631  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1632 
1633  // Get the width of the team.
1634  llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1635 
1636  // Initialize the counter variable for the loop.
1637  QualType Int32Ty =
1638  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1639  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1640  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1641  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1642  /*isInit=*/true);
1643 
1644  // Block checks if loop counter exceeds upper bound.
1645  CGF.EmitBlock(LoopBB);
1646  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1647  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1648  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1649 
1650  // Block tests which single thread should execute region, and which threads
1651  // should go straight to synchronisation point.
1652  CGF.EmitBlock(TestBB);
1653  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1654  llvm::Value *CmpThreadToCounter =
1655  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1656  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1657 
1658  // Block emits the body of the critical region.
1659  CGF.EmitBlock(BodyBB);
1660 
1661  // Output the critical statement.
1662  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1663  Hint);
1664 
1665  // After the body surrounded by the critical region, the single executing
1666  // thread will jump to the synchronisation point.
1667  // Block waits for all threads in current team to finish then increments the
1668  // counter variable and returns to the loop.
1669  CGF.EmitBlock(SyncBB);
1670  // Reconverge active threads in the warp.
1671  (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1672  CGM.getModule(), OMPRTL___kmpc_syncwarp),
1673  Mask);
1674 
1675  llvm::Value *IncCounterVal =
1676  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1677  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1678  CGF.EmitBranch(LoopBB);
1679 
1680  // Block that is reached when all threads in the team complete the region.
1681  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1682 }
1683 
1684 /// Cast value to the specified type.
1686  QualType ValTy, QualType CastTy,
1687  SourceLocation Loc) {
1688  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1689  "Cast type must sized.");
1690  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1691  "Val type must sized.");
1692  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1693  if (ValTy == CastTy)
1694  return Val;
1695  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1696  CGF.getContext().getTypeSizeInChars(CastTy))
1697  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1698  if (CastTy->isIntegerType() && ValTy->isIntegerType())
1699  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1700  CastTy->hasSignedIntegerRepresentation());
1701  Address CastItem = CGF.CreateMemTemp(CastTy);
1703  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
1704  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1706  TBAAAccessInfo());
1707  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1709  TBAAAccessInfo());
1710 }
1711 
1712 /// This function creates calls to one of two shuffle functions to copy
1713 /// variables between lanes in a warp.
1715  llvm::Value *Elem,
1716  QualType ElemType,
1718  SourceLocation Loc) {
1719  CodeGenModule &CGM = CGF.CGM;
1720  CGBuilderTy &Bld = CGF.Builder;
1721  CGOpenMPRuntimeGPU &RT =
1722  *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1723  llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1724 
1725  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1726  assert(Size.getQuantity() <= 8 &&
1727  "Unsupported bitwidth in shuffle instruction.");
1728 
1729  RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1730  ? OMPRTL___kmpc_shuffle_int32
1731  : OMPRTL___kmpc_shuffle_int64;
1732 
1733  // Cast all types to 32- or 64-bit values before calling shuffle routines.
1734  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1735  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1736  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1737  llvm::Value *WarpSize =
1738  Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1739 
1740  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1741  OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1742  {ElemCast, Offset, WarpSize});
1743 
1744  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1745 }
1746 
1747 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1748  Address DestAddr, QualType ElemType,
1750  CGBuilderTy &Bld = CGF.Builder;
1751 
1752  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1753  // Create the loop over the big sized data.
1754  // ptr = (void*)Elem;
1755  // ptrEnd = (void*) Elem + 1;
1756  // Step = 8;
1757  // while (ptr + Step < ptrEnd)
1758  // shuffle((int64_t)*ptr);
1759  // Step = 4;
1760  // while (ptr + Step < ptrEnd)
1761  // shuffle((int32_t)*ptr);
1762  // ...
1763  Address ElemPtr = DestAddr;
1764  Address Ptr = SrcAddr;
1766  Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
1767  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1768  if (Size < CharUnits::fromQuantity(IntSize))
1769  continue;
1770  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1771  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1772  /*Signed=*/1);
1773  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1774  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
1775  ElemPtr =
1776  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
1777  if (Size.getQuantity() / IntSize > 1) {
1778  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1779  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1780  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1781  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1782  CGF.EmitBlock(PreCondBB);
1783  llvm::PHINode *PhiSrc =
1784  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1785  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1786  llvm::PHINode *PhiDest =
1787  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1788  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1789  Ptr = Address(PhiSrc, Ptr.getAlignment());
1790  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
1791  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1793  Ptr.getPointer(), CGF.VoidPtrTy));
1794  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1795  ThenBB, ExitBB);
1796  CGF.EmitBlock(ThenBB);
1798  CGF,
1799  CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1801  TBAAAccessInfo()),
1802  IntType, Offset, Loc);
1803  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1805  TBAAAccessInfo());
1806  Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1807  Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1808  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1809  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1810  CGF.EmitBranch(PreCondBB);
1811  CGF.EmitBlock(ExitBB);
1812  } else {
1814  CGF,
1815  CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1817  TBAAAccessInfo()),
1818  IntType, Offset, Loc);
1819  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1821  TBAAAccessInfo());
1822  Ptr = Bld.CreateConstGEP(Ptr, 1);
1823  ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1824  }
1825  Size = Size % IntSize;
1826  }
1827 }
1828 
1829 namespace {
1830 enum CopyAction : unsigned {
1831  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1832  // the warp using shuffle instructions.
1833  RemoteLaneToThread,
1834  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1835  ThreadCopy,
1836  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1837  ThreadToScratchpad,
1838  // ScratchpadToThread: Copy from a scratchpad array in global memory
1839  // containing team-reduced data to a thread's stack.
1840  ScratchpadToThread,
1841 };
1842 } // namespace
1843 
1848 };
1849 
1850 /// Emit instructions to copy a Reduce list, which contains partially
1851 /// aggregated values, in the specified direction.
1853  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1854  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1855  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1856 
1857  CodeGenModule &CGM = CGF.CGM;
1858  ASTContext &C = CGM.getContext();
1859  CGBuilderTy &Bld = CGF.Builder;
1860 
1861  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1862  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1863  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1864 
1865  // Iterates, element-by-element, through the source Reduce list and
1866  // make a copy.
1867  unsigned Idx = 0;
1868  unsigned Size = Privates.size();
1869  for (const Expr *Private : Privates) {
1870  Address SrcElementAddr = Address::invalid();
1871  Address DestElementAddr = Address::invalid();
1872  Address DestElementPtrAddr = Address::invalid();
1873  // Should we shuffle in an element from a remote lane?
1874  bool ShuffleInElement = false;
1875  // Set to true to update the pointer in the dest Reduce list to a
1876  // newly created element.
1877  bool UpdateDestListPtr = false;
1878  // Increment the src or dest pointer to the scratchpad, for each
1879  // new element.
1880  bool IncrScratchpadSrc = false;
1881  bool IncrScratchpadDest = false;
1882 
1883  switch (Action) {
1884  case RemoteLaneToThread: {
1885  // Step 1.1: Get the address for the src element in the Reduce list.
1886  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1887  SrcElementAddr = CGF.EmitLoadOfPointer(
1888  SrcElementPtrAddr,
1889  C.getPointerType(Private->getType())->castAs<PointerType>());
1890 
1891  // Step 1.2: Create a temporary to store the element in the destination
1892  // Reduce list.
1893  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1894  DestElementAddr =
1895  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1896  ShuffleInElement = true;
1897  UpdateDestListPtr = true;
1898  break;
1899  }
1900  case ThreadCopy: {
1901  // Step 1.1: Get the address for the src element in the Reduce list.
1902  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1903  SrcElementAddr = CGF.EmitLoadOfPointer(
1904  SrcElementPtrAddr,
1905  C.getPointerType(Private->getType())->castAs<PointerType>());
1906 
1907  // Step 1.2: Get the address for dest element. The destination
1908  // element has already been created on the thread's stack.
1909  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1910  DestElementAddr = CGF.EmitLoadOfPointer(
1911  DestElementPtrAddr,
1912  C.getPointerType(Private->getType())->castAs<PointerType>());
1913  break;
1914  }
1915  case ThreadToScratchpad: {
1916  // Step 1.1: Get the address for the src element in the Reduce list.
1917  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1918  SrcElementAddr = CGF.EmitLoadOfPointer(
1919  SrcElementPtrAddr,
1920  C.getPointerType(Private->getType())->castAs<PointerType>());
1921 
1922  // Step 1.2: Get the address for dest element:
1923  // address = base + index * ElementSizeInChars.
1924  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1925  llvm::Value *CurrentOffset =
1926  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1927  llvm::Value *ScratchPadElemAbsolutePtrVal =
1928  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1929  ScratchPadElemAbsolutePtrVal =
1930  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1931  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1932  C.getTypeAlignInChars(Private->getType()));
1933  IncrScratchpadDest = true;
1934  break;
1935  }
1936  case ScratchpadToThread: {
1937  // Step 1.1: Get the address for the src element in the scratchpad.
1938  // address = base + index * ElementSizeInChars.
1939  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1940  llvm::Value *CurrentOffset =
1941  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1942  llvm::Value *ScratchPadElemAbsolutePtrVal =
1943  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1944  ScratchPadElemAbsolutePtrVal =
1945  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1946  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1947  C.getTypeAlignInChars(Private->getType()));
1948  IncrScratchpadSrc = true;
1949 
1950  // Step 1.2: Create a temporary to store the element in the destination
1951  // Reduce list.
1952  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1953  DestElementAddr =
1954  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1955  UpdateDestListPtr = true;
1956  break;
1957  }
1958  }
1959 
1960  // Regardless of src and dest of copy, we emit the load of src
1961  // element as this is required in all directions
1962  SrcElementAddr = Bld.CreateElementBitCast(
1963  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1964  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1965  SrcElementAddr.getElementType());
1966 
1967  // Now that all active lanes have read the element in the
1968  // Reduce list, shuffle over the value from the remote lane.
1969  if (ShuffleInElement) {
1970  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1971  RemoteLaneOffset, Private->getExprLoc());
1972  } else {
1973  switch (CGF.getEvaluationKind(Private->getType())) {
1974  case TEK_Scalar: {
1975  llvm::Value *Elem = CGF.EmitLoadOfScalar(
1976  SrcElementAddr, /*Volatile=*/false, Private->getType(),
1977  Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1978  TBAAAccessInfo());
1979  // Store the source element value to the dest element address.
1980  CGF.EmitStoreOfScalar(
1981  Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1983  break;
1984  }
1985  case TEK_Complex: {
1987  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1988  Private->getExprLoc());
1989  CGF.EmitStoreOfComplex(
1990  Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1991  /*isInit=*/false);
1992  break;
1993  }
1994  case TEK_Aggregate:
1995  CGF.EmitAggregateCopy(
1996  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1997  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1999  break;
2000  }
2001  }
2002 
2003  // Step 3.1: Modify reference in dest Reduce list as needed.
2004  // Modifying the reference in Reduce list to point to the newly
2005  // created element. The element is live in the current function
2006  // scope and that of functions it invokes (i.e., reduce_function).
2007  // RemoteReduceData[i] = (void*)&RemoteElem
2008  if (UpdateDestListPtr) {
2010  DestElementAddr.getPointer(), CGF.VoidPtrTy),
2011  DestElementPtrAddr, /*Volatile=*/false,
2012  C.VoidPtrTy);
2013  }
2014 
2015  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2016  // address of the next element in scratchpad memory, unless we're currently
2017  // processing the last one. Memory alignment is also taken care of here.
2018  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2019  llvm::Value *ScratchpadBasePtr =
2020  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2021  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2022  ScratchpadBasePtr = Bld.CreateNUWAdd(
2023  ScratchpadBasePtr,
2024  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2025 
2026  // Take care of global memory alignment for performance
2027  ScratchpadBasePtr = Bld.CreateNUWSub(
2028  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2029  ScratchpadBasePtr = Bld.CreateUDiv(
2030  ScratchpadBasePtr,
2031  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2032  ScratchpadBasePtr = Bld.CreateNUWAdd(
2033  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2034  ScratchpadBasePtr = Bld.CreateNUWMul(
2035  ScratchpadBasePtr,
2036  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2037 
2038  if (IncrScratchpadDest)
2039  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2040  else /* IncrScratchpadSrc = true */
2041  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2042  }
2043 
2044  ++Idx;
2045  }
2046 }
2047 
2048 /// This function emits a helper that gathers Reduce lists from the first
2049 /// lane of every active warp to lanes in the first warp.
2050 ///
2051 /// void inter_warp_copy_func(void* reduce_data, num_warps)
2052 /// shared smem[warp_size];
2053 /// For all data entries D in reduce_data:
2054 /// sync
2055 /// If (I am the first lane in each warp)
2056 /// Copy my local D to smem[warp_id]
2057 /// sync
2058 /// if (I am the first warp)
2059 /// Copy smem[thread_id] to my local D
2061  ArrayRef<const Expr *> Privates,
2062  QualType ReductionArrayTy,
2063  SourceLocation Loc) {
2064  ASTContext &C = CGM.getContext();
2065  llvm::Module &M = CGM.getModule();
2066 
2067  // ReduceList: thread local Reduce list.
2068  // At the stage of the computation when this function is called, partially
2069  // aggregated values reside in the first lane of every active warp.
2070  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2071  C.VoidPtrTy, ImplicitParamDecl::Other);
2072  // NumWarps: number of warps active in the parallel region. This could
2073  // be smaller than 32 (max warps in a CTA) for partial block reduction.
2074  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2075  C.getIntTypeForBitwidth(32, /* Signed */ true),
2077  FunctionArgList Args;
2078  Args.push_back(&ReduceListArg);
2079  Args.push_back(&NumWarpsArg);
2080 
2081  const CGFunctionInfo &CGFI =
2082  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2083  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2085  "_omp_reduction_inter_warp_copy_func", &M);
2086  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2087  Fn->setDoesNotRecurse();
2088  CodeGenFunction CGF(CGM);
2089  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2090 
2091  CGBuilderTy &Bld = CGF.Builder;
2092 
2093  // This array is used as a medium to transfer, one reduce element at a time,
2094  // the data from the first lane of every warp to lanes in the first warp
2095  // in order to perform the final step of a reduction in a parallel region
2096  // (reduction across warps). The array is placed in NVPTX __shared__ memory
2097  // for reduced latency, as well as to have a distinct copy for concurrently
2098  // executing target regions. The array is declared with common linkage so
2099  // as to be shared across compilation units.
2100  StringRef TransferMediumName =
2101  "__openmp_nvptx_data_transfer_temporary_storage";
2102  llvm::GlobalVariable *TransferMedium =
2103  M.getGlobalVariable(TransferMediumName);
2104  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
2105  if (!TransferMedium) {
2106  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2107  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2108  TransferMedium = new llvm::GlobalVariable(
2109  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2110  llvm::UndefValue::get(Ty), TransferMediumName,
2111  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2112  SharedAddressSpace);
2113  CGM.addCompilerUsedGlobal(TransferMedium);
2114  }
2115 
2116  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2117  // Get the CUDA thread id of the current OpenMP thread on the GPU.
2118  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2119  // nvptx_lane_id = nvptx_id % warpsize
2120  llvm::Value *LaneID = getNVPTXLaneID(CGF);
2121  // nvptx_warp_id = nvptx_id / warpsize
2122  llvm::Value *WarpID = getNVPTXWarpID(CGF);
2123 
2124  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2125  Address LocalReduceList(
2127  CGF.EmitLoadOfScalar(
2128  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2130  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2131  CGF.getPointerAlign());
2132 
2133  unsigned Idx = 0;
2134  for (const Expr *Private : Privates) {
2135  //
2136  // Warp master copies reduce element to transfer medium in __shared__
2137  // memory.
2138  //
2139  unsigned RealTySize =
2140  C.getTypeSizeInChars(Private->getType())
2141  .alignTo(C.getTypeAlignInChars(Private->getType()))
2142  .getQuantity();
2143  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2144  unsigned NumIters = RealTySize / TySize;
2145  if (NumIters == 0)
2146  continue;
2147  QualType CType = C.getIntTypeForBitwidth(
2148  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2149  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2150  CharUnits Align = CharUnits::fromQuantity(TySize);
2151  llvm::Value *Cnt = nullptr;
2152  Address CntAddr = Address::invalid();
2153  llvm::BasicBlock *PrecondBB = nullptr;
2154  llvm::BasicBlock *ExitBB = nullptr;
2155  if (NumIters > 1) {
2156  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2157  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2158  /*Volatile=*/false, C.IntTy);
2159  PrecondBB = CGF.createBasicBlock("precond");
2160  ExitBB = CGF.createBasicBlock("exit");
2161  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2162  // There is no need to emit line number for unconditional branch.
2164  CGF.EmitBlock(PrecondBB);
2165  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2166  llvm::Value *Cmp =
2167  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2168  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2169  CGF.EmitBlock(BodyBB);
2170  }
2171  // kmpc_barrier.
2172  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2173  /*EmitChecks=*/false,
2174  /*ForceSimpleCall=*/true);
2175  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2176  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2177  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2178 
2179  // if (lane_id == 0)
2180  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2181  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2182  CGF.EmitBlock(ThenBB);
2183 
2184  // Reduce element = LocalReduceList[i]
2185  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2186  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2187  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2188  // elemptr = ((CopyType*)(elemptrptr)) + I
2189  Address ElemPtr = Address(ElemPtrPtr, Align);
2190  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2191  if (NumIters > 1) {
2192  ElemPtr = Address(Bld.CreateGEP(ElemPtr.getElementType(),
2193  ElemPtr.getPointer(), Cnt),
2194  ElemPtr.getAlignment());
2195  }
2196 
2197  // Get pointer to location in transfer medium.
2198  // MediumPtr = &medium[warp_id]
2199  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2200  TransferMedium->getValueType(), TransferMedium,
2201  {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2202  Address MediumPtr(MediumPtrVal, Align);
2203  // Casting to actual data type.
2204  // MediumPtr = (CopyType*)MediumPtrAddr;
2205  MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
2206 
2207  // elem = *elemptr
2208  //*MediumPtr = elem
2209  llvm::Value *Elem = CGF.EmitLoadOfScalar(
2210  ElemPtr, /*Volatile=*/false, CType, Loc,
2212  // Store the source element value to the dest element address.
2213  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2215  TBAAAccessInfo());
2216 
2217  Bld.CreateBr(MergeBB);
2218 
2219  CGF.EmitBlock(ElseBB);
2220  Bld.CreateBr(MergeBB);
2221 
2222  CGF.EmitBlock(MergeBB);
2223 
2224  // kmpc_barrier.
2225  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2226  /*EmitChecks=*/false,
2227  /*ForceSimpleCall=*/true);
2228 
2229  //
2230  // Warp 0 copies reduce element from transfer medium.
2231  //
2232  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2233  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2234  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2235 
2236  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2237  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2238  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2239 
2240  // Up to 32 threads in warp 0 are active.
2241  llvm::Value *IsActiveThread =
2242  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2243  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2244 
2245  CGF.EmitBlock(W0ThenBB);
2246 
2247  // SrcMediumPtr = &medium[tid]
2248  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2249  TransferMedium->getValueType(), TransferMedium,
2250  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2251  Address SrcMediumPtr(SrcMediumPtrVal, Align);
2252  // SrcMediumVal = *SrcMediumPtr;
2253  SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
2254 
2255  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2256  Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2257  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2258  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2259  Address TargetElemPtr = Address(TargetElemPtrVal, Align);
2260  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2261  if (NumIters > 1) {
2262  TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getElementType(),
2263  TargetElemPtr.getPointer(), Cnt),
2264  TargetElemPtr.getAlignment());
2265  }
2266 
2267  // *TargetElemPtr = SrcMediumVal;
2268  llvm::Value *SrcMediumValue =
2269  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2270  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2271  CType);
2272  Bld.CreateBr(W0MergeBB);
2273 
2274  CGF.EmitBlock(W0ElseBB);
2275  Bld.CreateBr(W0MergeBB);
2276 
2277  CGF.EmitBlock(W0MergeBB);
2278 
2279  if (NumIters > 1) {
2280  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2281  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2282  CGF.EmitBranch(PrecondBB);
2284  CGF.EmitBlock(ExitBB);
2285  }
2286  RealTySize %= TySize;
2287  }
2288  ++Idx;
2289  }
2290 
2291  CGF.FinishFunction();
2292  return Fn;
2293 }
2294 
2295 /// Emit a helper that reduces data across two OpenMP threads (lanes)
2296 /// in the same warp. It uses shuffle instructions to copy over data from
2297 /// a remote lane's stack. The reduction algorithm performed is specified
2298 /// by the fourth parameter.
2299 ///
2300 /// Algorithm Versions.
2301 /// Full Warp Reduce (argument value 0):
2302 /// This algorithm assumes that all 32 lanes are active and gathers
2303 /// data from these 32 lanes, producing a single resultant value.
2304 /// Contiguous Partial Warp Reduce (argument value 1):
2305 /// This algorithm assumes that only a *contiguous* subset of lanes
2306 /// are active. This happens for the last warp in a parallel region
2307 /// when the user specified num_threads is not an integer multiple of
2308 /// 32. This contiguous subset always starts with the zeroth lane.
2309 /// Partial Warp Reduce (argument value 2):
2310 /// This algorithm gathers data from any number of lanes at any position.
2311 /// All reduced values are stored in the lowest possible lane. The set
2312 /// of problems every algorithm addresses is a super set of those
2313 /// addressable by algorithms with a lower version number. Overhead
2314 /// increases as algorithm version increases.
2315 ///
2316 /// Terminology
2317 /// Reduce element:
2318 /// Reduce element refers to the individual data field with primitive
2319 /// data types to be combined and reduced across threads.
2320 /// Reduce list:
2321 /// Reduce list refers to a collection of local, thread-private
2322 /// reduce elements.
2323 /// Remote Reduce list:
2324 /// Remote Reduce list refers to a collection of remote (relative to
2325 /// the current thread) reduce elements.
2326 ///
2327 /// We distinguish between three states of threads that are important to
2328 /// the implementation of this function.
2329 /// Alive threads:
2330 /// Threads in a warp executing the SIMT instruction, as distinguished from
2331 /// threads that are inactive due to divergent control flow.
2332 /// Active threads:
2333 /// The minimal set of threads that has to be alive upon entry to this
2334 /// function. The computation is correct iff active threads are alive.
2335 /// Some threads are alive but they are not active because they do not
2336 /// contribute to the computation in any useful manner. Turning them off
2337 /// may introduce control flow overheads without any tangible benefits.
2338 /// Effective threads:
2339 /// In order to comply with the argument requirements of the shuffle
2340 /// function, we must keep all lanes holding data alive. But at most
2341 /// half of them perform value aggregation; we refer to this half of
2342 /// threads as effective. The other half is simply handing off their
2343 /// data.
2344 ///
2345 /// Procedure
2346 /// Value shuffle:
2347 /// In this step active threads transfer data from higher lane positions
2348 /// in the warp to lower lane positions, creating Remote Reduce list.
2349 /// Value aggregation:
2350 /// In this step, effective threads combine their thread local Reduce list
2351 /// with Remote Reduce list and store the result in the thread local
2352 /// Reduce list.
2353 /// Value copy:
2354 /// In this step, we deal with the assumption made by algorithm 2
2355 /// (i.e. contiguity assumption). When we have an odd number of lanes
2356 /// active, say 2k+1, only k threads will be effective and therefore k
2357 /// new values will be produced. However, the Reduce list owned by the
2358 /// (2k+1)th thread is ignored in the value aggregation. Therefore
2359 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2360 /// that the contiguity assumption still holds.
2361 static llvm::Function *emitShuffleAndReduceFunction(
2362  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2363  QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2364  ASTContext &C = CGM.getContext();
2365 
2366  // Thread local Reduce list used to host the values of data to be reduced.
2367  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2368  C.VoidPtrTy, ImplicitParamDecl::Other);
2369  // Current lane id; could be logical.
2370  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2372  // Offset of the remote source lane relative to the current lane.
2373  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2374  C.ShortTy, ImplicitParamDecl::Other);
2375  // Algorithm version. This is expected to be known at compile time.
2376  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2377  C.ShortTy, ImplicitParamDecl::Other);
2378  FunctionArgList Args;
2379  Args.push_back(&ReduceListArg);
2380  Args.push_back(&LaneIDArg);
2381  Args.push_back(&RemoteLaneOffsetArg);
2382  Args.push_back(&AlgoVerArg);
2383 
2384  const CGFunctionInfo &CGFI =
2385  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2386  auto *Fn = llvm::Function::Create(
2388  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2389  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2390  Fn->setDoesNotRecurse();
2391 
2392  CodeGenFunction CGF(CGM);
2393  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2394 
2395  CGBuilderTy &Bld = CGF.Builder;
2396 
2397  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2398  Address LocalReduceList(
2400  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2401  C.VoidPtrTy, SourceLocation()),
2402  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2403  CGF.getPointerAlign());
2404 
2405  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2406  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2407  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2408 
2409  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2410  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2411  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2412 
2413  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2414  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2415  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2416 
2417  // Create a local thread-private variable to host the Reduce list
2418  // from a remote lane.
2419  Address RemoteReduceList =
2420  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2421 
2422  // This loop iterates through the list of reduce elements and copies,
2423  // element by element, from a remote lane in the warp to RemoteReduceList,
2424  // hosted on the thread's stack.
2425  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2426  LocalReduceList, RemoteReduceList,
2427  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2428  /*ScratchpadIndex=*/nullptr,
2429  /*ScratchpadWidth=*/nullptr});
2430 
2431  // The actions to be performed on the Remote Reduce list is dependent
2432  // on the algorithm version.
2433  //
2434  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2435  // LaneId % 2 == 0 && Offset > 0):
2436  // do the reduction value aggregation
2437  //
2438  // The thread local variable Reduce list is mutated in place to host the
2439  // reduced data, which is the aggregated value produced from local and
2440  // remote lanes.
2441  //
2442  // Note that AlgoVer is expected to be a constant integer known at compile
2443  // time.
2444  // When AlgoVer==0, the first conjunction evaluates to true, making
2445  // the entire predicate true during compile time.
2446  // When AlgoVer==1, the second conjunction has only the second part to be
2447  // evaluated during runtime. Other conjunctions evaluates to false
2448  // during compile time.
2449  // When AlgoVer==2, the third conjunction has only the second part to be
2450  // evaluated during runtime. Other conjunctions evaluates to false
2451  // during compile time.
2452  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2453 
2454  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2455  llvm::Value *CondAlgo1 = Bld.CreateAnd(
2456  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2457 
2458  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2459  llvm::Value *CondAlgo2 = Bld.CreateAnd(
2460  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2461  CondAlgo2 = Bld.CreateAnd(
2462  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2463 
2464  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2465  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2466 
2467  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2468  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2469  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2470  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2471 
2472  CGF.EmitBlock(ThenBB);
2473  // reduce_function(LocalReduceList, RemoteReduceList)
2474  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2475  LocalReduceList.getPointer(), CGF.VoidPtrTy);
2476  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2477  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2479  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2480  Bld.CreateBr(MergeBB);
2481 
2482  CGF.EmitBlock(ElseBB);
2483  Bld.CreateBr(MergeBB);
2484 
2485  CGF.EmitBlock(MergeBB);
2486 
2487  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2488  // Reduce list.
2489  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2490  llvm::Value *CondCopy = Bld.CreateAnd(
2491  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2492 
2493  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2494  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2495  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2496  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2497 
2498  CGF.EmitBlock(CpyThenBB);
2499  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2500  RemoteReduceList, LocalReduceList);
2501  Bld.CreateBr(CpyMergeBB);
2502 
2503  CGF.EmitBlock(CpyElseBB);
2504  Bld.CreateBr(CpyMergeBB);
2505 
2506  CGF.EmitBlock(CpyMergeBB);
2507 
2508  CGF.FinishFunction();
2509  return Fn;
2510 }
2511 
2512 /// This function emits a helper that copies all the reduction variables from
2513 /// the team into the provided global buffer for the reduction variables.
2514 ///
2515 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2516 /// For all data entries D in reduce_data:
2517 /// Copy local D to buffer.D[Idx]
2519  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2520  QualType ReductionArrayTy, SourceLocation Loc,
2521  const RecordDecl *TeamReductionRec,
2522  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2523  &VarFieldMap) {
2524  ASTContext &C = CGM.getContext();
2525 
2526  // Buffer: global reduction buffer.
2527  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2528  C.VoidPtrTy, ImplicitParamDecl::Other);
2529  // Idx: index of the buffer.
2530  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2532  // ReduceList: thread local Reduce list.
2533  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2534  C.VoidPtrTy, ImplicitParamDecl::Other);
2535  FunctionArgList Args;
2536  Args.push_back(&BufferArg);
2537  Args.push_back(&IdxArg);
2538  Args.push_back(&ReduceListArg);
2539 
2540  const CGFunctionInfo &CGFI =
2541  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2542  auto *Fn = llvm::Function::Create(
2544  "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2545  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2546  Fn->setDoesNotRecurse();
2547  CodeGenFunction CGF(CGM);
2548  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2549 
2550  CGBuilderTy &Bld = CGF.Builder;
2551 
2552  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2553  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2554  Address LocalReduceList(
2556  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2557  C.VoidPtrTy, Loc),
2558  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2559  CGF.getPointerAlign());
2560  QualType StaticTy = C.getRecordType(TeamReductionRec);
2561  llvm::Type *LLVMReductionsBufferTy =
2562  CGM.getTypes().ConvertTypeForMem(StaticTy);
2564  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2565  LLVMReductionsBufferTy->getPointerTo());
2566  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2567  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2568  /*Volatile=*/false, C.IntTy,
2569  Loc)};
2570  unsigned Idx = 0;
2571  for (const Expr *Private : Privates) {
2572  // Reduce element = LocalReduceList[i]
2573  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2574  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2575  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2576  // elemptr = ((CopyType*)(elemptrptr)) + I
2577  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2578  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
2579  Address ElemPtr =
2580  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2581  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2582  // Global = Buffer.VD[Idx];
2583  const FieldDecl *FD = VarFieldMap.lookup(VD);
2584  LValue GlobLVal = CGF.EmitLValueForField(
2585  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2586  Address GlobAddr = GlobLVal.getAddress(CGF);
2587  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2588  GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2589  GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
2590  switch (CGF.getEvaluationKind(Private->getType())) {
2591  case TEK_Scalar: {
2593  ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2595  CGF.EmitStoreOfScalar(V, GlobLVal);
2596  break;
2597  }
2598  case TEK_Complex: {
2600  CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2601  CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2602  break;
2603  }
2604  case TEK_Aggregate:
2605  CGF.EmitAggregateCopy(GlobLVal,
2606  CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2608  break;
2609  }
2610  ++Idx;
2611  }
2612 
2613  CGF.FinishFunction();
2614  return Fn;
2615 }
2616 
2617 /// This function emits a helper that reduces all the reduction variables from
2618 /// the team into the provided global buffer for the reduction variables.
2619 ///
2620 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2621 /// void *GlobPtrs[];
2622 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2623 /// ...
2624 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2625 /// reduce_function(GlobPtrs, reduce_data);
2627  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2628  QualType ReductionArrayTy, SourceLocation Loc,
2629  const RecordDecl *TeamReductionRec,
2630  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2631  &VarFieldMap,
2632  llvm::Function *ReduceFn) {
2633  ASTContext &C = CGM.getContext();
2634 
2635  // Buffer: global reduction buffer.
2636  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2637  C.VoidPtrTy, ImplicitParamDecl::Other);
2638  // Idx: index of the buffer.
2639  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2641  // ReduceList: thread local Reduce list.
2642  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2643  C.VoidPtrTy, ImplicitParamDecl::Other);
2644  FunctionArgList Args;
2645  Args.push_back(&BufferArg);
2646  Args.push_back(&IdxArg);
2647  Args.push_back(&ReduceListArg);
2648 
2649  const CGFunctionInfo &CGFI =
2650  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2651  auto *Fn = llvm::Function::Create(
2653  "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2654  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2655  Fn->setDoesNotRecurse();
2656  CodeGenFunction CGF(CGM);
2657  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2658 
2659  CGBuilderTy &Bld = CGF.Builder;
2660 
2661  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2662  QualType StaticTy = C.getRecordType(TeamReductionRec);
2663  llvm::Type *LLVMReductionsBufferTy =
2664  CGM.getTypes().ConvertTypeForMem(StaticTy);
2666  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2667  LLVMReductionsBufferTy->getPointerTo());
2668 
2669  // 1. Build a list of reduction variables.
2670  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2671  Address ReductionList =
2672  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2673  auto IPriv = Privates.begin();
2674  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2675  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2676  /*Volatile=*/false, C.IntTy,
2677  Loc)};
2678  unsigned Idx = 0;
2679  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2680  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2681  // Global = Buffer.VD[Idx];
2682  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2683  const FieldDecl *FD = VarFieldMap.lookup(VD);
2684  LValue GlobLVal = CGF.EmitLValueForField(
2685  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2686  Address GlobAddr = GlobLVal.getAddress(CGF);
2687  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2688  GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2689  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2690  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2691  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2692  // Store array size.
2693  ++Idx;
2694  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2695  llvm::Value *Size = CGF.Builder.CreateIntCast(
2696  CGF.getVLASize(
2697  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2698  .NumElts,
2699  CGF.SizeTy, /*isSigned=*/false);
2700  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2701  Elem);
2702  }
2703  }
2704 
2705  // Call reduce_function(GlobalReduceList, ReduceList)
2706  llvm::Value *GlobalReduceList =
2707  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2708  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2709  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2710  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2712  CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2713  CGF.FinishFunction();
2714  return Fn;
2715 }
2716 
2717 /// This function emits a helper that copies all the reduction variables from
2718 /// the team into the provided global buffer for the reduction variables.
2719 ///
2720 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2721 /// For all data entries D in reduce_data:
2722 /// Copy buffer.D[Idx] to local D;
2724  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2725  QualType ReductionArrayTy, SourceLocation Loc,
2726  const RecordDecl *TeamReductionRec,
2727  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2728  &VarFieldMap) {
2729  ASTContext &C = CGM.getContext();
2730 
2731  // Buffer: global reduction buffer.
2732  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2733  C.VoidPtrTy, ImplicitParamDecl::Other);
2734  // Idx: index of the buffer.
2735  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2737  // ReduceList: thread local Reduce list.
2738  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2739  C.VoidPtrTy, ImplicitParamDecl::Other);
2740  FunctionArgList Args;
2741  Args.push_back(&BufferArg);
2742  Args.push_back(&IdxArg);
2743  Args.push_back(&ReduceListArg);
2744 
2745  const CGFunctionInfo &CGFI =
2746  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2747  auto *Fn = llvm::Function::Create(
2749  "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2750  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2751  Fn->setDoesNotRecurse();
2752  CodeGenFunction CGF(CGM);
2753  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2754 
2755  CGBuilderTy &Bld = CGF.Builder;
2756 
2757  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2758  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2759  Address LocalReduceList(
2761  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2762  C.VoidPtrTy, Loc),
2763  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2764  CGF.getPointerAlign());
2765  QualType StaticTy = C.getRecordType(TeamReductionRec);
2766  llvm::Type *LLVMReductionsBufferTy =
2767  CGM.getTypes().ConvertTypeForMem(StaticTy);
2769  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2770  LLVMReductionsBufferTy->getPointerTo());
2771 
2772  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2773  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2774  /*Volatile=*/false, C.IntTy,
2775  Loc)};
2776  unsigned Idx = 0;
2777  for (const Expr *Private : Privates) {
2778  // Reduce element = LocalReduceList[i]
2779  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2780  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2781  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2782  // elemptr = ((CopyType*)(elemptrptr)) + I
2783  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2784  ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
2785  Address ElemPtr =
2786  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2787  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2788  // Global = Buffer.VD[Idx];
2789  const FieldDecl *FD = VarFieldMap.lookup(VD);
2790  LValue GlobLVal = CGF.EmitLValueForField(
2791  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2792  Address GlobAddr = GlobLVal.getAddress(CGF);
2793  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2794  GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2795  GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
2796  switch (CGF.getEvaluationKind(Private->getType())) {
2797  case TEK_Scalar: {
2798  llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2799  CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2801  TBAAAccessInfo());
2802  break;
2803  }
2804  case TEK_Complex: {
2805  CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2806  CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2807  /*isInit=*/false);
2808  break;
2809  }
2810  case TEK_Aggregate:
2811  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2812  GlobLVal, Private->getType(),
2814  break;
2815  }
2816  ++Idx;
2817  }
2818 
2819  CGF.FinishFunction();
2820  return Fn;
2821 }
2822 
2823 /// This function emits a helper that reduces all the reduction variables from
2824 /// the team into the provided global buffer for the reduction variables.
2825 ///
2826 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2827 /// void *GlobPtrs[];
2828 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2829 /// ...
2830 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2831 /// reduce_function(reduce_data, GlobPtrs);
2833  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2834  QualType ReductionArrayTy, SourceLocation Loc,
2835  const RecordDecl *TeamReductionRec,
2836  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2837  &VarFieldMap,
2838  llvm::Function *ReduceFn) {
2839  ASTContext &C = CGM.getContext();
2840 
2841  // Buffer: global reduction buffer.
2842  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2843  C.VoidPtrTy, ImplicitParamDecl::Other);
2844  // Idx: index of the buffer.
2845  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2847  // ReduceList: thread local Reduce list.
2848  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2849  C.VoidPtrTy, ImplicitParamDecl::Other);
2850  FunctionArgList Args;
2851  Args.push_back(&BufferArg);
2852  Args.push_back(&IdxArg);
2853  Args.push_back(&ReduceListArg);
2854 
2855  const CGFunctionInfo &CGFI =
2856  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2857  auto *Fn = llvm::Function::Create(
2859  "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2860  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2861  Fn->setDoesNotRecurse();
2862  CodeGenFunction CGF(CGM);
2863  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2864 
2865  CGBuilderTy &Bld = CGF.Builder;
2866 
2867  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2868  QualType StaticTy = C.getRecordType(TeamReductionRec);
2869  llvm::Type *LLVMReductionsBufferTy =
2870  CGM.getTypes().ConvertTypeForMem(StaticTy);
2872  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2873  LLVMReductionsBufferTy->getPointerTo());
2874 
2875  // 1. Build a list of reduction variables.
2876  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2877  Address ReductionList =
2878  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2879  auto IPriv = Privates.begin();
2880  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2881  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2882  /*Volatile=*/false, C.IntTy,
2883  Loc)};
2884  unsigned Idx = 0;
2885  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2886  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2887  // Global = Buffer.VD[Idx];
2888  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2889  const FieldDecl *FD = VarFieldMap.lookup(VD);
2890  LValue GlobLVal = CGF.EmitLValueForField(
2891  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2892  Address GlobAddr = GlobLVal.getAddress(CGF);
2893  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2894  GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2895  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2896  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2897  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2898  // Store array size.
2899  ++Idx;
2900  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2901  llvm::Value *Size = CGF.Builder.CreateIntCast(
2902  CGF.getVLASize(
2903  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2904  .NumElts,
2905  CGF.SizeTy, /*isSigned=*/false);
2906  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2907  Elem);
2908  }
2909  }
2910 
2911  // Call reduce_function(ReduceList, GlobalReduceList)
2912  llvm::Value *GlobalReduceList =
2913  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2914  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2915  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2916  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2918  CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2919  CGF.FinishFunction();
2920  return Fn;
2921 }
2922 
2923 ///
2924 /// Design of OpenMP reductions on the GPU
2925 ///
2926 /// Consider a typical OpenMP program with one or more reduction
2927 /// clauses:
2928 ///
2929 /// float foo;
2930 /// double bar;
2931 /// #pragma omp target teams distribute parallel for \
2932 /// reduction(+:foo) reduction(*:bar)
2933 /// for (int i = 0; i < N; i++) {
2934 /// foo += A[i]; bar *= B[i];
2935 /// }
2936 ///
2937 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
2938 /// all teams. In our OpenMP implementation on the NVPTX device an
2939 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2940 /// within a team are mapped to CUDA threads within a threadblock.
2941 /// Our goal is to efficiently aggregate values across all OpenMP
2942 /// threads such that:
2943 ///
2944 /// - the compiler and runtime are logically concise, and
2945 /// - the reduction is performed efficiently in a hierarchical
2946 /// manner as follows: within OpenMP threads in the same warp,
2947 /// across warps in a threadblock, and finally across teams on
2948 /// the NVPTX device.
2949 ///
2950 /// Introduction to Decoupling
2951 ///
2952 /// We would like to decouple the compiler and the runtime so that the
2953 /// latter is ignorant of the reduction variables (number, data types)
2954 /// and the reduction operators. This allows a simpler interface
2955 /// and implementation while still attaining good performance.
2956 ///
2957 /// Pseudocode for the aforementioned OpenMP program generated by the
2958 /// compiler is as follows:
2959 ///
2960 /// 1. Create private copies of reduction variables on each OpenMP
2961 /// thread: 'foo_private', 'bar_private'
2962 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2963 /// to it and writes the result in 'foo_private' and 'bar_private'
2964 /// respectively.
2965 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
2966 /// and store the result on the team master:
2967 ///
2968 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2969 /// reduceData, shuffleReduceFn, interWarpCpyFn)
2970 ///
2971 /// where:
2972 /// struct ReduceData {
2973 /// double *foo;
2974 /// double *bar;
2975 /// } reduceData
2976 /// reduceData.foo = &foo_private
2977 /// reduceData.bar = &bar_private
2978 ///
2979 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2980 /// auxiliary functions generated by the compiler that operate on
2981 /// variables of type 'ReduceData'. They aid the runtime perform
2982 /// algorithmic steps in a data agnostic manner.
2983 ///
2984 /// 'shuffleReduceFn' is a pointer to a function that reduces data
2985 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
2986 /// same warp. It takes the following arguments as input:
2987 ///
2988 /// a. variable of type 'ReduceData' on the calling lane,
2989 /// b. its lane_id,
2990 /// c. an offset relative to the current lane_id to generate a
2991 /// remote_lane_id. The remote lane contains the second
2992 /// variable of type 'ReduceData' that is to be reduced.
2993 /// d. an algorithm version parameter determining which reduction
2994 /// algorithm to use.
2995 ///
2996 /// 'shuffleReduceFn' retrieves data from the remote lane using
2997 /// efficient GPU shuffle intrinsics and reduces, using the
2998 /// algorithm specified by the 4th parameter, the two operands
2999 /// element-wise. The result is written to the first operand.
3000 ///
3001 /// Different reduction algorithms are implemented in different
3002 /// runtime functions, all calling 'shuffleReduceFn' to perform
3003 /// the essential reduction step. Therefore, based on the 4th
3004 /// parameter, this function behaves slightly differently to
3005 /// cooperate with the runtime to ensure correctness under
3006 /// different circumstances.
3007 ///
3008 /// 'InterWarpCpyFn' is a pointer to a function that transfers
3009 /// reduced variables across warps. It tunnels, through CUDA
3010 /// shared memory, the thread-private data of type 'ReduceData'
3011 /// from lane 0 of each warp to a lane in the first warp.
3012 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3013 /// The last team writes the global reduced value to memory.
3014 ///
3015 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3016 /// reduceData, shuffleReduceFn, interWarpCpyFn,
3017 /// scratchpadCopyFn, loadAndReduceFn)
3018 ///
3019 /// 'scratchpadCopyFn' is a helper that stores reduced
3020 /// data from the team master to a scratchpad array in
3021 /// global memory.
3022 ///
3023 /// 'loadAndReduceFn' is a helper that loads data from
3024 /// the scratchpad array and reduces it with the input
3025 /// operand.
3026 ///
3027 /// These compiler generated functions hide address
3028 /// calculation and alignment information from the runtime.
3029 /// 5. if ret == 1:
3030 /// The team master of the last team stores the reduced
3031 /// result to the globals in memory.
3032 /// foo += reduceData.foo; bar *= reduceData.bar
3033 ///
3034 ///
3035 /// Warp Reduction Algorithms
3036 ///
3037 /// On the warp level, we have three algorithms implemented in the
3038 /// OpenMP runtime depending on the number of active lanes:
3039 ///
3040 /// Full Warp Reduction
3041 ///
3042 /// The reduce algorithm within a warp where all lanes are active
3043 /// is implemented in the runtime as follows:
3044 ///
3045 /// full_warp_reduce(void *reduce_data,
3046 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3047 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3048 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
3049 /// }
3050 ///
3051 /// The algorithm completes in log(2, WARPSIZE) steps.
3052 ///
3053 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3054 /// not used therefore we save instructions by not retrieving lane_id
3055 /// from the corresponding special registers. The 4th parameter, which
3056 /// represents the version of the algorithm being used, is set to 0 to
3057 /// signify full warp reduction.
3058 ///
3059 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3060 ///
3061 /// #reduce_elem refers to an element in the local lane's data structure
3062 /// #remote_elem is retrieved from a remote lane
3063 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3064 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3065 ///
3066 /// Contiguous Partial Warp Reduction
3067 ///
3068 /// This reduce algorithm is used within a warp where only the first
3069 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3070 /// number of OpenMP threads in a parallel region is not a multiple of
3071 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
3072 ///
3073 /// void
3074 /// contiguous_partial_reduce(void *reduce_data,
3075 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3076 /// int size, int lane_id) {
3077 /// int curr_size;
3078 /// int offset;
3079 /// curr_size = size;
3080 /// mask = curr_size/2;
3081 /// while (offset>0) {
3082 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3083 /// curr_size = (curr_size+1)/2;
3084 /// offset = curr_size/2;
3085 /// }
3086 /// }
3087 ///
3088 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3089 ///
3090 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3091 /// if (lane_id < offset)
3092 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3093 /// else
3094 /// reduce_elem = remote_elem
3095 ///
3096 /// This algorithm assumes that the data to be reduced are located in a
3097 /// contiguous subset of lanes starting from the first. When there is
3098 /// an odd number of active lanes, the data in the last lane is not
3099 /// aggregated with any other lane's dat but is instead copied over.
3100 ///
3101 /// Dispersed Partial Warp Reduction
3102 ///
3103 /// This algorithm is used within a warp when any discontiguous subset of
3104 /// lanes are active. It is used to implement the reduction operation
3105 /// across lanes in an OpenMP simd region or in a nested parallel region.
3106 ///
3107 /// void
3108 /// dispersed_partial_reduce(void *reduce_data,
3109 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3110 /// int size, remote_id;
3111 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3112 /// do {
3113 /// remote_id = next_active_lane_id_right_after_me();
3114 /// # the above function returns 0 of no active lane
3115 /// # is present right after the current lane.
3116 /// size = number_of_active_lanes_in_this_warp();
3117 /// logical_lane_id /= 2;
3118 /// ShuffleReduceFn(reduce_data, logical_lane_id,
3119 /// remote_id-1-threadIdx.x, 2);
3120 /// } while (logical_lane_id % 2 == 0 && size > 1);
3121 /// }
3122 ///
3123 /// There is no assumption made about the initial state of the reduction.
3124 /// Any number of lanes (>=1) could be active at any position. The reduction
3125 /// result is returned in the first active lane.
3126 ///
3127 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3128 ///
3129 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3130 /// if (lane_id % 2 == 0 && offset > 0)
3131 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3132 /// else
3133 /// reduce_elem = remote_elem
3134 ///
3135 ///
3136 /// Intra-Team Reduction
3137 ///
3138 /// This function, as implemented in the runtime call
3139 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3140 /// threads in a team. It first reduces within a warp using the
3141 /// aforementioned algorithms. We then proceed to gather all such
3142 /// reduced values at the first warp.
3143 ///
3144 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3145 /// data from each of the "warp master" (zeroth lane of each warp, where
3146 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
3147 /// a mathematical sense) the problem of reduction across warp masters in
3148 /// a block to the problem of warp reduction.
3149 ///
3150 ///
3151 /// Inter-Team Reduction
3152 ///
3153 /// Once a team has reduced its data to a single value, it is stored in
3154 /// a global scratchpad array. Since each team has a distinct slot, this
3155 /// can be done without locking.
3156 ///
3157 /// The last team to write to the scratchpad array proceeds to reduce the
3158 /// scratchpad array. One or more workers in the last team use the helper
3159 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3160 /// the k'th worker reduces every k'th element.
3161 ///
3162 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3163 /// reduce across workers and compute a globally reduced value.
3164 ///
3168  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3169  if (!CGF.HaveInsertPoint())
3170  return;
3171 
3172  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3173 #ifndef NDEBUG
3174  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3175 #endif
3176 
3177  if (Options.SimpleReduction) {
3178  assert(!TeamsReduction && !ParallelReduction &&
3179  "Invalid reduction selection in emitReduction.");
3180  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3181  ReductionOps, Options);
3182  return;
3183  }
3184 
3185  assert((TeamsReduction || ParallelReduction) &&
3186  "Invalid reduction selection in emitReduction.");
3187 
3188  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3189  // RedList, shuffle_reduce_func, interwarp_copy_func);
3190  // or
3191  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3192  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3193  llvm::Value *ThreadId = getThreadID(CGF, Loc);
3194 
3195  llvm::Value *Res;
3196  ASTContext &C = CGM.getContext();
3197  // 1. Build a list of reduction variables.
3198  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3199  auto Size = RHSExprs.size();
3200  for (const Expr *E : Privates) {
3201  if (E->getType()->isVariablyModifiedType())
3202  // Reserve place for array size.
3203  ++Size;
3204  }
3205  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3206  QualType ReductionArrayTy =
3207  C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3208  /*IndexTypeQuals=*/0);
3209  Address ReductionList =
3210  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3211  auto IPriv = Privates.begin();
3212  unsigned Idx = 0;
3213  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3214  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3215  CGF.Builder.CreateStore(
3217  CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3218  Elem);
3219  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3220  // Store array size.
3221  ++Idx;
3222  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3223  llvm::Value *Size = CGF.Builder.CreateIntCast(
3224  CGF.getVLASize(
3225  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3226  .NumElts,
3227  CGF.SizeTy, /*isSigned=*/false);
3228  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3229  Elem);
3230  }
3231  }
3232 
3234  ReductionList.getPointer(), CGF.VoidPtrTy);
3235  llvm::Function *ReductionFn = emitReductionFunction(
3236  Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
3237  LHSExprs, RHSExprs, ReductionOps);
3238  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3239  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3240  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3241  llvm::Value *InterWarpCopyFn =
3242  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3243 
3244  if (ParallelReduction) {
3245  llvm::Value *Args[] = {RTLoc,
3246  ThreadId,
3247  CGF.Builder.getInt32(RHSExprs.size()),
3248  ReductionArrayTySize,
3249  RL,
3250  ShuffleAndReduceFn,
3251  InterWarpCopyFn};
3252 
3253  Res = CGF.EmitRuntimeCall(
3254  OMPBuilder.getOrCreateRuntimeFunction(
3255  CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3256  Args);
3257  } else {
3258  assert(TeamsReduction && "expected teams reduction.");
3259  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3260  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3261  int Cnt = 0;
3262  for (const Expr *DRE : Privates) {
3263  PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3264  ++Cnt;
3265  }
3266  const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3267  CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3268  C.getLangOpts().OpenMPCUDAReductionBufNum);
3269  TeamsReductions.push_back(TeamReductionRec);
3270  if (!KernelTeamsReductionPtr) {
3271  KernelTeamsReductionPtr = new llvm::GlobalVariable(
3272  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3274  "_openmp_teams_reductions_buffer_$_$ptr");
3275  }
3276  llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3277  Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
3278  /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3279  llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3280  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3281  llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3282  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3283  ReductionFn);
3284  llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3285  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3286  llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3287  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3288  ReductionFn);
3289 
3290  llvm::Value *Args[] = {
3291  RTLoc,
3292  ThreadId,
3293  GlobalBufferPtr,
3294  CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3295  RL,
3296  ShuffleAndReduceFn,
3297  InterWarpCopyFn,
3298  GlobalToBufferCpyFn,
3299  GlobalToBufferRedFn,
3300  BufferToGlobalCpyFn,
3301  BufferToGlobalRedFn};
3302 
3303  Res = CGF.EmitRuntimeCall(
3304  OMPBuilder.getOrCreateRuntimeFunction(
3305  CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3306  Args);
3307  }
3308 
3309  // 5. Build if (res == 1)
3310  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3311  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3312  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3313  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3314  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3315 
3316  // 6. Build then branch: where we have reduced values in the master
3317  // thread in each team.
3318  // __kmpc_end_reduce{_nowait}(<gtid>);
3319  // break;
3320  CGF.EmitBlock(ThenBB);
3321 
3322  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3323  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3324  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3325  auto IPriv = Privates.begin();
3326  auto ILHS = LHSExprs.begin();
3327  auto IRHS = RHSExprs.begin();
3328  for (const Expr *E : ReductionOps) {
3329  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3330  cast<DeclRefExpr>(*IRHS));
3331  ++IPriv;
3332  ++ILHS;
3333  ++IRHS;
3334  }
3335  };
3336  llvm::Value *EndArgs[] = {ThreadId};
3337  RegionCodeGenTy RCG(CodeGen);
3338  NVPTXActionTy Action(
3339  nullptr, llvm::None,
3340  OMPBuilder.getOrCreateRuntimeFunction(
3341  CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3342  EndArgs);
3343  RCG.setAction(Action);
3344  RCG(CGF);
3345  // There is no need to emit line number for unconditional branch.
3347  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3348 }
3349 
3350 const VarDecl *
3352  const VarDecl *NativeParam) const {
3353  if (!NativeParam->getType()->isReferenceType())
3354  return NativeParam;
3355  QualType ArgType = NativeParam->getType();
3356  QualifierCollector QC;
3357  const Type *NonQualTy = QC.strip(ArgType);
3358  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3359  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3360  if (Attr->getCaptureKind() == OMPC_map) {
3361  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3363  }
3364  }
3365  ArgType = CGM.getContext().getPointerType(PointeeTy);
3366  QC.addRestrict();
3367  enum { NVPTX_local_addr = 5 };
3368  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3369  ArgType = QC.apply(CGM.getContext(), ArgType);
3370  if (isa<ImplicitParamDecl>(NativeParam))
3372  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3373  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3374  return ParmVarDecl::Create(
3375  CGM.getContext(),
3376  const_cast<DeclContext *>(NativeParam->getDeclContext()),
3377  NativeParam->getBeginLoc(), NativeParam->getLocation(),
3378  NativeParam->getIdentifier(), ArgType,
3379  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3380 }
3381 
3382 Address
3384  const VarDecl *NativeParam,
3385  const VarDecl *TargetParam) const {
3386  assert(NativeParam != TargetParam &&
3387  NativeParam->getType()->isReferenceType() &&
3388  "Native arg must not be the same as target arg.");
3389  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3390  QualType NativeParamType = NativeParam->getType();
3391  QualifierCollector QC;
3392  const Type *NonQualTy = QC.strip(NativeParamType);
3393  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3394  unsigned NativePointeeAddrSpace =
3395  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3396  QualType TargetTy = TargetParam->getType();
3397  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3398  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3399  // First cast to generic.
3401  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3402  /*AddrSpace=*/0));
3403  // Cast from generic to native address space.
3405  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3406  NativePointeeAddrSpace));
3407  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3408  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3409  NativeParamType);
3410  return NativeParamAddr;
3411 }
3412 
3414  CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3415  ArrayRef<llvm::Value *> Args) const {
3416  SmallVector<llvm::Value *, 4> TargetArgs;
3417  TargetArgs.reserve(Args.size());
3418  auto *FnType = OutlinedFn.getFunctionType();
3419  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3420  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3421  TargetArgs.append(std::next(Args.begin(), I), Args.end());
3422  break;
3423  }
3424  llvm::Type *TargetType = FnType->getParamType(I);
3425  llvm::Value *NativeArg = Args[I];
3426  if (!TargetType->isPointerTy()) {
3427  TargetArgs.emplace_back(NativeArg);
3428  continue;
3429  }
3431  NativeArg,
3432  NativeArg->getType()->getPointerElementType()->getPointerTo());
3433  TargetArgs.emplace_back(
3434  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3435  }
3436  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3437 }
3438 
3439 /// Emit function which wraps the outline parallel region
3440 /// and controls the arguments which are passed to this function.
3441 /// The wrapper ensures that the outlined function is called
3442 /// with the correct arguments when data is shared.
3443 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3444  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3445  ASTContext &Ctx = CGM.getContext();
3446  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3447 
3448  // Create a function that takes as argument the source thread.
3449  FunctionArgList WrapperArgs;
3450  QualType Int16QTy =
3451  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3452  QualType Int32QTy =
3453  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3454  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3455  /*Id=*/nullptr, Int16QTy,
3457  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3458  /*Id=*/nullptr, Int32QTy,
3460  WrapperArgs.emplace_back(&ParallelLevelArg);
3461  WrapperArgs.emplace_back(&WrapperArg);
3462 
3463  const CGFunctionInfo &CGFI =
3465 
3466  auto *Fn = llvm::Function::Create(
3468  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3469 
3470  // Ensure we do not inline the function. This is trivially true for the ones
3471  // passed to __kmpc_fork_call but the ones calles in serialized regions
3472  // could be inlined. This is not a perfect but it is closer to the invariant
3473  // we want, namely, every data environment starts with a new function.
3474  // TODO: We should pass the if condition to the runtime function and do the
3475  // handling there. Much cleaner code.
3476  Fn->addFnAttr(llvm::Attribute::NoInline);
3477 
3479  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3480  Fn->setDoesNotRecurse();
3481 
3482  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3483  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3484  D.getBeginLoc(), D.getBeginLoc());
3485 
3486  const auto *RD = CS.getCapturedRecordDecl();
3487  auto CurField = RD->field_begin();
3488 
3489  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3490  /*Name=*/".zero.addr");
3491  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3492  // Get the array of arguments.
3494 
3495  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3496  Args.emplace_back(ZeroAddr.getPointer());
3497 
3498  CGBuilderTy &Bld = CGF.Builder;
3499  auto CI = CS.capture_begin();
3500 
3501  // Use global memory for data sharing.
3502  // Handle passing of global args to workers.
3503  Address GlobalArgs =
3504  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3505  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3506  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3507  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3508  CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3509  DataSharingArgs);
3510 
3511  // Retrieve the shared variables from the list of references returned
3512  // by the runtime. Pass the variables to the outlined function.
3513  Address SharedArgListAddress = Address::invalid();
3514  if (CS.capture_size() > 0 ||
3516  SharedArgListAddress = CGF.EmitLoadOfPointer(
3517  GlobalArgs, CGF.getContext()
3519  CGF.getContext().VoidPtrTy))
3520  .castAs<PointerType>());
3521  }
3522  unsigned Idx = 0;
3524  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3525  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3526  Src, CGF.SizeTy->getPointerTo());
3527  llvm::Value *LB = CGF.EmitLoadOfScalar(
3528  TypedAddress,
3529  /*Volatile=*/false,
3531  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3532  Args.emplace_back(LB);
3533  ++Idx;
3534  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3535  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3536  Src, CGF.SizeTy->getPointerTo());
3537  llvm::Value *UB = CGF.EmitLoadOfScalar(
3538  TypedAddress,
3539  /*Volatile=*/false,
3541  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3542  Args.emplace_back(UB);
3543  ++Idx;
3544  }
3545  if (CS.capture_size() > 0) {
3546  ASTContext &CGFContext = CGF.getContext();
3547  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3548  QualType ElemTy = CurField->getType();
3549  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3550  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3551  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3552  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3553  /*Volatile=*/false,
3554  CGFContext.getPointerType(ElemTy),
3555  CI->getLocation());
3556  if (CI->capturesVariableByCopy() &&
3557  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3558  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3559  CI->getLocation());
3560  }
3561  Args.emplace_back(Arg);
3562  }
3563  }
3564 
3565  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3566  CGF.FinishFunction();
3567  return Fn;
3568 }
3569 
3571  const Decl *D) {
3573  return;
3574 
3575  assert(D && "Expected function or captured|block decl.");
3576  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3577  "Function is registered already.");
3578  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3579  "Team is set but not processed.");
3580  const Stmt *Body = nullptr;
3581  bool NeedToDelayGlobalization = false;
3582  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3583  Body = FD->getBody();
3584  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3585  Body = BD->getBody();
3586  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3587  Body = CD->getBody();
3588  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3589  if (NeedToDelayGlobalization &&
3590  getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3591  return;
3592  }
3593  if (!Body)
3594  return;
3595  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3596  VarChecker.Visit(Body);
3597  const RecordDecl *GlobalizedVarsRecord =
3598  VarChecker.getGlobalizedRecord(IsInTTDRegion);
3599  TeamAndReductions.first = nullptr;
3600  TeamAndReductions.second.clear();
3601  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3602  VarChecker.getEscapedVariableLengthDecls();
3603  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3604  return;
3605  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3606  I->getSecond().MappedParams =
3607  std::make_unique<CodeGenFunction::OMPMapVars>();
3608  I->getSecond().EscapedParameters.insert(
3609  VarChecker.getEscapedParameters().begin(),
3610  VarChecker.getEscapedParameters().end());
3611  I->getSecond().EscapedVariableLengthDecls.append(
3612  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3613  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3614  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3615  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3616  Data.insert(std::make_pair(VD, MappedVarData()));
3617  }
3618  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
3619  CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
3620  VarChecker.Visit(Body);
3621  I->getSecond().SecondaryLocalVarData.emplace();
3622  DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
3623  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3624  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3625  Data.insert(std::make_pair(VD, MappedVarData()));
3626  }
3627  }
3628  if (!NeedToDelayGlobalization) {
3629  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3630  struct GlobalizationScope final : EHScopeStack::Cleanup {
3631  GlobalizationScope() = default;
3632 
3633  void Emit(CodeGenFunction &CGF, Flags flags) override {
3634  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3635  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3636  }
3637  };
3638  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3639  }
3640 }
3641 
3643  const VarDecl *VD) {
3644  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3645  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3646  auto AS = LangAS::Default;
3647  switch (A->getAllocatorType()) {
3648  // Use the default allocator here as by default local vars are
3649  // threadlocal.
3650  case OMPAllocateDeclAttr::OMPNullMemAlloc:
3651  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3652  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3653  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3654  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3655  // Follow the user decision - use default allocation.
3656  return Address::invalid();
3657  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3658  // TODO: implement aupport for user-defined allocators.
3659  return Address::invalid();
3660  case OMPAllocateDeclAttr::OMPConstMemAlloc:
3661  AS = LangAS::cuda_constant;
3662  break;
3663  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3664  AS = LangAS::cuda_shared;
3665  break;
3666  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3667  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3668  break;
3669  }
3670  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3671  auto *GV = new llvm::GlobalVariable(
3672  CGM.getModule(), VarTy, /*isConstant=*/false,
3673  llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
3674  VD->getName(),
3675  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3677  CharUnits Align = CGM.getContext().getDeclAlign(VD);
3678  GV->setAlignment(Align.getAsAlign());
3679  return Address(
3681  GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3682  VD->getType().getAddressSpace()))),
3683  Align);
3684  }
3685 
3687  return Address::invalid();
3688 
3689  VD = VD->getCanonicalDecl();
3690  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3691  if (I == FunctionGlobalizedDecls.end())
3692  return Address::invalid();
3693  auto VDI = I->getSecond().LocalVarData.find(VD);
3694  if (VDI != I->getSecond().LocalVarData.end())
3695  return VDI->second.PrivateAddr;
3696  if (VD->hasAttrs()) {
3698  E(VD->attr_end());
3699  IT != E; ++IT) {
3700  auto VDI = I->getSecond().LocalVarData.find(
3701  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3702  ->getCanonicalDecl());
3703  if (VDI != I->getSecond().LocalVarData.end())
3704  return VDI->second.PrivateAddr;
3705  }
3706  }
3707 
3708  return Address::invalid();
3709 }
3710 
3712  FunctionGlobalizedDecls.erase(CGF.CurFn);
3714 }
3715 
3717  CodeGenFunction &CGF, const OMPLoopDirective &S,
3718  OpenMPDistScheduleClauseKind &ScheduleKind,
3719  llvm::Value *&Chunk) const {
3720  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3721  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3722  ScheduleKind = OMPC_DIST_SCHEDULE_static;
3723  Chunk = CGF.EmitScalarConversion(
3724  RT.getGPUNumThreads(CGF),
3725  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3726  S.getIterationVariable()->getType(), S.getBeginLoc());
3727  return;
3728  }
3730  CGF, S, ScheduleKind, Chunk);
3731 }
3732 
3734  CodeGenFunction &CGF, const OMPLoopDirective &S,
3735  OpenMPScheduleClauseKind &ScheduleKind,
3736  const Expr *&ChunkExpr) const {
3737  ScheduleKind = OMPC_SCHEDULE_static;
3738  // Chunk size is 1 in this case.
3739  llvm::APInt ChunkSize(32, 1);
3740  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3741  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3742  SourceLocation());
3743 }
3744 
3746  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3748  " Expected target-based directive.");
3749  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3750  for (const CapturedStmt::Capture &C : CS->captures()) {
3751  // Capture variables captured by reference in lambdas for target-based
3752  // directives.
3753  if (!C.capturesVariable())
3754  continue;
3755  const VarDecl *VD = C.getCapturedVar();
3756  const auto *RD = VD->getType()
3757  .getCanonicalType()
3759  ->getAsCXXRecordDecl();
3760  if (!RD || !RD->isLambda())
3761  continue;
3762  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3763  LValue VDLVal;
3764  if (VD->getType().getCanonicalType()->isReferenceType())
3765  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3766  else
3767  VDLVal = CGF.MakeAddrLValue(
3768  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3769  llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
3770  FieldDecl *ThisCapture = nullptr;
3771  RD->getCaptureFields(Captures, ThisCapture);
3772  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3773  LValue ThisLVal =
3774  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3775  llvm::Value *CXXThis = CGF.LoadCXXThis();
3776  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3777  }
3778  for (const LambdaCapture &LC : RD->captures()) {
3779  if (LC.getCaptureKind() != LCK_ByRef)
3780  continue;
3781  const VarDecl *VD = LC.getCapturedVar();
3782  if (!CS->capturesVariable(VD))
3783  continue;
3784  auto It = Captures.find(VD);
3785  assert(It != Captures.end() && "Found lambda capture without field.");
3786  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3787  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3788  if (VD->getType().getCanonicalType()->isReferenceType())
3789  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3790  VD->getType().getCanonicalType())
3791  .getAddress(CGF);
3792  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3793  }
3794  }
3795 }
3796 
3798  LangAS &AS) {
3799  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3800  return false;
3801  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3802  switch(A->getAllocatorType()) {
3803  case OMPAllocateDeclAttr::OMPNullMemAlloc:
3804  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3805  // Not supported, fallback to the default mem space.
3806  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3807  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3808  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3809  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3810  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3811  AS = LangAS::Default;
3812  return true;
3813  case OMPAllocateDeclAttr::OMPConstMemAlloc:
3814  AS = LangAS::cuda_constant;
3815  return true;
3816  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3817  AS = LangAS::cuda_shared;
3818  return true;
3819  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3820  llvm_unreachable("Expected predefined allocator for the variables with the "
3821  "static storage.");
3822  }
3823  return false;
3824 }
3825 
3826 // Get current CudaArch and ignore any unknown values
3828  if (!CGM.getTarget().hasFeature("ptx"))
3829  return CudaArch::UNKNOWN;
3830  for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3831  if (Feature.getValue()) {
3832  CudaArch Arch = StringToCudaArch(Feature.getKey());
3833  if (Arch != CudaArch::UNKNOWN)
3834  return Arch;
3835  }
3836  }
3837  return CudaArch::UNKNOWN;
3838 }
3839 
3840 /// Check to see if target architecture supports unified addressing which is
3841 /// a restriction for OpenMP requires clause "unified_shared_memory".
3843  const OMPRequiresDecl *D) {
3844  for (const OMPClause *Clause : D->clauselists()) {
3845  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3846  CudaArch Arch = getCudaArch(CGM);
3847  switch (Arch) {
3848  case CudaArch::SM_20:
3849  case CudaArch::SM_21:
3850  case CudaArch::SM_30:
3851  case CudaArch::SM_32:
3852  case CudaArch::SM_35:
3853  case CudaArch::SM_37:
3854  case CudaArch::SM_50:
3855  case CudaArch::SM_52:
3856  case CudaArch::SM_53: {
3857  SmallString<256> Buffer;
3858  llvm::raw_svector_ostream Out(Buffer);
3859  Out << "Target architecture " << CudaArchToString(Arch)
3860  << " does not support unified addressing";
3861  CGM.Error(Clause->getBeginLoc(), Out.str());
3862  return;
3863  }
3864  case CudaArch::SM_60:
3865  case CudaArch::SM_61:
3866  case CudaArch::SM_62:
3867  case CudaArch::SM_70:
3868  case CudaArch::SM_72:
3869  case CudaArch::SM_75:
3870  case CudaArch::SM_80:
3871  case CudaArch::SM_86:
3872  case CudaArch::GFX600:
3873  case CudaArch::GFX601:
3874  case CudaArch::GFX602:
3875  case CudaArch::GFX700:
3876  case CudaArch::GFX701:
3877  case CudaArch::GFX702:
3878  case CudaArch::GFX703:
3879  case CudaArch::GFX704:
3880  case CudaArch::GFX705:
3881  case CudaArch::GFX801:
3882  case CudaArch::GFX802:
3883  case CudaArch::GFX803:
3884  case CudaArch::GFX805:
3885  case CudaArch::GFX810:
3886  case CudaArch::GFX900:
3887  case CudaArch::GFX902:
3888  case CudaArch::GFX904:
3889  case CudaArch::GFX906:
3890  case CudaArch::GFX908:
3891  case CudaArch::GFX909:
3892  case CudaArch::GFX90a:
3893  case CudaArch::GFX90c:
3894  case CudaArch::GFX1010:
3895  case CudaArch::GFX1011:
3896  case CudaArch::GFX1012:
3897  case CudaArch::GFX1013:
3898  case CudaArch::GFX1030:
3899  case CudaArch::GFX1031:
3900  case CudaArch::GFX1032:
3901  case CudaArch::GFX1033:
3902  case CudaArch::GFX1034:
3903  case CudaArch::GFX1035:
3904  case CudaArch::UNUSED:
3905  case CudaArch::UNKNOWN:
3906  break;
3907  case CudaArch::LAST:
3908  llvm_unreachable("Unexpected Cuda arch.");
3909  }
3910  }
3911  }
3913 }
3914 
3916 
3917  if (!TeamsReductions.empty()) {
3918  ASTContext &C = CGM.getContext();
3919  RecordDecl *StaticRD = C.buildImplicitRecord(
3920  "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3921  StaticRD->startDefinition();
3922  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3923  QualType RecTy = C.getRecordType(TeamReductionRec);
3924  auto *Field = FieldDecl::Create(
3925  C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3926  C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3927  /*BW=*/nullptr, /*Mutable=*/false,
3928  /*InitStyle=*/ICIS_NoInit);
3929  Field->setAccess(AS_public);
3930  StaticRD->addDecl(Field);
3931  }
3932  StaticRD->completeDefinition();
3933  QualType StaticTy = C.getRecordType(StaticRD);
3934  llvm::Type *LLVMReductionsBufferTy =
3935  CGM.getTypes().ConvertTypeForMem(StaticTy);
3936  // FIXME: nvlink does not handle weak linkage correctly (object with the
3937  // different size are reported as erroneous).
3938  // Restore CommonLinkage as soon as nvlink is fixed.
3939  auto *GV = new llvm::GlobalVariable(
3940  CGM.getModule(), LLVMReductionsBufferTy,
3941  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3942  llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3943  "_openmp_teams_reductions_buffer_$_");
3944  KernelTeamsReductionPtr->setInitializer(
3945  llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3946  CGM.VoidPtrTy));
3947  }
3949 }
3950 
3952  CGBuilderTy &Bld = CGF.Builder;
3953  llvm::Module *M = &CGF.CGM.getModule();
3954  const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3955  llvm::Function *F = M->getFunction(LocSize);
3956  if (!F) {
3957  F = llvm::Function::Create(
3958  llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
3960  }
3961  return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
3962 }
clang::QualifierCollector::strip
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:6371
clang::CodeGen::CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU
CGOpenMPRuntimeGPU(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:1197
clang::CudaArch::SM_35
@ SM_35
clang::InternalLinkage
@ InternalLinkage
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:31
clang::CudaArch::GFX805
@ GFX805
clang::LangAS::cuda_shared
@ cuda_shared
clang::CodeGen::CGOpenMPRuntimeGPU::emitOutlinedFunctionCall
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=llvm::None) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntimeGPU.cpp:3413
clang::CodeGen::CodeGenFunction::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T)
Definition: CodeGenFunction.cpp:207
clang::ASTContext::getTypeSizeInChars
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
Definition: ASTContext.cpp:2450
supportsSPMDExecutionMode
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
Definition: CGOpenMPRuntimeGPU.cpp:671
clang::CudaArch::SM_70
@ SM_70
clang::CudaArch::GFX704
@ GFX704
clang::prec::Conditional
@ Conditional
Definition: OperatorPrecedence.h:30
clang::CodeGen::CodeGenTypeCache::SizeTy
llvm::IntegerType * SizeTy
Definition: CodeGenTypeCache.h:50
max
__DEVICE__ int max(int __a, int __b)
Definition: __clang_cuda_math.h:196
clang::CodeGen::CodeGenFunction::getTypeSize
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
Definition: CGStmtOpenMP.cpp:303
clang::LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
clang::CodeGen::LValueBaseInfo
Definition: CGValue.h:150
clang::OMPLastprivateClause
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:2611
clang::CodeGen::CodeGenFunction::EmitScalarConversion
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
Definition: CGExprScalar.cpp:4858
clang::CodeGen::CodeGenTypeCache::Int8PtrTy
llvm::PointerType * Int8PtrTy
Definition: CodeGenTypeCache.h:57
clang::Decl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:422
clang::CodeGen::CodeGenFunction::LoadCXXThis
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
Definition: CodeGenFunction.h:2763
clang::CharUnits::getAsAlign
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:183
clang::CudaArch::GFX1035
@ GFX1035
clang::CodeGen::CGOpenMPRuntime::getThreadID
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
Definition: CGOpenMPRuntime.cpp:1459
clang::interp::APInt
llvm::APInt APInt
Definition: Integral.h:27
setPropertyExecutionMode
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
Definition: CGOpenMPRuntimeGPU.cpp:1113
clang::CodeGen::CodeGenTypeCache::IntTy
llvm::IntegerType * IntTy
int
Definition: CodeGenTypeCache.h:42
clang::CodeGen::TEK_Aggregate
@ TEK_Aggregate
Definition: CodeGenFunction.h:115
clang::CodeGen::CGOpenMPRuntime
Definition: CGOpenMPRuntime.h:225
clang::CudaArch::GFX601
@ GFX601
clang::DeclaratorDecl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:780
clang::Expr::isLValue
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:270
clang::CodeGen::CGOpenMPRuntime::emitSingleReductionCombiner
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Definition: CGOpenMPRuntime.cpp:5542
clang::Qualifiers::addRestrict
void addRestrict()
Definition: Type.h:275
CodeGenFunction.h
clang::DeclContext
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1347
clang::CapturedStmt::captures
capture_range captures()
Definition: Stmt.h:3635
clang::CodeGen::CGOpenMPRuntimeGPU::EM_SPMD
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
Definition: CGOpenMPRuntimeGPU.h:29
clang::LambdaCapture
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
clang::OMPExecutableDirective::getClausesOfKind
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:445
clang::Decl::hasAttr
bool hasAttr() const
Definition: DeclBase.h:547
emitShuffleAndReduceFunction
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
Definition: CGOpenMPRuntimeGPU.cpp:2361
clang::CudaArch::GFX908
@ GFX908
clang::CodeGen::CGOpenMPRuntimeGPU::emitBarrierCall
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntimeGPU.cpp:1598
emitListToGlobalCopyFunction
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2518
clang::VarDecl::getCanonicalDecl
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2141
clang::QualifierCollector::apply
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3701
clang::CodeGen::CGOpenMPRuntime::emitParallelOutlinedFunction
virtual llvm::Function * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
Definition: CGOpenMPRuntime.cpp:1290
clang::CodeGen::Address::getAlignment
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:66
clang::CodeGen::CGOpenMPRuntime::emitProcBindClause
virtual void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Definition: CGOpenMPRuntime.cpp:2941
clang::isOpenMPParallelDirective
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
Definition: OpenMPKinds.cpp:502
clang::CodeGen::CGOpenMPRuntime::clearLocThreadIdInsertPt
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
Definition: CGOpenMPRuntime.cpp:1414
clang::ASTContext::VoidTy
CanQualType VoidTy
Definition: ASTContext.h:1075
llvm::SmallVector
Definition: LLVM.h:38
CopyOptionsTy
Definition: CGOpenMPRuntimeGPU.cpp:1844
clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:88
CopyOptionsTy::ScratchpadWidth
llvm::Value * ScratchpadWidth
Definition: CGOpenMPRuntimeGPU.cpp:1847
clang::CodeGen::CodeGenModule::SetInternalFunctionAttributes
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
Definition: CodeGenModule.cpp:2058
clang::VarDecl::isInitCapture
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1488
clang::CodeGen::LValue::getAddress
Address getAddress(CodeGenFunction &CGF) const
Definition: CGValue.h:329
emitReductionListCopy
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values,...
Definition: CGOpenMPRuntimeGPU.cpp:1852
CGOpenMPRuntimeNVPTX.h
clang::CodeGen::CodeGenFunction::EmitLoadOfComplex
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.
Definition: CGExprComplex.cpp:1145
clang::CudaArch::GFX906
@ GFX906
clang::QualType::getNonReferenceType
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6620
clang::CastExpr::getSubExpr
Expr * getSubExpr()
Definition: Expr.h:3524
clang::CodeGen::CGOpenMPRuntime::setLocThreadIdInsertPt
void setLocThreadIdInsertPt(CodeGenFunction &CGF, bool AtCurrentPoint=false)
Definition: CGOpenMPRuntime.cpp:1398
clang::CharUnits::QuantityType
int64_t QuantityType
Definition: CharUnits.h:40
clang::CodeGen::CodeGenTypeCache::getPointerAlign
CharUnits getPointerAlign() const
Definition: CodeGenTypeCache.h:117
clang::QualType
A (possibly-)qualified type.
Definition: Type.h:673
clang::CudaArch::GFX802
@ GFX802
Attr.h
AttributeLangSupport::C
@ C
Definition: SemaDeclAttr.cpp:54
clang::CudaArch::SM_53
@ SM_53
clang::CodeGen::CodeGenModule::getContext
ASTContext & getContext() const
Definition: CodeGenModule.h:702
clang::QualType::getCanonicalType
QualType getCanonicalType() const
Definition: Type.h:6463
clang::FieldDecl
Represents a member of a struct/union/class.
Definition: Decl.h:2835
clang::CodeGen::CGOpenMPRuntime::OMPBuilder
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
Definition: CGOpenMPRuntime.h:309
clang::CodeGen::CGOpenMPRuntime::emitCriticalRegion
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
Definition: CGOpenMPRuntime.cpp:2243
clang::CodeGen::CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
Definition: CGOpenMPRuntimeGPU.cpp:3745
clang::isOpenMPLoopDirective
bool isOpenMPLoopDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a directive with an associated loop construct.
Definition: OpenMPKinds.cpp:457
clang::CudaArch::GFX1034
@ GFX1034
clang::CodeGen::CodeGenFunction::createBasicBlock
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
Definition: CodeGenFunction.h:2394
clang::CodeGen::CGOpenMPRuntimeGPU::Generic
@ Generic
Generic data-sharing mode.
Definition: CGOpenMPRuntimeGPU.h:349
clang::CodeGen::CGOpenMPRuntimeGPU::functionFinished
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntimeGPU.cpp:3711
DeclOpenMP.h
clang::CodeGen::CGBuilderTy::CreateStore
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:95
clang::CodeGen::LValue::setAddress
void setAddress(Address address)
Definition: CGValue.h:332
clang::StringToCudaArch
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:147
createRuntimeShuffleFunction
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
Definition: CGOpenMPRuntimeGPU.cpp:1714
clang::CodeGen::CodeGenTypes::arrangeBuiltinFunctionDeclaration
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:657
clang::TargetInfo::hasFeature
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1305
llvm::SmallPtrSet
Definition: ASTContext.h:82
clang::UnaryOperator
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2157
clang::CodeGen::CodeGenModule::getLangOpts
const LangOptions & getLangOpts() const
Definition: CodeGenModule.h:703
clang::OMPReductionClause
This represents clause 'reduction' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:2922
clang::CodeGen::CodeGenFunction::EmitLoadOfPointer
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Definition: CGExpr.cpp:2494
clang::CodeGen::CGOpenMPRuntime::emitThreadIDAddress
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
Definition: CGOpenMPRuntime.cpp:2161
clang::CudaArch::GFX1010
@ GFX1010
clang::CodeGen::CodeGenFunction::HaveInsertPoint
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
Definition: CodeGenFunction.h:2435
clang::CodeGen::CodeGenFunction::EmitLValueForField
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:4275
clang::Decl::getAttr
T * getAttr() const
Definition: DeclBase.h:543
clang::CodeGen::CodeGenFunction::EmitStoreOfScalar
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:3802
clang::CudaArch::SM_60
@ SM_60
clang::CodeGen::CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
Definition: CGOpenMPRuntimeGPU.cpp:3797
clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelOutlinedFunction
llvm::Function * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
Definition: CGOpenMPRuntimeGPU.cpp:1238
getTeamsReductionVars
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
Definition: CGOpenMPRuntimeGPU.cpp:1305
clang::CodeGen::CGOpenMPRuntime::emitBarrierCall
virtual void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false)
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntime.cpp:2578
clang::CodeGen::CGBuilderTy
Definition: CGBuilder.h:43
clang::ASTContext::toBits
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
Definition: ASTContext.cpp:2444
clang::AttributeCommonInfo::AS_GNU
@ AS_GNU
attribute((...))
Definition: AttributeCommonInfo.h:27
clang::CodeGen::CGOpenMPRuntimeGPU::CUDA
@ CUDA
CUDA data sharing mode.
Definition: CGOpenMPRuntimeGPU.h:347
clang::ArrayType::Normal
@ Normal
Definition: Type.h:2890
clang::CodeGen::CGBuilderTy::CreateConstGEP
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:242
emitGlobalToListReduceFunction
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2832
clang::CodeGen::CodeGenFunction::Builder
CGBuilderTy Builder
Definition: CodeGenFunction.h:274
clang::CodeGen::CGBuilderTy::CreateBitCast
Address CreateBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:151
clang::OMPScheduleClause
This represents 'schedule' clause in the '#pragma omp ...' directive.
Definition: OpenMPClause.h:1497
clang::Type
The base class of the type hierarchy.
Definition: Type.h:1490
clang::FieldDecl::Create
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4171
clang::CodeGen::CodeGenTypeCache::VoidPtrTy
llvm::PointerType * VoidPtrTy
Definition: CodeGenTypeCache.h:56
clang::CodeGen::CGOpenMPRuntimeGPU
Definition: CGOpenMPRuntimeGPU.h:24
clang::Decl::attr_end
attr_iterator attr_end() const
Definition: DeclBase.h:513
clang::Type::hasSignedIntegerRepresentation
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2050
clang::CodeGen::CodeGenFunction::GetAddrOfLocalVar
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
Definition: CodeGenFunction.h:2679
clang::CodeGen::CGOpenMPRuntimeGPU::emitNumThreadsClause
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Definition: CGOpenMPRuntimeGPU.cpp:1223
Offset
unsigned Offset
Definition: Format.cpp:2335
clang::CodeGen::CodeGenModule::getOpenMPRuntime
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
Definition: CodeGenModule.h:616
clang::CudaArch::GFX1011
@ GFX1011
clang::GlobalDecl
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:55
clang::CodeGen::AlignmentSource::Decl
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
clang::BlockDecl::captures
ArrayRef< Capture > captures() const
Definition: Decl.h:4284
clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::isCXXThisExprCaptured
bool isCXXThisExprCaptured() const
Definition: CodeGenFunction.h:418
clang::OMPOrderedClause
This represents 'ordered' clause in the '#pragma omp ...' directive.
Definition: OpenMPClause.h:1690
clang::CodeGen::CodeGenFunction::EmitLoadOfScalar
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:3780
clang::CodeGen::CodeGenFunction::ComplexPairTy
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
Definition: CodeGenFunction.h:272
clang::CodeGen::CodeGenTypes::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T, bool ForBitField=false)
ConvertTypeForMem - Convert type T into a llvm::Type.
Definition: CodeGenTypes.cpp:90
clang::CudaArch::SM_20
@ SM_20
clang::isOpenMPDistributeDirective
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
Definition: OpenMPKinds.cpp:569
clang::CodeGen::CGOpenMPRuntimeGPU::getGPUNumThreads
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
Definition: CGOpenMPRuntimeGPU.cpp:3951
clang::CodeGen::CodeGenFunction::CreateDefaultAlignTempAlloca
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:123
clang::CapturedStmt::capturesVariable
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1417
clang::ASTContext::getUIntPtrType
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Definition: ASTContext.cpp:5747
clang::Type::isReferenceType
bool isReferenceType() const
Definition: Type.h:6684
clang::CapturedStmt::getCapturedStmt
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3601
clang::CallExpr::getCallee
Expr * getCallee()
Definition: Expr.h:2945
V
#define V(N, I)
Definition: ASTContext.h:3121
clang::CodeGen::CGOpenMPRuntime::getOMPBuilder
llvm::OpenMPIRBuilder & getOMPBuilder()
Definition: CGOpenMPRuntime.h:302
clang::CodeGen::CGBuilderTy::CreateConstInBoundsGEP
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:226
clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3733
clang::CodeGen::TBAAAccessInfo
Definition: CodeGenTBAA.h:42
getDistributeLastprivateVars
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ...
Definition: CGOpenMPRuntimeGPU.cpp:1280
clang::CodeGen::CGOpenMPRuntimeGPU::DataSharingMode
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
Definition: CGOpenMPRuntimeGPU.h:345
clang::CudaArch::SM_86
@ SM_86
clang::CudaArch::GFX702
@ GFX702
clang::CodeGen::EHScopeStack::Cleanup
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141
StmtOpenMP.h
getDataSharingMode
static CGOpenMPRuntimeGPU::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:563
clang::QualType::getAddressSpace
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:6545
clang::OMPClause
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:54
clang::CodeGen::CGOpenMPRuntimeGPU::emitReduction
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
Definition: CGOpenMPRuntimeGPU.cpp:3165
clang::LambdaExpr
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1865
clang::OMPLoopDirective
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1002
clang::CudaArch::SM_72
@ SM_72
clang::CharUnits::fromQuantity
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
clang::CodeGen::CodeGenFunction::EmitCastToVoidPtr
llvm::Value * EmitCastToVoidPtr(llvm::Value *value)
Emit a cast to void* in the appropriate address space.
Definition: CGExpr.cpp:54
clang::AS_public
@ AS_public
Definition: Specifiers.h:109
clang::CodeGen::CGBuilderTy::CreatePointerBitCastOrAddrSpaceCast
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:173
clang::BlockExpr::getBlockDecl
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5977
clang::CodeGen::CGOpenMPRuntimeGPU::emitNumTeamsClause
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
Definition: CGOpenMPRuntimeGPU.cpp:1233
clang::OMPRequiresDecl::clauselists
clauselist_range clauselists()
Definition: DeclOpenMP.h:441
clang::CodeGen::Address::getType
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:43
clang::getOpenMPCaptureRegions
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
Definition: OpenMPKinds.cpp:608
clang::ASTContext
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:212
clang::ASTContext::getSizeType
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
Definition: ASTContext.cpp:5709
clang::CodeGen::CodeGenFunction::OMPPrivateScope
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
Definition: CodeGenFunction.h:1032
clang::CudaArch::GFX90c
@ GFX90c
clang::CodeGen::CGOpenMPRuntimeGPU::ExecutionMode
ExecutionMode
Defines the execution mode.
Definition: CGOpenMPRuntimeGPU.h:27
clang::CodeGen::CodeGenFunction::EmitRuntimeCall
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
clang::CodeGen::TEK_Complex
@ TEK_Complex
Definition: CodeGenFunction.h:114
clang::CodeGen::CodeGenTypeCache::Int16Ty
llvm::IntegerType * Int16Ty
Definition: CodeGenTypeCache.h:37
clang::CudaArch::GFX1012
@ GFX1012
clang::CodeGen::CodeGenFunction::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenFunction.h:1989
clang::CodeGen::CodeGenFunction::getDebugInfo
CGDebugInfo * getDebugInfo()
Definition: CodeGenFunction.h:1948
clang::ASTContext::getAsVariableArrayType
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2632
supportsLightweightRuntime
static bool supportsLightweightRuntime(ASTContext &Ctx, const OMPExecutableDirective &D)
Checks if the construct supports lightweight runtime.
Definition: CGOpenMPRuntimeGPU.cpp:928
clang::CudaArch::GFX1033
@ GFX1033
clang::CudaArch::GFX1031
@ GFX1031
clang::CudaArch::GFX90a
@ GFX90a
clang::CudaArch::SM_21
@ SM_21
clang::Decl::getCanonicalDecl
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:901
clang::OMPExecutableDirective::getInnermostCapturedStmt
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:542
clang::Stmt::children
child_range children()
Definition: Stmt.cpp:285
clang::IntegerLiteral::Create
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:883
clang::CodeGen::CodeGenModule::getModule
llvm::Module & getModule() const
Definition: CodeGenModule.h:709
clang::Type::isLValueReferenceType
bool isLValueReferenceType() const
Definition: Type.h:6688
clang::index::SymbolKind::Field
@ Field
clang::DeclRefExpr::getDecl
ValueDecl * getDecl()
Definition: Expr.h:1289
clang::ImplicitParamDecl
Definition: Decl.h:1601
clang::CodeGen::CGOpenMPRuntime::getDefaultFlagsForBarriers
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
Definition: CGOpenMPRuntime.cpp:2545
clang::CR_OpenMP
@ CR_OpenMP
Definition: CapturedStmt.h:19
clang::CudaArch::GFX1032
@ GFX1032
clang::CodeGen::LValue::getType
QualType getType() const
Definition: CGValue.h:266
llvm::SmallString
Definition: LLVM.h:37
clang::getLangASFromTargetAS
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:80
clang::ASTContext::getIntTypeForBitwidth
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
Definition: ASTContext.cpp:11259
clang::VarDecl
Represents a variable declaration or definition.
Definition: Decl.h:876
hasNestedSPMDDirective
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Definition: CGOpenMPRuntimeGPU.cpp:569
clang::Type::getPointeeType
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:625
clang::CodeGen::CodeGenModule::getTypes
CodeGenTypes & getTypes()
Definition: CodeGenModule.h:726
clang::CudaArch::GFX904
@ GFX904
clang::CapturedStmt
This captures a statement into a function.
Definition: Stmt.h:3500
clang::Type::getAsCXXRecordDecl
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1760
clang::TTK_Union
@ TTK_Union
The "union" keyword.
Definition: Type.h:5316
clang::CodeGen::CodeGenFunction::getVLASize
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements,...
Definition: CodeGenFunction.cpp:2106
clang::Type::isVariablyModifiedType
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2192
clang::CapturedStmt::Capture
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3513
clang::CudaArch::SM_52
@ SM_52
clang::CodeGen::CGOpenMPRuntime::getDefaultDistScheduleAndChunk
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
Definition: CGOpenMPRuntime.h:1780
clang::CodeGen::CGOpenMPRuntime::getIdentTyPointerTy
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
Definition: CGOpenMPRuntime.cpp:1548
clang::CodeGen::CGOpenMPRuntime::ReductionOptionsTy
Definition: CGOpenMPRuntime.h:1429
clang::CodeGen::CodeGenFunction::getContext
ASTContext & getContext() const
Definition: CodeGenFunction.h:1947
clang::QualifierCollector
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:6364
clang::CodeGen::CGOpenMPRuntime::getSingleCompoundChild
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
Definition: CGOpenMPRuntime.cpp:6605
clang::CodeGen::CodeGenFunction::FinishFunction
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
Definition: CodeGenFunction.cpp:324
clang::CodeGen::Address
An aligned address.
Definition: Address.h:24
castValueToType
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
Definition: CGOpenMPRuntimeGPU.cpp:1685
Base
clang::CodeGen::CGOpenMPRuntime::emitReductionFunction
llvm::Function * emitReductionFunction(SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
Definition: CGOpenMPRuntime.cpp:5450
clang::CodeGen::CodeGenFunction::getEvaluationKind
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
Definition: CodeGenFunction.cpp:215
clang::OMPExecutableDirective::hasClausesOfKind
bool hasClausesOfKind() const
Returns true if the current directive has one or more clauses of a specific kind.
Definition: StmtOpenMP.h:482
clang::isOpenMPPrivate
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
Definition: OpenMPKinds.cpp:580
clang::CodeGen::CGOpenMPRuntime::emitUpdateLocation
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
Definition: CGOpenMPRuntime.cpp:1436
clang::CodeGen::Address::getAddressSpace
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:56
clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelCall
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
Definition: CGOpenMPRuntimeGPU.cpp:1511
clang::CastExpr::getCastKind
CastKind getCastKind() const
Definition: Expr.h:3518
clang::CudaArch::SM_61
@ SM_61
clang::isOpenMPTeamsDirective
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
Definition: OpenMPKinds.cpp:540
clang::CodeGen::CodeGenModule::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenModule.h:714
clang::CudaArch::GFX803
@ GFX803
getNVPTXWarpID
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Definition: CGOpenMPRuntimeGPU.cpp:536
clang::CodeGen::CodeGenTypeCache::Int32Ty
llvm::IntegerType * Int32Ty
Definition: CodeGenTypeCache.h:37
clang::CodeGen::CGOpenMPRuntimeGPU::translateParameter
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
Definition: CGOpenMPRuntimeGPU.cpp:3351
clang::ASTContext::getTargetAddressSpace
unsigned getTargetAddressSpace(QualType T) const
Definition: ASTContext.h:2720
clang::LangAS
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
clang::CodeGen::CodeGenTypes::GetFunctionType
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1595
clang::CodeGen::RegionCodeGenTy
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
Definition: CGOpenMPRuntime.h:70
clang::CodeGen::CodeGenFunction::StartFunction
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
Definition: CodeGenFunction.cpp:700
clang::CudaArch::LAST
@ LAST
clang::CodeGen::CodeGenFunction::VlaSizePair::NumElts
llvm::Value * NumElts
Definition: CodeGenFunction.h:2743
clang::CodeGen::LValue
LValue - This represents an lvalue references.
Definition: CGValue.h:167
clang::CodeGen::CodeGenFunction::EHStack
EHScopeStack EHStack
Definition: CodeGenFunction.h:585
clang::TargetOptions::FeatureMap
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62
clang::TargetInfo::getTargetOpts
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:258
clang::CodeGen::CodeGenFunction::MakeNaturalAlignAddrLValue
LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T)
Definition: CodeGenFunction.cpp:187
clang::OMPExecutableDirective::getDirectiveKind
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:555
clang::LambdaExpr::isInitCapture
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition: ExprCXX.cpp:1209
clang::CodeGen::CGOpenMPRuntimeGPU::processRequiresDirective
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntimeGPU.cpp:3842
clang::TagDecl::startDefinition
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4319
clang::ASTContext::VoidPtrTy
CanQualType VoidPtrTy
Definition: ASTContext.h:1102
clang::Decl::hasAttrs
bool hasAttrs() const
Definition: DeclBase.h:489
clang::CudaArch::SM_80
@ SM_80
clang::syntax::NodeRole::Size
@ Size
clang::CodeGen::Address::getPointer
llvm::Value * getPointer() const
Definition: Address.h:37
clang::CodeGen::CodeGenFunction
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
Definition: CodeGenFunction.h:235
clang::Stmt::IgnoreContainers
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top,...
Definition: Stmt.cpp:195
clang::ASTContext::getDeclAlign
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
Definition: ASTContext.cpp:1727
clang::CudaArch::GFX909
@ GFX909
clang::CudaArch::SM_30
@ SM_30
clang::LCK_ByRef
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37
clang::NamedDecl::getIdentifier
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
clang::ValueDecl
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:676
clang::RecordDecl::completeDefinition
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:4625
clang::CodeGen::CGOpenMPRuntime::processRequiresDirective
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntime.cpp:10965
clang::CharUnits::isZero
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:116
clang::DeclContext::addDecl
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1559
clang::CodeGen::CodeGenModule
This class organizes the cross-function state that is used while generating LLVM code.
Definition: CodeGenModule.h:284
clang::LambdaExpr::captures
capture_range captures() const
Retrieve this lambda's captures.
Definition: ExprCXX.cpp:1222
clang::CodeGen::CGOpenMPRuntimeGPU::getGPUThreadID
virtual llvm::Value * getGPUThreadID(CodeGenFunction &CGF)=0
Get the id of the current thread on the GPU.
clang::BlockExpr
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:5965
llvm::ArrayRef
Definition: LLVM.h:34
clang::LangAS::opencl_global
@ opencl_global
clang::Decl::attr_begin
attr_iterator attr_begin() const
Definition: DeclBase.h:510
Value
Value
Definition: UninitializedValues.cpp:102
clang::Decl
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:89
clang::CudaArch::GFX600
@ GFX600
clang::CodeGen::CGOpenMPRuntime::emitNumThreadsClause
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc)
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Definition: CGOpenMPRuntime.cpp:2927
clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::getKind
CapturedRegionKind getKind() const
Definition: CodeGenFunction.h:407
clang::Expr::IgnoreParenImpCasts
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:2919
clang::OMPExecutableDirective::getAssociatedStmt
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:520
CopyOptionsTy::ScratchpadIndex
llvm::Value * ScratchpadIndex
Definition: CGOpenMPRuntimeGPU.cpp:1846
emitListToGlobalReduceFunction
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2626
clang::CodeGen::Address::invalid
static Address invalid()
Definition: Address.h:34
clang::CudaArchToString
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:129
clang::CodeGen::FunctionArgList
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:358
StmtVisitor.h
clang::CudaArch::UNKNOWN
@ UNKNOWN
clang::CudaArch::SM_32
@ SM_32
clang::OMPExecutableDirective::hasAssociatedStmt
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:517
clang::CodeGen::CGOpenMPRuntime::emitOutlinedFunctionCall
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=llvm::None) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntime.cpp:12191
clang::CodeGen::NormalAndEHCleanup
@ NormalAndEHCleanup
Definition: EHScopeStack.h:86
hasNestedLightweightDirective
static bool hasNestedLightweightDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) lightweight runtime construct, if any.
Definition: CGOpenMPRuntimeGPU.cpp:766
clang::CudaArch::GFX900
@ GFX900
getNVPTXLaneID
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
Definition: CGOpenMPRuntimeGPU.cpp:547
clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3716
clang::CodeGen::Address::getElementType
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:51
clang::CodeGen::CodeGenFunction::MakeAddrLValue
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
Definition: CodeGenFunction.h:2456
clang::CudaArch::GFX801
@ GFX801
clang::ConstStmtVisitor
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:193
clang::CodeGen::LValue::getPointer
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:325
clang::CodeGen::CGBuilderTy::CreateElementBitCast
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:166
clang::CodeGen::CGOpenMPRuntimeGPU::EM_NonSPMD
@ EM_NonSPMD
Non-SPMD execution mode (1 master thread, others are workers).
Definition: CGOpenMPRuntimeGPU.h:31
getPrivateItem
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false)
Definition: SemaOpenMP.cpp:5012
CopyOptionsTy::RemoteLaneOffset
llvm::Value * RemoteLaneOffset
Definition: CGOpenMPRuntimeGPU.cpp:1845
clang::OMPExecutableDirective
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266
clang::CudaArch::GFX703
@ GFX703
clang::ObjCPropertyAttribute::Kind
Kind
Definition: DeclObjCCommon.h:22
clang::CodeGen::AlignmentSource::Type
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
clang::DeclStmt
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1292
clang::CodeGen::CGFunctionInfo
CGFunctionInfo - Class to encapsulate the information about a function definition.
Definition: CGFunctionInfo.h:546
clang::isOpenMPLoopBoundSharingDirective
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
Definition: OpenMPKinds.cpp:595
clang::PointerType
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2640
clang::CodeGen::CodeGenFunction::EmitStoreOfComplex
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.
Definition: CGExprComplex.cpp:1139
clang::CodeGen::CGOpenMPRuntimeGPU::clear
void clear() override
Definition: CGOpenMPRuntimeGPU.cpp:3915
clang::isOpenMPWorksharingDirective
bool isOpenMPWorksharingDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a worksharing directive.
Definition: OpenMPKinds.cpp:480
clang::LangAS::cuda_constant
@ cuda_constant
clang::CudaArch::GFX705
@ GFX705
clang::LangAS::Default
@ Default
Cuda.h
clang::CodeGen::CGBuilderTy::CreateConstArrayGEP
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:208
clang::CodeGen::CodeGenFunction::CGM
CodeGenModule & CGM
Definition: CodeGenFunction.h:266
clang::TargetInfo::getGridValue
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1429
clang::ImplicitParamDecl::Create
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:4880
clang::CodeGen::TEK_Scalar
@ TEK_Scalar
Definition: CodeGenFunction.h:113
clang::ASTContext::getAddrSpaceQualType
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Definition: ASTContext.cpp:3002
clang::CodeGen::CodeGenFunction::CurFn
llvm::Function * CurFn
Definition: CodeGenFunction.h:330
clang::Builtin::ID
ID
Definition: Builtins.h:48
clang::OMPExecutableDirective::getBeginLoc
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:488
clang::UnaryOperator::getSubExpr
Expr * getSubExpr() const
Definition: Expr.h:2204
clang::Expr::IgnoreParens
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:2915
clang
Definition: CalledOnceCheck.h:17
clang::CodeGen::CGOpenMPRuntimeGPU::getParameterAddress
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
Definition: CGOpenMPRuntimeGPU.cpp:3383
emitGlobalToListCopyFunction
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2723
clang::CodeGen::CodeGenModule::addCompilerUsedGlobal
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
Definition: CodeGenModule.cpp:2211
clang::CodeGen::CodeGenFunction::EmitAggregateCopy
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:2052
clang::OpenMPDistScheduleClauseKind
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103
clang::CudaArch::GFX902
@ GFX902
clang::ExternalLinkage
@ ExternalLinkage
External linkage, which indicates that the entity can be referred to from other translation units.
Definition: Linkage.h:59
clang::Stmt
Stmt - This represents one statement.
Definition: Stmt.h:69
clang::UnaryOperator::getOpcode
Opcode getOpcode() const
Definition: Expr.h:2199
clang::CodeGen::CGOpenMPRuntime::clear
virtual void clear()
Definition: CGOpenMPRuntime.cpp:1090
clang::CodeGen::CGOpenMPRuntimeGPU::emitCriticalRegion
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
Definition: CGOpenMPRuntimeGPU.cpp:1615
clang::CodeGen::CGOpenMPRuntimeGPU::emitProcBindClause
virtual void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Definition: CGOpenMPRuntimeGPU.cpp:1213
clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction
llvm::Function * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
Definition: CGOpenMPRuntimeGPU.cpp:1315
clang::CodeGen::CodeGenFunction::CapturedStmtInfo
CGCapturedStmtInfo * CapturedStmtInfo
Definition: CodeGenFunction.h:448
clang::CodeGen::CGOpenMPRuntime::emitTeamsOutlinedFunction
virtual llvm::Function * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
Definition: CGOpenMPRuntime.cpp:1298
clang::CodeGen::CodeGenFunction::EmitLoadOfReferenceLValue
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Definition: CGExpr.cpp:2485
clang::CudaArch::SM_37
@ SM_37
clang::CudaArch::GFX602
@ GFX602
clang::CudaArch::SM_50
@ SM_50
clang::Expr::getType
QualType getType() const
Definition: Expr.h:141
clang::Attr
Attr - This represents one attribute.
Definition: Attr.h:46
clang::Qualifiers::addAddressSpace
void addAddressSpace(LangAS space)
Definition: Type.h:387
clang::CudaArch::GFX1013
@ GFX1013
clang::CodeGen::RegionCodeGenTy::setAction
void setAction(PrePostActionTy &Action) const
Definition: CGOpenMPRuntime.h:91
clang::OMPRequiresDecl
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:416
clang::CudaArch
CudaArch
Definition: Cuda.h:45
clang::CodeGen::ApplyDebugLocation::CreateEmpty
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:827
clang::Decl::getAttrs
AttrVec & getAttrs()
Definition: DeclBase.h:495
clang::ImplicitCastExpr
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3618
clang::CudaArch::GFX810
@ GFX810
clang::CodeGen::CodeGenFunction::EmitLValue
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression.
Definition: CGExpr.cpp:1273
hasStaticScheduling
static bool hasStaticScheduling(const OMPExecutableDirective &D)
Check if the directive is loops based and has schedule clause at all or has static scheduling.
Definition: CGOpenMPRuntimeGPU.cpp:753
clang::CudaArch::UNUSED
@ UNUSED
clang::OpenMPDirectiveKind
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24
clang::ASTContext::getPointerType
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
Definition: ASTContext.cpp:3249
clang::OMPExecutableDirective::clauses
ArrayRef< OMPClause * > clauses() const
Definition: StmtOpenMP.h:572
clang::CharUnits
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
clang::isOpenMPTargetExecutionDirective
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
Definition: OpenMPKinds.cpp:518
clang::CodeGen::CGOpenMPRuntimeGPU::emitFunctionProlog
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
Definition: CGOpenMPRuntimeGPU.cpp:3570
clang::CodeGen::CGOpenMPRuntimeGPU::getGPUWarpSize
virtual llvm::Value * getGPUWarpSize(CodeGenFunction &CGF)=0
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
clang::CudaArch::SM_62
@ SM_62
llvm::SmallVectorImpl
Definition: LLVM.h:39
emitInterWarpCopyFunction
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
Definition: CGOpenMPRuntimeGPU.cpp:2060
clang::CallExpr::arguments
arg_range arguments()
Definition: Expr.h:3034
clang::SC_None
@ SC_None
Definition: Specifiers.h:235
clang::ValueDecl::getType
QualType getType() const
Definition: Decl.h:687
clang::Type::isIntegerType
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:6987
clang::CodeGen::CodeGenFunction::CreateMemTemp
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:135
clang::Expr
This represents one expression.
Definition: Expr.h:109
clang::OpenMPScheduleClauseKind
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30
clang::CodeGen::CGOpenMPRuntime::CGM
CodeGenModule & CGM
Definition: CGOpenMPRuntime.h:305
clang::CapturedStmt::getCapturedDecl
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1393
clang::ParmVarDecl::Create
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2753
clang::CodeGen::CodeGenTypeCache::Int8Ty
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Definition: CodeGenTypeCache.h:37
clang::BlockDecl::Capture
A class which contains all the information about a particular captured value.
Definition: Decl.h:4163
shuffleAndStore
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
Definition: CGOpenMPRuntimeGPU.cpp:1747
clang::OMPExecutableDirective::getCapturedStmt
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive.
Definition: StmtOpenMP.h:533
clang::CodeGen::CodeGenTypeCache::VoidPtrPtrTy
llvm::PointerType * VoidPtrPtrTy
Definition: CodeGenTypeCache.h:62
getCudaArch
static CudaArch getCudaArch(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:3827
clang::Decl::getLocation
SourceLocation getLocation() const
Definition: DeclBase.h:430
clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsCall
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
Definition: CGOpenMPRuntimeGPU.cpp:1493
clang::Decl::isCanonicalDecl
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:907
clang::DeclRefExpr
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1217
clang::CudaArch::SM_75
@ SM_75
clang::RecordDecl
Represents a struct/union/class.
Definition: Decl.h:3863
clang::CodeGen::CGOpenMPRuntime::emitReduction
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
Definition: CGOpenMPRuntime.cpp:5562
clang::CallExpr
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2795
clang::CodeGen::CodeGenFunction::EmitBlock
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:529
clang::CudaArch::GFX701
@ GFX701
clang::CodeGen::CodeGenFunction::EmitLValueForFieldInitialization
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
Definition: CGExpr.cpp:4445
clang::CodeGen::CodeGenModule::Error
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
Definition: CodeGenModule.cpp:957
clang::CharUnits::getQuantity
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
clang::CudaArch::GFX1030
@ GFX1030
clang::CodeGen::PrePostActionTy
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
Definition: CGOpenMPRuntime.h:60
clang::ICIS_NoInit
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:257
clang::specific_attr_iterator
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:34
CGOpenMPRuntimeGPU.h
clang::CudaArch::GFX700
@ GFX700
clang::CodeGen::CGOpenMPRuntimeGPU::EM_Unknown
@ EM_Unknown
Unknown execution mode (orphaned directive).
Definition: CGOpenMPRuntimeGPU.h:33
clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags
unsigned getDefaultLocationReserved2Flags() const override
Returns additional flags that can be stored in reserved_2 field of the default location.
Definition: CGOpenMPRuntimeGPU.cpp:1182
clang::CodeGen::CGOpenMPRuntime::functionFinished
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntime.cpp:1527
clang::ImplicitParamDecl::Other
@ Other
Other implicit parameter.
Definition: Decl.h:1625
clang::CodeGen::CGOpenMPRuntimeGPU::getAddressOfLocalVariable
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Definition: CGOpenMPRuntimeGPU.cpp:3642
clang::CodeGen::CodeGenFunction::EmitBranch
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Definition: CGStmt.cpp:549
clang::Decl::getDeclContext
DeclContext * getDeclContext()
Definition: DeclBase.h:439
clang::CodeGen::AggValueSlot::DoesNotOverlap
@ DoesNotOverlap
Definition: CGValue.h:527
clang::NamedDecl::getName
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:276