clang  17.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/OpenMPClause.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 #include "llvm/Support/MathExtras.h"
25 
26 using namespace clang;
27 using namespace CodeGen;
28 using namespace llvm::omp;
29 
30 namespace {
31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32 class NVPTXActionTy final : public PrePostActionTy {
33  llvm::FunctionCallee EnterCallee = nullptr;
34  ArrayRef<llvm::Value *> EnterArgs;
35  llvm::FunctionCallee ExitCallee = nullptr;
36  ArrayRef<llvm::Value *> ExitArgs;
37  bool Conditional = false;
38  llvm::BasicBlock *ContBlock = nullptr;
39 
40 public:
41  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42  ArrayRef<llvm::Value *> EnterArgs,
43  llvm::FunctionCallee ExitCallee,
44  ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46  ExitArgs(ExitArgs), Conditional(Conditional) {}
47  void Enter(CodeGenFunction &CGF) override {
48  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49  if (Conditional) {
50  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52  ContBlock = CGF.createBasicBlock("omp_if.end");
53  // Generate the branch (If-stmt)
54  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55  CGF.EmitBlock(ThenBlock);
56  }
57  }
58  void Done(CodeGenFunction &CGF) {
59  // Emit the rest of blocks/branches
60  CGF.EmitBranch(ContBlock);
61  CGF.EmitBlock(ContBlock, true);
62  }
63  void Exit(CodeGenFunction &CGF) override {
64  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65  }
66 };
67 
68 /// A class to track the execution mode when codegening directives within
69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70 /// to the target region and used by containing directives such as 'parallel'
71 /// to emit optimized code.
72 class ExecutionRuntimeModesRAII {
73 private:
74  CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
77 
78 public:
79  ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
81  : ExecMode(ExecMode) {
82  SavedExecMode = ExecMode;
83  ExecMode = EntryMode;
84  }
85  ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86 };
87 
88 /// GPU Configuration: This information can be derived from cuda registers,
89 /// however, providing compile time constants helps generate more efficient
90 /// code. For all practical purposes this is fine because the configuration
91 /// is the same for all known NVPTX architectures.
92 enum MachineConfiguration : unsigned {
93  /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
94  /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
95 
96  /// Global memory alignment for performance.
97  GlobalMemoryAlignment = 128,
98 };
99 
100 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
101  RefExpr = RefExpr->IgnoreParens();
102  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
103  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
104  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
105  Base = TempASE->getBase()->IgnoreParenImpCasts();
106  RefExpr = Base;
107  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
108  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
109  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
110  Base = TempOASE->getBase()->IgnoreParenImpCasts();
111  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
112  Base = TempASE->getBase()->IgnoreParenImpCasts();
113  RefExpr = Base;
114  }
115  RefExpr = RefExpr->IgnoreParenImpCasts();
116  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
117  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
118  const auto *ME = cast<MemberExpr>(RefExpr);
119  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
120 }
121 
122 
123 static RecordDecl *buildRecordForGlobalizedVars(
124  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
125  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
126  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
127  &MappedDeclsFields, int BufSize) {
128  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
129  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
130  return nullptr;
131  SmallVector<VarsDataTy, 4> GlobalizedVars;
132  for (const ValueDecl *D : EscapedDecls)
133  GlobalizedVars.emplace_back(
135  C.getDeclAlign(D).getQuantity(),
136  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
137  D);
138  for (const ValueDecl *D : EscapedDeclsForTeams)
139  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
140  llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
141  return L.first > R.first;
142  });
143 
144  // Build struct _globalized_locals_ty {
145  // /* globalized vars */[WarSize] align (max(decl_align,
146  // GlobalMemoryAlignment))
147  // /* globalized vars */ for EscapedDeclsForTeams
148  // };
149  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
150  GlobalizedRD->startDefinition();
152  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
153  for (const auto &Pair : GlobalizedVars) {
154  const ValueDecl *VD = Pair.second;
155  QualType Type = VD->getType();
157  Type = C.getPointerType(Type.getNonReferenceType());
158  else
159  Type = Type.getNonReferenceType();
160  SourceLocation Loc = VD->getLocation();
161  FieldDecl *Field;
162  if (SingleEscaped.count(VD)) {
164  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
165  C.getTrivialTypeSourceInfo(Type, SourceLocation()),
166  /*BW=*/nullptr, /*Mutable=*/false,
167  /*InitStyle=*/ICIS_NoInit);
168  Field->setAccess(AS_public);
169  if (VD->hasAttrs()) {
170  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
171  E(VD->getAttrs().end());
172  I != E; ++I)
173  Field->addAttr(*I);
174  }
175  } else {
176  llvm::APInt ArraySize(32, BufSize);
177  Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
178  0);
180  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
181  C.getTrivialTypeSourceInfo(Type, SourceLocation()),
182  /*BW=*/nullptr, /*Mutable=*/false,
183  /*InitStyle=*/ICIS_NoInit);
184  Field->setAccess(AS_public);
185  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
186  static_cast<CharUnits::QuantityType>(
187  GlobalMemoryAlignment)));
188  Field->addAttr(AlignedAttr::CreateImplicit(
189  C, /*IsAlignmentExpr=*/true,
190  IntegerLiteral::Create(C, Align,
191  C.getIntTypeForBitwidth(32, /*Signed=*/0),
192  SourceLocation()),
193  {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
194  }
195  GlobalizedRD->addDecl(Field);
196  MappedDeclsFields.try_emplace(VD, Field);
197  }
198  GlobalizedRD->completeDefinition();
199  return GlobalizedRD;
200 }
201 
202 /// Get the list of variables that can escape their declaration context.
203 class CheckVarsEscapingDeclContext final
204  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
205  CodeGenFunction &CGF;
206  llvm::SetVector<const ValueDecl *> EscapedDecls;
207  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
208  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
209  RecordDecl *GlobalizedRD = nullptr;
210  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
211  bool AllEscaped = false;
212  bool IsForCombinedParallelRegion = false;
213 
214  void markAsEscaped(const ValueDecl *VD) {
215  // Do not globalize declare target variables.
216  if (!isa<VarDecl>(VD) ||
217  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
218  return;
219  VD = cast<ValueDecl>(VD->getCanonicalDecl());
220  // Use user-specified allocation.
221  if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
222  return;
223  // Variables captured by value must be globalized.
224  if (auto *CSI = CGF.CapturedStmtInfo) {
225  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
226  // Check if need to capture the variable that was already captured by
227  // value in the outer region.
228  if (!IsForCombinedParallelRegion) {
229  if (!FD->hasAttrs())
230  return;
231  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
232  if (!Attr)
233  return;
234  if (((Attr->getCaptureKind() != OMPC_map) &&
235  !isOpenMPPrivate(Attr->getCaptureKind())) ||
236  ((Attr->getCaptureKind() == OMPC_map) &&
237  !FD->getType()->isAnyPointerType()))
238  return;
239  }
240  if (!FD->getType()->isReferenceType()) {
241  assert(!VD->getType()->isVariablyModifiedType() &&
242  "Parameter captured by value with variably modified type");
243  EscapedParameters.insert(VD);
244  } else if (!IsForCombinedParallelRegion) {
245  return;
246  }
247  }
248  }
249  if ((!CGF.CapturedStmtInfo ||
250  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
251  VD->getType()->isReferenceType())
252  // Do not globalize variables with reference type.
253  return;
254  if (VD->getType()->isVariablyModifiedType())
255  EscapedVariableLengthDecls.insert(VD);
256  else
257  EscapedDecls.insert(VD);
258  }
259 
260  void VisitValueDecl(const ValueDecl *VD) {
261  if (VD->getType()->isLValueReferenceType())
262  markAsEscaped(VD);
263  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
264  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
265  const bool SavedAllEscaped = AllEscaped;
266  AllEscaped = VD->getType()->isLValueReferenceType();
267  Visit(VarD->getInit());
268  AllEscaped = SavedAllEscaped;
269  }
270  }
271  }
272  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
273  ArrayRef<OMPClause *> Clauses,
274  bool IsCombinedParallelRegion) {
275  if (!S)
276  return;
277  for (const CapturedStmt::Capture &C : S->captures()) {
278  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
279  const ValueDecl *VD = C.getCapturedVar();
280  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
281  if (IsCombinedParallelRegion) {
282  // Check if the variable is privatized in the combined construct and
283  // those private copies must be shared in the inner parallel
284  // directive.
285  IsForCombinedParallelRegion = false;
286  for (const OMPClause *C : Clauses) {
287  if (!isOpenMPPrivate(C->getClauseKind()) ||
288  C->getClauseKind() == OMPC_reduction ||
289  C->getClauseKind() == OMPC_linear ||
290  C->getClauseKind() == OMPC_private)
291  continue;
293  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
294  Vars = PC->getVarRefs();
295  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
296  Vars = PC->getVarRefs();
297  else
298  llvm_unreachable("Unexpected clause.");
299  for (const auto *E : Vars) {
300  const Decl *D =
301  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
302  if (D == VD->getCanonicalDecl()) {
303  IsForCombinedParallelRegion = true;
304  break;
305  }
306  }
307  if (IsForCombinedParallelRegion)
308  break;
309  }
310  }
311  markAsEscaped(VD);
312  if (isa<OMPCapturedExprDecl>(VD))
313  VisitValueDecl(VD);
314  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
315  }
316  }
317  }
318 
319  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
320  assert(!GlobalizedRD &&
321  "Record for globalized variables is built already.");
322  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
323  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
324  if (IsInTTDRegion)
325  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
326  else
327  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
328  GlobalizedRD = ::buildRecordForGlobalizedVars(
329  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
330  MappedDeclsFields, WarpSize);
331  }
332 
333 public:
334  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
335  ArrayRef<const ValueDecl *> TeamsReductions)
336  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
337  }
338  virtual ~CheckVarsEscapingDeclContext() = default;
339  void VisitDeclStmt(const DeclStmt *S) {
340  if (!S)
341  return;
342  for (const Decl *D : S->decls())
343  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
344  VisitValueDecl(VD);
345  }
346  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
347  if (!D)
348  return;
349  if (!D->hasAssociatedStmt())
350  return;
351  if (const auto *S =
352  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
353  // Do not analyze directives that do not actually require capturing,
354  // like `omp for` or `omp simd` directives.
356  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
357  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
358  VisitStmt(S->getCapturedStmt());
359  return;
360  }
361  VisitOpenMPCapturedStmt(
362  S, D->clauses(),
363  CaptureRegions.back() == OMPD_parallel &&
365  }
366  }
367  void VisitCapturedStmt(const CapturedStmt *S) {
368  if (!S)
369  return;
370  for (const CapturedStmt::Capture &C : S->captures()) {
371  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
372  const ValueDecl *VD = C.getCapturedVar();
373  markAsEscaped(VD);
374  if (isa<OMPCapturedExprDecl>(VD))
375  VisitValueDecl(VD);
376  }
377  }
378  }
379  void VisitLambdaExpr(const LambdaExpr *E) {
380  if (!E)
381  return;
382  for (const LambdaCapture &C : E->captures()) {
383  if (C.capturesVariable()) {
384  if (C.getCaptureKind() == LCK_ByRef) {
385  const ValueDecl *VD = C.getCapturedVar();
386  markAsEscaped(VD);
387  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
388  VisitValueDecl(VD);
389  }
390  }
391  }
392  }
393  void VisitBlockExpr(const BlockExpr *E) {
394  if (!E)
395  return;
396  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
397  if (C.isByRef()) {
398  const VarDecl *VD = C.getVariable();
399  markAsEscaped(VD);
400  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
401  VisitValueDecl(VD);
402  }
403  }
404  }
405  void VisitCallExpr(const CallExpr *E) {
406  if (!E)
407  return;
408  for (const Expr *Arg : E->arguments()) {
409  if (!Arg)
410  continue;
411  if (Arg->isLValue()) {
412  const bool SavedAllEscaped = AllEscaped;
413  AllEscaped = true;
414  Visit(Arg);
415  AllEscaped = SavedAllEscaped;
416  } else {
417  Visit(Arg);
418  }
419  }
420  Visit(E->getCallee());
421  }
422  void VisitDeclRefExpr(const DeclRefExpr *E) {
423  if (!E)
424  return;
425  const ValueDecl *VD = E->getDecl();
426  if (AllEscaped)
427  markAsEscaped(VD);
428  if (isa<OMPCapturedExprDecl>(VD))
429  VisitValueDecl(VD);
430  else if (VD->isInitCapture())
431  VisitValueDecl(VD);
432  }
433  void VisitUnaryOperator(const UnaryOperator *E) {
434  if (!E)
435  return;
436  if (E->getOpcode() == UO_AddrOf) {
437  const bool SavedAllEscaped = AllEscaped;
438  AllEscaped = true;
439  Visit(E->getSubExpr());
440  AllEscaped = SavedAllEscaped;
441  } else {
442  Visit(E->getSubExpr());
443  }
444  }
445  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
446  if (!E)
447  return;
448  if (E->getCastKind() == CK_ArrayToPointerDecay) {
449  const bool SavedAllEscaped = AllEscaped;
450  AllEscaped = true;
451  Visit(E->getSubExpr());
452  AllEscaped = SavedAllEscaped;
453  } else {
454  Visit(E->getSubExpr());
455  }
456  }
457  void VisitExpr(const Expr *E) {
458  if (!E)
459  return;
460  bool SavedAllEscaped = AllEscaped;
461  if (!E->isLValue())
462  AllEscaped = false;
463  for (const Stmt *Child : E->children())
464  if (Child)
465  Visit(Child);
466  AllEscaped = SavedAllEscaped;
467  }
468  void VisitStmt(const Stmt *S) {
469  if (!S)
470  return;
471  for (const Stmt *Child : S->children())
472  if (Child)
473  Visit(Child);
474  }
475 
476  /// Returns the record that handles all the escaped local variables and used
477  /// instead of their original storage.
478  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
479  if (!GlobalizedRD)
480  buildRecordForGlobalizedVars(IsInTTDRegion);
481  return GlobalizedRD;
482  }
483 
484  /// Returns the field in the globalized record for the escaped variable.
485  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
486  assert(GlobalizedRD &&
487  "Record for globalized variables must be generated already.");
488  auto I = MappedDeclsFields.find(VD);
489  if (I == MappedDeclsFields.end())
490  return nullptr;
491  return I->getSecond();
492  }
493 
494  /// Returns the list of the escaped local variables/parameters.
495  ArrayRef<const ValueDecl *> getEscapedDecls() const {
496  return EscapedDecls.getArrayRef();
497  }
498 
499  /// Checks if the escaped local variable is actually a parameter passed by
500  /// value.
501  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
502  return EscapedParameters;
503  }
504 
505  /// Returns the list of the escaped variables with the variably modified
506  /// types.
507  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
508  return EscapedVariableLengthDecls.getArrayRef();
509  }
510 };
511 } // anonymous namespace
512 
513 /// Get the id of the warp in the block.
514 /// We assume that the warp size is 32, which is always the case
515 /// on the NVPTX device, to generate more efficient code.
516 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
517  CGBuilderTy &Bld = CGF.Builder;
518  unsigned LaneIDBits =
519  llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
520  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
521  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
522 }
523 
524 /// Get the id of the current lane in the Warp.
525 /// We assume that the warp size is 32, which is always the case
526 /// on the NVPTX device, to generate more efficient code.
527 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
528  CGBuilderTy &Bld = CGF.Builder;
529  unsigned LaneIDBits =
530  llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
531  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
532  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
533  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
534  "nvptx_lane_id");
535 }
536 
538 CGOpenMPRuntimeGPU::getExecutionMode() const {
539  return CurrentExecutionMode;
540 }
541 
544  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
546 }
547 
548 /// Check for inner (nested) SPMD construct, if any
550  const OMPExecutableDirective &D) {
551  const auto *CS = D.getInnermostCapturedStmt();
552  const auto *Body =
553  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
554  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
555 
556  if (const auto *NestedDir =
557  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
558  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
559  switch (D.getDirectiveKind()) {
560  case OMPD_target:
561  if (isOpenMPParallelDirective(DKind))
562  return true;
563  if (DKind == OMPD_teams) {
564  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
565  /*IgnoreCaptured=*/true);
566  if (!Body)
567  return false;
568  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
569  if (const auto *NND =
570  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
571  DKind = NND->getDirectiveKind();
572  if (isOpenMPParallelDirective(DKind))
573  return true;
574  }
575  }
576  return false;
577  case OMPD_target_teams:
578  return isOpenMPParallelDirective(DKind);
579  case OMPD_target_simd:
580  case OMPD_target_parallel:
581  case OMPD_target_parallel_for:
582  case OMPD_target_parallel_for_simd:
583  case OMPD_target_teams_distribute:
584  case OMPD_target_teams_distribute_simd:
585  case OMPD_target_teams_distribute_parallel_for:
586  case OMPD_target_teams_distribute_parallel_for_simd:
587  case OMPD_parallel:
588  case OMPD_for:
589  case OMPD_parallel_for:
590  case OMPD_parallel_master:
591  case OMPD_parallel_sections:
592  case OMPD_for_simd:
593  case OMPD_parallel_for_simd:
594  case OMPD_cancel:
595  case OMPD_cancellation_point:
596  case OMPD_ordered:
597  case OMPD_threadprivate:
598  case OMPD_allocate:
599  case OMPD_task:
600  case OMPD_simd:
601  case OMPD_sections:
602  case OMPD_section:
603  case OMPD_single:
604  case OMPD_master:
605  case OMPD_critical:
606  case OMPD_taskyield:
607  case OMPD_barrier:
608  case OMPD_taskwait:
609  case OMPD_taskgroup:
610  case OMPD_atomic:
611  case OMPD_flush:
612  case OMPD_depobj:
613  case OMPD_scan:
614  case OMPD_teams:
615  case OMPD_target_data:
616  case OMPD_target_exit_data:
617  case OMPD_target_enter_data:
618  case OMPD_distribute:
619  case OMPD_distribute_simd:
620  case OMPD_distribute_parallel_for:
621  case OMPD_distribute_parallel_for_simd:
622  case OMPD_teams_distribute:
623  case OMPD_teams_distribute_simd:
624  case OMPD_teams_distribute_parallel_for:
625  case OMPD_teams_distribute_parallel_for_simd:
626  case OMPD_target_update:
627  case OMPD_declare_simd:
628  case OMPD_declare_variant:
629  case OMPD_begin_declare_variant:
630  case OMPD_end_declare_variant:
631  case OMPD_declare_target:
632  case OMPD_end_declare_target:
633  case OMPD_declare_reduction:
634  case OMPD_declare_mapper:
635  case OMPD_taskloop:
636  case OMPD_taskloop_simd:
637  case OMPD_master_taskloop:
638  case OMPD_master_taskloop_simd:
639  case OMPD_parallel_master_taskloop:
640  case OMPD_parallel_master_taskloop_simd:
641  case OMPD_requires:
642  case OMPD_unknown:
643  default:
644  llvm_unreachable("Unexpected directive.");
645  }
646  }
647 
648  return false;
649 }
650 
652  const OMPExecutableDirective &D) {
654  switch (DirectiveKind) {
655  case OMPD_target:
656  case OMPD_target_teams:
657  return hasNestedSPMDDirective(Ctx, D);
658  case OMPD_target_parallel:
659  case OMPD_target_parallel_for:
660  case OMPD_target_parallel_for_simd:
661  case OMPD_target_teams_distribute_parallel_for:
662  case OMPD_target_teams_distribute_parallel_for_simd:
663  case OMPD_target_simd:
664  case OMPD_target_teams_distribute_simd:
665  return true;
666  case OMPD_target_teams_distribute:
667  return false;
668  case OMPD_parallel:
669  case OMPD_for:
670  case OMPD_parallel_for:
671  case OMPD_parallel_master:
672  case OMPD_parallel_sections:
673  case OMPD_for_simd:
674  case OMPD_parallel_for_simd:
675  case OMPD_cancel:
676  case OMPD_cancellation_point:
677  case OMPD_ordered:
678  case OMPD_threadprivate:
679  case OMPD_allocate:
680  case OMPD_task:
681  case OMPD_simd:
682  case OMPD_sections:
683  case OMPD_section:
684  case OMPD_single:
685  case OMPD_master:
686  case OMPD_critical:
687  case OMPD_taskyield:
688  case OMPD_barrier:
689  case OMPD_taskwait:
690  case OMPD_taskgroup:
691  case OMPD_atomic:
692  case OMPD_flush:
693  case OMPD_depobj:
694  case OMPD_scan:
695  case OMPD_teams:
696  case OMPD_target_data:
697  case OMPD_target_exit_data:
698  case OMPD_target_enter_data:
699  case OMPD_distribute:
700  case OMPD_distribute_simd:
701  case OMPD_distribute_parallel_for:
702  case OMPD_distribute_parallel_for_simd:
703  case OMPD_teams_distribute:
704  case OMPD_teams_distribute_simd:
705  case OMPD_teams_distribute_parallel_for:
706  case OMPD_teams_distribute_parallel_for_simd:
707  case OMPD_target_update:
708  case OMPD_declare_simd:
709  case OMPD_declare_variant:
710  case OMPD_begin_declare_variant:
711  case OMPD_end_declare_variant:
712  case OMPD_declare_target:
713  case OMPD_end_declare_target:
714  case OMPD_declare_reduction:
715  case OMPD_declare_mapper:
716  case OMPD_taskloop:
717  case OMPD_taskloop_simd:
718  case OMPD_master_taskloop:
719  case OMPD_master_taskloop_simd:
720  case OMPD_parallel_master_taskloop:
721  case OMPD_parallel_master_taskloop_simd:
722  case OMPD_requires:
723  case OMPD_unknown:
724  default:
725  break;
726  }
727  llvm_unreachable(
728  "Unknown programming model for OpenMP directive on NVPTX target.");
729 }
730 
731 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
732  StringRef ParentName,
733  llvm::Function *&OutlinedFn,
734  llvm::Constant *&OutlinedFnID,
735  bool IsOffloadEntry,
736  const RegionCodeGenTy &CodeGen) {
737  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
738  EntryFunctionState EST;
739  WrapperFunctionsMap.clear();
740 
741  // Emit target region as a standalone region.
742  class NVPTXPrePostActionTy : public PrePostActionTy {
743  CGOpenMPRuntimeGPU::EntryFunctionState &EST;
744 
745  public:
746  NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
747  : EST(EST) {}
748  void Enter(CodeGenFunction &CGF) override {
749  auto &RT =
750  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
751  RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
752  // Skip target region initialization.
753  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
754  }
755  void Exit(CodeGenFunction &CGF) override {
756  auto &RT =
757  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
758  RT.clearLocThreadIdInsertPt(CGF);
759  RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
760  }
761  } Action(EST);
762  CodeGen.setAction(Action);
763  IsInTTDRegion = true;
764  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
765  IsOffloadEntry, CodeGen);
766  IsInTTDRegion = false;
767 }
768 
769 void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
770  EntryFunctionState &EST, bool IsSPMD) {
771  CGBuilderTy &Bld = CGF.Builder;
772  Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
773  if (!IsSPMD)
774  emitGenericVarsProlog(CGF, EST.Loc);
775 }
776 
777 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
778  EntryFunctionState &EST,
779  bool IsSPMD) {
780  if (!IsSPMD)
781  emitGenericVarsEpilog(CGF);
782 
783  CGBuilderTy &Bld = CGF.Builder;
784  OMPBuilder.createTargetDeinit(Bld, IsSPMD);
785 }
786 
787 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
788  StringRef ParentName,
789  llvm::Function *&OutlinedFn,
790  llvm::Constant *&OutlinedFnID,
791  bool IsOffloadEntry,
792  const RegionCodeGenTy &CodeGen) {
793  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
794  EntryFunctionState EST;
795 
796  // Emit target region as a standalone region.
797  class NVPTXPrePostActionTy : public PrePostActionTy {
798  CGOpenMPRuntimeGPU &RT;
799  CGOpenMPRuntimeGPU::EntryFunctionState &EST;
800 
801  public:
802  NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
803  CGOpenMPRuntimeGPU::EntryFunctionState &EST)
804  : RT(RT), EST(EST) {}
805  void Enter(CodeGenFunction &CGF) override {
806  RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
807  // Skip target region initialization.
808  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
809  }
810  void Exit(CodeGenFunction &CGF) override {
811  RT.clearLocThreadIdInsertPt(CGF);
812  RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
813  }
814  } Action(*this, EST);
815  CodeGen.setAction(Action);
816  IsInTTDRegion = true;
817  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
818  IsOffloadEntry, CodeGen);
819  IsInTTDRegion = false;
820 }
821 
822 // Create a unique global variable to indicate the execution mode of this target
823 // region. The execution mode is either 'generic', or 'spmd' depending on the
824 // target directive. This variable is picked up by the offload library to setup
825 // the device appropriately before kernel launch. If the execution mode is
826 // 'generic', the runtime reserves one warp for the master, otherwise, all
827 // warps participate in parallel work.
828 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
829  bool Mode) {
830  auto *GVMode = new llvm::GlobalVariable(
831  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
832  llvm::GlobalValue::WeakAnyLinkage,
833  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
834  : OMP_TGT_EXEC_MODE_GENERIC),
835  Twine(Name, "_exec_mode"));
836  GVMode->setVisibility(llvm::GlobalVariable::ProtectedVisibility);
837  CGM.addCompilerUsedGlobal(GVMode);
838 }
839 
840 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
841  const OMPExecutableDirective &D, StringRef ParentName,
842  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
843  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
844  if (!IsOffloadEntry) // Nothing to do.
845  return;
846 
847  assert(!ParentName.empty() && "Invalid target region parent name!");
848 
849  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
850  if (Mode)
851  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
852  CodeGen);
853  else
854  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
855  CodeGen);
856 
857  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
858 }
859 
861  : CGOpenMPRuntime(CGM) {
862  llvm::OpenMPIRBuilderConfig Config(CGM.getLangOpts().OpenMPIsDevice, true,
864  CGM.getLangOpts().OpenMPOffloadMandatory);
865  OMPBuilder.setConfig(Config);
866  OffloadEntriesInfoManager.setConfig(Config);
867 
868  if (!CGM.getLangOpts().OpenMPIsDevice)
869  llvm_unreachable("OpenMP can only handle device code.");
870 
871  llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
872  if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
873  return;
874 
875  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
876  "__omp_rtl_debug_kind");
877  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
878  "__omp_rtl_assume_teams_oversubscription");
879  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
880  "__omp_rtl_assume_threads_oversubscription");
881  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
882  "__omp_rtl_assume_no_thread_state");
883  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
884  "__omp_rtl_assume_no_nested_parallelism");
885 }
886 
888  ProcBindKind ProcBind,
889  SourceLocation Loc) {
890  // Do nothing in case of SPMD mode and L0 parallel.
891  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
892  return;
893 
894  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
895 }
896 
898  llvm::Value *NumThreads,
899  SourceLocation Loc) {
900  // Nothing to do.
901 }
902 
904  const Expr *NumTeams,
905  const Expr *ThreadLimit,
906  SourceLocation Loc) {}
907 
909  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
910  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
911  // Emit target region as a standalone region.
912  bool PrevIsInTTDRegion = IsInTTDRegion;
913  IsInTTDRegion = false;
914  auto *OutlinedFun =
916  D, ThreadIDVar, InnermostKind, CodeGen));
917  IsInTTDRegion = PrevIsInTTDRegion;
918  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
919  llvm::Function *WrapperFun =
920  createParallelDataSharingWrapper(OutlinedFun, D);
921  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
922  }
923 
924  return OutlinedFun;
925 }
926 
927 /// Get list of lastprivate variables from the teams distribute ... or
928 /// teams {distribute ...} directives.
929 static void
933  "expected teams directive.");
934  const OMPExecutableDirective *Dir = &D;
937  Ctx,
939  /*IgnoreCaptured=*/true))) {
940  Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
941  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
942  Dir = nullptr;
943  }
944  }
945  if (!Dir)
946  return;
947  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
948  for (const Expr *E : C->getVarRefs())
949  Vars.push_back(getPrivateItem(E));
950  }
951 }
952 
953 /// Get list of reduction variables from the teams ... directives.
954 static void
958  "expected teams directive.");
959  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
960  for (const Expr *E : C->privates())
961  Vars.push_back(getPrivateItem(E));
962  }
963 }
964 
966  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
967  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
968  SourceLocation Loc = D.getBeginLoc();
969 
970  const RecordDecl *GlobalizedRD = nullptr;
971  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
972  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
973  unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
974  // Globalize team reductions variable unconditionally in all modes.
975  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
976  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
977  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
978  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
979  if (!LastPrivatesReductions.empty()) {
980  GlobalizedRD = ::buildRecordForGlobalizedVars(
981  CGM.getContext(), std::nullopt, LastPrivatesReductions,
982  MappedDeclsFields, WarpSize);
983  }
984  } else if (!LastPrivatesReductions.empty()) {
985  assert(!TeamAndReductions.first &&
986  "Previous team declaration is not expected.");
987  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
988  std::swap(TeamAndReductions.second, LastPrivatesReductions);
989  }
990 
991  // Emit target region as a standalone region.
992  class NVPTXPrePostActionTy : public PrePostActionTy {
993  SourceLocation &Loc;
994  const RecordDecl *GlobalizedRD;
995  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
996  &MappedDeclsFields;
997 
998  public:
999  NVPTXPrePostActionTy(
1000  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1001  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1002  &MappedDeclsFields)
1003  : Loc(Loc), GlobalizedRD(GlobalizedRD),
1004  MappedDeclsFields(MappedDeclsFields) {}
1005  void Enter(CodeGenFunction &CGF) override {
1006  auto &Rt =
1007  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1008  if (GlobalizedRD) {
1009  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1010  I->getSecond().MappedParams =
1011  std::make_unique<CodeGenFunction::OMPMapVars>();
1012  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1013  for (const auto &Pair : MappedDeclsFields) {
1014  assert(Pair.getFirst()->isCanonicalDecl() &&
1015  "Expected canonical declaration");
1016  Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1017  }
1018  }
1019  Rt.emitGenericVarsProlog(CGF, Loc);
1020  }
1021  void Exit(CodeGenFunction &CGF) override {
1022  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1023  .emitGenericVarsEpilog(CGF);
1024  }
1025  } Action(Loc, GlobalizedRD, MappedDeclsFields);
1026  CodeGen.setAction(Action);
1027  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1028  D, ThreadIDVar, InnermostKind, CodeGen);
1029 
1030  return OutlinedFun;
1031 }
1032 
1033 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1034  SourceLocation Loc,
1035  bool WithSPMDCheck) {
1037  getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1038  return;
1039 
1040  CGBuilderTy &Bld = CGF.Builder;
1041 
1042  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1043  if (I == FunctionGlobalizedDecls.end())
1044  return;
1045 
1046  for (auto &Rec : I->getSecond().LocalVarData) {
1047  const auto *VD = cast<VarDecl>(Rec.first);
1048  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1049  QualType VarTy = VD->getType();
1050 
1051  // Get the local allocation of a firstprivate variable before sharing
1052  llvm::Value *ParValue;
1053  if (EscapedParam) {
1054  LValue ParLVal =
1055  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1056  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1057  }
1058 
1059  // Allocate space for the variable to be globalized
1060  llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1061  llvm::CallBase *VoidPtr =
1062  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1063  CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1064  AllocArgs, VD->getName());
1065  // FIXME: We should use the variables actual alignment as an argument.
1066  VoidPtr->addRetAttr(llvm::Attribute::get(
1067  CGM.getLLVMContext(), llvm::Attribute::Alignment,
1069 
1070  // Cast the void pointer and get the address of the globalized variable.
1071  llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1072  llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1073  VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1074  LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1075  Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1076  Rec.second.GlobalizedVal = VoidPtr;
1077 
1078  // Assign the local allocation to the newly globalized location.
1079  if (EscapedParam) {
1080  CGF.EmitStoreOfScalar(ParValue, VarAddr);
1081  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1082  }
1083  if (auto *DI = CGF.getDebugInfo())
1084  VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1085  }
1086  for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1087  // Use actual memory size of the VLA object including the padding
1088  // for alignment purposes.
1089  llvm::Value *Size = CGF.getTypeSize(VD->getType());
1090  CharUnits Align = CGM.getContext().getDeclAlign(VD);
1091  Size = Bld.CreateNUWAdd(
1092  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1093  llvm::Value *AlignVal =
1094  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1095 
1096  Size = Bld.CreateUDiv(Size, AlignVal);
1097  Size = Bld.CreateNUWMul(Size, AlignVal);
1098 
1099  // Allocate space for this VLA object to be globalized.
1100  llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1101  llvm::CallBase *VoidPtr =
1102  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1103  CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1104  AllocArgs, VD->getName());
1105  VoidPtr->addRetAttr(
1106  llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
1108 
1109  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1110  std::pair<llvm::Value *, llvm::Value *>(
1111  {VoidPtr, CGF.getTypeSize(VD->getType())}));
1112  LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1113  CGM.getContext().getDeclAlign(VD),
1115  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1116  Base.getAddress(CGF));
1117  }
1118  I->getSecond().MappedParams->apply(CGF);
1119 }
1120 
1121 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1122  bool WithSPMDCheck) {
1124  getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1125  return;
1126 
1127  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1128  if (I != FunctionGlobalizedDecls.end()) {
1129  // Deallocate the memory for each globalized VLA object
1130  for (auto AddrSizePair :
1131  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1132  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1133  CGM.getModule(), OMPRTL___kmpc_free_shared),
1134  {AddrSizePair.first, AddrSizePair.second});
1135  }
1136  // Deallocate the memory for each globalized value
1137  for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1138  const auto *VD = cast<VarDecl>(Rec.first);
1139  I->getSecond().MappedParams->restore(CGF);
1140 
1141  llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1142  CGF.getTypeSize(VD->getType())};
1143  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1144  CGM.getModule(), OMPRTL___kmpc_free_shared),
1145  FreeArgs);
1146  }
1147  }
1148 }
1149 
1151  const OMPExecutableDirective &D,
1152  SourceLocation Loc,
1153  llvm::Function *OutlinedFn,
1154  ArrayRef<llvm::Value *> CapturedVars) {
1155  if (!CGF.HaveInsertPoint())
1156  return;
1157 
1158  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1159  /*Name=*/".zero.addr");
1160  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1161  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1162  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1163  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1164  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1165  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1166 }
1167 
1169  SourceLocation Loc,
1170  llvm::Function *OutlinedFn,
1171  ArrayRef<llvm::Value *> CapturedVars,
1172  const Expr *IfCond,
1173  llvm::Value *NumThreads) {
1174  if (!CGF.HaveInsertPoint())
1175  return;
1176 
1177  auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1178  NumThreads](CodeGenFunction &CGF,
1179  PrePostActionTy &Action) {
1180  CGBuilderTy &Bld = CGF.Builder;
1181  llvm::Value *NumThreadsVal = NumThreads;
1182  llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1183  llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1184  if (WFn)
1185  ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1186  llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1187 
1188  // Create a private scope that will globalize the arguments
1189  // passed from the outside of the target region.
1190  // TODO: Is that needed?
1191  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1192 
1193  Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1194  llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1195  "captured_vars_addrs");
1196  // There's something to share.
1197  if (!CapturedVars.empty()) {
1198  // Prepare for parallel region. Indicate the outlined function.
1199  ASTContext &Ctx = CGF.getContext();
1200  unsigned Idx = 0;
1201  for (llvm::Value *V : CapturedVars) {
1202  Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1203  llvm::Value *PtrV;
1204  if (V->getType()->isIntegerTy())
1205  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1206  else
1207  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1208  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1209  Ctx.getPointerType(Ctx.VoidPtrTy));
1210  ++Idx;
1211  }
1212  }
1213 
1214  llvm::Value *IfCondVal = nullptr;
1215  if (IfCond)
1216  IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1217  /* isSigned */ false);
1218  else
1219  IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1220 
1221  if (!NumThreadsVal)
1222  NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1223  else
1224  NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1225 
1226  assert(IfCondVal && "Expected a value");
1227  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1228  llvm::Value *Args[] = {
1229  RTLoc,
1230  getThreadID(CGF, Loc),
1231  IfCondVal,
1232  NumThreadsVal,
1233  llvm::ConstantInt::get(CGF.Int32Ty, -1),
1234  FnPtr,
1235  ID,
1236  Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1237  CGF.VoidPtrPtrTy),
1238  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1239  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1240  CGM.getModule(), OMPRTL___kmpc_parallel_51),
1241  Args);
1242  };
1243 
1244  RegionCodeGenTy RCG(ParallelGen);
1245  RCG(CGF);
1246 }
1247 
1248 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1249  // Always emit simple barriers!
1250  if (!CGF.HaveInsertPoint())
1251  return;
1252  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1253  // This function does not use parameters, so we can emit just default values.
1254  llvm::Value *Args[] = {
1255  llvm::ConstantPointerNull::get(
1256  cast<llvm::PointerType>(getIdentTyPointerTy())),
1257  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1258  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1259  CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1260  Args);
1261 }
1262 
1264  SourceLocation Loc,
1265  OpenMPDirectiveKind Kind, bool,
1266  bool) {
1267  // Always emit simple barriers!
1268  if (!CGF.HaveInsertPoint())
1269  return;
1270  // Build call __kmpc_cancel_barrier(loc, thread_id);
1271  unsigned Flags = getDefaultFlagsForBarriers(Kind);
1272  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1273  getThreadID(CGF, Loc)};
1274 
1275  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1276  CGM.getModule(), OMPRTL___kmpc_barrier),
1277  Args);
1278 }
1279 
1281  CodeGenFunction &CGF, StringRef CriticalName,
1282  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1283  const Expr *Hint) {
1284  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1285  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1286  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1287  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1288  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1289 
1290  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1291 
1292  // Get the mask of active threads in the warp.
1293  llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1294  CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1295  // Fetch team-local id of the thread.
1296  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1297 
1298  // Get the width of the team.
1299  llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1300 
1301  // Initialize the counter variable for the loop.
1302  QualType Int32Ty =
1303  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1304  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1305  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1306  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1307  /*isInit=*/true);
1308 
1309  // Block checks if loop counter exceeds upper bound.
1310  CGF.EmitBlock(LoopBB);
1311  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1312  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1313  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1314 
1315  // Block tests which single thread should execute region, and which threads
1316  // should go straight to synchronisation point.
1317  CGF.EmitBlock(TestBB);
1318  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1319  llvm::Value *CmpThreadToCounter =
1320  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1321  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1322 
1323  // Block emits the body of the critical region.
1324  CGF.EmitBlock(BodyBB);
1325 
1326  // Output the critical statement.
1327  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1328  Hint);
1329 
1330  // After the body surrounded by the critical region, the single executing
1331  // thread will jump to the synchronisation point.
1332  // Block waits for all threads in current team to finish then increments the
1333  // counter variable and returns to the loop.
1334  CGF.EmitBlock(SyncBB);
1335  // Reconverge active threads in the warp.
1336  (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1337  CGM.getModule(), OMPRTL___kmpc_syncwarp),
1338  Mask);
1339 
1340  llvm::Value *IncCounterVal =
1341  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1342  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1343  CGF.EmitBranch(LoopBB);
1344 
1345  // Block that is reached when all threads in the team complete the region.
1346  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1347 }
1348 
1349 /// Cast value to the specified type.
1350 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1351  QualType ValTy, QualType CastTy,
1352  SourceLocation Loc) {
1353  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1354  "Cast type must sized.");
1355  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1356  "Val type must sized.");
1357  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1358  if (ValTy == CastTy)
1359  return Val;
1360  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1361  CGF.getContext().getTypeSizeInChars(CastTy))
1362  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1363  if (CastTy->isIntegerType() && ValTy->isIntegerType())
1364  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1365  CastTy->hasSignedIntegerRepresentation());
1366  Address CastItem = CGF.CreateMemTemp(CastTy);
1368  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()),
1369  Val->getType());
1370  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1372  TBAAAccessInfo());
1373  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1375  TBAAAccessInfo());
1376 }
1377 
1378 /// This function creates calls to one of two shuffle functions to copy
1379 /// variables between lanes in a warp.
1381  llvm::Value *Elem,
1382  QualType ElemType,
1383  llvm::Value *Offset,
1384  SourceLocation Loc) {
1385  CodeGenModule &CGM = CGF.CGM;
1386  CGBuilderTy &Bld = CGF.Builder;
1387  CGOpenMPRuntimeGPU &RT =
1388  *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1389  llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1390 
1391  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1392  assert(Size.getQuantity() <= 8 &&
1393  "Unsupported bitwidth in shuffle instruction.");
1394 
1395  RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1396  ? OMPRTL___kmpc_shuffle_int32
1397  : OMPRTL___kmpc_shuffle_int64;
1398 
1399  // Cast all types to 32- or 64-bit values before calling shuffle routines.
1400  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1401  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1402  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1403  llvm::Value *WarpSize =
1404  Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1405 
1406  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1407  OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1408  {ElemCast, Offset, WarpSize});
1409 
1410  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1411 }
1412 
1413 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1414  Address DestAddr, QualType ElemType,
1415  llvm::Value *Offset, SourceLocation Loc) {
1416  CGBuilderTy &Bld = CGF.Builder;
1417 
1418  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1419  // Create the loop over the big sized data.
1420  // ptr = (void*)Elem;
1421  // ptrEnd = (void*) Elem + 1;
1422  // Step = 8;
1423  // while (ptr + Step < ptrEnd)
1424  // shuffle((int64_t)*ptr);
1425  // Step = 4;
1426  // while (ptr + Step < ptrEnd)
1427  // shuffle((int32_t)*ptr);
1428  // ...
1429  Address ElemPtr = DestAddr;
1430  Address Ptr = SrcAddr;
1432  Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1433  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1434  if (Size < CharUnits::fromQuantity(IntSize))
1435  continue;
1436  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1437  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1438  /*Signed=*/1);
1439  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1440  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1441  IntTy);
1443  ElemPtr, IntTy->getPointerTo(), IntTy);
1444  if (Size.getQuantity() / IntSize > 1) {
1445  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1446  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1447  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1448  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1449  CGF.EmitBlock(PreCondBB);
1450  llvm::PHINode *PhiSrc =
1451  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1452  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1453  llvm::PHINode *PhiDest =
1454  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1455  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1456  Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1457  ElemPtr =
1458  Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1459  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1460  CGF.Int8Ty, PtrEnd.getPointer(),
1462  CGF.VoidPtrTy));
1463  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1464  ThenBB, ExitBB);
1465  CGF.EmitBlock(ThenBB);
1466  llvm::Value *Res = createRuntimeShuffleFunction(
1467  CGF,
1468  CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1470  TBAAAccessInfo()),
1471  IntType, Offset, Loc);
1472  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1474  TBAAAccessInfo());
1475  Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1476  Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1477  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1478  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1479  CGF.EmitBranch(PreCondBB);
1480  CGF.EmitBlock(ExitBB);
1481  } else {
1482  llvm::Value *Res = createRuntimeShuffleFunction(
1483  CGF,
1484  CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1486  TBAAAccessInfo()),
1487  IntType, Offset, Loc);
1488  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1490  TBAAAccessInfo());
1491  Ptr = Bld.CreateConstGEP(Ptr, 1);
1492  ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1493  }
1494  Size = Size % IntSize;
1495  }
1496 }
1497 
1498 namespace {
1499 enum CopyAction : unsigned {
1500  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1501  // the warp using shuffle instructions.
1502  RemoteLaneToThread,
1503  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1504  ThreadCopy,
1505  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1506  ThreadToScratchpad,
1507  // ScratchpadToThread: Copy from a scratchpad array in global memory
1508  // containing team-reduced data to a thread's stack.
1509  ScratchpadToThread,
1510 };
1511 } // namespace
1512 
1514  llvm::Value *RemoteLaneOffset;
1515  llvm::Value *ScratchpadIndex;
1516  llvm::Value *ScratchpadWidth;
1517 };
1518 
1519 /// Emit instructions to copy a Reduce list, which contains partially
1520 /// aggregated values, in the specified direction.
1522  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1523  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1524  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1525 
1526  CodeGenModule &CGM = CGF.CGM;
1527  ASTContext &C = CGM.getContext();
1528  CGBuilderTy &Bld = CGF.Builder;
1529 
1530  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1531  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1532  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1533 
1534  // Iterates, element-by-element, through the source Reduce list and
1535  // make a copy.
1536  unsigned Idx = 0;
1537  unsigned Size = Privates.size();
1538  for (const Expr *Private : Privates) {
1539  Address SrcElementAddr = Address::invalid();
1540  Address DestElementAddr = Address::invalid();
1541  Address DestElementPtrAddr = Address::invalid();
1542  // Should we shuffle in an element from a remote lane?
1543  bool ShuffleInElement = false;
1544  // Set to true to update the pointer in the dest Reduce list to a
1545  // newly created element.
1546  bool UpdateDestListPtr = false;
1547  // Increment the src or dest pointer to the scratchpad, for each
1548  // new element.
1549  bool IncrScratchpadSrc = false;
1550  bool IncrScratchpadDest = false;
1551  QualType PrivatePtrType = C.getPointerType(Private->getType());
1552  llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1553 
1554  switch (Action) {
1555  case RemoteLaneToThread: {
1556  // Step 1.1: Get the address for the src element in the Reduce list.
1557  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1558  SrcElementAddr =
1560  SrcElementPtrAddr, PrivateLlvmPtrType),
1561  PrivatePtrType->castAs<PointerType>());
1562 
1563  // Step 1.2: Create a temporary to store the element in the destination
1564  // Reduce list.
1565  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1566  DestElementAddr =
1567  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1568  ShuffleInElement = true;
1569  UpdateDestListPtr = true;
1570  break;
1571  }
1572  case ThreadCopy: {
1573  // Step 1.1: Get the address for the src element in the Reduce list.
1574  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1575  SrcElementAddr =
1577  SrcElementPtrAddr, PrivateLlvmPtrType),
1578  PrivatePtrType->castAs<PointerType>());
1579 
1580  // Step 1.2: Get the address for dest element. The destination
1581  // element has already been created on the thread's stack.
1582  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1583  DestElementAddr =
1585  DestElementPtrAddr, PrivateLlvmPtrType),
1586  PrivatePtrType->castAs<PointerType>());
1587  break;
1588  }
1589  case ThreadToScratchpad: {
1590  // Step 1.1: Get the address for the src element in the Reduce list.
1591  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1592  SrcElementAddr =
1594  SrcElementPtrAddr, PrivateLlvmPtrType),
1595  PrivatePtrType->castAs<PointerType>());
1596 
1597  // Step 1.2: Get the address for dest element:
1598  // address = base + index * ElementSizeInChars.
1599  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1600  llvm::Value *CurrentOffset =
1601  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1602  llvm::Value *ScratchPadElemAbsolutePtrVal =
1603  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1604  ScratchPadElemAbsolutePtrVal =
1605  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1606  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1607  C.getTypeAlignInChars(Private->getType()));
1608  IncrScratchpadDest = true;
1609  break;
1610  }
1611  case ScratchpadToThread: {
1612  // Step 1.1: Get the address for the src element in the scratchpad.
1613  // address = base + index * ElementSizeInChars.
1614  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1615  llvm::Value *CurrentOffset =
1616  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1617  llvm::Value *ScratchPadElemAbsolutePtrVal =
1618  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1619  ScratchPadElemAbsolutePtrVal =
1620  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1621  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1622  C.getTypeAlignInChars(Private->getType()));
1623  IncrScratchpadSrc = true;
1624 
1625  // Step 1.2: Create a temporary to store the element in the destination
1626  // Reduce list.
1627  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1628  DestElementAddr =
1629  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1630  UpdateDestListPtr = true;
1631  break;
1632  }
1633  }
1634 
1635  // Regardless of src and dest of copy, we emit the load of src
1636  // element as this is required in all directions
1637  SrcElementAddr = Bld.CreateElementBitCast(
1638  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1639  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1640  SrcElementAddr.getElementType());
1641 
1642  // Now that all active lanes have read the element in the
1643  // Reduce list, shuffle over the value from the remote lane.
1644  if (ShuffleInElement) {
1645  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1646  RemoteLaneOffset, Private->getExprLoc());
1647  } else {
1648  switch (CGF.getEvaluationKind(Private->getType())) {
1649  case TEK_Scalar: {
1650  llvm::Value *Elem = CGF.EmitLoadOfScalar(
1651  SrcElementAddr, /*Volatile=*/false, Private->getType(),
1652  Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1653  TBAAAccessInfo());
1654  // Store the source element value to the dest element address.
1655  CGF.EmitStoreOfScalar(
1656  Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1658  break;
1659  }
1660  case TEK_Complex: {
1662  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1663  Private->getExprLoc());
1664  CGF.EmitStoreOfComplex(
1665  Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1666  /*isInit=*/false);
1667  break;
1668  }
1669  case TEK_Aggregate:
1670  CGF.EmitAggregateCopy(
1671  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1672  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1674  break;
1675  }
1676  }
1677 
1678  // Step 3.1: Modify reference in dest Reduce list as needed.
1679  // Modifying the reference in Reduce list to point to the newly
1680  // created element. The element is live in the current function
1681  // scope and that of functions it invokes (i.e., reduce_function).
1682  // RemoteReduceData[i] = (void*)&RemoteElem
1683  if (UpdateDestListPtr) {
1685  DestElementAddr.getPointer(), CGF.VoidPtrTy),
1686  DestElementPtrAddr, /*Volatile=*/false,
1687  C.VoidPtrTy);
1688  }
1689 
1690  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1691  // address of the next element in scratchpad memory, unless we're currently
1692  // processing the last one. Memory alignment is also taken care of here.
1693  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1694  // FIXME: This code doesn't make any sense, it's trying to perform
1695  // integer arithmetic on pointers.
1696  llvm::Value *ScratchpadBasePtr =
1697  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1698  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1699  ScratchpadBasePtr = Bld.CreateNUWAdd(
1700  ScratchpadBasePtr,
1701  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
1702 
1703  // Take care of global memory alignment for performance
1704  ScratchpadBasePtr = Bld.CreateNUWSub(
1705  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1706  ScratchpadBasePtr = Bld.CreateUDiv(
1707  ScratchpadBasePtr,
1708  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1709  ScratchpadBasePtr = Bld.CreateNUWAdd(
1710  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1711  ScratchpadBasePtr = Bld.CreateNUWMul(
1712  ScratchpadBasePtr,
1713  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1714 
1715  if (IncrScratchpadDest)
1716  DestBase =
1717  Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1718  else /* IncrScratchpadSrc = true */
1719  SrcBase =
1720  Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1721  }
1722 
1723  ++Idx;
1724  }
1725 }
1726 
1727 /// This function emits a helper that gathers Reduce lists from the first
1728 /// lane of every active warp to lanes in the first warp.
1729 ///
1730 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1731 /// shared smem[warp_size];
1732 /// For all data entries D in reduce_data:
1733 /// sync
1734 /// If (I am the first lane in each warp)
1735 /// Copy my local D to smem[warp_id]
1736 /// sync
1737 /// if (I am the first warp)
1738 /// Copy smem[thread_id] to my local D
1739 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1740  ArrayRef<const Expr *> Privates,
1741  QualType ReductionArrayTy,
1742  SourceLocation Loc) {
1743  ASTContext &C = CGM.getContext();
1744  llvm::Module &M = CGM.getModule();
1745 
1746  // ReduceList: thread local Reduce list.
1747  // At the stage of the computation when this function is called, partially
1748  // aggregated values reside in the first lane of every active warp.
1749  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1750  C.VoidPtrTy, ImplicitParamDecl::Other);
1751  // NumWarps: number of warps active in the parallel region. This could
1752  // be smaller than 32 (max warps in a CTA) for partial block reduction.
1753  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1754  C.getIntTypeForBitwidth(32, /* Signed */ true),
1756  FunctionArgList Args;
1757  Args.push_back(&ReduceListArg);
1758  Args.push_back(&NumWarpsArg);
1759 
1760  const CGFunctionInfo &CGFI =
1761  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1762  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
1764  "_omp_reduction_inter_warp_copy_func", &M);
1765  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
1766  Fn->setDoesNotRecurse();
1767  CodeGenFunction CGF(CGM);
1768  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
1769 
1770  CGBuilderTy &Bld = CGF.Builder;
1771 
1772  // This array is used as a medium to transfer, one reduce element at a time,
1773  // the data from the first lane of every warp to lanes in the first warp
1774  // in order to perform the final step of a reduction in a parallel region
1775  // (reduction across warps). The array is placed in NVPTX __shared__ memory
1776  // for reduced latency, as well as to have a distinct copy for concurrently
1777  // executing target regions. The array is declared with common linkage so
1778  // as to be shared across compilation units.
1779  StringRef TransferMediumName =
1780  "__openmp_nvptx_data_transfer_temporary_storage";
1781  llvm::GlobalVariable *TransferMedium =
1782  M.getGlobalVariable(TransferMediumName);
1783  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1784  if (!TransferMedium) {
1785  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
1786  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1787  TransferMedium = new llvm::GlobalVariable(
1788  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1789  llvm::UndefValue::get(Ty), TransferMediumName,
1790  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1791  SharedAddressSpace);
1792  CGM.addCompilerUsedGlobal(TransferMedium);
1793  }
1794 
1795  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1796  // Get the CUDA thread id of the current OpenMP thread on the GPU.
1797  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1798  // nvptx_lane_id = nvptx_id % warpsize
1799  llvm::Value *LaneID = getNVPTXLaneID(CGF);
1800  // nvptx_warp_id = nvptx_id / warpsize
1801  llvm::Value *WarpID = getNVPTXWarpID(CGF);
1802 
1803  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1804  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
1805  Address LocalReduceList(
1807  CGF.EmitLoadOfScalar(
1808  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1810  ElemTy->getPointerTo()),
1811  ElemTy, CGF.getPointerAlign());
1812 
1813  unsigned Idx = 0;
1814  for (const Expr *Private : Privates) {
1815  //
1816  // Warp master copies reduce element to transfer medium in __shared__
1817  // memory.
1818  //
1819  unsigned RealTySize =
1820  C.getTypeSizeInChars(Private->getType())
1821  .alignTo(C.getTypeAlignInChars(Private->getType()))
1822  .getQuantity();
1823  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1824  unsigned NumIters = RealTySize / TySize;
1825  if (NumIters == 0)
1826  continue;
1827  QualType CType = C.getIntTypeForBitwidth(
1828  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
1829  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
1830  CharUnits Align = CharUnits::fromQuantity(TySize);
1831  llvm::Value *Cnt = nullptr;
1832  Address CntAddr = Address::invalid();
1833  llvm::BasicBlock *PrecondBB = nullptr;
1834  llvm::BasicBlock *ExitBB = nullptr;
1835  if (NumIters > 1) {
1836  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1837  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
1838  /*Volatile=*/false, C.IntTy);
1839  PrecondBB = CGF.createBasicBlock("precond");
1840  ExitBB = CGF.createBasicBlock("exit");
1841  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
1842  // There is no need to emit line number for unconditional branch.
1844  CGF.EmitBlock(PrecondBB);
1845  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1846  llvm::Value *Cmp =
1847  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
1848  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1849  CGF.EmitBlock(BodyBB);
1850  }
1851  // kmpc_barrier.
1852  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1853  /*EmitChecks=*/false,
1854  /*ForceSimpleCall=*/true);
1855  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1856  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1857  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1858 
1859  // if (lane_id == 0)
1860  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
1861  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1862  CGF.EmitBlock(ThenBB);
1863 
1864  // Reduce element = LocalReduceList[i]
1865  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1866  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1867  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1868  // elemptr = ((CopyType*)(elemptrptr)) + I
1869  Address ElemPtr(ElemPtrPtr, CGF.Int8Ty, Align);
1870  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
1871  if (NumIters > 1)
1872  ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
1873 
1874  // Get pointer to location in transfer medium.
1875  // MediumPtr = &medium[warp_id]
1876  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1877  TransferMedium->getValueType(), TransferMedium,
1878  {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1879  // Casting to actual data type.
1880  // MediumPtr = (CopyType*)MediumPtrAddr;
1881  Address MediumPtr(
1882  Bld.CreateBitCast(
1883  MediumPtrVal,
1884  CopyType->getPointerTo(
1885  MediumPtrVal->getType()->getPointerAddressSpace())),
1886  CopyType, Align);
1887 
1888  // elem = *elemptr
1889  //*MediumPtr = elem
1890  llvm::Value *Elem = CGF.EmitLoadOfScalar(
1891  ElemPtr, /*Volatile=*/false, CType, Loc,
1893  // Store the source element value to the dest element address.
1894  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
1896  TBAAAccessInfo());
1897 
1898  Bld.CreateBr(MergeBB);
1899 
1900  CGF.EmitBlock(ElseBB);
1901  Bld.CreateBr(MergeBB);
1902 
1903  CGF.EmitBlock(MergeBB);
1904 
1905  // kmpc_barrier.
1906  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1907  /*EmitChecks=*/false,
1908  /*ForceSimpleCall=*/true);
1909 
1910  //
1911  // Warp 0 copies reduce element from transfer medium.
1912  //
1913  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1914  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1915  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1916 
1917  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1918  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1919  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1920 
1921  // Up to 32 threads in warp 0 are active.
1922  llvm::Value *IsActiveThread =
1923  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1924  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1925 
1926  CGF.EmitBlock(W0ThenBB);
1927 
1928  // SrcMediumPtr = &medium[tid]
1929  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1930  TransferMedium->getValueType(), TransferMedium,
1931  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1932  // SrcMediumVal = *SrcMediumPtr;
1933  Address SrcMediumPtr(
1934  Bld.CreateBitCast(
1935  SrcMediumPtrVal,
1936  CopyType->getPointerTo(
1937  SrcMediumPtrVal->getType()->getPointerAddressSpace())),
1938  CopyType, Align);
1939 
1940  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1941  Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1942  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1943  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
1944  Address TargetElemPtr(TargetElemPtrVal, CGF.Int8Ty, Align);
1945  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
1946  if (NumIters > 1)
1947  TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
1948 
1949  // *TargetElemPtr = SrcMediumVal;
1950  llvm::Value *SrcMediumValue =
1951  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
1952  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1953  CType);
1954  Bld.CreateBr(W0MergeBB);
1955 
1956  CGF.EmitBlock(W0ElseBB);
1957  Bld.CreateBr(W0MergeBB);
1958 
1959  CGF.EmitBlock(W0MergeBB);
1960 
1961  if (NumIters > 1) {
1962  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
1963  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
1964  CGF.EmitBranch(PrecondBB);
1966  CGF.EmitBlock(ExitBB);
1967  }
1968  RealTySize %= TySize;
1969  }
1970  ++Idx;
1971  }
1972 
1973  CGF.FinishFunction();
1974  return Fn;
1975 }
1976 
1977 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1978 /// in the same warp. It uses shuffle instructions to copy over data from
1979 /// a remote lane's stack. The reduction algorithm performed is specified
1980 /// by the fourth parameter.
1981 ///
1982 /// Algorithm Versions.
1983 /// Full Warp Reduce (argument value 0):
1984 /// This algorithm assumes that all 32 lanes are active and gathers
1985 /// data from these 32 lanes, producing a single resultant value.
1986 /// Contiguous Partial Warp Reduce (argument value 1):
1987 /// This algorithm assumes that only a *contiguous* subset of lanes
1988 /// are active. This happens for the last warp in a parallel region
1989 /// when the user specified num_threads is not an integer multiple of
1990 /// 32. This contiguous subset always starts with the zeroth lane.
1991 /// Partial Warp Reduce (argument value 2):
1992 /// This algorithm gathers data from any number of lanes at any position.
1993 /// All reduced values are stored in the lowest possible lane. The set
1994 /// of problems every algorithm addresses is a super set of those
1995 /// addressable by algorithms with a lower version number. Overhead
1996 /// increases as algorithm version increases.
1997 ///
1998 /// Terminology
1999 /// Reduce element:
2000 /// Reduce element refers to the individual data field with primitive
2001 /// data types to be combined and reduced across threads.
2002 /// Reduce list:
2003 /// Reduce list refers to a collection of local, thread-private
2004 /// reduce elements.
2005 /// Remote Reduce list:
2006 /// Remote Reduce list refers to a collection of remote (relative to
2007 /// the current thread) reduce elements.
2008 ///
2009 /// We distinguish between three states of threads that are important to
2010 /// the implementation of this function.
2011 /// Alive threads:
2012 /// Threads in a warp executing the SIMT instruction, as distinguished from
2013 /// threads that are inactive due to divergent control flow.
2014 /// Active threads:
2015 /// The minimal set of threads that has to be alive upon entry to this
2016 /// function. The computation is correct iff active threads are alive.
2017 /// Some threads are alive but they are not active because they do not
2018 /// contribute to the computation in any useful manner. Turning them off
2019 /// may introduce control flow overheads without any tangible benefits.
2020 /// Effective threads:
2021 /// In order to comply with the argument requirements of the shuffle
2022 /// function, we must keep all lanes holding data alive. But at most
2023 /// half of them perform value aggregation; we refer to this half of
2024 /// threads as effective. The other half is simply handing off their
2025 /// data.
2026 ///
2027 /// Procedure
2028 /// Value shuffle:
2029 /// In this step active threads transfer data from higher lane positions
2030 /// in the warp to lower lane positions, creating Remote Reduce list.
2031 /// Value aggregation:
2032 /// In this step, effective threads combine their thread local Reduce list
2033 /// with Remote Reduce list and store the result in the thread local
2034 /// Reduce list.
2035 /// Value copy:
2036 /// In this step, we deal with the assumption made by algorithm 2
2037 /// (i.e. contiguity assumption). When we have an odd number of lanes
2038 /// active, say 2k+1, only k threads will be effective and therefore k
2039 /// new values will be produced. However, the Reduce list owned by the
2040 /// (2k+1)th thread is ignored in the value aggregation. Therefore
2041 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2042 /// that the contiguity assumption still holds.
2043 static llvm::Function *emitShuffleAndReduceFunction(
2044  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2045  QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2046  ASTContext &C = CGM.getContext();
2047 
2048  // Thread local Reduce list used to host the values of data to be reduced.
2049  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2050  C.VoidPtrTy, ImplicitParamDecl::Other);
2051  // Current lane id; could be logical.
2052  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2054  // Offset of the remote source lane relative to the current lane.
2055  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2056  C.ShortTy, ImplicitParamDecl::Other);
2057  // Algorithm version. This is expected to be known at compile time.
2058  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2059  C.ShortTy, ImplicitParamDecl::Other);
2060  FunctionArgList Args;
2061  Args.push_back(&ReduceListArg);
2062  Args.push_back(&LaneIDArg);
2063  Args.push_back(&RemoteLaneOffsetArg);
2064  Args.push_back(&AlgoVerArg);
2065 
2066  const CGFunctionInfo &CGFI =
2067  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2068  auto *Fn = llvm::Function::Create(
2070  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2071  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2072  Fn->setDoesNotRecurse();
2073 
2074  CodeGenFunction CGF(CGM);
2075  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2076 
2077  CGBuilderTy &Bld = CGF.Builder;
2078 
2079  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2080  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2081  Address LocalReduceList(
2083  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2084  C.VoidPtrTy, SourceLocation()),
2085  ElemTy->getPointerTo()),
2086  ElemTy, CGF.getPointerAlign());
2087 
2088  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2089  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2090  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2091 
2092  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2093  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2094  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2095 
2096  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2097  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2098  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2099 
2100  // Create a local thread-private variable to host the Reduce list
2101  // from a remote lane.
2102  Address RemoteReduceList =
2103  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2104 
2105  // This loop iterates through the list of reduce elements and copies,
2106  // element by element, from a remote lane in the warp to RemoteReduceList,
2107  // hosted on the thread's stack.
2108  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2109  LocalReduceList, RemoteReduceList,
2110  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2111  /*ScratchpadIndex=*/nullptr,
2112  /*ScratchpadWidth=*/nullptr});
2113 
2114  // The actions to be performed on the Remote Reduce list is dependent
2115  // on the algorithm version.
2116  //
2117  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2118  // LaneId % 2 == 0 && Offset > 0):
2119  // do the reduction value aggregation
2120  //
2121  // The thread local variable Reduce list is mutated in place to host the
2122  // reduced data, which is the aggregated value produced from local and
2123  // remote lanes.
2124  //
2125  // Note that AlgoVer is expected to be a constant integer known at compile
2126  // time.
2127  // When AlgoVer==0, the first conjunction evaluates to true, making
2128  // the entire predicate true during compile time.
2129  // When AlgoVer==1, the second conjunction has only the second part to be
2130  // evaluated during runtime. Other conjunctions evaluates to false
2131  // during compile time.
2132  // When AlgoVer==2, the third conjunction has only the second part to be
2133  // evaluated during runtime. Other conjunctions evaluates to false
2134  // during compile time.
2135  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2136 
2137  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2138  llvm::Value *CondAlgo1 = Bld.CreateAnd(
2139  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2140 
2141  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2142  llvm::Value *CondAlgo2 = Bld.CreateAnd(
2143  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2144  CondAlgo2 = Bld.CreateAnd(
2145  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2146 
2147  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2148  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2149 
2150  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2151  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2152  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2153  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2154 
2155  CGF.EmitBlock(ThenBB);
2156  // reduce_function(LocalReduceList, RemoteReduceList)
2157  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2158  LocalReduceList.getPointer(), CGF.VoidPtrTy);
2159  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2160  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2162  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2163  Bld.CreateBr(MergeBB);
2164 
2165  CGF.EmitBlock(ElseBB);
2166  Bld.CreateBr(MergeBB);
2167 
2168  CGF.EmitBlock(MergeBB);
2169 
2170  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2171  // Reduce list.
2172  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2173  llvm::Value *CondCopy = Bld.CreateAnd(
2174  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2175 
2176  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2177  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2178  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2179  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2180 
2181  CGF.EmitBlock(CpyThenBB);
2182  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2183  RemoteReduceList, LocalReduceList);
2184  Bld.CreateBr(CpyMergeBB);
2185 
2186  CGF.EmitBlock(CpyElseBB);
2187  Bld.CreateBr(CpyMergeBB);
2188 
2189  CGF.EmitBlock(CpyMergeBB);
2190 
2191  CGF.FinishFunction();
2192  return Fn;
2193 }
2194 
2195 /// This function emits a helper that copies all the reduction variables from
2196 /// the team into the provided global buffer for the reduction variables.
2197 ///
2198 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2199 /// For all data entries D in reduce_data:
2200 /// Copy local D to buffer.D[Idx]
2201 static llvm::Value *emitListToGlobalCopyFunction(
2202  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2203  QualType ReductionArrayTy, SourceLocation Loc,
2204  const RecordDecl *TeamReductionRec,
2205  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2206  &VarFieldMap) {
2207  ASTContext &C = CGM.getContext();
2208 
2209  // Buffer: global reduction buffer.
2210  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2211  C.VoidPtrTy, ImplicitParamDecl::Other);
2212  // Idx: index of the buffer.
2213  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2215  // ReduceList: thread local Reduce list.
2216  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2217  C.VoidPtrTy, ImplicitParamDecl::Other);
2218  FunctionArgList Args;
2219  Args.push_back(&BufferArg);
2220  Args.push_back(&IdxArg);
2221  Args.push_back(&ReduceListArg);
2222 
2223  const CGFunctionInfo &CGFI =
2224  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2225  auto *Fn = llvm::Function::Create(
2227  "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2228  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2229  Fn->setDoesNotRecurse();
2230  CodeGenFunction CGF(CGM);
2231  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2232 
2233  CGBuilderTy &Bld = CGF.Builder;
2234 
2235  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2236  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2237  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2238  Address LocalReduceList(
2240  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2241  C.VoidPtrTy, Loc),
2242  ElemTy->getPointerTo()),
2243  ElemTy, CGF.getPointerAlign());
2244  QualType StaticTy = C.getRecordType(TeamReductionRec);
2245  llvm::Type *LLVMReductionsBufferTy =
2246  CGM.getTypes().ConvertTypeForMem(StaticTy);
2247  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2248  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2249  LLVMReductionsBufferTy->getPointerTo());
2250  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2251  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2252  /*Volatile=*/false, C.IntTy,
2253  Loc)};
2254  unsigned Idx = 0;
2255  for (const Expr *Private : Privates) {
2256  // Reduce element = LocalReduceList[i]
2257  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2258  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2259  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2260  // elemptr = ((CopyType*)(elemptrptr)) + I
2261  ElemTy = CGF.ConvertTypeForMem(Private->getType());
2262  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2263  ElemPtrPtr, ElemTy->getPointerTo());
2264  Address ElemPtr =
2265  Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2266  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2267  // Global = Buffer.VD[Idx];
2268  const FieldDecl *FD = VarFieldMap.lookup(VD);
2269  LValue GlobLVal = CGF.EmitLValueForField(
2270  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2271  Address GlobAddr = GlobLVal.getAddress(CGF);
2272  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2273  GlobAddr.getPointer(), Idxs);
2274  GlobLVal.setAddress(Address(BufferPtr,
2275  CGF.ConvertTypeForMem(Private->getType()),
2276  GlobAddr.getAlignment()));
2277  switch (CGF.getEvaluationKind(Private->getType())) {
2278  case TEK_Scalar: {
2279  llvm::Value *V = CGF.EmitLoadOfScalar(
2280  ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2282  CGF.EmitStoreOfScalar(V, GlobLVal);
2283  break;
2284  }
2285  case TEK_Complex: {
2287  CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2288  CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2289  break;
2290  }
2291  case TEK_Aggregate:
2292  CGF.EmitAggregateCopy(GlobLVal,
2293  CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2295  break;
2296  }
2297  ++Idx;
2298  }
2299 
2300  CGF.FinishFunction();
2301  return Fn;
2302 }
2303 
2304 /// This function emits a helper that reduces all the reduction variables from
2305 /// the team into the provided global buffer for the reduction variables.
2306 ///
2307 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2308 /// void *GlobPtrs[];
2309 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2310 /// ...
2311 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2312 /// reduce_function(GlobPtrs, reduce_data);
2314  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2315  QualType ReductionArrayTy, SourceLocation Loc,
2316  const RecordDecl *TeamReductionRec,
2317  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2318  &VarFieldMap,
2319  llvm::Function *ReduceFn) {
2320  ASTContext &C = CGM.getContext();
2321 
2322  // Buffer: global reduction buffer.
2323  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2324  C.VoidPtrTy, ImplicitParamDecl::Other);
2325  // Idx: index of the buffer.
2326  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2328  // ReduceList: thread local Reduce list.
2329  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2330  C.VoidPtrTy, ImplicitParamDecl::Other);
2331  FunctionArgList Args;
2332  Args.push_back(&BufferArg);
2333  Args.push_back(&IdxArg);
2334  Args.push_back(&ReduceListArg);
2335 
2336  const CGFunctionInfo &CGFI =
2337  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2338  auto *Fn = llvm::Function::Create(
2340  "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2341  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2342  Fn->setDoesNotRecurse();
2343  CodeGenFunction CGF(CGM);
2344  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2345 
2346  CGBuilderTy &Bld = CGF.Builder;
2347 
2348  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2349  QualType StaticTy = C.getRecordType(TeamReductionRec);
2350  llvm::Type *LLVMReductionsBufferTy =
2351  CGM.getTypes().ConvertTypeForMem(StaticTy);
2352  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2353  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2354  LLVMReductionsBufferTy->getPointerTo());
2355 
2356  // 1. Build a list of reduction variables.
2357  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2358  Address ReductionList =
2359  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2360  auto IPriv = Privates.begin();
2361  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2362  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2363  /*Volatile=*/false, C.IntTy,
2364  Loc)};
2365  unsigned Idx = 0;
2366  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2367  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2368  // Global = Buffer.VD[Idx];
2369  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2370  const FieldDecl *FD = VarFieldMap.lookup(VD);
2371  LValue GlobLVal = CGF.EmitLValueForField(
2372  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2373  Address GlobAddr = GlobLVal.getAddress(CGF);
2374  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2375  GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2376  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2377  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2378  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2379  // Store array size.
2380  ++Idx;
2381  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2382  llvm::Value *Size = CGF.Builder.CreateIntCast(
2383  CGF.getVLASize(
2384  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2385  .NumElts,
2386  CGF.SizeTy, /*isSigned=*/false);
2387  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2388  Elem);
2389  }
2390  }
2391 
2392  // Call reduce_function(GlobalReduceList, ReduceList)
2393  llvm::Value *GlobalReduceList =
2394  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2395  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2396  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2397  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2399  CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2400  CGF.FinishFunction();
2401  return Fn;
2402 }
2403 
2404 /// This function emits a helper that copies all the reduction variables from
2405 /// the team into the provided global buffer for the reduction variables.
2406 ///
2407 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2408 /// For all data entries D in reduce_data:
2409 /// Copy buffer.D[Idx] to local D;
2410 static llvm::Value *emitGlobalToListCopyFunction(
2411  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2412  QualType ReductionArrayTy, SourceLocation Loc,
2413  const RecordDecl *TeamReductionRec,
2414  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2415  &VarFieldMap) {
2416  ASTContext &C = CGM.getContext();
2417 
2418  // Buffer: global reduction buffer.
2419  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2420  C.VoidPtrTy, ImplicitParamDecl::Other);
2421  // Idx: index of the buffer.
2422  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2424  // ReduceList: thread local Reduce list.
2425  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2426  C.VoidPtrTy, ImplicitParamDecl::Other);
2427  FunctionArgList Args;
2428  Args.push_back(&BufferArg);
2429  Args.push_back(&IdxArg);
2430  Args.push_back(&ReduceListArg);
2431 
2432  const CGFunctionInfo &CGFI =
2433  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2434  auto *Fn = llvm::Function::Create(
2436  "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2437  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2438  Fn->setDoesNotRecurse();
2439  CodeGenFunction CGF(CGM);
2440  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2441 
2442  CGBuilderTy &Bld = CGF.Builder;
2443 
2444  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2445  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2446  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2447  Address LocalReduceList(
2449  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2450  C.VoidPtrTy, Loc),
2451  ElemTy->getPointerTo()),
2452  ElemTy, CGF.getPointerAlign());
2453  QualType StaticTy = C.getRecordType(TeamReductionRec);
2454  llvm::Type *LLVMReductionsBufferTy =
2455  CGM.getTypes().ConvertTypeForMem(StaticTy);
2456  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2457  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2458  LLVMReductionsBufferTy->getPointerTo());
2459 
2460  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2461  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2462  /*Volatile=*/false, C.IntTy,
2463  Loc)};
2464  unsigned Idx = 0;
2465  for (const Expr *Private : Privates) {
2466  // Reduce element = LocalReduceList[i]
2467  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2468  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2469  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2470  // elemptr = ((CopyType*)(elemptrptr)) + I
2471  ElemTy = CGF.ConvertTypeForMem(Private->getType());
2472  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2473  ElemPtrPtr, ElemTy->getPointerTo());
2474  Address ElemPtr =
2475  Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2476  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2477  // Global = Buffer.VD[Idx];
2478  const FieldDecl *FD = VarFieldMap.lookup(VD);
2479  LValue GlobLVal = CGF.EmitLValueForField(
2480  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2481  Address GlobAddr = GlobLVal.getAddress(CGF);
2482  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2483  GlobAddr.getPointer(), Idxs);
2484  GlobLVal.setAddress(Address(BufferPtr,
2485  CGF.ConvertTypeForMem(Private->getType()),
2486  GlobAddr.getAlignment()));
2487  switch (CGF.getEvaluationKind(Private->getType())) {
2488  case TEK_Scalar: {
2489  llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2490  CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2492  TBAAAccessInfo());
2493  break;
2494  }
2495  case TEK_Complex: {
2496  CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2497  CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2498  /*isInit=*/false);
2499  break;
2500  }
2501  case TEK_Aggregate:
2502  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2503  GlobLVal, Private->getType(),
2505  break;
2506  }
2507  ++Idx;
2508  }
2509 
2510  CGF.FinishFunction();
2511  return Fn;
2512 }
2513 
2514 /// This function emits a helper that reduces all the reduction variables from
2515 /// the team into the provided global buffer for the reduction variables.
2516 ///
2517 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2518 /// void *GlobPtrs[];
2519 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2520 /// ...
2521 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2522 /// reduce_function(reduce_data, GlobPtrs);
2524  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2525  QualType ReductionArrayTy, SourceLocation Loc,
2526  const RecordDecl *TeamReductionRec,
2527  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2528  &VarFieldMap,
2529  llvm::Function *ReduceFn) {
2530  ASTContext &C = CGM.getContext();
2531 
2532  // Buffer: global reduction buffer.
2533  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2534  C.VoidPtrTy, ImplicitParamDecl::Other);
2535  // Idx: index of the buffer.
2536  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2538  // ReduceList: thread local Reduce list.
2539  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2540  C.VoidPtrTy, ImplicitParamDecl::Other);
2541  FunctionArgList Args;
2542  Args.push_back(&BufferArg);
2543  Args.push_back(&IdxArg);
2544  Args.push_back(&ReduceListArg);
2545 
2546  const CGFunctionInfo &CGFI =
2547  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2548  auto *Fn = llvm::Function::Create(
2550  "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2551  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2552  Fn->setDoesNotRecurse();
2553  CodeGenFunction CGF(CGM);
2554  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2555 
2556  CGBuilderTy &Bld = CGF.Builder;
2557 
2558  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2559  QualType StaticTy = C.getRecordType(TeamReductionRec);
2560  llvm::Type *LLVMReductionsBufferTy =
2561  CGM.getTypes().ConvertTypeForMem(StaticTy);
2562  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2563  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2564  LLVMReductionsBufferTy->getPointerTo());
2565 
2566  // 1. Build a list of reduction variables.
2567  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2568  Address ReductionList =
2569  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2570  auto IPriv = Privates.begin();
2571  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2572  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2573  /*Volatile=*/false, C.IntTy,
2574  Loc)};
2575  unsigned Idx = 0;
2576  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2577  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2578  // Global = Buffer.VD[Idx];
2579  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2580  const FieldDecl *FD = VarFieldMap.lookup(VD);
2581  LValue GlobLVal = CGF.EmitLValueForField(
2582  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2583  Address GlobAddr = GlobLVal.getAddress(CGF);
2584  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2585  GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2586  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2587  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2588  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2589  // Store array size.
2590  ++Idx;
2591  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2592  llvm::Value *Size = CGF.Builder.CreateIntCast(
2593  CGF.getVLASize(
2594  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2595  .NumElts,
2596  CGF.SizeTy, /*isSigned=*/false);
2597  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2598  Elem);
2599  }
2600  }
2601 
2602  // Call reduce_function(ReduceList, GlobalReduceList)
2603  llvm::Value *GlobalReduceList =
2604  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2605  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2606  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2607  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2609  CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2610  CGF.FinishFunction();
2611  return Fn;
2612 }
2613 
2614 ///
2615 /// Design of OpenMP reductions on the GPU
2616 ///
2617 /// Consider a typical OpenMP program with one or more reduction
2618 /// clauses:
2619 ///
2620 /// float foo;
2621 /// double bar;
2622 /// #pragma omp target teams distribute parallel for \
2623 /// reduction(+:foo) reduction(*:bar)
2624 /// for (int i = 0; i < N; i++) {
2625 /// foo += A[i]; bar *= B[i];
2626 /// }
2627 ///
2628 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
2629 /// all teams. In our OpenMP implementation on the NVPTX device an
2630 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2631 /// within a team are mapped to CUDA threads within a threadblock.
2632 /// Our goal is to efficiently aggregate values across all OpenMP
2633 /// threads such that:
2634 ///
2635 /// - the compiler and runtime are logically concise, and
2636 /// - the reduction is performed efficiently in a hierarchical
2637 /// manner as follows: within OpenMP threads in the same warp,
2638 /// across warps in a threadblock, and finally across teams on
2639 /// the NVPTX device.
2640 ///
2641 /// Introduction to Decoupling
2642 ///
2643 /// We would like to decouple the compiler and the runtime so that the
2644 /// latter is ignorant of the reduction variables (number, data types)
2645 /// and the reduction operators. This allows a simpler interface
2646 /// and implementation while still attaining good performance.
2647 ///
2648 /// Pseudocode for the aforementioned OpenMP program generated by the
2649 /// compiler is as follows:
2650 ///
2651 /// 1. Create private copies of reduction variables on each OpenMP
2652 /// thread: 'foo_private', 'bar_private'
2653 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2654 /// to it and writes the result in 'foo_private' and 'bar_private'
2655 /// respectively.
2656 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
2657 /// and store the result on the team master:
2658 ///
2659 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2660 /// reduceData, shuffleReduceFn, interWarpCpyFn)
2661 ///
2662 /// where:
2663 /// struct ReduceData {
2664 /// double *foo;
2665 /// double *bar;
2666 /// } reduceData
2667 /// reduceData.foo = &foo_private
2668 /// reduceData.bar = &bar_private
2669 ///
2670 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2671 /// auxiliary functions generated by the compiler that operate on
2672 /// variables of type 'ReduceData'. They aid the runtime perform
2673 /// algorithmic steps in a data agnostic manner.
2674 ///
2675 /// 'shuffleReduceFn' is a pointer to a function that reduces data
2676 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
2677 /// same warp. It takes the following arguments as input:
2678 ///
2679 /// a. variable of type 'ReduceData' on the calling lane,
2680 /// b. its lane_id,
2681 /// c. an offset relative to the current lane_id to generate a
2682 /// remote_lane_id. The remote lane contains the second
2683 /// variable of type 'ReduceData' that is to be reduced.
2684 /// d. an algorithm version parameter determining which reduction
2685 /// algorithm to use.
2686 ///
2687 /// 'shuffleReduceFn' retrieves data from the remote lane using
2688 /// efficient GPU shuffle intrinsics and reduces, using the
2689 /// algorithm specified by the 4th parameter, the two operands
2690 /// element-wise. The result is written to the first operand.
2691 ///
2692 /// Different reduction algorithms are implemented in different
2693 /// runtime functions, all calling 'shuffleReduceFn' to perform
2694 /// the essential reduction step. Therefore, based on the 4th
2695 /// parameter, this function behaves slightly differently to
2696 /// cooperate with the runtime to ensure correctness under
2697 /// different circumstances.
2698 ///
2699 /// 'InterWarpCpyFn' is a pointer to a function that transfers
2700 /// reduced variables across warps. It tunnels, through CUDA
2701 /// shared memory, the thread-private data of type 'ReduceData'
2702 /// from lane 0 of each warp to a lane in the first warp.
2703 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2704 /// The last team writes the global reduced value to memory.
2705 ///
2706 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2707 /// reduceData, shuffleReduceFn, interWarpCpyFn,
2708 /// scratchpadCopyFn, loadAndReduceFn)
2709 ///
2710 /// 'scratchpadCopyFn' is a helper that stores reduced
2711 /// data from the team master to a scratchpad array in
2712 /// global memory.
2713 ///
2714 /// 'loadAndReduceFn' is a helper that loads data from
2715 /// the scratchpad array and reduces it with the input
2716 /// operand.
2717 ///
2718 /// These compiler generated functions hide address
2719 /// calculation and alignment information from the runtime.
2720 /// 5. if ret == 1:
2721 /// The team master of the last team stores the reduced
2722 /// result to the globals in memory.
2723 /// foo += reduceData.foo; bar *= reduceData.bar
2724 ///
2725 ///
2726 /// Warp Reduction Algorithms
2727 ///
2728 /// On the warp level, we have three algorithms implemented in the
2729 /// OpenMP runtime depending on the number of active lanes:
2730 ///
2731 /// Full Warp Reduction
2732 ///
2733 /// The reduce algorithm within a warp where all lanes are active
2734 /// is implemented in the runtime as follows:
2735 ///
2736 /// full_warp_reduce(void *reduce_data,
2737 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2738 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2739 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
2740 /// }
2741 ///
2742 /// The algorithm completes in log(2, WARPSIZE) steps.
2743 ///
2744 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2745 /// not used therefore we save instructions by not retrieving lane_id
2746 /// from the corresponding special registers. The 4th parameter, which
2747 /// represents the version of the algorithm being used, is set to 0 to
2748 /// signify full warp reduction.
2749 ///
2750 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2751 ///
2752 /// #reduce_elem refers to an element in the local lane's data structure
2753 /// #remote_elem is retrieved from a remote lane
2754 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2755 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2756 ///
2757 /// Contiguous Partial Warp Reduction
2758 ///
2759 /// This reduce algorithm is used within a warp where only the first
2760 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2761 /// number of OpenMP threads in a parallel region is not a multiple of
2762 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
2763 ///
2764 /// void
2765 /// contiguous_partial_reduce(void *reduce_data,
2766 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2767 /// int size, int lane_id) {
2768 /// int curr_size;
2769 /// int offset;
2770 /// curr_size = size;
2771 /// mask = curr_size/2;
2772 /// while (offset>0) {
2773 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2774 /// curr_size = (curr_size+1)/2;
2775 /// offset = curr_size/2;
2776 /// }
2777 /// }
2778 ///
2779 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2780 ///
2781 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2782 /// if (lane_id < offset)
2783 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2784 /// else
2785 /// reduce_elem = remote_elem
2786 ///
2787 /// This algorithm assumes that the data to be reduced are located in a
2788 /// contiguous subset of lanes starting from the first. When there is
2789 /// an odd number of active lanes, the data in the last lane is not
2790 /// aggregated with any other lane's dat but is instead copied over.
2791 ///
2792 /// Dispersed Partial Warp Reduction
2793 ///
2794 /// This algorithm is used within a warp when any discontiguous subset of
2795 /// lanes are active. It is used to implement the reduction operation
2796 /// across lanes in an OpenMP simd region or in a nested parallel region.
2797 ///
2798 /// void
2799 /// dispersed_partial_reduce(void *reduce_data,
2800 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2801 /// int size, remote_id;
2802 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2803 /// do {
2804 /// remote_id = next_active_lane_id_right_after_me();
2805 /// # the above function returns 0 of no active lane
2806 /// # is present right after the current lane.
2807 /// size = number_of_active_lanes_in_this_warp();
2808 /// logical_lane_id /= 2;
2809 /// ShuffleReduceFn(reduce_data, logical_lane_id,
2810 /// remote_id-1-threadIdx.x, 2);
2811 /// } while (logical_lane_id % 2 == 0 && size > 1);
2812 /// }
2813 ///
2814 /// There is no assumption made about the initial state of the reduction.
2815 /// Any number of lanes (>=1) could be active at any position. The reduction
2816 /// result is returned in the first active lane.
2817 ///
2818 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2819 ///
2820 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2821 /// if (lane_id % 2 == 0 && offset > 0)
2822 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2823 /// else
2824 /// reduce_elem = remote_elem
2825 ///
2826 ///
2827 /// Intra-Team Reduction
2828 ///
2829 /// This function, as implemented in the runtime call
2830 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2831 /// threads in a team. It first reduces within a warp using the
2832 /// aforementioned algorithms. We then proceed to gather all such
2833 /// reduced values at the first warp.
2834 ///
2835 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
2836 /// data from each of the "warp master" (zeroth lane of each warp, where
2837 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2838 /// a mathematical sense) the problem of reduction across warp masters in
2839 /// a block to the problem of warp reduction.
2840 ///
2841 ///
2842 /// Inter-Team Reduction
2843 ///
2844 /// Once a team has reduced its data to a single value, it is stored in
2845 /// a global scratchpad array. Since each team has a distinct slot, this
2846 /// can be done without locking.
2847 ///
2848 /// The last team to write to the scratchpad array proceeds to reduce the
2849 /// scratchpad array. One or more workers in the last team use the helper
2850 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2851 /// the k'th worker reduces every k'th element.
2852 ///
2853 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2854 /// reduce across workers and compute a globally reduced value.
2855 ///
2859  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2860  if (!CGF.HaveInsertPoint())
2861  return;
2862 
2863  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2864 #ifndef NDEBUG
2865  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2866 #endif
2867 
2868  if (Options.SimpleReduction) {
2869  assert(!TeamsReduction && !ParallelReduction &&
2870  "Invalid reduction selection in emitReduction.");
2871  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2872  ReductionOps, Options);
2873  return;
2874  }
2875 
2876  assert((TeamsReduction || ParallelReduction) &&
2877  "Invalid reduction selection in emitReduction.");
2878 
2879  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2880  // RedList, shuffle_reduce_func, interwarp_copy_func);
2881  // or
2882  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2883  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2884  llvm::Value *ThreadId = getThreadID(CGF, Loc);
2885 
2886  llvm::Value *Res;
2887  ASTContext &C = CGM.getContext();
2888  // 1. Build a list of reduction variables.
2889  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2890  auto Size = RHSExprs.size();
2891  for (const Expr *E : Privates) {
2892  if (E->getType()->isVariablyModifiedType())
2893  // Reserve place for array size.
2894  ++Size;
2895  }
2896  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2897  QualType ReductionArrayTy =
2898  C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
2899  /*IndexTypeQuals=*/0);
2900  Address ReductionList =
2901  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2902  auto IPriv = Privates.begin();
2903  unsigned Idx = 0;
2904  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2905  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2906  CGF.Builder.CreateStore(
2908  CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
2909  Elem);
2910  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2911  // Store array size.
2912  ++Idx;
2913  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2914  llvm::Value *Size = CGF.Builder.CreateIntCast(
2915  CGF.getVLASize(
2916  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2917  .NumElts,
2918  CGF.SizeTy, /*isSigned=*/false);
2919  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2920  Elem);
2921  }
2922  }
2923 
2924  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2925  ReductionList.getPointer(), CGF.VoidPtrTy);
2926  llvm::Function *ReductionFn =
2927  emitReductionFunction(Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
2928  Privates, LHSExprs, RHSExprs, ReductionOps);
2929  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2930  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2931  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2932  llvm::Value *InterWarpCopyFn =
2933  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2934 
2935  if (ParallelReduction) {
2936  llvm::Value *Args[] = {RTLoc,
2937  ThreadId,
2938  CGF.Builder.getInt32(RHSExprs.size()),
2939  ReductionArrayTySize,
2940  RL,
2941  ShuffleAndReduceFn,
2942  InterWarpCopyFn};
2943 
2944  Res = CGF.EmitRuntimeCall(
2945  OMPBuilder.getOrCreateRuntimeFunction(
2946  CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2947  Args);
2948  } else {
2949  assert(TeamsReduction && "expected teams reduction.");
2950  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2951  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2952  int Cnt = 0;
2953  for (const Expr *DRE : Privates) {
2954  PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2955  ++Cnt;
2956  }
2957  const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
2958  CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
2959  C.getLangOpts().OpenMPCUDAReductionBufNum);
2960  TeamsReductions.push_back(TeamReductionRec);
2961  if (!KernelTeamsReductionPtr) {
2962  KernelTeamsReductionPtr = new llvm::GlobalVariable(
2963  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
2965  "_openmp_teams_reductions_buffer_$_$ptr");
2966  }
2967  llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
2968  Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
2969  /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
2970  llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2971  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2972  llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2973  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2974  ReductionFn);
2975  llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2976  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2977  llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2978  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2979  ReductionFn);
2980 
2981  llvm::Value *Args[] = {
2982  RTLoc,
2983  ThreadId,
2984  GlobalBufferPtr,
2985  CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2986  RL,
2987  ShuffleAndReduceFn,
2988  InterWarpCopyFn,
2989  GlobalToBufferCpyFn,
2990  GlobalToBufferRedFn,
2991  BufferToGlobalCpyFn,
2992  BufferToGlobalRedFn};
2993 
2994  Res = CGF.EmitRuntimeCall(
2995  OMPBuilder.getOrCreateRuntimeFunction(
2996  CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2997  Args);
2998  }
2999 
3000  // 5. Build if (res == 1)
3001  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3002  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3003  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3004  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3005  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3006 
3007  // 6. Build then branch: where we have reduced values in the master
3008  // thread in each team.
3009  // __kmpc_end_reduce{_nowait}(<gtid>);
3010  // break;
3011  CGF.EmitBlock(ThenBB);
3012 
3013  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3014  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3015  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3016  auto IPriv = Privates.begin();
3017  auto ILHS = LHSExprs.begin();
3018  auto IRHS = RHSExprs.begin();
3019  for (const Expr *E : ReductionOps) {
3020  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3021  cast<DeclRefExpr>(*IRHS));
3022  ++IPriv;
3023  ++ILHS;
3024  ++IRHS;
3025  }
3026  };
3027  llvm::Value *EndArgs[] = {ThreadId};
3028  RegionCodeGenTy RCG(CodeGen);
3029  NVPTXActionTy Action(
3030  nullptr, std::nullopt,
3031  OMPBuilder.getOrCreateRuntimeFunction(
3032  CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3033  EndArgs);
3034  RCG.setAction(Action);
3035  RCG(CGF);
3036  // There is no need to emit line number for unconditional branch.
3038  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3039 }
3040 
3041 const VarDecl *
3043  const VarDecl *NativeParam) const {
3044  if (!NativeParam->getType()->isReferenceType())
3045  return NativeParam;
3046  QualType ArgType = NativeParam->getType();
3047  QualifierCollector QC;
3048  const Type *NonQualTy = QC.strip(ArgType);
3049  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3050  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3051  if (Attr->getCaptureKind() == OMPC_map) {
3052  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3054  }
3055  }
3056  ArgType = CGM.getContext().getPointerType(PointeeTy);
3057  QC.addRestrict();
3058  enum { NVPTX_local_addr = 5 };
3059  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3060  ArgType = QC.apply(CGM.getContext(), ArgType);
3061  if (isa<ImplicitParamDecl>(NativeParam))
3063  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3064  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3065  return ParmVarDecl::Create(
3066  CGM.getContext(),
3067  const_cast<DeclContext *>(NativeParam->getDeclContext()),
3068  NativeParam->getBeginLoc(), NativeParam->getLocation(),
3069  NativeParam->getIdentifier(), ArgType,
3070  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3071 }
3072 
3073 Address
3075  const VarDecl *NativeParam,
3076  const VarDecl *TargetParam) const {
3077  assert(NativeParam != TargetParam &&
3078  NativeParam->getType()->isReferenceType() &&
3079  "Native arg must not be the same as target arg.");
3080  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3081  QualType NativeParamType = NativeParam->getType();
3082  QualifierCollector QC;
3083  const Type *NonQualTy = QC.strip(NativeParamType);
3084  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3085  unsigned NativePointeeAddrSpace =
3086  CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
3087  QualType TargetTy = TargetParam->getType();
3088  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3089  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3090  // First cast to generic.
3092  TargetAddr, llvm::PointerType::getWithSamePointeeType(
3093  cast<llvm::PointerType>(TargetAddr->getType()), /*AddrSpace=*/0));
3094  // Cast from generic to native address space.
3096  TargetAddr, llvm::PointerType::getWithSamePointeeType(
3097  cast<llvm::PointerType>(TargetAddr->getType()),
3098  NativePointeeAddrSpace));
3099  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3100  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3101  NativeParamType);
3102  return NativeParamAddr;
3103 }
3104 
3106  CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3107  ArrayRef<llvm::Value *> Args) const {
3108  SmallVector<llvm::Value *, 4> TargetArgs;
3109  TargetArgs.reserve(Args.size());
3110  auto *FnType = OutlinedFn.getFunctionType();
3111  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3112  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3113  TargetArgs.append(std::next(Args.begin(), I), Args.end());
3114  break;
3115  }
3116  llvm::Type *TargetType = FnType->getParamType(I);
3117  llvm::Value *NativeArg = Args[I];
3118  if (!TargetType->isPointerTy()) {
3119  TargetArgs.emplace_back(NativeArg);
3120  continue;
3121  }
3122  llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3123  NativeArg, llvm::PointerType::getWithSamePointeeType(
3124  cast<llvm::PointerType>(NativeArg->getType()), /*AddrSpace*/ 0));
3125  TargetArgs.emplace_back(
3126  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3127  }
3128  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3129 }
3130 
3131 /// Emit function which wraps the outline parallel region
3132 /// and controls the arguments which are passed to this function.
3133 /// The wrapper ensures that the outlined function is called
3134 /// with the correct arguments when data is shared.
3135 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3136  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3137  ASTContext &Ctx = CGM.getContext();
3138  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3139 
3140  // Create a function that takes as argument the source thread.
3141  FunctionArgList WrapperArgs;
3142  QualType Int16QTy =
3143  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3144  QualType Int32QTy =
3145  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3146  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3147  /*Id=*/nullptr, Int16QTy,
3149  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3150  /*Id=*/nullptr, Int32QTy,
3152  WrapperArgs.emplace_back(&ParallelLevelArg);
3153  WrapperArgs.emplace_back(&WrapperArg);
3154 
3155  const CGFunctionInfo &CGFI =
3157 
3158  auto *Fn = llvm::Function::Create(
3160  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3161 
3162  // Ensure we do not inline the function. This is trivially true for the ones
3163  // passed to __kmpc_fork_call but the ones calles in serialized regions
3164  // could be inlined. This is not a perfect but it is closer to the invariant
3165  // we want, namely, every data environment starts with a new function.
3166  // TODO: We should pass the if condition to the runtime function and do the
3167  // handling there. Much cleaner code.
3168  Fn->addFnAttr(llvm::Attribute::NoInline);
3169 
3171  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3172  Fn->setDoesNotRecurse();
3173 
3174  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3175  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3176  D.getBeginLoc(), D.getBeginLoc());
3177 
3178  const auto *RD = CS.getCapturedRecordDecl();
3179  auto CurField = RD->field_begin();
3180 
3181  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3182  /*Name=*/".zero.addr");
3183  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3184  // Get the array of arguments.
3186 
3187  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3188  Args.emplace_back(ZeroAddr.getPointer());
3189 
3190  CGBuilderTy &Bld = CGF.Builder;
3191  auto CI = CS.capture_begin();
3192 
3193  // Use global memory for data sharing.
3194  // Handle passing of global args to workers.
3195  Address GlobalArgs =
3196  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3197  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3198  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3199  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3200  CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3201  DataSharingArgs);
3202 
3203  // Retrieve the shared variables from the list of references returned
3204  // by the runtime. Pass the variables to the outlined function.
3205  Address SharedArgListAddress = Address::invalid();
3206  if (CS.capture_size() > 0 ||
3208  SharedArgListAddress = CGF.EmitLoadOfPointer(
3209  GlobalArgs, CGF.getContext()
3211  .castAs<PointerType>());
3212  }
3213  unsigned Idx = 0;
3215  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3216  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3217  Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3218  llvm::Value *LB = CGF.EmitLoadOfScalar(
3219  TypedAddress,
3220  /*Volatile=*/false,
3222  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3223  Args.emplace_back(LB);
3224  ++Idx;
3225  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3226  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3227  Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3228  llvm::Value *UB = CGF.EmitLoadOfScalar(
3229  TypedAddress,
3230  /*Volatile=*/false,
3232  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3233  Args.emplace_back(UB);
3234  ++Idx;
3235  }
3236  if (CS.capture_size() > 0) {
3237  ASTContext &CGFContext = CGF.getContext();
3238  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3239  QualType ElemTy = CurField->getType();
3240  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3241  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3242  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3243  CGF.ConvertTypeForMem(ElemTy));
3244  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3245  /*Volatile=*/false,
3246  CGFContext.getPointerType(ElemTy),
3247  CI->getLocation());
3248  if (CI->capturesVariableByCopy() &&
3249  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3250  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3251  CI->getLocation());
3252  }
3253  Args.emplace_back(Arg);
3254  }
3255  }
3256 
3257  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3258  CGF.FinishFunction();
3259  return Fn;
3260 }
3261 
3263  const Decl *D) {
3265  return;
3266 
3267  assert(D && "Expected function or captured|block decl.");
3268  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3269  "Function is registered already.");
3270  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3271  "Team is set but not processed.");
3272  const Stmt *Body = nullptr;
3273  bool NeedToDelayGlobalization = false;
3274  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3275  Body = FD->getBody();
3276  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3277  Body = BD->getBody();
3278  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3279  Body = CD->getBody();
3280  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3281  if (NeedToDelayGlobalization &&
3282  getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3283  return;
3284  }
3285  if (!Body)
3286  return;
3287  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3288  VarChecker.Visit(Body);
3289  const RecordDecl *GlobalizedVarsRecord =
3290  VarChecker.getGlobalizedRecord(IsInTTDRegion);
3291  TeamAndReductions.first = nullptr;
3292  TeamAndReductions.second.clear();
3293  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3294  VarChecker.getEscapedVariableLengthDecls();
3295  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3296  return;
3297  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3298  I->getSecond().MappedParams =
3299  std::make_unique<CodeGenFunction::OMPMapVars>();
3300  I->getSecond().EscapedParameters.insert(
3301  VarChecker.getEscapedParameters().begin(),
3302  VarChecker.getEscapedParameters().end());
3303  I->getSecond().EscapedVariableLengthDecls.append(
3304  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3305  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3306  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3307  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3308  Data.insert(std::make_pair(VD, MappedVarData()));
3309  }
3310  if (!NeedToDelayGlobalization) {
3311  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3312  struct GlobalizationScope final : EHScopeStack::Cleanup {
3313  GlobalizationScope() = default;
3314 
3315  void Emit(CodeGenFunction &CGF, Flags flags) override {
3316  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3317  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3318  }
3319  };
3320  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3321  }
3322 }
3323 
3325  const VarDecl *VD) {
3326  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3327  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3328  auto AS = LangAS::Default;
3329  switch (A->getAllocatorType()) {
3330  // Use the default allocator here as by default local vars are
3331  // threadlocal.
3332  case OMPAllocateDeclAttr::OMPNullMemAlloc:
3333  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3334  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3335  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3336  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3337  // Follow the user decision - use default allocation.
3338  return Address::invalid();
3339  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3340  // TODO: implement aupport for user-defined allocators.
3341  return Address::invalid();
3342  case OMPAllocateDeclAttr::OMPConstMemAlloc:
3343  AS = LangAS::cuda_constant;
3344  break;
3345  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3346  AS = LangAS::cuda_shared;
3347  break;
3348  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3349  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3350  break;
3351  }
3352  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3353  auto *GV = new llvm::GlobalVariable(
3354  CGM.getModule(), VarTy, /*isConstant=*/false,
3355  llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
3356  VD->getName(),
3357  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3359  CharUnits Align = CGM.getContext().getDeclAlign(VD);
3360  GV->setAlignment(Align.getAsAlign());
3361  return Address(
3363  GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3364  VD->getType().getAddressSpace()))),
3365  VarTy, Align);
3366  }
3367 
3369  return Address::invalid();
3370 
3371  VD = VD->getCanonicalDecl();
3372  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3373  if (I == FunctionGlobalizedDecls.end())
3374  return Address::invalid();
3375  auto VDI = I->getSecond().LocalVarData.find(VD);
3376  if (VDI != I->getSecond().LocalVarData.end())
3377  return VDI->second.PrivateAddr;
3378  if (VD->hasAttrs()) {
3380  E(VD->attr_end());
3381  IT != E; ++IT) {
3382  auto VDI = I->getSecond().LocalVarData.find(
3383  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3384  ->getCanonicalDecl());
3385  if (VDI != I->getSecond().LocalVarData.end())
3386  return VDI->second.PrivateAddr;
3387  }
3388  }
3389 
3390  return Address::invalid();
3391 }
3392 
3394  FunctionGlobalizedDecls.erase(CGF.CurFn);
3396 }
3397 
3399  CodeGenFunction &CGF, const OMPLoopDirective &S,
3400  OpenMPDistScheduleClauseKind &ScheduleKind,
3401  llvm::Value *&Chunk) const {
3402  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3403  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3404  ScheduleKind = OMPC_DIST_SCHEDULE_static;
3405  Chunk = CGF.EmitScalarConversion(
3406  RT.getGPUNumThreads(CGF),
3407  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3408  S.getIterationVariable()->getType(), S.getBeginLoc());
3409  return;
3410  }
3412  CGF, S, ScheduleKind, Chunk);
3413 }
3414 
3416  CodeGenFunction &CGF, const OMPLoopDirective &S,
3417  OpenMPScheduleClauseKind &ScheduleKind,
3418  const Expr *&ChunkExpr) const {
3419  ScheduleKind = OMPC_SCHEDULE_static;
3420  // Chunk size is 1 in this case.
3421  llvm::APInt ChunkSize(32, 1);
3422  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3423  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3424  SourceLocation());
3425 }
3426 
3428  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3430  " Expected target-based directive.");
3431  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3432  for (const CapturedStmt::Capture &C : CS->captures()) {
3433  // Capture variables captured by reference in lambdas for target-based
3434  // directives.
3435  if (!C.capturesVariable())
3436  continue;
3437  const VarDecl *VD = C.getCapturedVar();
3438  const auto *RD = VD->getType()
3439  .getCanonicalType()
3441  ->getAsCXXRecordDecl();
3442  if (!RD || !RD->isLambda())
3443  continue;
3444  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3445  LValue VDLVal;
3446  if (VD->getType().getCanonicalType()->isReferenceType())
3447  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3448  else
3449  VDLVal = CGF.MakeAddrLValue(
3450  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3451  llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3452  FieldDecl *ThisCapture = nullptr;
3453  RD->getCaptureFields(Captures, ThisCapture);
3454  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3455  LValue ThisLVal =
3456  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3457  llvm::Value *CXXThis = CGF.LoadCXXThis();
3458  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3459  }
3460  for (const LambdaCapture &LC : RD->captures()) {
3461  if (LC.getCaptureKind() != LCK_ByRef)
3462  continue;
3463  const ValueDecl *VD = LC.getCapturedVar();
3464  // FIXME: For now VD is always a VarDecl because OpenMP does not support
3465  // capturing structured bindings in lambdas yet.
3466  if (!CS->capturesVariable(cast<VarDecl>(VD)))
3467  continue;
3468  auto It = Captures.find(VD);
3469  assert(It != Captures.end() && "Found lambda capture without field.");
3470  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3471  Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3472  if (VD->getType().getCanonicalType()->isReferenceType())
3473  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3474  VD->getType().getCanonicalType())
3475  .getAddress(CGF);
3476  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3477  }
3478  }
3479 }
3480 
3482  LangAS &AS) {
3483  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3484  return false;
3485  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3486  switch(A->getAllocatorType()) {
3487  case OMPAllocateDeclAttr::OMPNullMemAlloc:
3488  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3489  // Not supported, fallback to the default mem space.
3490  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3491  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3492  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3493  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3494  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3495  AS = LangAS::Default;
3496  return true;
3497  case OMPAllocateDeclAttr::OMPConstMemAlloc:
3498  AS = LangAS::cuda_constant;
3499  return true;
3500  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3501  AS = LangAS::cuda_shared;
3502  return true;
3503  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3504  llvm_unreachable("Expected predefined allocator for the variables with the "
3505  "static storage.");
3506  }
3507  return false;
3508 }
3509 
3510 // Get current CudaArch and ignore any unknown values
3512  if (!CGM.getTarget().hasFeature("ptx"))
3513  return CudaArch::UNKNOWN;
3514  for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3515  if (Feature.getValue()) {
3516  CudaArch Arch = StringToCudaArch(Feature.getKey());
3517  if (Arch != CudaArch::UNKNOWN)
3518  return Arch;
3519  }
3520  }
3521  return CudaArch::UNKNOWN;
3522 }
3523 
3524 /// Check to see if target architecture supports unified addressing which is
3525 /// a restriction for OpenMP requires clause "unified_shared_memory".
3527  const OMPRequiresDecl *D) {
3528  for (const OMPClause *Clause : D->clauselists()) {
3529  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3530  CudaArch Arch = getCudaArch(CGM);
3531  switch (Arch) {
3532  case CudaArch::SM_20:
3533  case CudaArch::SM_21:
3534  case CudaArch::SM_30:
3535  case CudaArch::SM_32:
3536  case CudaArch::SM_35:
3537  case CudaArch::SM_37:
3538  case CudaArch::SM_50:
3539  case CudaArch::SM_52:
3540  case CudaArch::SM_53: {
3541  SmallString<256> Buffer;
3542  llvm::raw_svector_ostream Out(Buffer);
3543  Out << "Target architecture " << CudaArchToString(Arch)
3544  << " does not support unified addressing";
3545  CGM.Error(Clause->getBeginLoc(), Out.str());
3546  return;
3547  }
3548  case CudaArch::SM_60:
3549  case CudaArch::SM_61:
3550  case CudaArch::SM_62:
3551  case CudaArch::SM_70:
3552  case CudaArch::SM_72:
3553  case CudaArch::SM_75:
3554  case CudaArch::SM_80:
3555  case CudaArch::SM_86:
3556  case CudaArch::SM_87:
3557  case CudaArch::SM_89:
3558  case CudaArch::SM_90:
3559  case CudaArch::GFX600:
3560  case CudaArch::GFX601:
3561  case CudaArch::GFX602:
3562  case CudaArch::GFX700:
3563  case CudaArch::GFX701:
3564  case CudaArch::GFX702:
3565  case CudaArch::GFX703:
3566  case CudaArch::GFX704:
3567  case CudaArch::GFX705:
3568  case CudaArch::GFX801:
3569  case CudaArch::GFX802:
3570  case CudaArch::GFX803:
3571  case CudaArch::GFX805:
3572  case CudaArch::GFX810:
3573  case CudaArch::GFX900:
3574  case CudaArch::GFX902:
3575  case CudaArch::GFX904:
3576  case CudaArch::GFX906:
3577  case CudaArch::GFX908:
3578  case CudaArch::GFX909:
3579  case CudaArch::GFX90a:
3580  case CudaArch::GFX90c:
3581  case CudaArch::GFX940:
3582  case CudaArch::GFX1010:
3583  case CudaArch::GFX1011:
3584  case CudaArch::GFX1012:
3585  case CudaArch::GFX1013:
3586  case CudaArch::GFX1030:
3587  case CudaArch::GFX1031:
3588  case CudaArch::GFX1032:
3589  case CudaArch::GFX1033:
3590  case CudaArch::GFX1034:
3591  case CudaArch::GFX1035:
3592  case CudaArch::GFX1036:
3593  case CudaArch::GFX1100:
3594  case CudaArch::GFX1101:
3595  case CudaArch::GFX1102:
3596  case CudaArch::GFX1103:
3597  case CudaArch::Generic:
3598  case CudaArch::UNUSED:
3599  case CudaArch::UNKNOWN:
3600  break;
3601  case CudaArch::LAST:
3602  llvm_unreachable("Unexpected Cuda arch.");
3603  }
3604  }
3605  }
3607 }
3608 
3610 
3611  if (!TeamsReductions.empty()) {
3612  ASTContext &C = CGM.getContext();
3613  RecordDecl *StaticRD = C.buildImplicitRecord(
3614  "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3615  StaticRD->startDefinition();
3616  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3617  QualType RecTy = C.getRecordType(TeamReductionRec);
3618  auto *Field = FieldDecl::Create(
3619  C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3620  C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3621  /*BW=*/nullptr, /*Mutable=*/false,
3622  /*InitStyle=*/ICIS_NoInit);
3623  Field->setAccess(AS_public);
3624  StaticRD->addDecl(Field);
3625  }
3626  StaticRD->completeDefinition();
3627  QualType StaticTy = C.getRecordType(StaticRD);
3628  llvm::Type *LLVMReductionsBufferTy =
3629  CGM.getTypes().ConvertTypeForMem(StaticTy);
3630  // FIXME: nvlink does not handle weak linkage correctly (object with the
3631  // different size are reported as erroneous).
3632  // Restore CommonLinkage as soon as nvlink is fixed.
3633  auto *GV = new llvm::GlobalVariable(
3634  CGM.getModule(), LLVMReductionsBufferTy,
3635  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3636  llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3637  "_openmp_teams_reductions_buffer_$_");
3638  KernelTeamsReductionPtr->setInitializer(
3639  llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3640  CGM.VoidPtrTy));
3641  }
3643 }
3644 
3646  CGBuilderTy &Bld = CGF.Builder;
3647  llvm::Module *M = &CGF.CGM.getModule();
3648  const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3649  llvm::Function *F = M->getFunction(LocSize);
3650  if (!F) {
3651  F = llvm::Function::Create(
3652  llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
3654  }
3655  return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
3656 }
3657 
3659  ArrayRef<llvm::Value *> Args{};
3660  return CGF.EmitRuntimeCall(
3661  OMPBuilder.getOrCreateRuntimeFunction(
3662  CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3663  Args);
3664 }
3665 
3667  ArrayRef<llvm::Value *> Args{};
3668  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3669  CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3670  Args);
3671 }
clang::QualifierCollector::strip
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:6580
clang::CodeGen::CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU
CGOpenMPRuntimeGPU(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:860
clang::CudaArch::SM_35
@ SM_35
clang::InternalLinkage
@ InternalLinkage
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:31
clang::CudaArch::GFX805
@ GFX805
clang::LangAS::cuda_shared
@ cuda_shared
clang::CodeGen::CodeGenFunction::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T)
Definition: CodeGenFunction.cpp:211
clang::ASTContext::getTypeSizeInChars
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
Definition: ASTContext.cpp:2563
supportsSPMDExecutionMode
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
Definition: CGOpenMPRuntimeGPU.cpp:651
clang::CudaArch::SM_70
@ SM_70
clang::CudaArch::GFX704
@ GFX704
clang::prec::Conditional
@ Conditional
Definition: OperatorPrecedence.h:30
clang::CodeGen::CodeGenTypeCache::SizeTy
llvm::IntegerType * SizeTy
Definition: CodeGenTypeCache.h:50
max
__DEVICE__ int max(int __a, int __b)
Definition: __clang_cuda_math.h:196
clang::CodeGen::CodeGenFunction::getTypeSize
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
Definition: CGStmtOpenMP.cpp:306
clang::CodeGen::LValueBaseInfo
Definition: CGValue.h:154
clang::OMPLastprivateClause
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:2770
clang::CodeGen::CGOpenMPRuntime::emitOutlinedFunctionCall
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntime.cpp:11705
clang::CodeGen::CodeGenFunction::EmitScalarConversion
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
Definition: CGExprScalar.cpp:5049
clang::CodeGen::CodeGenTypeCache::Int8PtrTy
llvm::PointerType * Int8PtrTy
Definition: CodeGenTypeCache.h:57
clang::Decl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:424
clang::CodeGen::CodeGenFunction::LoadCXXThis
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
Definition: CodeGenFunction.h:2803
clang::CharUnits::getAsAlign
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:189
clang::CudaArch::GFX1035
@ GFX1035
clang::CodeGen::CGOpenMPRuntime::getThreadID
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
Definition: CGOpenMPRuntime.cpp:1391
clang::interp::APInt
llvm::APInt APInt
Definition: Integral.h:29
setPropertyExecutionMode
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
Definition: CGOpenMPRuntimeGPU.cpp:828
clang::CodeGen::CodeGenTypeCache::IntTy
llvm::IntegerType * IntTy
int
Definition: CodeGenTypeCache.h:42
clang::CodeGen::TEK_Aggregate
@ TEK_Aggregate
Definition: CodeGenFunction.h:110
clang::CodeGen::CodeGenTypes::getTargetAddressSpace
unsigned getTargetAddressSpace(QualType T) const
Definition: CodeGenTypes.cpp:962
clang::CodeGen::CGOpenMPRuntime
Definition: CGOpenMPRuntime.h:229
clang::CudaArch::GFX601
@ GFX601
clang::DeclaratorDecl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:819
clang::Expr::isLValue
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:271
clang::CodeGen::CGOpenMPRuntime::emitSingleReductionCombiner
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Definition: CGOpenMPRuntime.cpp:5097
clang::Qualifiers::addRestrict
void addRestrict()
Definition: Type.h:286
CodeGenFunction.h
clang::DeclContext
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1389
clang::CapturedStmt::captures
capture_range captures()
Definition: Stmt.h:3679
clang::CodeGen::CGOpenMPRuntimeGPU::EM_SPMD
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
Definition: CGOpenMPRuntimeGPU.h:29
clang::LambdaCapture
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
clang::OMPExecutableDirective::getClausesOfKind
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:446
clang::Decl::hasAttr
bool hasAttr() const
Definition: DeclBase.h:560
emitShuffleAndReduceFunction
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
Definition: CGOpenMPRuntimeGPU.cpp:2043
clang::CudaArch::GFX908
@ GFX908
clang::CodeGen::CGOpenMPRuntimeGPU::emitBarrierCall
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntimeGPU.cpp:1263
emitListToGlobalCopyFunction
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2201
clang::VarDecl::getCanonicalDecl
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2194
clang::QualifierCollector::apply
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3846
clang::CodeGen::CGOpenMPRuntime::emitParallelOutlinedFunction
virtual llvm::Function * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
Definition: CGOpenMPRuntime.cpp:1265
clang::CodeGen::Address::getAlignment
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:120
clang::CodeGen::CGOpenMPRuntime::emitProcBindClause
virtual void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Definition: CGOpenMPRuntime.cpp:2892
clang::isOpenMPParallelDirective
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
Definition: OpenMPKinds.cpp:604
clang::CodeGen::CGOpenMPRuntime::clearLocThreadIdInsertPt
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
Definition: CGOpenMPRuntime.cpp:1344
clang::ASTContext::VoidTy
CanQualType VoidTy
Definition: ASTContext.h:1077
clang::TargetInfo::getNewAlign
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:686
llvm::SmallVector
Definition: LLVM.h:38
CopyOptionsTy
Definition: CGOpenMPRuntimeGPU.cpp:1513
clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:86
CopyOptionsTy::ScratchpadWidth
llvm::Value * ScratchpadWidth
Definition: CGOpenMPRuntimeGPU.cpp:1516
clang::CodeGen::CodeGenModule::SetInternalFunctionAttributes
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
Definition: CodeGenModule.cpp:2311
clang::VarDecl::isInitCapture
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1530
clang::CodeGen::CGOpenMPRuntimeGPU::getGPUWarpSize
llvm::Value * getGPUWarpSize(CodeGenFunction &CGF)
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
Definition: CGOpenMPRuntimeGPU.cpp:3666
clang::CodeGen::LValue::getAddress
Address getAddress(CodeGenFunction &CGF) const
Definition: CGValue.h:341
emitReductionListCopy
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values,...
Definition: CGOpenMPRuntimeGPU.cpp:1521
clang::CodeGen::CodeGenFunction::EmitLoadOfComplex
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.
Definition: CGExprComplex.cpp:1318
clang::CodeGen::CGOpenMPRuntime::emitUpdateLocation
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
Definition: CGOpenMPRuntime.cpp:1366
clang::CudaArch::GFX906
@ GFX906
clang::QualType::getNonReferenceType
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6842
clang::CastExpr::getSubExpr
Expr * getSubExpr()
Definition: Expr.h:3532
clang::CudaArch::SM_90
@ SM_90
clang::CodeGen::CGOpenMPRuntime::setLocThreadIdInsertPt
void setLocThreadIdInsertPt(CodeGenFunction &CGF, bool AtCurrentPoint=false)
Definition: CGOpenMPRuntime.cpp:1328
clang::CharUnits::QuantityType
int64_t QuantityType
Definition: CharUnits.h:40
clang::CodeGen::CodeGenTypeCache::getPointerAlign
CharUnits getPointerAlign() const
Definition: CodeGenTypeCache.h:120
clang::QualType
A (possibly-)qualified type.
Definition: Type.h:736
clang::CudaArch::GFX802
@ GFX802
clang::CudaArch::GFX940
@ GFX940
Attr.h
clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelCall
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
Definition: CGOpenMPRuntimeGPU.cpp:1168
AttributeLangSupport::C
@ C
Definition: SemaDeclAttr.cpp:56
clang::CudaArch::SM_53
@ SM_53
clang::CodeGen::CodeGenModule::getContext
ASTContext & getContext() const
Definition: CodeGenModule.h:729
clang::QualType::getCanonicalType
QualType getCanonicalType() const
Definition: Type.h:6685
clang::FieldDecl
Represents a member of a struct/union/class.
Definition: Decl.h:2943
clang::CodeGen::CGOpenMPRuntime::OMPBuilder
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
Definition: CGOpenMPRuntime.h:312
clang::CodeGen::CGOpenMPRuntime::emitCriticalRegion
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
Definition: CGOpenMPRuntime.cpp:2176
clang::CodeGen::CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
Definition: CGOpenMPRuntimeGPU.cpp:3427
clang::CudaArch::GFX1034
@ GFX1034
clang::CodeGen::CodeGenFunction::createBasicBlock
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
Definition: CodeGenFunction.h:2429
clang::CudaArch::GFX1102
@ GFX1102
clang::CodeGen::CGOpenMPRuntimeGPU::Generic
@ Generic
Generic data-sharing mode.
Definition: CGOpenMPRuntimeGPU.h:339
clang::CodeGen::CGOpenMPRuntimeGPU::functionFinished
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntimeGPU.cpp:3393
DeclOpenMP.h
clang::CodeGen::CGBuilderTy::CreateStore
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:99
clang::CodeGen::LValue::setAddress
void setAddress(Address address)
Definition: CGValue.h:344
clang::StringToCudaArch
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:157
createRuntimeShuffleFunction
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
Definition: CGOpenMPRuntimeGPU.cpp:1380
clang::CodeGen::CodeGenTypes::arrangeBuiltinFunctionDeclaration
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:670
clang::TargetInfo::hasFeature
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1382
llvm::SmallPtrSet
Definition: ASTContext.h:55
clang::UnaryOperator
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2174
clang::CodeGen::CodeGenModule::getLangOpts
const LangOptions & getLangOpts() const
Definition: CodeGenModule.h:730
clang::OMPReductionClause
This represents clause 'reduction' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:3081
clang::CodeGen::CodeGenFunction::EmitLoadOfPointer
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
Definition: CGExpr.cpp:2540
clang::CodeGen::CGOpenMPRuntime::emitThreadIDAddress
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
Definition: CGOpenMPRuntime.cpp:2114
clang::CudaArch::GFX1010
@ GFX1010
clang::CodeGen::CodeGenFunction::HaveInsertPoint
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
Definition: CodeGenFunction.h:2470
clang::CodeGen::CodeGenFunction::EmitLValueForField
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:4343
clang::Decl::getAttr
T * getAttr() const
Definition: DeclBase.h:556
clang::CodeGen::CodeGenFunction::EmitStoreOfScalar
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:3852
clang::CudaArch::SM_60
@ SM_60
clang::CodeGen::CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
Definition: CGOpenMPRuntimeGPU.cpp:3481
clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelOutlinedFunction
llvm::Function * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
Definition: CGOpenMPRuntimeGPU.cpp:908
clang::CudaArch::GFX1100
@ GFX1100
getTeamsReductionVars
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
Definition: CGOpenMPRuntimeGPU.cpp:955
clang::CodeGen::CGOpenMPRuntime::emitBarrierCall
virtual void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false)
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntime.cpp:2513
clang::CodeGen::CGBuilderTy
Definition: CGBuilder.h:45
clang::ASTContext::toBits
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
Definition: ASTContext.cpp:2557
clang::AttributeCommonInfo::AS_GNU
@ AS_GNU
attribute((...))
Definition: AttributeCommonInfo.h:27
clang::CodeGen::CGOpenMPRuntimeGPU::CUDA
@ CUDA
CUDA data sharing mode.
Definition: CGOpenMPRuntimeGPU.h:337
clang::ArrayType::Normal
@ Normal
Definition: Type.h:3024
clang::CodeGen::CGBuilderTy::CreateConstGEP
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:249
emitGlobalToListReduceFunction
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2523
clang::CodeGen::CodeGenFunction::Builder
CGBuilderTy Builder
Definition: CodeGenFunction.h:270
clang::ProtectedVisibility
@ ProtectedVisibility
Objects with "protected" visibility are seen by the dynamic linker but always dynamically resolve to ...
Definition: Visibility.h:41
clang::Type
The base class of the type hierarchy.
Definition: Type.h:1566
clang::FieldDecl::Create
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4290
clang::CodeGen::CodeGenTypeCache::VoidPtrTy
llvm::PointerType * VoidPtrTy
Definition: CodeGenTypeCache.h:56
clang::CodeGen::CGOpenMPRuntimeGPU
Definition: CGOpenMPRuntimeGPU.h:24
clang::Decl::attr_end
attr_iterator attr_end() const
Definition: DeclBase.h:526
clang::Type::hasSignedIntegerRepresentation
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2082
clang::CodeGen::CodeGenFunction::GetAddrOfLocalVar
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
Definition: CodeGenFunction.h:2719
clang::CodeGen::CGOpenMPRuntimeGPU::emitNumThreadsClause
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Definition: CGOpenMPRuntimeGPU.cpp:897
Offset
unsigned Offset
Definition: Format.cpp:2774
clang::CodeGen::CodeGenModule::getOpenMPRuntime
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
Definition: CodeGenModule.h:636
clang::CudaArch::GFX1011
@ GFX1011
clang::GlobalDecl
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
clang::CodeGen::AlignmentSource::Decl
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
clang::BlockDecl::captures
ArrayRef< Capture > captures() const
Definition: Decl.h:4459
clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::isCXXThisExprCaptured
bool isCXXThisExprCaptured() const
Definition: CodeGenFunction.h:442
clang::CodeGen::CodeGenFunction::EmitLoadOfScalar
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:3830
clang::CodeGen::CodeGenFunction::ComplexPairTy
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
Definition: CodeGenFunction.h:268
clang::CodeGen::CodeGenTypes::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T, bool ForBitField=false)
ConvertTypeForMem - Convert type T into a llvm::Type.
Definition: CodeGenTypes.cpp:91
clang::CudaArch::SM_20
@ SM_20
clang::isOpenMPDistributeDirective
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
Definition: OpenMPKinds.cpp:679
clang::CodeGen::CGOpenMPRuntimeGPU::getGPUNumThreads
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
Definition: CGOpenMPRuntimeGPU.cpp:3645
clang::CodeGen::CodeGenFunction::CreateDefaultAlignTempAlloca
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:124
clang::CapturedStmt::capturesVariable
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1424
clang::ASTContext::getUIntPtrType
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Definition: ASTContext.cpp:6018
clang::Type::isReferenceType
bool isReferenceType() const
Definition: Type.h:6906
clang::CapturedStmt::getCapturedStmt
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3645
clang::CallExpr::getCallee
Expr * getCallee()
Definition: Expr.h:2962
V
#define V(N, I)
Definition: ASTContext.h:3211
clang::CodeGen::CGOpenMPRuntime::getOMPBuilder
llvm::OpenMPIRBuilder & getOMPBuilder()
Definition: CGOpenMPRuntime.h:306
clang::CodeGen::CGBuilderTy::CreateConstInBoundsGEP
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:232
clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3415
clang::CodeGen::TBAAAccessInfo
Definition: CodeGenTBAA.h:41
clang::CodeGen::CGBuilderTy::CreatePointerBitCastOrAddrSpaceCast
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:176
getDistributeLastprivateVars
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ...
Definition: CGOpenMPRuntimeGPU.cpp:930
clang::CodeGen::CGOpenMPRuntimeGPU::DataSharingMode
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
Definition: CGOpenMPRuntimeGPU.h:335
clang::CudaArch::SM_86
@ SM_86
clang::CudaArch::GFX702
@ GFX702
clang::CodeGen::EHScopeStack::Cleanup
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141
StmtOpenMP.h
getDataSharingMode
static CGOpenMPRuntimeGPU::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:543
clang::QualType::getAddressSpace
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:6767
clang::OMPClause
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55
clang::CodeGen::CGOpenMPRuntimeGPU::emitReduction
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
Definition: CGOpenMPRuntimeGPU.cpp:2856
clang::LambdaExpr
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1923
clang::OMPLoopDirective
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1003
clang::CudaArch::SM_72
@ SM_72
clang::CharUnits::fromQuantity
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
clang::CodeGen::CodeGenFunction::EmitCastToVoidPtr
llvm::Value * EmitCastToVoidPtr(llvm::Value *value)
Emit a cast to void* in the appropriate address space.
Definition: CGExpr.cpp:55
clang::CudaArch::SM_87
@ SM_87
clang::AS_public
@ AS_public
Definition: Specifiers.h:112
clang::BlockExpr::getBlockDecl
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5985
clang::CodeGen::CGOpenMPRuntimeGPU::emitNumTeamsClause
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
Definition: CGOpenMPRuntimeGPU.cpp:903
clang::OMPRequiresDecl::clauselists
clauselist_range clauselists()
Definition: DeclOpenMP.h:441
clang::CodeGen::CGOpenMPRuntime::hasRequiresUnifiedSharedMemory
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
Definition: CGOpenMPRuntime.cpp:10510
clang::CodeGen::Address::getType
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:99
clang::getOpenMPCaptureRegions
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
Definition: OpenMPKinds.cpp:732
clang::ASTContext
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:182
clang::ASTContext::getSizeType
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
Definition: ASTContext.cpp:5980
clang::CodeGen::CodeGenFunction::OMPPrivateScope
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
Definition: CodeGenFunction.h:1067
clang::CudaArch::GFX90c
@ GFX90c
clang::CodeGen::CGOpenMPRuntimeGPU::ExecutionMode
ExecutionMode
Defines the execution mode.
Definition: CGOpenMPRuntimeGPU.h:27
clang::CodeGen::CodeGenFunction::EmitRuntimeCall
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
clang::CodeGen::TEK_Complex
@ TEK_Complex
Definition: CodeGenFunction.h:109
clang::CodeGen::CodeGenTypeCache::Int16Ty
llvm::IntegerType * Int16Ty
Definition: CodeGenTypeCache.h:37
clang::CudaArch::GFX1012
@ GFX1012
clang::CodeGen::CodeGenFunction::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenFunction.h:2027
clang::CudaArch::GFX1103
@ GFX1103
clang::CodeGen::CodeGenFunction::getDebugInfo
CGDebugInfo * getDebugInfo()
Definition: CodeGenFunction.h:1986
clang::ASTContext::getAsVariableArrayType
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2700
clang::CudaArch::GFX1033
@ GFX1033
clang::CudaArch::GFX1031
@ GFX1031
clang::CudaArch::GFX90a
@ GFX90a
clang::CudaArch::SM_21
@ SM_21
clang::Decl::getCanonicalDecl
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:943
clang::OMPExecutableDirective::getInnermostCapturedStmt
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:543
clang::Stmt::children
child_range children()
Definition: Stmt.cpp:286
clang::IntegerLiteral::Create
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:983
clang::CodeGen::CodeGenModule::getModule
llvm::Module & getModule() const
Definition: CodeGenModule.h:739
clang::Type::isLValueReferenceType
bool isLValueReferenceType() const
Definition: Type.h:6910
clang::index::SymbolKind::Field
@ Field
clang::DeclRefExpr::getDecl
ValueDecl * getDecl()
Definition: Expr.h:1306
clang::ImplicitParamDecl
Definition: Decl.h:1657
clang::CodeGen::CGOpenMPRuntime::getDefaultFlagsForBarriers
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
Definition: CGOpenMPRuntime.cpp:2480
clang::CR_OpenMP
@ CR_OpenMP
Definition: CapturedStmt.h:19
clang::CudaArch::GFX1032
@ GFX1032
clang::CodeGen::LValue::getType
QualType getType() const
Definition: CGValue.h:278
llvm::SmallString
Definition: LLVM.h:37
clang::getLangASFromTargetAS
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:83
clang::ASTContext::getIntTypeForBitwidth
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
Definition: ASTContext.cpp:12042
clang::VarDecl
Represents a variable declaration or definition.
Definition: Decl.h:915
hasNestedSPMDDirective
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Definition: CGOpenMPRuntimeGPU.cpp:549
clang::Type::getPointeeType
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:629
clang::CodeGen::CGOpenMPRuntimeGPU::emitOutlinedFunctionCall
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntimeGPU.cpp:3105
clang::CodeGen::CodeGenModule::getTypes
CodeGenTypes & getTypes()
Definition: CodeGenModule.h:756
clang::CudaArch::GFX904
@ GFX904
clang::CapturedStmt
This captures a statement into a function.
Definition: Stmt.h:3544
clang::Type::getAsCXXRecordDecl
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1783
clang::TTK_Union
@ TTK_Union
The "union" keyword.
Definition: Type.h:5553
clang::CodeGen::CodeGenFunction::getVLASize
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements,...
Definition: CodeGenFunction.cpp:2172
clang::Type::isVariablyModifiedType
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2326
clang::CapturedStmt::Capture
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3557
clang::CudaArch::SM_52
@ SM_52
clang::CodeGen::CGOpenMPRuntime::getDefaultDistScheduleAndChunk
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
Definition: CGOpenMPRuntime.h:1511
clang::CodeGen::CGOpenMPRuntime::getIdentTyPointerTy
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
Definition: CGOpenMPRuntime.cpp:1481
clang::CodeGen::CGOpenMPRuntime::ReductionOptionsTy
Definition: CGOpenMPRuntime.h:1208
clang::CodeGen::CodeGenFunction::getContext
ASTContext & getContext() const
Definition: CodeGenFunction.h:1985
clang::QualifierCollector
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:6573
clang::CodeGen::CGOpenMPRuntime::getSingleCompoundChild
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
Definition: CGOpenMPRuntime.cpp:6136
clang::CodeGen::CodeGenFunction::FinishFunction
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
Definition: CodeGenFunction.cpp:330
clang::CodeGen::Address
An aligned address.
Definition: Address.h:74
castValueToType
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
Definition: CGOpenMPRuntimeGPU.cpp:1350
Base
clang::CodeGen::CGBuilderTy::CreateGEP
Address CreateGEP(Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition: CGBuilder.h:264
clang::CodeGen::CodeGenFunction::getEvaluationKind
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
Definition: CodeGenFunction.cpp:219
clang::isOpenMPPrivate
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
Definition: OpenMPKinds.cpp:696
clang::CodeGen::Address::getAddressSpace
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:110
clang::CastExpr::getCastKind
CastKind getCastKind() const
Definition: Expr.h:3526
clang::CudaArch::SM_61
@ SM_61
clang::isOpenMPTeamsDirective
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
Definition: OpenMPKinds.cpp:647
clang::CodeGen::CodeGenModule::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenModule.h:744
clang::ASTContext::getTargetAddressSpace
unsigned getTargetAddressSpace(LangAS AS) const
Definition: ASTContext.cpp:12314
clang::CudaArch::GFX803
@ GFX803
getNVPTXWarpID
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Definition: CGOpenMPRuntimeGPU.cpp:516
clang::CodeGen::CodeGenTypeCache::Int32Ty
llvm::IntegerType * Int32Ty
Definition: CodeGenTypeCache.h:37
clang::CodeGen::CGOpenMPRuntimeGPU::translateParameter
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
Definition: CGOpenMPRuntimeGPU.cpp:3042
clang::LangAS
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
clang::CodeGen::CodeGenTypes::GetFunctionType
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1617
clang::Type::castAs
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:7464
clang::CodeGen::RegionCodeGenTy
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
Definition: CGOpenMPRuntime.h:68
clang::CodeGen::CodeGenFunction::StartFunction
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
Definition: CodeGenFunction.cpp:695
clang::CudaArch::LAST
@ LAST
clang::CodeGen::CodeGenFunction::VlaSizePair::NumElts
llvm::Value * NumElts
Definition: CodeGenFunction.h:2783
clang::CodeGen::LValue
LValue - This represents an lvalue references.
Definition: CGValue.h:171
clang::CodeGen::CodeGenFunction::EHStack
EHScopeStack EHStack
Definition: CodeGenFunction.h:620
clang::TargetOptions::FeatureMap
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62
clang::dependency_directives_scan::DirectiveKind
DirectiveKind
Represents the kind of preprocessor directive or a module declaration that is tracked by the scanner ...
Definition: DependencyDirectivesScanner.h:59
clang::TargetInfo::getTargetOpts
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:280
clang::CodeGen::CodeGenFunction::MakeNaturalAlignAddrLValue
LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T)
Definition: CodeGenFunction.cpp:190
clang::OMPExecutableDirective::getDirectiveKind
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:556
clang::LambdaExpr::isInitCapture
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition: ExprCXX.cpp:1280
clang::CodeGen::CGOpenMPRuntimeGPU::processRequiresDirective
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntimeGPU.cpp:3526
clang::TagDecl::startDefinition
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4439
clang::ASTContext::VoidPtrTy
CanQualType VoidPtrTy
Definition: ASTContext.h:1104
clang::Decl::hasAttrs
bool hasAttrs() const
Definition: DeclBase.h:502
clang::CudaArch::SM_80
@ SM_80
clang::syntax::NodeRole::Size
@ Size
clang::CodeGen::Address::getPointer
llvm::Value * getPointer() const
Definition: Address.h:93
clang::CodeGen::CodeGenFunction
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
Definition: CodeGenFunction.h:232
clang::Stmt::IgnoreContainers
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top,...
Definition: Stmt.cpp:196
clang::ASTContext::getDeclAlign
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
Definition: ASTContext.cpp:1783
clang::CudaArch::GFX909
@ GFX909
clang::CudaArch::SM_30
@ SM_30
clang::LCK_ByRef
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37
clang::NamedDecl::getIdentifier
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:268
clang::ValueDecl
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:703
clang::RecordDecl::completeDefinition
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:4779
clang::CodeGen::CGOpenMPRuntime::processRequiresDirective
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntime.cpp:10457
clang::CharUnits::isZero
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:122
clang::DeclContext::addDecl
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1607
clang::CodeGen::CodeGenModule
This class organizes the cross-function state that is used while generating LLVM code.
Definition: CodeGenModule.h:280
clang::LambdaExpr::captures
capture_range captures() const
Retrieve this lambda's captures.
Definition: ExprCXX.cpp:1293
clang::BlockExpr
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:5973
llvm::ArrayRef
Definition: LLVM.h:34
clang::LangAS::opencl_global
@ opencl_global
clang::Decl::attr_begin
attr_iterator attr_begin() const
Definition: DeclBase.h:523
clang::Decl
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:83
clang::CudaArch::GFX600
@ GFX600
OpenMPClause.h
clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::getKind
CapturedRegionKind getKind() const
Definition: CodeGenFunction.h:431
clang::Expr::IgnoreParenImpCasts
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3036
clang::OMPExecutableDirective::getAssociatedStmt
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:521
CopyOptionsTy::ScratchpadIndex
llvm::Value * ScratchpadIndex
Definition: CGOpenMPRuntimeGPU.cpp:1515
emitListToGlobalReduceFunction
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2313
clang::CodeGen::Address::invalid
static Address invalid()
Definition: Address.h:90
clang::ValueDecl::isInitCapture
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:5051
clang::CudaArchToString
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:139
clang::CodeGen::FunctionArgList
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:353
StmtVisitor.h
clang::CudaArch::UNKNOWN
@ UNKNOWN
clang::CodeGen::CodeGenFunction::getTypes
CodeGenTypes & getTypes() const
Definition: CodeGenFunction.h:1984
clang::CodeGen::CodeGenFunction::ConvertType
llvm::Type * ConvertType(QualType T)
Definition: CodeGenFunction.cpp:215
clang::CudaArch::SM_32
@ SM_32
clang::OMPExecutableDirective::hasAssociatedStmt
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:518
clang::CodeGen::NormalAndEHCleanup
@ NormalAndEHCleanup
Definition: EHScopeStack.h:86
clang::CudaArch::GFX900
@ GFX900
getNVPTXLaneID
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
Definition: CGOpenMPRuntimeGPU.cpp:527
clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3398
clang::CodeGen::Address::getElementType
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:104
clang::CodeGen::CodeGenFunction::MakeAddrLValue
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
Definition: CodeGenFunction.h:2491
clang::CudaArch::GFX801
@ GFX801
clang::ConstStmtVisitor
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:193
clang::CodeGen::CGOpenMPRuntime::emitReductionFunction
llvm::Function * emitReductionFunction(SourceLocation Loc, llvm::Type *ArgsElemType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
Definition: CGOpenMPRuntime.cpp:5007
clang::CodeGen::CGOpenMPRuntimeGPU::getGPUThreadID
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
Definition: CGOpenMPRuntimeGPU.cpp:3658
clang::ASTContext::getTargetInfo
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:743
clang::CodeGen::LValue::getPointer
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:337
clang::CodeGen::CGBuilderTy::CreateElementBitCast
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:168
CopyOptionsTy::RemoteLaneOffset
llvm::Value * RemoteLaneOffset
Definition: CGOpenMPRuntimeGPU.cpp:1514
clang::OMPExecutableDirective
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266
clang::CudaArch::GFX703
@ GFX703
clang::ObjCPropertyAttribute::Kind
Kind
Definition: DeclObjCCommon.h:22
clang::CodeGen::AlignmentSource::Type
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
clang::DeclStmt
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1311
clang::CodeGen::CGFunctionInfo
CGFunctionInfo - Class to encapsulate the information about a function definition.
Definition: CGFunctionInfo.h:546
clang::isOpenMPLoopBoundSharingDirective
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
Definition: OpenMPKinds.cpp:711
clang::PointerType
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2774
clang::CodeGen::CodeGenFunction::EmitStoreOfComplex
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.
Definition: CGExprComplex.cpp:1312
clang::CodeGen::CGOpenMPRuntimeGPU::clear
void clear() override
Definition: CGOpenMPRuntimeGPU.cpp:3609
clang::LangAS::cuda_constant
@ cuda_constant
clang::CudaArch::GFX705
@ GFX705
clang::LangAS::Default
@ Default
Cuda.h
clang::CodeGen::CGBuilderTy::CreateConstArrayGEP
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:213
clang::CodeGen::CodeGenFunction::CGM
CodeGenModule & CGM
Definition: CodeGenFunction.h:262
clang::TargetInfo::getGridValue
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1528
clang::ImplicitParamDecl::Create
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:5059
clang::CodeGen::TEK_Scalar
@ TEK_Scalar
Definition: CodeGenFunction.h:108
clang::ASTContext::getAddrSpaceQualType
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Definition: ASTContext.cpp:3142
clang::Builtin::ID
ID
Definition: Builtins.h:64
clang::CodeGen::CodeGenFunction::CurFn
llvm::Function * CurFn
Definition: CodeGenFunction.h:326
clang::OMPExecutableDirective::getBeginLoc
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:489
clang::UnaryOperator::getSubExpr
Expr * getSubExpr() const
Definition: Expr.h:2221
clang::Expr::IgnoreParens
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3032
clang
Definition: CalledOnceCheck.h:17
clang::LangOptions::OMPHostIRFile
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:460
clang::CodeGen::CGOpenMPRuntimeGPU::getParameterAddress
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
Definition: CGOpenMPRuntimeGPU.cpp:3074
emitGlobalToListCopyFunction
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2410
clang::CodeGen::CodeGenModule::addCompilerUsedGlobal
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
Definition: CodeGenModule.cpp:2521
clang::CudaArch::GFX1036
@ GFX1036
clang::CodeGen::CodeGenFunction::EmitAggregateCopy
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:2092
clang::OpenMPDistScheduleClauseKind
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103
clang::CudaArch::GFX902
@ GFX902
clang::ExternalLinkage
@ ExternalLinkage
External linkage, which indicates that the entity can be referred to from other translation units.
Definition: Linkage.h:59
clang::Stmt
Stmt - This represents one statement.
Definition: Stmt.h:72
clang::UnaryOperator::getOpcode
Opcode getOpcode() const
Definition: Expr.h:2216
clang::CodeGen::CGOpenMPRuntime::OffloadEntriesInfoManager
llvm::OffloadEntriesInfoManager OffloadEntriesInfoManager
Entity that registers the offloading constants that were emitted so far.
Definition: CGOpenMPRuntime.h:513
clang::CodeGen::CGOpenMPRuntime::clear
virtual void clear()
Definition: CGOpenMPRuntime.cpp:1074
clang::CodeGen::CGOpenMPRuntimeGPU::emitCriticalRegion
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
Definition: CGOpenMPRuntimeGPU.cpp:1280
clang::CodeGen::CGOpenMPRuntimeGPU::emitProcBindClause
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Definition: CGOpenMPRuntimeGPU.cpp:887
clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction
llvm::Function * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
Definition: CGOpenMPRuntimeGPU.cpp:965
clang::CudaArch::GFX1101
@ GFX1101
clang::CodeGen::CodeGenFunction::CapturedStmtInfo
CGCapturedStmtInfo * CapturedStmtInfo
Definition: CodeGenFunction.h:477
clang::CodeGen::CGOpenMPRuntime::emitTeamsOutlinedFunction
virtual llvm::Function * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
Definition: CGOpenMPRuntime.cpp:1273
clang::CodeGen::CodeGenFunction::EmitLoadOfReferenceLValue
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Definition: CGExpr.cpp:2531
clang::CudaArch::SM_37
@ SM_37
clang::CudaArch::GFX602
@ GFX602
clang::CudaArch::SM_50
@ SM_50
clang::Expr::getType
QualType getType() const
Definition: Expr.h:142
clang::Attr
Attr - This represents one attribute.
Definition: Attr.h:40
clang::Qualifiers::addAddressSpace
void addAddressSpace(LangAS space)
Definition: Type.h:403
clang::CudaArch::GFX1013
@ GFX1013
clang::CodeGen::RegionCodeGenTy::setAction
void setAction(PrePostActionTy &Action) const
Definition: CGOpenMPRuntime.h:89
clang::OMPRequiresDecl
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:416
clang::CudaArch
CudaArch
Definition: Cuda.h:49
clang::CodeGen::ApplyDebugLocation::CreateEmpty
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:846
clang::Decl::getAttrs
AttrVec & getAttrs()
Definition: DeclBase.h:508
clang::CudaArch::Generic
@ Generic
clang::ImplicitCastExpr
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3626
clang::CudaArch::GFX810
@ GFX810
clang::CodeGen::CodeGenFunction::EmitLValue
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression.
Definition: CGExpr.cpp:1273
clang::CudaArch::UNUSED
@ UNUSED
clang::OpenMPDirectiveKind
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24
clang::ASTContext::getPointerType
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
Definition: ASTContext.cpp:3389
clang::OMPExecutableDirective::clauses
ArrayRef< OMPClause * > clauses() const
Definition: StmtOpenMP.h:573
clang::CharUnits
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
clang::isOpenMPTargetExecutionDirective
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
Definition: OpenMPKinds.cpp:623
clang::CodeGen::CGOpenMPRuntimeGPU::emitFunctionProlog
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
Definition: CGOpenMPRuntimeGPU.cpp:3262
clang::CudaArch::SM_62
@ SM_62
llvm::SmallVectorImpl
Definition: Randstruct.h:18
emitInterWarpCopyFunction
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
Definition: CGOpenMPRuntimeGPU.cpp:1739
clang::CallExpr::arguments
arg_range arguments()
Definition: Expr.h:3051
clang::SC_None
@ SC_None
Definition: Specifiers.h:238
clang::ValueDecl::getType
QualType getType() const
Definition: Decl.h:714
clang::Type::isIntegerType
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:7223
clang::CodeGen::CodeGenFunction::CreateMemTemp
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:136
clang::Expr
This represents one expression.
Definition: Expr.h:110
clang::OpenMPScheduleClauseKind
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30
clang::CodeGen::CGOpenMPRuntime::CGM
CodeGenModule & CGM
Definition: CGOpenMPRuntime.h:309
clang::CapturedStmt::getCapturedDecl
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1400
clang::ParmVarDecl::Create
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2842
clang::CodeGen::CodeGenTypeCache::Int8Ty
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Definition: CodeGenTypeCache.h:37
clang::BlockDecl::Capture
A class which contains all the information about a particular captured value.
Definition: Decl.h:4338
shuffleAndStore
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
Definition: CGOpenMPRuntimeGPU.cpp:1413
clang::OMPExecutableDirective::getCapturedStmt
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive.
Definition: StmtOpenMP.h:534
getPrivateItem
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
Definition: SemaOpenMP.cpp:5344
clang::CodeGen::CodeGenTypeCache::VoidPtrPtrTy
llvm::PointerType * VoidPtrPtrTy
Definition: CodeGenTypeCache.h:62
getCudaArch
static CudaArch getCudaArch(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:3511
clang::Decl::getLocation
SourceLocation getLocation() const
Definition: DeclBase.h:432
clang::format::Base
Base
Definition: IntegerLiteralSeparatorFixer.cpp:20
clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsCall
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
Definition: CGOpenMPRuntimeGPU.cpp:1150
clang::Decl::isCanonicalDecl
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:949
clang::DeclRefExpr
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1234
clang::CudaArch::SM_75
@ SM_75
clang::RecordDecl
Represents a struct/union/class.
Definition: Decl.h:3996
clang::CodeGen::CGOpenMPRuntime::emitReduction
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
Definition: CGOpenMPRuntime.cpp:5117
clang::CallExpr
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2812
clang::CodeGen::CodeGenFunction::EmitBlock
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:564
clang::CudaArch::GFX701
@ GFX701
clang::CodeGen::CodeGenFunction::EmitLValueForFieldInitialization
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
Definition: CGExpr.cpp:4512
clang::CodeGen::CodeGenModule::Error
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
Definition: CodeGenModule.cpp:1082
clang::CodeGen::CodeGenModule::getLLVMContext
llvm::LLVMContext & getLLVMContext()
Definition: CodeGenModule.h:750
clang::CharUnits::getQuantity
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
clang::CudaArch::GFX1030
@ GFX1030
clang::CodeGen::PrePostActionTy
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
Definition: CGOpenMPRuntime.h:58
clang::ICIS_NoInit
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:260
clang::specific_attr_iterator
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:33
CGOpenMPRuntimeGPU.h
clang::CudaArch::GFX700
@ GFX700
clang::CodeGen::CGOpenMPRuntimeGPU::EM_Unknown
@ EM_Unknown
Unknown execution mode (orphaned directive).
Definition: CGOpenMPRuntimeGPU.h:33
clang::CodeGen::CGOpenMPRuntime::functionFinished
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntime.cpp:1460
clang::ImplicitParamDecl::Other
@ Other
Other implicit parameter.
Definition: Decl.h:1684
clang::CodeGen::CGOpenMPRuntimeGPU::getAddressOfLocalVariable
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Definition: CGOpenMPRuntimeGPU.cpp:3324
clang::CodeGen::CodeGenFunction::EmitBranch
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Definition: CGStmt.cpp:584
clang::Decl::getDeclContext
DeclContext * getDeclContext()
Definition: DeclBase.h:441
clang::CodeGen::AggValueSlot::DoesNotOverlap
@ DoesNotOverlap
Definition: CGValue.h:553
clang::CudaArch::SM_89
@ SM_89
clang::NamedDecl::getName
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:274