clang  16.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/StmtOpenMP.h"
19 #include "clang/AST/StmtVisitor.h"
20 #include "clang/Basic/Cuda.h"
21 #include "llvm/ADT/SmallPtrSet.h"
22 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
23 #include "llvm/Support/MathExtras.h"
24 
25 using namespace clang;
26 using namespace CodeGen;
27 using namespace llvm::omp;
28 
29 namespace {
30 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
31 class NVPTXActionTy final : public PrePostActionTy {
32  llvm::FunctionCallee EnterCallee = nullptr;
33  ArrayRef<llvm::Value *> EnterArgs;
34  llvm::FunctionCallee ExitCallee = nullptr;
35  ArrayRef<llvm::Value *> ExitArgs;
36  bool Conditional = false;
37  llvm::BasicBlock *ContBlock = nullptr;
38 
39 public:
40  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
41  ArrayRef<llvm::Value *> EnterArgs,
42  llvm::FunctionCallee ExitCallee,
43  ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
44  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
45  ExitArgs(ExitArgs), Conditional(Conditional) {}
46  void Enter(CodeGenFunction &CGF) override {
47  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
48  if (Conditional) {
49  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
50  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
51  ContBlock = CGF.createBasicBlock("omp_if.end");
52  // Generate the branch (If-stmt)
53  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
54  CGF.EmitBlock(ThenBlock);
55  }
56  }
57  void Done(CodeGenFunction &CGF) {
58  // Emit the rest of blocks/branches
59  CGF.EmitBranch(ContBlock);
60  CGF.EmitBlock(ContBlock, true);
61  }
62  void Exit(CodeGenFunction &CGF) override {
63  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
64  }
65 };
66 
67 /// A class to track the execution mode when codegening directives within
68 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
69 /// to the target region and used by containing directives such as 'parallel'
70 /// to emit optimized code.
71 class ExecutionRuntimeModesRAII {
72 private:
73  CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
76 
77 public:
78  ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80  : ExecMode(ExecMode) {
81  SavedExecMode = ExecMode;
82  ExecMode = EntryMode;
83  }
84  ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
85 };
86 
87 /// GPU Configuration: This information can be derived from cuda registers,
88 /// however, providing compile time constants helps generate more efficient
89 /// code. For all practical purposes this is fine because the configuration
90 /// is the same for all known NVPTX architectures.
91 enum MachineConfiguration : unsigned {
92  /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
93  /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
94 
95  /// Global memory alignment for performance.
96  GlobalMemoryAlignment = 128,
97 
98  /// Maximal size of the shared memory buffer.
99  SharedMemorySize = 128,
100 };
101 
102 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
103  RefExpr = RefExpr->IgnoreParens();
104  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
105  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
106  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
107  Base = TempASE->getBase()->IgnoreParenImpCasts();
108  RefExpr = Base;
109  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
110  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
111  while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
112  Base = TempOASE->getBase()->IgnoreParenImpCasts();
113  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
114  Base = TempASE->getBase()->IgnoreParenImpCasts();
115  RefExpr = Base;
116  }
117  RefExpr = RefExpr->IgnoreParenImpCasts();
118  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
119  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
120  const auto *ME = cast<MemberExpr>(RefExpr);
121  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
122 }
123 
124 
125 static RecordDecl *buildRecordForGlobalizedVars(
126  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
127  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
128  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
129  &MappedDeclsFields, int BufSize) {
130  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
131  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
132  return nullptr;
133  SmallVector<VarsDataTy, 4> GlobalizedVars;
134  for (const ValueDecl *D : EscapedDecls)
135  GlobalizedVars.emplace_back(
137  C.getDeclAlign(D).getQuantity(),
138  static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
139  D);
140  for (const ValueDecl *D : EscapedDeclsForTeams)
141  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
142  llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
143  return L.first > R.first;
144  });
145 
146  // Build struct _globalized_locals_ty {
147  // /* globalized vars */[WarSize] align (max(decl_align,
148  // GlobalMemoryAlignment))
149  // /* globalized vars */ for EscapedDeclsForTeams
150  // };
151  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
152  GlobalizedRD->startDefinition();
154  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
155  for (const auto &Pair : GlobalizedVars) {
156  const ValueDecl *VD = Pair.second;
157  QualType Type = VD->getType();
159  Type = C.getPointerType(Type.getNonReferenceType());
160  else
161  Type = Type.getNonReferenceType();
162  SourceLocation Loc = VD->getLocation();
163  FieldDecl *Field;
164  if (SingleEscaped.count(VD)) {
166  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
167  C.getTrivialTypeSourceInfo(Type, SourceLocation()),
168  /*BW=*/nullptr, /*Mutable=*/false,
169  /*InitStyle=*/ICIS_NoInit);
170  Field->setAccess(AS_public);
171  if (VD->hasAttrs()) {
172  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
173  E(VD->getAttrs().end());
174  I != E; ++I)
175  Field->addAttr(*I);
176  }
177  } else {
178  llvm::APInt ArraySize(32, BufSize);
179  Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
180  0);
182  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
183  C.getTrivialTypeSourceInfo(Type, SourceLocation()),
184  /*BW=*/nullptr, /*Mutable=*/false,
185  /*InitStyle=*/ICIS_NoInit);
186  Field->setAccess(AS_public);
187  llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
188  static_cast<CharUnits::QuantityType>(
189  GlobalMemoryAlignment)));
190  Field->addAttr(AlignedAttr::CreateImplicit(
191  C, /*IsAlignmentExpr=*/true,
192  IntegerLiteral::Create(C, Align,
193  C.getIntTypeForBitwidth(32, /*Signed=*/0),
194  SourceLocation()),
195  {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
196  }
197  GlobalizedRD->addDecl(Field);
198  MappedDeclsFields.try_emplace(VD, Field);
199  }
200  GlobalizedRD->completeDefinition();
201  return GlobalizedRD;
202 }
203 
204 /// Get the list of variables that can escape their declaration context.
205 class CheckVarsEscapingDeclContext final
206  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
207  CodeGenFunction &CGF;
208  llvm::SetVector<const ValueDecl *> EscapedDecls;
209  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
210  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
211  RecordDecl *GlobalizedRD = nullptr;
212  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
213  bool AllEscaped = false;
214  bool IsForCombinedParallelRegion = false;
215 
216  void markAsEscaped(const ValueDecl *VD) {
217  // Do not globalize declare target variables.
218  if (!isa<VarDecl>(VD) ||
219  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
220  return;
221  VD = cast<ValueDecl>(VD->getCanonicalDecl());
222  // Use user-specified allocation.
223  if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
224  return;
225  // Variables captured by value must be globalized.
226  if (auto *CSI = CGF.CapturedStmtInfo) {
227  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
228  // Check if need to capture the variable that was already captured by
229  // value in the outer region.
230  if (!IsForCombinedParallelRegion) {
231  if (!FD->hasAttrs())
232  return;
233  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
234  if (!Attr)
235  return;
236  if (((Attr->getCaptureKind() != OMPC_map) &&
237  !isOpenMPPrivate(Attr->getCaptureKind())) ||
238  ((Attr->getCaptureKind() == OMPC_map) &&
239  !FD->getType()->isAnyPointerType()))
240  return;
241  }
242  if (!FD->getType()->isReferenceType()) {
243  assert(!VD->getType()->isVariablyModifiedType() &&
244  "Parameter captured by value with variably modified type");
245  EscapedParameters.insert(VD);
246  } else if (!IsForCombinedParallelRegion) {
247  return;
248  }
249  }
250  }
251  if ((!CGF.CapturedStmtInfo ||
252  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
253  VD->getType()->isReferenceType())
254  // Do not globalize variables with reference type.
255  return;
256  if (VD->getType()->isVariablyModifiedType())
257  EscapedVariableLengthDecls.insert(VD);
258  else
259  EscapedDecls.insert(VD);
260  }
261 
262  void VisitValueDecl(const ValueDecl *VD) {
263  if (VD->getType()->isLValueReferenceType())
264  markAsEscaped(VD);
265  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
266  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
267  const bool SavedAllEscaped = AllEscaped;
268  AllEscaped = VD->getType()->isLValueReferenceType();
269  Visit(VarD->getInit());
270  AllEscaped = SavedAllEscaped;
271  }
272  }
273  }
274  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
275  ArrayRef<OMPClause *> Clauses,
276  bool IsCombinedParallelRegion) {
277  if (!S)
278  return;
279  for (const CapturedStmt::Capture &C : S->captures()) {
280  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
281  const ValueDecl *VD = C.getCapturedVar();
282  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
283  if (IsCombinedParallelRegion) {
284  // Check if the variable is privatized in the combined construct and
285  // those private copies must be shared in the inner parallel
286  // directive.
287  IsForCombinedParallelRegion = false;
288  for (const OMPClause *C : Clauses) {
289  if (!isOpenMPPrivate(C->getClauseKind()) ||
290  C->getClauseKind() == OMPC_reduction ||
291  C->getClauseKind() == OMPC_linear ||
292  C->getClauseKind() == OMPC_private)
293  continue;
295  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
296  Vars = PC->getVarRefs();
297  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
298  Vars = PC->getVarRefs();
299  else
300  llvm_unreachable("Unexpected clause.");
301  for (const auto *E : Vars) {
302  const Decl *D =
303  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
304  if (D == VD->getCanonicalDecl()) {
305  IsForCombinedParallelRegion = true;
306  break;
307  }
308  }
309  if (IsForCombinedParallelRegion)
310  break;
311  }
312  }
313  markAsEscaped(VD);
314  if (isa<OMPCapturedExprDecl>(VD))
315  VisitValueDecl(VD);
316  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
317  }
318  }
319  }
320 
321  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
322  assert(!GlobalizedRD &&
323  "Record for globalized variables is built already.");
324  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
325  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
326  if (IsInTTDRegion)
327  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
328  else
329  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
330  GlobalizedRD = ::buildRecordForGlobalizedVars(
331  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
332  MappedDeclsFields, WarpSize);
333  }
334 
335 public:
336  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
337  ArrayRef<const ValueDecl *> TeamsReductions)
338  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
339  }
340  virtual ~CheckVarsEscapingDeclContext() = default;
341  void VisitDeclStmt(const DeclStmt *S) {
342  if (!S)
343  return;
344  for (const Decl *D : S->decls())
345  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
346  VisitValueDecl(VD);
347  }
348  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
349  if (!D)
350  return;
351  if (!D->hasAssociatedStmt())
352  return;
353  if (const auto *S =
354  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
355  // Do not analyze directives that do not actually require capturing,
356  // like `omp for` or `omp simd` directives.
358  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
359  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
360  VisitStmt(S->getCapturedStmt());
361  return;
362  }
363  VisitOpenMPCapturedStmt(
364  S, D->clauses(),
365  CaptureRegions.back() == OMPD_parallel &&
367  }
368  }
369  void VisitCapturedStmt(const CapturedStmt *S) {
370  if (!S)
371  return;
372  for (const CapturedStmt::Capture &C : S->captures()) {
373  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
374  const ValueDecl *VD = C.getCapturedVar();
375  markAsEscaped(VD);
376  if (isa<OMPCapturedExprDecl>(VD))
377  VisitValueDecl(VD);
378  }
379  }
380  }
381  void VisitLambdaExpr(const LambdaExpr *E) {
382  if (!E)
383  return;
384  for (const LambdaCapture &C : E->captures()) {
385  if (C.capturesVariable()) {
386  if (C.getCaptureKind() == LCK_ByRef) {
387  const ValueDecl *VD = C.getCapturedVar();
388  markAsEscaped(VD);
389  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
390  VisitValueDecl(VD);
391  }
392  }
393  }
394  }
395  void VisitBlockExpr(const BlockExpr *E) {
396  if (!E)
397  return;
398  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
399  if (C.isByRef()) {
400  const VarDecl *VD = C.getVariable();
401  markAsEscaped(VD);
402  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
403  VisitValueDecl(VD);
404  }
405  }
406  }
407  void VisitCallExpr(const CallExpr *E) {
408  if (!E)
409  return;
410  for (const Expr *Arg : E->arguments()) {
411  if (!Arg)
412  continue;
413  if (Arg->isLValue()) {
414  const bool SavedAllEscaped = AllEscaped;
415  AllEscaped = true;
416  Visit(Arg);
417  AllEscaped = SavedAllEscaped;
418  } else {
419  Visit(Arg);
420  }
421  }
422  Visit(E->getCallee());
423  }
424  void VisitDeclRefExpr(const DeclRefExpr *E) {
425  if (!E)
426  return;
427  const ValueDecl *VD = E->getDecl();
428  if (AllEscaped)
429  markAsEscaped(VD);
430  if (isa<OMPCapturedExprDecl>(VD))
431  VisitValueDecl(VD);
432  else if (VD->isInitCapture())
433  VisitValueDecl(VD);
434  }
435  void VisitUnaryOperator(const UnaryOperator *E) {
436  if (!E)
437  return;
438  if (E->getOpcode() == UO_AddrOf) {
439  const bool SavedAllEscaped = AllEscaped;
440  AllEscaped = true;
441  Visit(E->getSubExpr());
442  AllEscaped = SavedAllEscaped;
443  } else {
444  Visit(E->getSubExpr());
445  }
446  }
447  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
448  if (!E)
449  return;
450  if (E->getCastKind() == CK_ArrayToPointerDecay) {
451  const bool SavedAllEscaped = AllEscaped;
452  AllEscaped = true;
453  Visit(E->getSubExpr());
454  AllEscaped = SavedAllEscaped;
455  } else {
456  Visit(E->getSubExpr());
457  }
458  }
459  void VisitExpr(const Expr *E) {
460  if (!E)
461  return;
462  bool SavedAllEscaped = AllEscaped;
463  if (!E->isLValue())
464  AllEscaped = false;
465  for (const Stmt *Child : E->children())
466  if (Child)
467  Visit(Child);
468  AllEscaped = SavedAllEscaped;
469  }
470  void VisitStmt(const Stmt *S) {
471  if (!S)
472  return;
473  for (const Stmt *Child : S->children())
474  if (Child)
475  Visit(Child);
476  }
477 
478  /// Returns the record that handles all the escaped local variables and used
479  /// instead of their original storage.
480  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
481  if (!GlobalizedRD)
482  buildRecordForGlobalizedVars(IsInTTDRegion);
483  return GlobalizedRD;
484  }
485 
486  /// Returns the field in the globalized record for the escaped variable.
487  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
488  assert(GlobalizedRD &&
489  "Record for globalized variables must be generated already.");
490  auto I = MappedDeclsFields.find(VD);
491  if (I == MappedDeclsFields.end())
492  return nullptr;
493  return I->getSecond();
494  }
495 
496  /// Returns the list of the escaped local variables/parameters.
497  ArrayRef<const ValueDecl *> getEscapedDecls() const {
498  return EscapedDecls.getArrayRef();
499  }
500 
501  /// Checks if the escaped local variable is actually a parameter passed by
502  /// value.
503  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
504  return EscapedParameters;
505  }
506 
507  /// Returns the list of the escaped variables with the variably modified
508  /// types.
509  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
510  return EscapedVariableLengthDecls.getArrayRef();
511  }
512 };
513 } // anonymous namespace
514 
515 /// Get the id of the warp in the block.
516 /// We assume that the warp size is 32, which is always the case
517 /// on the NVPTX device, to generate more efficient code.
518 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
519  CGBuilderTy &Bld = CGF.Builder;
520  unsigned LaneIDBits =
521  llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
522  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
523  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
524 }
525 
526 /// Get the id of the current lane in the Warp.
527 /// We assume that the warp size is 32, which is always the case
528 /// on the NVPTX device, to generate more efficient code.
529 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
530  CGBuilderTy &Bld = CGF.Builder;
531  unsigned LaneIDBits =
532  llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
533  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
534  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
535  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
536  "nvptx_lane_id");
537 }
538 
540 CGOpenMPRuntimeGPU::getExecutionMode() const {
541  return CurrentExecutionMode;
542 }
543 
546  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
548 }
549 
550 /// Check for inner (nested) SPMD construct, if any
552  const OMPExecutableDirective &D) {
553  const auto *CS = D.getInnermostCapturedStmt();
554  const auto *Body =
555  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
556  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
557 
558  if (const auto *NestedDir =
559  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
560  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
561  switch (D.getDirectiveKind()) {
562  case OMPD_target:
563  if (isOpenMPParallelDirective(DKind))
564  return true;
565  if (DKind == OMPD_teams) {
566  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
567  /*IgnoreCaptured=*/true);
568  if (!Body)
569  return false;
570  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
571  if (const auto *NND =
572  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
573  DKind = NND->getDirectiveKind();
574  if (isOpenMPParallelDirective(DKind))
575  return true;
576  }
577  }
578  return false;
579  case OMPD_target_teams:
580  return isOpenMPParallelDirective(DKind);
581  case OMPD_target_simd:
582  case OMPD_target_parallel:
583  case OMPD_target_parallel_for:
584  case OMPD_target_parallel_for_simd:
585  case OMPD_target_teams_distribute:
586  case OMPD_target_teams_distribute_simd:
587  case OMPD_target_teams_distribute_parallel_for:
588  case OMPD_target_teams_distribute_parallel_for_simd:
589  case OMPD_parallel:
590  case OMPD_for:
591  case OMPD_parallel_for:
592  case OMPD_parallel_master:
593  case OMPD_parallel_sections:
594  case OMPD_for_simd:
595  case OMPD_parallel_for_simd:
596  case OMPD_cancel:
597  case OMPD_cancellation_point:
598  case OMPD_ordered:
599  case OMPD_threadprivate:
600  case OMPD_allocate:
601  case OMPD_task:
602  case OMPD_simd:
603  case OMPD_sections:
604  case OMPD_section:
605  case OMPD_single:
606  case OMPD_master:
607  case OMPD_critical:
608  case OMPD_taskyield:
609  case OMPD_barrier:
610  case OMPD_taskwait:
611  case OMPD_taskgroup:
612  case OMPD_atomic:
613  case OMPD_flush:
614  case OMPD_depobj:
615  case OMPD_scan:
616  case OMPD_teams:
617  case OMPD_target_data:
618  case OMPD_target_exit_data:
619  case OMPD_target_enter_data:
620  case OMPD_distribute:
621  case OMPD_distribute_simd:
622  case OMPD_distribute_parallel_for:
623  case OMPD_distribute_parallel_for_simd:
624  case OMPD_teams_distribute:
625  case OMPD_teams_distribute_simd:
626  case OMPD_teams_distribute_parallel_for:
627  case OMPD_teams_distribute_parallel_for_simd:
628  case OMPD_target_update:
629  case OMPD_declare_simd:
630  case OMPD_declare_variant:
631  case OMPD_begin_declare_variant:
632  case OMPD_end_declare_variant:
633  case OMPD_declare_target:
634  case OMPD_end_declare_target:
635  case OMPD_declare_reduction:
636  case OMPD_declare_mapper:
637  case OMPD_taskloop:
638  case OMPD_taskloop_simd:
639  case OMPD_master_taskloop:
640  case OMPD_master_taskloop_simd:
641  case OMPD_parallel_master_taskloop:
642  case OMPD_parallel_master_taskloop_simd:
643  case OMPD_requires:
644  case OMPD_unknown:
645  default:
646  llvm_unreachable("Unexpected directive.");
647  }
648  }
649 
650  return false;
651 }
652 
654  const OMPExecutableDirective &D) {
656  switch (DirectiveKind) {
657  case OMPD_target:
658  case OMPD_target_teams:
659  return hasNestedSPMDDirective(Ctx, D);
660  case OMPD_target_parallel:
661  case OMPD_target_parallel_for:
662  case OMPD_target_parallel_for_simd:
663  case OMPD_target_teams_distribute_parallel_for:
664  case OMPD_target_teams_distribute_parallel_for_simd:
665  case OMPD_target_simd:
666  case OMPD_target_teams_distribute_simd:
667  return true;
668  case OMPD_target_teams_distribute:
669  return false;
670  case OMPD_parallel:
671  case OMPD_for:
672  case OMPD_parallel_for:
673  case OMPD_parallel_master:
674  case OMPD_parallel_sections:
675  case OMPD_for_simd:
676  case OMPD_parallel_for_simd:
677  case OMPD_cancel:
678  case OMPD_cancellation_point:
679  case OMPD_ordered:
680  case OMPD_threadprivate:
681  case OMPD_allocate:
682  case OMPD_task:
683  case OMPD_simd:
684  case OMPD_sections:
685  case OMPD_section:
686  case OMPD_single:
687  case OMPD_master:
688  case OMPD_critical:
689  case OMPD_taskyield:
690  case OMPD_barrier:
691  case OMPD_taskwait:
692  case OMPD_taskgroup:
693  case OMPD_atomic:
694  case OMPD_flush:
695  case OMPD_depobj:
696  case OMPD_scan:
697  case OMPD_teams:
698  case OMPD_target_data:
699  case OMPD_target_exit_data:
700  case OMPD_target_enter_data:
701  case OMPD_distribute:
702  case OMPD_distribute_simd:
703  case OMPD_distribute_parallel_for:
704  case OMPD_distribute_parallel_for_simd:
705  case OMPD_teams_distribute:
706  case OMPD_teams_distribute_simd:
707  case OMPD_teams_distribute_parallel_for:
708  case OMPD_teams_distribute_parallel_for_simd:
709  case OMPD_target_update:
710  case OMPD_declare_simd:
711  case OMPD_declare_variant:
712  case OMPD_begin_declare_variant:
713  case OMPD_end_declare_variant:
714  case OMPD_declare_target:
715  case OMPD_end_declare_target:
716  case OMPD_declare_reduction:
717  case OMPD_declare_mapper:
718  case OMPD_taskloop:
719  case OMPD_taskloop_simd:
720  case OMPD_master_taskloop:
721  case OMPD_master_taskloop_simd:
722  case OMPD_parallel_master_taskloop:
723  case OMPD_parallel_master_taskloop_simd:
724  case OMPD_requires:
725  case OMPD_unknown:
726  default:
727  break;
728  }
729  llvm_unreachable(
730  "Unknown programming model for OpenMP directive on NVPTX target.");
731 }
732 
733 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
734  StringRef ParentName,
735  llvm::Function *&OutlinedFn,
736  llvm::Constant *&OutlinedFnID,
737  bool IsOffloadEntry,
738  const RegionCodeGenTy &CodeGen) {
739  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
740  EntryFunctionState EST;
741  WrapperFunctionsMap.clear();
742 
743  // Emit target region as a standalone region.
744  class NVPTXPrePostActionTy : public PrePostActionTy {
745  CGOpenMPRuntimeGPU::EntryFunctionState &EST;
746 
747  public:
748  NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
749  : EST(EST) {}
750  void Enter(CodeGenFunction &CGF) override {
751  auto &RT =
752  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
753  RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
754  // Skip target region initialization.
755  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
756  }
757  void Exit(CodeGenFunction &CGF) override {
758  auto &RT =
759  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
760  RT.clearLocThreadIdInsertPt(CGF);
761  RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
762  }
763  } Action(EST);
764  CodeGen.setAction(Action);
765  IsInTTDRegion = true;
766  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
767  IsOffloadEntry, CodeGen);
768  IsInTTDRegion = false;
769 }
770 
771 void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
772  EntryFunctionState &EST, bool IsSPMD) {
773  CGBuilderTy &Bld = CGF.Builder;
774  Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, true));
775  IsInTargetMasterThreadRegion = IsSPMD;
776  if (!IsSPMD)
777  emitGenericVarsProlog(CGF, EST.Loc);
778 }
779 
780 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
781  EntryFunctionState &EST,
782  bool IsSPMD) {
783  if (!IsSPMD)
784  emitGenericVarsEpilog(CGF);
785 
786  CGBuilderTy &Bld = CGF.Builder;
787  OMPBuilder.createTargetDeinit(Bld, IsSPMD, true);
788 }
789 
790 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
791  StringRef ParentName,
792  llvm::Function *&OutlinedFn,
793  llvm::Constant *&OutlinedFnID,
794  bool IsOffloadEntry,
795  const RegionCodeGenTy &CodeGen) {
796  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
797  EntryFunctionState EST;
798 
799  // Emit target region as a standalone region.
800  class NVPTXPrePostActionTy : public PrePostActionTy {
801  CGOpenMPRuntimeGPU &RT;
802  CGOpenMPRuntimeGPU::EntryFunctionState &EST;
803 
804  public:
805  NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
806  CGOpenMPRuntimeGPU::EntryFunctionState &EST)
807  : RT(RT), EST(EST) {}
808  void Enter(CodeGenFunction &CGF) override {
809  RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
810  // Skip target region initialization.
811  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
812  }
813  void Exit(CodeGenFunction &CGF) override {
814  RT.clearLocThreadIdInsertPt(CGF);
815  RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
816  }
817  } Action(*this, EST);
818  CodeGen.setAction(Action);
819  IsInTTDRegion = true;
820  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
821  IsOffloadEntry, CodeGen);
822  IsInTTDRegion = false;
823 }
824 
825 // Create a unique global variable to indicate the execution mode of this target
826 // region. The execution mode is either 'generic', or 'spmd' depending on the
827 // target directive. This variable is picked up by the offload library to setup
828 // the device appropriately before kernel launch. If the execution mode is
829 // 'generic', the runtime reserves one warp for the master, otherwise, all
830 // warps participate in parallel work.
831 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
832  bool Mode) {
833  auto *GVMode = new llvm::GlobalVariable(
834  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
835  llvm::GlobalValue::WeakAnyLinkage,
836  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
837  : OMP_TGT_EXEC_MODE_GENERIC),
838  Twine(Name, "_exec_mode"));
839  CGM.addCompilerUsedGlobal(GVMode);
840 }
841 
842 void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
843  llvm::Constant *Addr,
844  uint64_t Size, int32_t,
845  llvm::GlobalValue::LinkageTypes) {
846  // TODO: Add support for global variables on the device after declare target
847  // support.
848  llvm::Function *Fn = dyn_cast<llvm::Function>(Addr);
849  if (!Fn)
850  return;
851 
852  llvm::Module &M = CGM.getModule();
853  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
854 
855  // Get "nvvm.annotations" metadata node.
856  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
857 
858  llvm::Metadata *MDVals[] = {
859  llvm::ConstantAsMetadata::get(Fn), llvm::MDString::get(Ctx, "kernel"),
860  llvm::ConstantAsMetadata::get(
861  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
862  // Append metadata to nvvm.annotations.
863  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
864 
865  // Add a function attribute for the kernel.
866  Fn->addFnAttr(llvm::Attribute::get(Ctx, "kernel"));
867 }
868 
869 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
870  const OMPExecutableDirective &D, StringRef ParentName,
871  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
872  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
873  if (!IsOffloadEntry) // Nothing to do.
874  return;
875 
876  assert(!ParentName.empty() && "Invalid target region parent name!");
877 
878  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
879  if (Mode)
880  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
881  CodeGen);
882  else
883  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
884  CodeGen);
885 
886  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
887 }
888 
889 namespace {
891 /// Enum for accessing the reserved_2 field of the ident_t struct.
892 enum ModeFlagsTy : unsigned {
893  /// Bit set to 1 when in SPMD mode.
894  KMP_IDENT_SPMD_MODE = 0x01,
895  /// Bit set to 1 when a simplified runtime is used.
896  KMP_IDENT_SIMPLE_RT_MODE = 0x02,
897  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
898 };
899 
900 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
901 static const ModeFlagsTy UndefinedMode =
902  (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
903 } // anonymous namespace
904 
906  switch (getExecutionMode()) {
907  case EM_SPMD:
908  return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
909  case EM_NonSPMD:
910  return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
911  case EM_Unknown:
912  return UndefinedMode;
913  }
914  llvm_unreachable("Unknown flags are requested.");
915 }
916 
918  : CGOpenMPRuntime(CGM, "_", "$") {
919  if (!CGM.getLangOpts().OpenMPIsDevice)
920  llvm_unreachable("OpenMP can only handle device code.");
921 
922  llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
923  if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
924  return;
925 
926  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
927  "__omp_rtl_debug_kind");
928  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
929  "__omp_rtl_assume_teams_oversubscription");
930  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
931  "__omp_rtl_assume_threads_oversubscription");
932  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
933  "__omp_rtl_assume_no_thread_state");
934  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
935  "__omp_rtl_assume_no_nested_parallelism");
936 }
937 
939  ProcBindKind ProcBind,
940  SourceLocation Loc) {
941  // Do nothing in case of SPMD mode and L0 parallel.
942  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
943  return;
944 
945  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
946 }
947 
949  llvm::Value *NumThreads,
950  SourceLocation Loc) {
951  // Nothing to do.
952 }
953 
955  const Expr *NumTeams,
956  const Expr *ThreadLimit,
957  SourceLocation Loc) {}
958 
960  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
961  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
962  // Emit target region as a standalone region.
963  class NVPTXPrePostActionTy : public PrePostActionTy {
964  bool &IsInParallelRegion;
965  bool PrevIsInParallelRegion;
966 
967  public:
968  NVPTXPrePostActionTy(bool &IsInParallelRegion)
969  : IsInParallelRegion(IsInParallelRegion) {}
970  void Enter(CodeGenFunction &CGF) override {
971  PrevIsInParallelRegion = IsInParallelRegion;
972  IsInParallelRegion = true;
973  }
974  void Exit(CodeGenFunction &CGF) override {
975  IsInParallelRegion = PrevIsInParallelRegion;
976  }
977  } Action(IsInParallelRegion);
978  CodeGen.setAction(Action);
979  bool PrevIsInTTDRegion = IsInTTDRegion;
980  IsInTTDRegion = false;
981  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
982  IsInTargetMasterThreadRegion = false;
983  auto *OutlinedFun =
985  D, ThreadIDVar, InnermostKind, CodeGen));
986  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
987  IsInTTDRegion = PrevIsInTTDRegion;
988  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
989  !IsInParallelRegion) {
990  llvm::Function *WrapperFun =
991  createParallelDataSharingWrapper(OutlinedFun, D);
992  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
993  }
994 
995  return OutlinedFun;
996 }
997 
998 /// Get list of lastprivate variables from the teams distribute ... or
999 /// teams {distribute ...} directives.
1000 static void
1004  "expected teams directive.");
1005  const OMPExecutableDirective *Dir = &D;
1008  Ctx,
1010  /*IgnoreCaptured=*/true))) {
1011  Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1012  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1013  Dir = nullptr;
1014  }
1015  }
1016  if (!Dir)
1017  return;
1018  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1019  for (const Expr *E : C->getVarRefs())
1020  Vars.push_back(getPrivateItem(E));
1021  }
1022 }
1023 
1024 /// Get list of reduction variables from the teams ... directives.
1025 static void
1029  "expected teams directive.");
1030  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1031  for (const Expr *E : C->privates())
1032  Vars.push_back(getPrivateItem(E));
1033  }
1034 }
1035 
1037  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1038  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1039  SourceLocation Loc = D.getBeginLoc();
1040 
1041  const RecordDecl *GlobalizedRD = nullptr;
1042  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1043  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1044  unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1045  // Globalize team reductions variable unconditionally in all modes.
1046  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1047  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1048  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1049  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1050  if (!LastPrivatesReductions.empty()) {
1051  GlobalizedRD = ::buildRecordForGlobalizedVars(
1052  CGM.getContext(), llvm::None, LastPrivatesReductions,
1053  MappedDeclsFields, WarpSize);
1054  }
1055  } else if (!LastPrivatesReductions.empty()) {
1056  assert(!TeamAndReductions.first &&
1057  "Previous team declaration is not expected.");
1058  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1059  std::swap(TeamAndReductions.second, LastPrivatesReductions);
1060  }
1061 
1062  // Emit target region as a standalone region.
1063  class NVPTXPrePostActionTy : public PrePostActionTy {
1064  SourceLocation &Loc;
1065  const RecordDecl *GlobalizedRD;
1066  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1067  &MappedDeclsFields;
1068 
1069  public:
1070  NVPTXPrePostActionTy(
1071  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1072  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1073  &MappedDeclsFields)
1074  : Loc(Loc), GlobalizedRD(GlobalizedRD),
1075  MappedDeclsFields(MappedDeclsFields) {}
1076  void Enter(CodeGenFunction &CGF) override {
1077  auto &Rt =
1078  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1079  if (GlobalizedRD) {
1080  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1081  I->getSecond().MappedParams =
1082  std::make_unique<CodeGenFunction::OMPMapVars>();
1083  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1084  for (const auto &Pair : MappedDeclsFields) {
1085  assert(Pair.getFirst()->isCanonicalDecl() &&
1086  "Expected canonical declaration");
1087  Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1088  }
1089  }
1090  Rt.emitGenericVarsProlog(CGF, Loc);
1091  }
1092  void Exit(CodeGenFunction &CGF) override {
1093  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1094  .emitGenericVarsEpilog(CGF);
1095  }
1096  } Action(Loc, GlobalizedRD, MappedDeclsFields);
1097  CodeGen.setAction(Action);
1098  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1099  D, ThreadIDVar, InnermostKind, CodeGen);
1100 
1101  return OutlinedFun;
1102 }
1103 
1104 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1105  SourceLocation Loc,
1106  bool WithSPMDCheck) {
1108  getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1109  return;
1110 
1111  CGBuilderTy &Bld = CGF.Builder;
1112 
1113  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1114  if (I == FunctionGlobalizedDecls.end())
1115  return;
1116 
1117  for (auto &Rec : I->getSecond().LocalVarData) {
1118  const auto *VD = cast<VarDecl>(Rec.first);
1119  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1120  QualType VarTy = VD->getType();
1121 
1122  // Get the local allocation of a firstprivate variable before sharing
1123  llvm::Value *ParValue;
1124  if (EscapedParam) {
1125  LValue ParLVal =
1126  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1127  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1128  }
1129 
1130  // Allocate space for the variable to be globalized
1131  llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1132  llvm::CallBase *VoidPtr =
1133  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1134  CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1135  AllocArgs, VD->getName());
1136  // FIXME: We should use the variables actual alignment as an argument.
1137  VoidPtr->addRetAttr(llvm::Attribute::get(
1138  CGM.getLLVMContext(), llvm::Attribute::Alignment,
1140 
1141  // Cast the void pointer and get the address of the globalized variable.
1142  llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1143  llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1144  VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1145  LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1146  Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1147  Rec.second.GlobalizedVal = VoidPtr;
1148 
1149  // Assign the local allocation to the newly globalized location.
1150  if (EscapedParam) {
1151  CGF.EmitStoreOfScalar(ParValue, VarAddr);
1152  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1153  }
1154  if (auto *DI = CGF.getDebugInfo())
1155  VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1156  }
1157  for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1158  // Use actual memory size of the VLA object including the padding
1159  // for alignment purposes.
1160  llvm::Value *Size = CGF.getTypeSize(VD->getType());
1161  CharUnits Align = CGM.getContext().getDeclAlign(VD);
1162  Size = Bld.CreateNUWAdd(
1163  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1164  llvm::Value *AlignVal =
1165  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1166 
1167  Size = Bld.CreateUDiv(Size, AlignVal);
1168  Size = Bld.CreateNUWMul(Size, AlignVal);
1169 
1170  // Allocate space for this VLA object to be globalized.
1171  llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1172  llvm::CallBase *VoidPtr =
1173  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1174  CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1175  AllocArgs, VD->getName());
1176  VoidPtr->addRetAttr(
1177  llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
1179 
1180  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1181  std::pair<llvm::Value *, llvm::Value *>(
1182  {VoidPtr, CGF.getTypeSize(VD->getType())}));
1183  LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1184  CGM.getContext().getDeclAlign(VD),
1186  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1187  Base.getAddress(CGF));
1188  }
1189  I->getSecond().MappedParams->apply(CGF);
1190 }
1191 
1192 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1193  bool WithSPMDCheck) {
1195  getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1196  return;
1197 
1198  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1199  if (I != FunctionGlobalizedDecls.end()) {
1200  // Deallocate the memory for each globalized VLA object
1201  for (auto AddrSizePair :
1202  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1203  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1204  CGM.getModule(), OMPRTL___kmpc_free_shared),
1205  {AddrSizePair.first, AddrSizePair.second});
1206  }
1207  // Deallocate the memory for each globalized value
1208  for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1209  const auto *VD = cast<VarDecl>(Rec.first);
1210  I->getSecond().MappedParams->restore(CGF);
1211 
1212  llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1213  CGF.getTypeSize(VD->getType())};
1214  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1215  CGM.getModule(), OMPRTL___kmpc_free_shared),
1216  FreeArgs);
1217  }
1218  }
1219 }
1220 
1222  const OMPExecutableDirective &D,
1223  SourceLocation Loc,
1224  llvm::Function *OutlinedFn,
1225  ArrayRef<llvm::Value *> CapturedVars) {
1226  if (!CGF.HaveInsertPoint())
1227  return;
1228 
1229  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1230  /*Name=*/".zero.addr");
1231  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1232  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1233  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1234  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1235  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1236  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1237 }
1238 
1240  SourceLocation Loc,
1241  llvm::Function *OutlinedFn,
1242  ArrayRef<llvm::Value *> CapturedVars,
1243  const Expr *IfCond,
1244  llvm::Value *NumThreads) {
1245  if (!CGF.HaveInsertPoint())
1246  return;
1247 
1248  auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1249  NumThreads](CodeGenFunction &CGF,
1250  PrePostActionTy &Action) {
1251  CGBuilderTy &Bld = CGF.Builder;
1252  llvm::Value *NumThreadsVal = NumThreads;
1253  llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1254  llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1255  if (WFn)
1256  ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1257  llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1258 
1259  // Create a private scope that will globalize the arguments
1260  // passed from the outside of the target region.
1261  // TODO: Is that needed?
1262  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1263 
1264  Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1265  llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1266  "captured_vars_addrs");
1267  // There's something to share.
1268  if (!CapturedVars.empty()) {
1269  // Prepare for parallel region. Indicate the outlined function.
1270  ASTContext &Ctx = CGF.getContext();
1271  unsigned Idx = 0;
1272  for (llvm::Value *V : CapturedVars) {
1273  Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1274  llvm::Value *PtrV;
1275  if (V->getType()->isIntegerTy())
1276  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1277  else
1278  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1279  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1280  Ctx.getPointerType(Ctx.VoidPtrTy));
1281  ++Idx;
1282  }
1283  }
1284 
1285  llvm::Value *IfCondVal = nullptr;
1286  if (IfCond)
1287  IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1288  /* isSigned */ false);
1289  else
1290  IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1291 
1292  if (!NumThreadsVal)
1293  NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1294  else
1295  NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1296 
1297  assert(IfCondVal && "Expected a value");
1298  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1299  llvm::Value *Args[] = {
1300  RTLoc,
1301  getThreadID(CGF, Loc),
1302  IfCondVal,
1303  NumThreadsVal,
1304  llvm::ConstantInt::get(CGF.Int32Ty, -1),
1305  FnPtr,
1306  ID,
1307  Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1308  CGF.VoidPtrPtrTy),
1309  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1310  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1311  CGM.getModule(), OMPRTL___kmpc_parallel_51),
1312  Args);
1313  };
1314 
1315  RegionCodeGenTy RCG(ParallelGen);
1316  RCG(CGF);
1317 }
1318 
1319 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1320  // Always emit simple barriers!
1321  if (!CGF.HaveInsertPoint())
1322  return;
1323  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1324  // This function does not use parameters, so we can emit just default values.
1325  llvm::Value *Args[] = {
1326  llvm::ConstantPointerNull::get(
1327  cast<llvm::PointerType>(getIdentTyPointerTy())),
1328  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1329  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1330  CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1331  Args);
1332 }
1333 
1335  SourceLocation Loc,
1336  OpenMPDirectiveKind Kind, bool,
1337  bool) {
1338  // Always emit simple barriers!
1339  if (!CGF.HaveInsertPoint())
1340  return;
1341  // Build call __kmpc_cancel_barrier(loc, thread_id);
1342  unsigned Flags = getDefaultFlagsForBarriers(Kind);
1343  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1344  getThreadID(CGF, Loc)};
1345 
1346  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1347  CGM.getModule(), OMPRTL___kmpc_barrier),
1348  Args);
1349 }
1350 
1352  CodeGenFunction &CGF, StringRef CriticalName,
1353  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1354  const Expr *Hint) {
1355  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1356  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1357  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1358  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1359  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1360 
1361  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1362 
1363  // Get the mask of active threads in the warp.
1364  llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1365  CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1366  // Fetch team-local id of the thread.
1367  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1368 
1369  // Get the width of the team.
1370  llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1371 
1372  // Initialize the counter variable for the loop.
1373  QualType Int32Ty =
1374  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1375  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1376  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1377  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1378  /*isInit=*/true);
1379 
1380  // Block checks if loop counter exceeds upper bound.
1381  CGF.EmitBlock(LoopBB);
1382  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1383  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1384  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1385 
1386  // Block tests which single thread should execute region, and which threads
1387  // should go straight to synchronisation point.
1388  CGF.EmitBlock(TestBB);
1389  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1390  llvm::Value *CmpThreadToCounter =
1391  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1392  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1393 
1394  // Block emits the body of the critical region.
1395  CGF.EmitBlock(BodyBB);
1396 
1397  // Output the critical statement.
1398  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1399  Hint);
1400 
1401  // After the body surrounded by the critical region, the single executing
1402  // thread will jump to the synchronisation point.
1403  // Block waits for all threads in current team to finish then increments the
1404  // counter variable and returns to the loop.
1405  CGF.EmitBlock(SyncBB);
1406  // Reconverge active threads in the warp.
1407  (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1408  CGM.getModule(), OMPRTL___kmpc_syncwarp),
1409  Mask);
1410 
1411  llvm::Value *IncCounterVal =
1412  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1413  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1414  CGF.EmitBranch(LoopBB);
1415 
1416  // Block that is reached when all threads in the team complete the region.
1417  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1418 }
1419 
1420 /// Cast value to the specified type.
1421 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1422  QualType ValTy, QualType CastTy,
1423  SourceLocation Loc) {
1424  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1425  "Cast type must sized.");
1426  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1427  "Val type must sized.");
1428  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1429  if (ValTy == CastTy)
1430  return Val;
1431  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1432  CGF.getContext().getTypeSizeInChars(CastTy))
1433  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1434  if (CastTy->isIntegerType() && ValTy->isIntegerType())
1435  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1436  CastTy->hasSignedIntegerRepresentation());
1437  Address CastItem = CGF.CreateMemTemp(CastTy);
1439  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()),
1440  Val->getType());
1441  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1443  TBAAAccessInfo());
1444  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1446  TBAAAccessInfo());
1447 }
1448 
1449 /// This function creates calls to one of two shuffle functions to copy
1450 /// variables between lanes in a warp.
1452  llvm::Value *Elem,
1453  QualType ElemType,
1454  llvm::Value *Offset,
1455  SourceLocation Loc) {
1456  CodeGenModule &CGM = CGF.CGM;
1457  CGBuilderTy &Bld = CGF.Builder;
1458  CGOpenMPRuntimeGPU &RT =
1459  *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1460  llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1461 
1462  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1463  assert(Size.getQuantity() <= 8 &&
1464  "Unsupported bitwidth in shuffle instruction.");
1465 
1466  RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1467  ? OMPRTL___kmpc_shuffle_int32
1468  : OMPRTL___kmpc_shuffle_int64;
1469 
1470  // Cast all types to 32- or 64-bit values before calling shuffle routines.
1471  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1472  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1473  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1474  llvm::Value *WarpSize =
1475  Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1476 
1477  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1478  OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1479  {ElemCast, Offset, WarpSize});
1480 
1481  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1482 }
1483 
1484 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1485  Address DestAddr, QualType ElemType,
1486  llvm::Value *Offset, SourceLocation Loc) {
1487  CGBuilderTy &Bld = CGF.Builder;
1488 
1489  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1490  // Create the loop over the big sized data.
1491  // ptr = (void*)Elem;
1492  // ptrEnd = (void*) Elem + 1;
1493  // Step = 8;
1494  // while (ptr + Step < ptrEnd)
1495  // shuffle((int64_t)*ptr);
1496  // Step = 4;
1497  // while (ptr + Step < ptrEnd)
1498  // shuffle((int32_t)*ptr);
1499  // ...
1500  Address ElemPtr = DestAddr;
1501  Address Ptr = SrcAddr;
1503  Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1504  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1505  if (Size < CharUnits::fromQuantity(IntSize))
1506  continue;
1507  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1508  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1509  /*Signed=*/1);
1510  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1511  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1512  IntTy);
1514  ElemPtr, IntTy->getPointerTo(), IntTy);
1515  if (Size.getQuantity() / IntSize > 1) {
1516  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1517  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1518  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1519  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1520  CGF.EmitBlock(PreCondBB);
1521  llvm::PHINode *PhiSrc =
1522  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1523  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1524  llvm::PHINode *PhiDest =
1525  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1526  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1527  Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1528  ElemPtr =
1529  Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1530  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1531  CGF.Int8Ty, PtrEnd.getPointer(),
1533  CGF.VoidPtrTy));
1534  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1535  ThenBB, ExitBB);
1536  CGF.EmitBlock(ThenBB);
1537  llvm::Value *Res = createRuntimeShuffleFunction(
1538  CGF,
1539  CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1541  TBAAAccessInfo()),
1542  IntType, Offset, Loc);
1543  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1545  TBAAAccessInfo());
1546  Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1547  Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1548  PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1549  PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1550  CGF.EmitBranch(PreCondBB);
1551  CGF.EmitBlock(ExitBB);
1552  } else {
1553  llvm::Value *Res = createRuntimeShuffleFunction(
1554  CGF,
1555  CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1557  TBAAAccessInfo()),
1558  IntType, Offset, Loc);
1559  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1561  TBAAAccessInfo());
1562  Ptr = Bld.CreateConstGEP(Ptr, 1);
1563  ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1564  }
1565  Size = Size % IntSize;
1566  }
1567 }
1568 
1569 namespace {
1570 enum CopyAction : unsigned {
1571  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1572  // the warp using shuffle instructions.
1573  RemoteLaneToThread,
1574  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1575  ThreadCopy,
1576  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1577  ThreadToScratchpad,
1578  // ScratchpadToThread: Copy from a scratchpad array in global memory
1579  // containing team-reduced data to a thread's stack.
1580  ScratchpadToThread,
1581 };
1582 } // namespace
1583 
1585  llvm::Value *RemoteLaneOffset;
1586  llvm::Value *ScratchpadIndex;
1587  llvm::Value *ScratchpadWidth;
1588 };
1589 
1590 /// Emit instructions to copy a Reduce list, which contains partially
1591 /// aggregated values, in the specified direction.
1593  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1594  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1595  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1596 
1597  CodeGenModule &CGM = CGF.CGM;
1598  ASTContext &C = CGM.getContext();
1599  CGBuilderTy &Bld = CGF.Builder;
1600 
1601  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1602  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1603  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1604 
1605  // Iterates, element-by-element, through the source Reduce list and
1606  // make a copy.
1607  unsigned Idx = 0;
1608  unsigned Size = Privates.size();
1609  for (const Expr *Private : Privates) {
1610  Address SrcElementAddr = Address::invalid();
1611  Address DestElementAddr = Address::invalid();
1612  Address DestElementPtrAddr = Address::invalid();
1613  // Should we shuffle in an element from a remote lane?
1614  bool ShuffleInElement = false;
1615  // Set to true to update the pointer in the dest Reduce list to a
1616  // newly created element.
1617  bool UpdateDestListPtr = false;
1618  // Increment the src or dest pointer to the scratchpad, for each
1619  // new element.
1620  bool IncrScratchpadSrc = false;
1621  bool IncrScratchpadDest = false;
1622  QualType PrivatePtrType = C.getPointerType(Private->getType());
1623  llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1624 
1625  switch (Action) {
1626  case RemoteLaneToThread: {
1627  // Step 1.1: Get the address for the src element in the Reduce list.
1628  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1629  SrcElementAddr =
1631  SrcElementPtrAddr, PrivateLlvmPtrType),
1632  PrivatePtrType->castAs<PointerType>());
1633 
1634  // Step 1.2: Create a temporary to store the element in the destination
1635  // Reduce list.
1636  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1637  DestElementAddr =
1638  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1639  ShuffleInElement = true;
1640  UpdateDestListPtr = true;
1641  break;
1642  }
1643  case ThreadCopy: {
1644  // Step 1.1: Get the address for the src element in the Reduce list.
1645  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1646  SrcElementAddr =
1648  SrcElementPtrAddr, PrivateLlvmPtrType),
1649  PrivatePtrType->castAs<PointerType>());
1650 
1651  // Step 1.2: Get the address for dest element. The destination
1652  // element has already been created on the thread's stack.
1653  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1654  DestElementAddr =
1656  DestElementPtrAddr, PrivateLlvmPtrType),
1657  PrivatePtrType->castAs<PointerType>());
1658  break;
1659  }
1660  case ThreadToScratchpad: {
1661  // Step 1.1: Get the address for the src element in the Reduce list.
1662  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1663  SrcElementAddr =
1665  SrcElementPtrAddr, PrivateLlvmPtrType),
1666  PrivatePtrType->castAs<PointerType>());
1667 
1668  // Step 1.2: Get the address for dest element:
1669  // address = base + index * ElementSizeInChars.
1670  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1671  llvm::Value *CurrentOffset =
1672  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1673  llvm::Value *ScratchPadElemAbsolutePtrVal =
1674  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1675  ScratchPadElemAbsolutePtrVal =
1676  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1677  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1678  C.getTypeAlignInChars(Private->getType()));
1679  IncrScratchpadDest = true;
1680  break;
1681  }
1682  case ScratchpadToThread: {
1683  // Step 1.1: Get the address for the src element in the scratchpad.
1684  // address = base + index * ElementSizeInChars.
1685  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1686  llvm::Value *CurrentOffset =
1687  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1688  llvm::Value *ScratchPadElemAbsolutePtrVal =
1689  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1690  ScratchPadElemAbsolutePtrVal =
1691  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1692  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1693  C.getTypeAlignInChars(Private->getType()));
1694  IncrScratchpadSrc = true;
1695 
1696  // Step 1.2: Create a temporary to store the element in the destination
1697  // Reduce list.
1698  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1699  DestElementAddr =
1700  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1701  UpdateDestListPtr = true;
1702  break;
1703  }
1704  }
1705 
1706  // Regardless of src and dest of copy, we emit the load of src
1707  // element as this is required in all directions
1708  SrcElementAddr = Bld.CreateElementBitCast(
1709  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1710  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1711  SrcElementAddr.getElementType());
1712 
1713  // Now that all active lanes have read the element in the
1714  // Reduce list, shuffle over the value from the remote lane.
1715  if (ShuffleInElement) {
1716  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1717  RemoteLaneOffset, Private->getExprLoc());
1718  } else {
1719  switch (CGF.getEvaluationKind(Private->getType())) {
1720  case TEK_Scalar: {
1721  llvm::Value *Elem = CGF.EmitLoadOfScalar(
1722  SrcElementAddr, /*Volatile=*/false, Private->getType(),
1723  Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1724  TBAAAccessInfo());
1725  // Store the source element value to the dest element address.
1726  CGF.EmitStoreOfScalar(
1727  Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1729  break;
1730  }
1731  case TEK_Complex: {
1733  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1734  Private->getExprLoc());
1735  CGF.EmitStoreOfComplex(
1736  Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1737  /*isInit=*/false);
1738  break;
1739  }
1740  case TEK_Aggregate:
1741  CGF.EmitAggregateCopy(
1742  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1743  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1745  break;
1746  }
1747  }
1748 
1749  // Step 3.1: Modify reference in dest Reduce list as needed.
1750  // Modifying the reference in Reduce list to point to the newly
1751  // created element. The element is live in the current function
1752  // scope and that of functions it invokes (i.e., reduce_function).
1753  // RemoteReduceData[i] = (void*)&RemoteElem
1754  if (UpdateDestListPtr) {
1756  DestElementAddr.getPointer(), CGF.VoidPtrTy),
1757  DestElementPtrAddr, /*Volatile=*/false,
1758  C.VoidPtrTy);
1759  }
1760 
1761  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1762  // address of the next element in scratchpad memory, unless we're currently
1763  // processing the last one. Memory alignment is also taken care of here.
1764  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1765  // FIXME: This code doesn't make any sense, it's trying to perform
1766  // integer arithmetic on pointers.
1767  llvm::Value *ScratchpadBasePtr =
1768  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1769  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1770  ScratchpadBasePtr = Bld.CreateNUWAdd(
1771  ScratchpadBasePtr,
1772  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
1773 
1774  // Take care of global memory alignment for performance
1775  ScratchpadBasePtr = Bld.CreateNUWSub(
1776  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1777  ScratchpadBasePtr = Bld.CreateUDiv(
1778  ScratchpadBasePtr,
1779  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1780  ScratchpadBasePtr = Bld.CreateNUWAdd(
1781  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1782  ScratchpadBasePtr = Bld.CreateNUWMul(
1783  ScratchpadBasePtr,
1784  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1785 
1786  if (IncrScratchpadDest)
1787  DestBase =
1788  Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1789  else /* IncrScratchpadSrc = true */
1790  SrcBase =
1791  Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1792  }
1793 
1794  ++Idx;
1795  }
1796 }
1797 
1798 /// This function emits a helper that gathers Reduce lists from the first
1799 /// lane of every active warp to lanes in the first warp.
1800 ///
1801 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1802 /// shared smem[warp_size];
1803 /// For all data entries D in reduce_data:
1804 /// sync
1805 /// If (I am the first lane in each warp)
1806 /// Copy my local D to smem[warp_id]
1807 /// sync
1808 /// if (I am the first warp)
1809 /// Copy smem[thread_id] to my local D
1810 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1811  ArrayRef<const Expr *> Privates,
1812  QualType ReductionArrayTy,
1813  SourceLocation Loc) {
1814  ASTContext &C = CGM.getContext();
1815  llvm::Module &M = CGM.getModule();
1816 
1817  // ReduceList: thread local Reduce list.
1818  // At the stage of the computation when this function is called, partially
1819  // aggregated values reside in the first lane of every active warp.
1820  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1821  C.VoidPtrTy, ImplicitParamDecl::Other);
1822  // NumWarps: number of warps active in the parallel region. This could
1823  // be smaller than 32 (max warps in a CTA) for partial block reduction.
1824  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1825  C.getIntTypeForBitwidth(32, /* Signed */ true),
1827  FunctionArgList Args;
1828  Args.push_back(&ReduceListArg);
1829  Args.push_back(&NumWarpsArg);
1830 
1831  const CGFunctionInfo &CGFI =
1832  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1833  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
1835  "_omp_reduction_inter_warp_copy_func", &M);
1836  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
1837  Fn->setDoesNotRecurse();
1838  CodeGenFunction CGF(CGM);
1839  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
1840 
1841  CGBuilderTy &Bld = CGF.Builder;
1842 
1843  // This array is used as a medium to transfer, one reduce element at a time,
1844  // the data from the first lane of every warp to lanes in the first warp
1845  // in order to perform the final step of a reduction in a parallel region
1846  // (reduction across warps). The array is placed in NVPTX __shared__ memory
1847  // for reduced latency, as well as to have a distinct copy for concurrently
1848  // executing target regions. The array is declared with common linkage so
1849  // as to be shared across compilation units.
1850  StringRef TransferMediumName =
1851  "__openmp_nvptx_data_transfer_temporary_storage";
1852  llvm::GlobalVariable *TransferMedium =
1853  M.getGlobalVariable(TransferMediumName);
1854  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1855  if (!TransferMedium) {
1856  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
1857  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1858  TransferMedium = new llvm::GlobalVariable(
1859  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1860  llvm::UndefValue::get(Ty), TransferMediumName,
1861  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1862  SharedAddressSpace);
1863  CGM.addCompilerUsedGlobal(TransferMedium);
1864  }
1865 
1866  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1867  // Get the CUDA thread id of the current OpenMP thread on the GPU.
1868  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1869  // nvptx_lane_id = nvptx_id % warpsize
1870  llvm::Value *LaneID = getNVPTXLaneID(CGF);
1871  // nvptx_warp_id = nvptx_id / warpsize
1872  llvm::Value *WarpID = getNVPTXWarpID(CGF);
1873 
1874  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1875  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
1876  Address LocalReduceList(
1878  CGF.EmitLoadOfScalar(
1879  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1881  ElemTy->getPointerTo()),
1882  ElemTy, CGF.getPointerAlign());
1883 
1884  unsigned Idx = 0;
1885  for (const Expr *Private : Privates) {
1886  //
1887  // Warp master copies reduce element to transfer medium in __shared__
1888  // memory.
1889  //
1890  unsigned RealTySize =
1891  C.getTypeSizeInChars(Private->getType())
1892  .alignTo(C.getTypeAlignInChars(Private->getType()))
1893  .getQuantity();
1894  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1895  unsigned NumIters = RealTySize / TySize;
1896  if (NumIters == 0)
1897  continue;
1898  QualType CType = C.getIntTypeForBitwidth(
1899  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
1900  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
1901  CharUnits Align = CharUnits::fromQuantity(TySize);
1902  llvm::Value *Cnt = nullptr;
1903  Address CntAddr = Address::invalid();
1904  llvm::BasicBlock *PrecondBB = nullptr;
1905  llvm::BasicBlock *ExitBB = nullptr;
1906  if (NumIters > 1) {
1907  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1908  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
1909  /*Volatile=*/false, C.IntTy);
1910  PrecondBB = CGF.createBasicBlock("precond");
1911  ExitBB = CGF.createBasicBlock("exit");
1912  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
1913  // There is no need to emit line number for unconditional branch.
1915  CGF.EmitBlock(PrecondBB);
1916  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1917  llvm::Value *Cmp =
1918  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
1919  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1920  CGF.EmitBlock(BodyBB);
1921  }
1922  // kmpc_barrier.
1923  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1924  /*EmitChecks=*/false,
1925  /*ForceSimpleCall=*/true);
1926  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1927  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1928  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1929 
1930  // if (lane_id == 0)
1931  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
1932  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1933  CGF.EmitBlock(ThenBB);
1934 
1935  // Reduce element = LocalReduceList[i]
1936  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1937  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1938  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1939  // elemptr = ((CopyType*)(elemptrptr)) + I
1940  Address ElemPtr(ElemPtrPtr, CGF.Int8Ty, Align);
1941  ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
1942  if (NumIters > 1)
1943  ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
1944 
1945  // Get pointer to location in transfer medium.
1946  // MediumPtr = &medium[warp_id]
1947  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1948  TransferMedium->getValueType(), TransferMedium,
1949  {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1950  // Casting to actual data type.
1951  // MediumPtr = (CopyType*)MediumPtrAddr;
1952  Address MediumPtr(
1953  Bld.CreateBitCast(
1954  MediumPtrVal,
1955  CopyType->getPointerTo(
1956  MediumPtrVal->getType()->getPointerAddressSpace())),
1957  CopyType, Align);
1958 
1959  // elem = *elemptr
1960  //*MediumPtr = elem
1961  llvm::Value *Elem = CGF.EmitLoadOfScalar(
1962  ElemPtr, /*Volatile=*/false, CType, Loc,
1964  // Store the source element value to the dest element address.
1965  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
1967  TBAAAccessInfo());
1968 
1969  Bld.CreateBr(MergeBB);
1970 
1971  CGF.EmitBlock(ElseBB);
1972  Bld.CreateBr(MergeBB);
1973 
1974  CGF.EmitBlock(MergeBB);
1975 
1976  // kmpc_barrier.
1977  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1978  /*EmitChecks=*/false,
1979  /*ForceSimpleCall=*/true);
1980 
1981  //
1982  // Warp 0 copies reduce element from transfer medium.
1983  //
1984  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1985  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1986  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1987 
1988  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1989  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1990  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1991 
1992  // Up to 32 threads in warp 0 are active.
1993  llvm::Value *IsActiveThread =
1994  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1995  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1996 
1997  CGF.EmitBlock(W0ThenBB);
1998 
1999  // SrcMediumPtr = &medium[tid]
2000  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2001  TransferMedium->getValueType(), TransferMedium,
2002  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2003  // SrcMediumVal = *SrcMediumPtr;
2004  Address SrcMediumPtr(
2005  Bld.CreateBitCast(
2006  SrcMediumPtrVal,
2007  CopyType->getPointerTo(
2008  SrcMediumPtrVal->getType()->getPointerAddressSpace())),
2009  CopyType, Align);
2010 
2011  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2012  Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2013  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2014  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2015  Address TargetElemPtr(TargetElemPtrVal, CGF.Int8Ty, Align);
2016  TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2017  if (NumIters > 1)
2018  TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
2019 
2020  // *TargetElemPtr = SrcMediumVal;
2021  llvm::Value *SrcMediumValue =
2022  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2023  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2024  CType);
2025  Bld.CreateBr(W0MergeBB);
2026 
2027  CGF.EmitBlock(W0ElseBB);
2028  Bld.CreateBr(W0MergeBB);
2029 
2030  CGF.EmitBlock(W0MergeBB);
2031 
2032  if (NumIters > 1) {
2033  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2034  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2035  CGF.EmitBranch(PrecondBB);
2037  CGF.EmitBlock(ExitBB);
2038  }
2039  RealTySize %= TySize;
2040  }
2041  ++Idx;
2042  }
2043 
2044  CGF.FinishFunction();
2045  return Fn;
2046 }
2047 
2048 /// Emit a helper that reduces data across two OpenMP threads (lanes)
2049 /// in the same warp. It uses shuffle instructions to copy over data from
2050 /// a remote lane's stack. The reduction algorithm performed is specified
2051 /// by the fourth parameter.
2052 ///
2053 /// Algorithm Versions.
2054 /// Full Warp Reduce (argument value 0):
2055 /// This algorithm assumes that all 32 lanes are active and gathers
2056 /// data from these 32 lanes, producing a single resultant value.
2057 /// Contiguous Partial Warp Reduce (argument value 1):
2058 /// This algorithm assumes that only a *contiguous* subset of lanes
2059 /// are active. This happens for the last warp in a parallel region
2060 /// when the user specified num_threads is not an integer multiple of
2061 /// 32. This contiguous subset always starts with the zeroth lane.
2062 /// Partial Warp Reduce (argument value 2):
2063 /// This algorithm gathers data from any number of lanes at any position.
2064 /// All reduced values are stored in the lowest possible lane. The set
2065 /// of problems every algorithm addresses is a super set of those
2066 /// addressable by algorithms with a lower version number. Overhead
2067 /// increases as algorithm version increases.
2068 ///
2069 /// Terminology
2070 /// Reduce element:
2071 /// Reduce element refers to the individual data field with primitive
2072 /// data types to be combined and reduced across threads.
2073 /// Reduce list:
2074 /// Reduce list refers to a collection of local, thread-private
2075 /// reduce elements.
2076 /// Remote Reduce list:
2077 /// Remote Reduce list refers to a collection of remote (relative to
2078 /// the current thread) reduce elements.
2079 ///
2080 /// We distinguish between three states of threads that are important to
2081 /// the implementation of this function.
2082 /// Alive threads:
2083 /// Threads in a warp executing the SIMT instruction, as distinguished from
2084 /// threads that are inactive due to divergent control flow.
2085 /// Active threads:
2086 /// The minimal set of threads that has to be alive upon entry to this
2087 /// function. The computation is correct iff active threads are alive.
2088 /// Some threads are alive but they are not active because they do not
2089 /// contribute to the computation in any useful manner. Turning them off
2090 /// may introduce control flow overheads without any tangible benefits.
2091 /// Effective threads:
2092 /// In order to comply with the argument requirements of the shuffle
2093 /// function, we must keep all lanes holding data alive. But at most
2094 /// half of them perform value aggregation; we refer to this half of
2095 /// threads as effective. The other half is simply handing off their
2096 /// data.
2097 ///
2098 /// Procedure
2099 /// Value shuffle:
2100 /// In this step active threads transfer data from higher lane positions
2101 /// in the warp to lower lane positions, creating Remote Reduce list.
2102 /// Value aggregation:
2103 /// In this step, effective threads combine their thread local Reduce list
2104 /// with Remote Reduce list and store the result in the thread local
2105 /// Reduce list.
2106 /// Value copy:
2107 /// In this step, we deal with the assumption made by algorithm 2
2108 /// (i.e. contiguity assumption). When we have an odd number of lanes
2109 /// active, say 2k+1, only k threads will be effective and therefore k
2110 /// new values will be produced. However, the Reduce list owned by the
2111 /// (2k+1)th thread is ignored in the value aggregation. Therefore
2112 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2113 /// that the contiguity assumption still holds.
2114 static llvm::Function *emitShuffleAndReduceFunction(
2115  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2116  QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2117  ASTContext &C = CGM.getContext();
2118 
2119  // Thread local Reduce list used to host the values of data to be reduced.
2120  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2121  C.VoidPtrTy, ImplicitParamDecl::Other);
2122  // Current lane id; could be logical.
2123  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2125  // Offset of the remote source lane relative to the current lane.
2126  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2127  C.ShortTy, ImplicitParamDecl::Other);
2128  // Algorithm version. This is expected to be known at compile time.
2129  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2130  C.ShortTy, ImplicitParamDecl::Other);
2131  FunctionArgList Args;
2132  Args.push_back(&ReduceListArg);
2133  Args.push_back(&LaneIDArg);
2134  Args.push_back(&RemoteLaneOffsetArg);
2135  Args.push_back(&AlgoVerArg);
2136 
2137  const CGFunctionInfo &CGFI =
2138  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2139  auto *Fn = llvm::Function::Create(
2141  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2142  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2143  Fn->setDoesNotRecurse();
2144 
2145  CodeGenFunction CGF(CGM);
2146  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2147 
2148  CGBuilderTy &Bld = CGF.Builder;
2149 
2150  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2151  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2152  Address LocalReduceList(
2154  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2155  C.VoidPtrTy, SourceLocation()),
2156  ElemTy->getPointerTo()),
2157  ElemTy, CGF.getPointerAlign());
2158 
2159  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2160  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2161  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2162 
2163  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2164  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2165  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2166 
2167  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2168  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2169  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2170 
2171  // Create a local thread-private variable to host the Reduce list
2172  // from a remote lane.
2173  Address RemoteReduceList =
2174  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2175 
2176  // This loop iterates through the list of reduce elements and copies,
2177  // element by element, from a remote lane in the warp to RemoteReduceList,
2178  // hosted on the thread's stack.
2179  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2180  LocalReduceList, RemoteReduceList,
2181  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2182  /*ScratchpadIndex=*/nullptr,
2183  /*ScratchpadWidth=*/nullptr});
2184 
2185  // The actions to be performed on the Remote Reduce list is dependent
2186  // on the algorithm version.
2187  //
2188  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2189  // LaneId % 2 == 0 && Offset > 0):
2190  // do the reduction value aggregation
2191  //
2192  // The thread local variable Reduce list is mutated in place to host the
2193  // reduced data, which is the aggregated value produced from local and
2194  // remote lanes.
2195  //
2196  // Note that AlgoVer is expected to be a constant integer known at compile
2197  // time.
2198  // When AlgoVer==0, the first conjunction evaluates to true, making
2199  // the entire predicate true during compile time.
2200  // When AlgoVer==1, the second conjunction has only the second part to be
2201  // evaluated during runtime. Other conjunctions evaluates to false
2202  // during compile time.
2203  // When AlgoVer==2, the third conjunction has only the second part to be
2204  // evaluated during runtime. Other conjunctions evaluates to false
2205  // during compile time.
2206  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2207 
2208  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2209  llvm::Value *CondAlgo1 = Bld.CreateAnd(
2210  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2211 
2212  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2213  llvm::Value *CondAlgo2 = Bld.CreateAnd(
2214  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2215  CondAlgo2 = Bld.CreateAnd(
2216  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2217 
2218  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2219  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2220 
2221  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2222  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2223  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2224  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2225 
2226  CGF.EmitBlock(ThenBB);
2227  // reduce_function(LocalReduceList, RemoteReduceList)
2228  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2229  LocalReduceList.getPointer(), CGF.VoidPtrTy);
2230  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2231  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2233  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2234  Bld.CreateBr(MergeBB);
2235 
2236  CGF.EmitBlock(ElseBB);
2237  Bld.CreateBr(MergeBB);
2238 
2239  CGF.EmitBlock(MergeBB);
2240 
2241  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2242  // Reduce list.
2243  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2244  llvm::Value *CondCopy = Bld.CreateAnd(
2245  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2246 
2247  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2248  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2249  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2250  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2251 
2252  CGF.EmitBlock(CpyThenBB);
2253  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2254  RemoteReduceList, LocalReduceList);
2255  Bld.CreateBr(CpyMergeBB);
2256 
2257  CGF.EmitBlock(CpyElseBB);
2258  Bld.CreateBr(CpyMergeBB);
2259 
2260  CGF.EmitBlock(CpyMergeBB);
2261 
2262  CGF.FinishFunction();
2263  return Fn;
2264 }
2265 
2266 /// This function emits a helper that copies all the reduction variables from
2267 /// the team into the provided global buffer for the reduction variables.
2268 ///
2269 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2270 /// For all data entries D in reduce_data:
2271 /// Copy local D to buffer.D[Idx]
2272 static llvm::Value *emitListToGlobalCopyFunction(
2273  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2274  QualType ReductionArrayTy, SourceLocation Loc,
2275  const RecordDecl *TeamReductionRec,
2276  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2277  &VarFieldMap) {
2278  ASTContext &C = CGM.getContext();
2279 
2280  // Buffer: global reduction buffer.
2281  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2282  C.VoidPtrTy, ImplicitParamDecl::Other);
2283  // Idx: index of the buffer.
2284  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2286  // ReduceList: thread local Reduce list.
2287  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2288  C.VoidPtrTy, ImplicitParamDecl::Other);
2289  FunctionArgList Args;
2290  Args.push_back(&BufferArg);
2291  Args.push_back(&IdxArg);
2292  Args.push_back(&ReduceListArg);
2293 
2294  const CGFunctionInfo &CGFI =
2295  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2296  auto *Fn = llvm::Function::Create(
2298  "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2299  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2300  Fn->setDoesNotRecurse();
2301  CodeGenFunction CGF(CGM);
2302  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2303 
2304  CGBuilderTy &Bld = CGF.Builder;
2305 
2306  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2307  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2308  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2309  Address LocalReduceList(
2311  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2312  C.VoidPtrTy, Loc),
2313  ElemTy->getPointerTo()),
2314  ElemTy, CGF.getPointerAlign());
2315  QualType StaticTy = C.getRecordType(TeamReductionRec);
2316  llvm::Type *LLVMReductionsBufferTy =
2317  CGM.getTypes().ConvertTypeForMem(StaticTy);
2318  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2319  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2320  LLVMReductionsBufferTy->getPointerTo());
2321  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2322  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2323  /*Volatile=*/false, C.IntTy,
2324  Loc)};
2325  unsigned Idx = 0;
2326  for (const Expr *Private : Privates) {
2327  // Reduce element = LocalReduceList[i]
2328  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2329  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2330  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2331  // elemptr = ((CopyType*)(elemptrptr)) + I
2332  ElemTy = CGF.ConvertTypeForMem(Private->getType());
2333  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2334  ElemPtrPtr, ElemTy->getPointerTo());
2335  Address ElemPtr =
2336  Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2337  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2338  // Global = Buffer.VD[Idx];
2339  const FieldDecl *FD = VarFieldMap.lookup(VD);
2340  LValue GlobLVal = CGF.EmitLValueForField(
2341  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2342  Address GlobAddr = GlobLVal.getAddress(CGF);
2343  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2344  GlobAddr.getPointer(), Idxs);
2345  GlobLVal.setAddress(Address(BufferPtr,
2346  CGF.ConvertTypeForMem(Private->getType()),
2347  GlobAddr.getAlignment()));
2348  switch (CGF.getEvaluationKind(Private->getType())) {
2349  case TEK_Scalar: {
2350  llvm::Value *V = CGF.EmitLoadOfScalar(
2351  ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2353  CGF.EmitStoreOfScalar(V, GlobLVal);
2354  break;
2355  }
2356  case TEK_Complex: {
2358  CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2359  CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2360  break;
2361  }
2362  case TEK_Aggregate:
2363  CGF.EmitAggregateCopy(GlobLVal,
2364  CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2366  break;
2367  }
2368  ++Idx;
2369  }
2370 
2371  CGF.FinishFunction();
2372  return Fn;
2373 }
2374 
2375 /// This function emits a helper that reduces all the reduction variables from
2376 /// the team into the provided global buffer for the reduction variables.
2377 ///
2378 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2379 /// void *GlobPtrs[];
2380 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2381 /// ...
2382 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2383 /// reduce_function(GlobPtrs, reduce_data);
2385  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2386  QualType ReductionArrayTy, SourceLocation Loc,
2387  const RecordDecl *TeamReductionRec,
2388  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2389  &VarFieldMap,
2390  llvm::Function *ReduceFn) {
2391  ASTContext &C = CGM.getContext();
2392 
2393  // Buffer: global reduction buffer.
2394  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2395  C.VoidPtrTy, ImplicitParamDecl::Other);
2396  // Idx: index of the buffer.
2397  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2399  // ReduceList: thread local Reduce list.
2400  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2401  C.VoidPtrTy, ImplicitParamDecl::Other);
2402  FunctionArgList Args;
2403  Args.push_back(&BufferArg);
2404  Args.push_back(&IdxArg);
2405  Args.push_back(&ReduceListArg);
2406 
2407  const CGFunctionInfo &CGFI =
2408  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2409  auto *Fn = llvm::Function::Create(
2411  "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2412  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2413  Fn->setDoesNotRecurse();
2414  CodeGenFunction CGF(CGM);
2415  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2416 
2417  CGBuilderTy &Bld = CGF.Builder;
2418 
2419  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2420  QualType StaticTy = C.getRecordType(TeamReductionRec);
2421  llvm::Type *LLVMReductionsBufferTy =
2422  CGM.getTypes().ConvertTypeForMem(StaticTy);
2423  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2424  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2425  LLVMReductionsBufferTy->getPointerTo());
2426 
2427  // 1. Build a list of reduction variables.
2428  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2429  Address ReductionList =
2430  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2431  auto IPriv = Privates.begin();
2432  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2433  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2434  /*Volatile=*/false, C.IntTy,
2435  Loc)};
2436  unsigned Idx = 0;
2437  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2438  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2439  // Global = Buffer.VD[Idx];
2440  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2441  const FieldDecl *FD = VarFieldMap.lookup(VD);
2442  LValue GlobLVal = CGF.EmitLValueForField(
2443  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2444  Address GlobAddr = GlobLVal.getAddress(CGF);
2445  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2446  GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2447  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2448  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2449  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2450  // Store array size.
2451  ++Idx;
2452  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2453  llvm::Value *Size = CGF.Builder.CreateIntCast(
2454  CGF.getVLASize(
2455  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2456  .NumElts,
2457  CGF.SizeTy, /*isSigned=*/false);
2458  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2459  Elem);
2460  }
2461  }
2462 
2463  // Call reduce_function(GlobalReduceList, ReduceList)
2464  llvm::Value *GlobalReduceList =
2465  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2466  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2467  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2468  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2470  CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2471  CGF.FinishFunction();
2472  return Fn;
2473 }
2474 
2475 /// This function emits a helper that copies all the reduction variables from
2476 /// the team into the provided global buffer for the reduction variables.
2477 ///
2478 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2479 /// For all data entries D in reduce_data:
2480 /// Copy buffer.D[Idx] to local D;
2481 static llvm::Value *emitGlobalToListCopyFunction(
2482  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2483  QualType ReductionArrayTy, SourceLocation Loc,
2484  const RecordDecl *TeamReductionRec,
2485  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2486  &VarFieldMap) {
2487  ASTContext &C = CGM.getContext();
2488 
2489  // Buffer: global reduction buffer.
2490  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2491  C.VoidPtrTy, ImplicitParamDecl::Other);
2492  // Idx: index of the buffer.
2493  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2495  // ReduceList: thread local Reduce list.
2496  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2497  C.VoidPtrTy, ImplicitParamDecl::Other);
2498  FunctionArgList Args;
2499  Args.push_back(&BufferArg);
2500  Args.push_back(&IdxArg);
2501  Args.push_back(&ReduceListArg);
2502 
2503  const CGFunctionInfo &CGFI =
2504  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2505  auto *Fn = llvm::Function::Create(
2507  "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2508  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2509  Fn->setDoesNotRecurse();
2510  CodeGenFunction CGF(CGM);
2511  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2512 
2513  CGBuilderTy &Bld = CGF.Builder;
2514 
2515  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2516  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2517  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2518  Address LocalReduceList(
2520  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2521  C.VoidPtrTy, Loc),
2522  ElemTy->getPointerTo()),
2523  ElemTy, CGF.getPointerAlign());
2524  QualType StaticTy = C.getRecordType(TeamReductionRec);
2525  llvm::Type *LLVMReductionsBufferTy =
2526  CGM.getTypes().ConvertTypeForMem(StaticTy);
2527  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2528  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2529  LLVMReductionsBufferTy->getPointerTo());
2530 
2531  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2532  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2533  /*Volatile=*/false, C.IntTy,
2534  Loc)};
2535  unsigned Idx = 0;
2536  for (const Expr *Private : Privates) {
2537  // Reduce element = LocalReduceList[i]
2538  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2539  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2540  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2541  // elemptr = ((CopyType*)(elemptrptr)) + I
2542  ElemTy = CGF.ConvertTypeForMem(Private->getType());
2543  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2544  ElemPtrPtr, ElemTy->getPointerTo());
2545  Address ElemPtr =
2546  Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2547  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2548  // Global = Buffer.VD[Idx];
2549  const FieldDecl *FD = VarFieldMap.lookup(VD);
2550  LValue GlobLVal = CGF.EmitLValueForField(
2551  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2552  Address GlobAddr = GlobLVal.getAddress(CGF);
2553  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2554  GlobAddr.getPointer(), Idxs);
2555  GlobLVal.setAddress(Address(BufferPtr,
2556  CGF.ConvertTypeForMem(Private->getType()),
2557  GlobAddr.getAlignment()));
2558  switch (CGF.getEvaluationKind(Private->getType())) {
2559  case TEK_Scalar: {
2560  llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2561  CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2563  TBAAAccessInfo());
2564  break;
2565  }
2566  case TEK_Complex: {
2567  CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2568  CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2569  /*isInit=*/false);
2570  break;
2571  }
2572  case TEK_Aggregate:
2573  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2574  GlobLVal, Private->getType(),
2576  break;
2577  }
2578  ++Idx;
2579  }
2580 
2581  CGF.FinishFunction();
2582  return Fn;
2583 }
2584 
2585 /// This function emits a helper that reduces all the reduction variables from
2586 /// the team into the provided global buffer for the reduction variables.
2587 ///
2588 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2589 /// void *GlobPtrs[];
2590 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2591 /// ...
2592 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2593 /// reduce_function(reduce_data, GlobPtrs);
2595  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2596  QualType ReductionArrayTy, SourceLocation Loc,
2597  const RecordDecl *TeamReductionRec,
2598  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2599  &VarFieldMap,
2600  llvm::Function *ReduceFn) {
2601  ASTContext &C = CGM.getContext();
2602 
2603  // Buffer: global reduction buffer.
2604  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2605  C.VoidPtrTy, ImplicitParamDecl::Other);
2606  // Idx: index of the buffer.
2607  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2609  // ReduceList: thread local Reduce list.
2610  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2611  C.VoidPtrTy, ImplicitParamDecl::Other);
2612  FunctionArgList Args;
2613  Args.push_back(&BufferArg);
2614  Args.push_back(&IdxArg);
2615  Args.push_back(&ReduceListArg);
2616 
2617  const CGFunctionInfo &CGFI =
2618  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2619  auto *Fn = llvm::Function::Create(
2621  "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2622  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2623  Fn->setDoesNotRecurse();
2624  CodeGenFunction CGF(CGM);
2625  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2626 
2627  CGBuilderTy &Bld = CGF.Builder;
2628 
2629  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2630  QualType StaticTy = C.getRecordType(TeamReductionRec);
2631  llvm::Type *LLVMReductionsBufferTy =
2632  CGM.getTypes().ConvertTypeForMem(StaticTy);
2633  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2634  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2635  LLVMReductionsBufferTy->getPointerTo());
2636 
2637  // 1. Build a list of reduction variables.
2638  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2639  Address ReductionList =
2640  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2641  auto IPriv = Privates.begin();
2642  llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2643  CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2644  /*Volatile=*/false, C.IntTy,
2645  Loc)};
2646  unsigned Idx = 0;
2647  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2648  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2649  // Global = Buffer.VD[Idx];
2650  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2651  const FieldDecl *FD = VarFieldMap.lookup(VD);
2652  LValue GlobLVal = CGF.EmitLValueForField(
2653  CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2654  Address GlobAddr = GlobLVal.getAddress(CGF);
2655  llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2656  GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2657  llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2658  CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2659  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2660  // Store array size.
2661  ++Idx;
2662  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2663  llvm::Value *Size = CGF.Builder.CreateIntCast(
2664  CGF.getVLASize(
2665  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2666  .NumElts,
2667  CGF.SizeTy, /*isSigned=*/false);
2668  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2669  Elem);
2670  }
2671  }
2672 
2673  // Call reduce_function(ReduceList, GlobalReduceList)
2674  llvm::Value *GlobalReduceList =
2675  CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2676  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2677  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2678  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2680  CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2681  CGF.FinishFunction();
2682  return Fn;
2683 }
2684 
2685 ///
2686 /// Design of OpenMP reductions on the GPU
2687 ///
2688 /// Consider a typical OpenMP program with one or more reduction
2689 /// clauses:
2690 ///
2691 /// float foo;
2692 /// double bar;
2693 /// #pragma omp target teams distribute parallel for \
2694 /// reduction(+:foo) reduction(*:bar)
2695 /// for (int i = 0; i < N; i++) {
2696 /// foo += A[i]; bar *= B[i];
2697 /// }
2698 ///
2699 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
2700 /// all teams. In our OpenMP implementation on the NVPTX device an
2701 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2702 /// within a team are mapped to CUDA threads within a threadblock.
2703 /// Our goal is to efficiently aggregate values across all OpenMP
2704 /// threads such that:
2705 ///
2706 /// - the compiler and runtime are logically concise, and
2707 /// - the reduction is performed efficiently in a hierarchical
2708 /// manner as follows: within OpenMP threads in the same warp,
2709 /// across warps in a threadblock, and finally across teams on
2710 /// the NVPTX device.
2711 ///
2712 /// Introduction to Decoupling
2713 ///
2714 /// We would like to decouple the compiler and the runtime so that the
2715 /// latter is ignorant of the reduction variables (number, data types)
2716 /// and the reduction operators. This allows a simpler interface
2717 /// and implementation while still attaining good performance.
2718 ///
2719 /// Pseudocode for the aforementioned OpenMP program generated by the
2720 /// compiler is as follows:
2721 ///
2722 /// 1. Create private copies of reduction variables on each OpenMP
2723 /// thread: 'foo_private', 'bar_private'
2724 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2725 /// to it and writes the result in 'foo_private' and 'bar_private'
2726 /// respectively.
2727 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
2728 /// and store the result on the team master:
2729 ///
2730 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2731 /// reduceData, shuffleReduceFn, interWarpCpyFn)
2732 ///
2733 /// where:
2734 /// struct ReduceData {
2735 /// double *foo;
2736 /// double *bar;
2737 /// } reduceData
2738 /// reduceData.foo = &foo_private
2739 /// reduceData.bar = &bar_private
2740 ///
2741 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2742 /// auxiliary functions generated by the compiler that operate on
2743 /// variables of type 'ReduceData'. They aid the runtime perform
2744 /// algorithmic steps in a data agnostic manner.
2745 ///
2746 /// 'shuffleReduceFn' is a pointer to a function that reduces data
2747 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
2748 /// same warp. It takes the following arguments as input:
2749 ///
2750 /// a. variable of type 'ReduceData' on the calling lane,
2751 /// b. its lane_id,
2752 /// c. an offset relative to the current lane_id to generate a
2753 /// remote_lane_id. The remote lane contains the second
2754 /// variable of type 'ReduceData' that is to be reduced.
2755 /// d. an algorithm version parameter determining which reduction
2756 /// algorithm to use.
2757 ///
2758 /// 'shuffleReduceFn' retrieves data from the remote lane using
2759 /// efficient GPU shuffle intrinsics and reduces, using the
2760 /// algorithm specified by the 4th parameter, the two operands
2761 /// element-wise. The result is written to the first operand.
2762 ///
2763 /// Different reduction algorithms are implemented in different
2764 /// runtime functions, all calling 'shuffleReduceFn' to perform
2765 /// the essential reduction step. Therefore, based on the 4th
2766 /// parameter, this function behaves slightly differently to
2767 /// cooperate with the runtime to ensure correctness under
2768 /// different circumstances.
2769 ///
2770 /// 'InterWarpCpyFn' is a pointer to a function that transfers
2771 /// reduced variables across warps. It tunnels, through CUDA
2772 /// shared memory, the thread-private data of type 'ReduceData'
2773 /// from lane 0 of each warp to a lane in the first warp.
2774 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2775 /// The last team writes the global reduced value to memory.
2776 ///
2777 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2778 /// reduceData, shuffleReduceFn, interWarpCpyFn,
2779 /// scratchpadCopyFn, loadAndReduceFn)
2780 ///
2781 /// 'scratchpadCopyFn' is a helper that stores reduced
2782 /// data from the team master to a scratchpad array in
2783 /// global memory.
2784 ///
2785 /// 'loadAndReduceFn' is a helper that loads data from
2786 /// the scratchpad array and reduces it with the input
2787 /// operand.
2788 ///
2789 /// These compiler generated functions hide address
2790 /// calculation and alignment information from the runtime.
2791 /// 5. if ret == 1:
2792 /// The team master of the last team stores the reduced
2793 /// result to the globals in memory.
2794 /// foo += reduceData.foo; bar *= reduceData.bar
2795 ///
2796 ///
2797 /// Warp Reduction Algorithms
2798 ///
2799 /// On the warp level, we have three algorithms implemented in the
2800 /// OpenMP runtime depending on the number of active lanes:
2801 ///
2802 /// Full Warp Reduction
2803 ///
2804 /// The reduce algorithm within a warp where all lanes are active
2805 /// is implemented in the runtime as follows:
2806 ///
2807 /// full_warp_reduce(void *reduce_data,
2808 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2809 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2810 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
2811 /// }
2812 ///
2813 /// The algorithm completes in log(2, WARPSIZE) steps.
2814 ///
2815 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2816 /// not used therefore we save instructions by not retrieving lane_id
2817 /// from the corresponding special registers. The 4th parameter, which
2818 /// represents the version of the algorithm being used, is set to 0 to
2819 /// signify full warp reduction.
2820 ///
2821 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2822 ///
2823 /// #reduce_elem refers to an element in the local lane's data structure
2824 /// #remote_elem is retrieved from a remote lane
2825 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2826 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2827 ///
2828 /// Contiguous Partial Warp Reduction
2829 ///
2830 /// This reduce algorithm is used within a warp where only the first
2831 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2832 /// number of OpenMP threads in a parallel region is not a multiple of
2833 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
2834 ///
2835 /// void
2836 /// contiguous_partial_reduce(void *reduce_data,
2837 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2838 /// int size, int lane_id) {
2839 /// int curr_size;
2840 /// int offset;
2841 /// curr_size = size;
2842 /// mask = curr_size/2;
2843 /// while (offset>0) {
2844 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2845 /// curr_size = (curr_size+1)/2;
2846 /// offset = curr_size/2;
2847 /// }
2848 /// }
2849 ///
2850 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2851 ///
2852 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2853 /// if (lane_id < offset)
2854 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2855 /// else
2856 /// reduce_elem = remote_elem
2857 ///
2858 /// This algorithm assumes that the data to be reduced are located in a
2859 /// contiguous subset of lanes starting from the first. When there is
2860 /// an odd number of active lanes, the data in the last lane is not
2861 /// aggregated with any other lane's dat but is instead copied over.
2862 ///
2863 /// Dispersed Partial Warp Reduction
2864 ///
2865 /// This algorithm is used within a warp when any discontiguous subset of
2866 /// lanes are active. It is used to implement the reduction operation
2867 /// across lanes in an OpenMP simd region or in a nested parallel region.
2868 ///
2869 /// void
2870 /// dispersed_partial_reduce(void *reduce_data,
2871 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2872 /// int size, remote_id;
2873 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2874 /// do {
2875 /// remote_id = next_active_lane_id_right_after_me();
2876 /// # the above function returns 0 of no active lane
2877 /// # is present right after the current lane.
2878 /// size = number_of_active_lanes_in_this_warp();
2879 /// logical_lane_id /= 2;
2880 /// ShuffleReduceFn(reduce_data, logical_lane_id,
2881 /// remote_id-1-threadIdx.x, 2);
2882 /// } while (logical_lane_id % 2 == 0 && size > 1);
2883 /// }
2884 ///
2885 /// There is no assumption made about the initial state of the reduction.
2886 /// Any number of lanes (>=1) could be active at any position. The reduction
2887 /// result is returned in the first active lane.
2888 ///
2889 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2890 ///
2891 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2892 /// if (lane_id % 2 == 0 && offset > 0)
2893 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2894 /// else
2895 /// reduce_elem = remote_elem
2896 ///
2897 ///
2898 /// Intra-Team Reduction
2899 ///
2900 /// This function, as implemented in the runtime call
2901 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2902 /// threads in a team. It first reduces within a warp using the
2903 /// aforementioned algorithms. We then proceed to gather all such
2904 /// reduced values at the first warp.
2905 ///
2906 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
2907 /// data from each of the "warp master" (zeroth lane of each warp, where
2908 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2909 /// a mathematical sense) the problem of reduction across warp masters in
2910 /// a block to the problem of warp reduction.
2911 ///
2912 ///
2913 /// Inter-Team Reduction
2914 ///
2915 /// Once a team has reduced its data to a single value, it is stored in
2916 /// a global scratchpad array. Since each team has a distinct slot, this
2917 /// can be done without locking.
2918 ///
2919 /// The last team to write to the scratchpad array proceeds to reduce the
2920 /// scratchpad array. One or more workers in the last team use the helper
2921 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2922 /// the k'th worker reduces every k'th element.
2923 ///
2924 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2925 /// reduce across workers and compute a globally reduced value.
2926 ///
2930  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2931  if (!CGF.HaveInsertPoint())
2932  return;
2933 
2934  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2935 #ifndef NDEBUG
2936  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2937 #endif
2938 
2939  if (Options.SimpleReduction) {
2940  assert(!TeamsReduction && !ParallelReduction &&
2941  "Invalid reduction selection in emitReduction.");
2942  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2943  ReductionOps, Options);
2944  return;
2945  }
2946 
2947  assert((TeamsReduction || ParallelReduction) &&
2948  "Invalid reduction selection in emitReduction.");
2949 
2950  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2951  // RedList, shuffle_reduce_func, interwarp_copy_func);
2952  // or
2953  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2954  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2955  llvm::Value *ThreadId = getThreadID(CGF, Loc);
2956 
2957  llvm::Value *Res;
2958  ASTContext &C = CGM.getContext();
2959  // 1. Build a list of reduction variables.
2960  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2961  auto Size = RHSExprs.size();
2962  for (const Expr *E : Privates) {
2963  if (E->getType()->isVariablyModifiedType())
2964  // Reserve place for array size.
2965  ++Size;
2966  }
2967  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2968  QualType ReductionArrayTy =
2969  C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
2970  /*IndexTypeQuals=*/0);
2971  Address ReductionList =
2972  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2973  auto IPriv = Privates.begin();
2974  unsigned Idx = 0;
2975  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2976  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2977  CGF.Builder.CreateStore(
2979  CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
2980  Elem);
2981  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2982  // Store array size.
2983  ++Idx;
2984  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2985  llvm::Value *Size = CGF.Builder.CreateIntCast(
2986  CGF.getVLASize(
2987  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2988  .NumElts,
2989  CGF.SizeTy, /*isSigned=*/false);
2990  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2991  Elem);
2992  }
2993  }
2994 
2995  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2996  ReductionList.getPointer(), CGF.VoidPtrTy);
2997  llvm::Function *ReductionFn =
2998  emitReductionFunction(Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
2999  Privates, LHSExprs, RHSExprs, ReductionOps);
3000  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3001  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3002  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3003  llvm::Value *InterWarpCopyFn =
3004  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3005 
3006  if (ParallelReduction) {
3007  llvm::Value *Args[] = {RTLoc,
3008  ThreadId,
3009  CGF.Builder.getInt32(RHSExprs.size()),
3010  ReductionArrayTySize,
3011  RL,
3012  ShuffleAndReduceFn,
3013  InterWarpCopyFn};
3014 
3015  Res = CGF.EmitRuntimeCall(
3016  OMPBuilder.getOrCreateRuntimeFunction(
3017  CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3018  Args);
3019  } else {
3020  assert(TeamsReduction && "expected teams reduction.");
3021  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3022  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3023  int Cnt = 0;
3024  for (const Expr *DRE : Privates) {
3025  PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3026  ++Cnt;
3027  }
3028  const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3029  CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3030  C.getLangOpts().OpenMPCUDAReductionBufNum);
3031  TeamsReductions.push_back(TeamReductionRec);
3032  if (!KernelTeamsReductionPtr) {
3033  KernelTeamsReductionPtr = new llvm::GlobalVariable(
3034  CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3036  "_openmp_teams_reductions_buffer_$_$ptr");
3037  }
3038  llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3039  Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
3040  /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3041  llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3042  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3043  llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3044  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3045  ReductionFn);
3046  llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3047  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3048  llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3049  CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3050  ReductionFn);
3051 
3052  llvm::Value *Args[] = {
3053  RTLoc,
3054  ThreadId,
3055  GlobalBufferPtr,
3056  CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3057  RL,
3058  ShuffleAndReduceFn,
3059  InterWarpCopyFn,
3060  GlobalToBufferCpyFn,
3061  GlobalToBufferRedFn,
3062  BufferToGlobalCpyFn,
3063  BufferToGlobalRedFn};
3064 
3065  Res = CGF.EmitRuntimeCall(
3066  OMPBuilder.getOrCreateRuntimeFunction(
3067  CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3068  Args);
3069  }
3070 
3071  // 5. Build if (res == 1)
3072  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3073  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3074  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3075  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3076  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3077 
3078  // 6. Build then branch: where we have reduced values in the master
3079  // thread in each team.
3080  // __kmpc_end_reduce{_nowait}(<gtid>);
3081  // break;
3082  CGF.EmitBlock(ThenBB);
3083 
3084  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3085  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3086  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3087  auto IPriv = Privates.begin();
3088  auto ILHS = LHSExprs.begin();
3089  auto IRHS = RHSExprs.begin();
3090  for (const Expr *E : ReductionOps) {
3091  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3092  cast<DeclRefExpr>(*IRHS));
3093  ++IPriv;
3094  ++ILHS;
3095  ++IRHS;
3096  }
3097  };
3098  llvm::Value *EndArgs[] = {ThreadId};
3099  RegionCodeGenTy RCG(CodeGen);
3100  NVPTXActionTy Action(
3101  nullptr, llvm::None,
3102  OMPBuilder.getOrCreateRuntimeFunction(
3103  CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3104  EndArgs);
3105  RCG.setAction(Action);
3106  RCG(CGF);
3107  // There is no need to emit line number for unconditional branch.
3109  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3110 }
3111 
3112 const VarDecl *
3114  const VarDecl *NativeParam) const {
3115  if (!NativeParam->getType()->isReferenceType())
3116  return NativeParam;
3117  QualType ArgType = NativeParam->getType();
3118  QualifierCollector QC;
3119  const Type *NonQualTy = QC.strip(ArgType);
3120  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3121  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3122  if (Attr->getCaptureKind() == OMPC_map) {
3123  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3125  }
3126  }
3127  ArgType = CGM.getContext().getPointerType(PointeeTy);
3128  QC.addRestrict();
3129  enum { NVPTX_local_addr = 5 };
3130  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3131  ArgType = QC.apply(CGM.getContext(), ArgType);
3132  if (isa<ImplicitParamDecl>(NativeParam))
3134  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3135  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3136  return ParmVarDecl::Create(
3137  CGM.getContext(),
3138  const_cast<DeclContext *>(NativeParam->getDeclContext()),
3139  NativeParam->getBeginLoc(), NativeParam->getLocation(),
3140  NativeParam->getIdentifier(), ArgType,
3141  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3142 }
3143 
3144 Address
3146  const VarDecl *NativeParam,
3147  const VarDecl *TargetParam) const {
3148  assert(NativeParam != TargetParam &&
3149  NativeParam->getType()->isReferenceType() &&
3150  "Native arg must not be the same as target arg.");
3151  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3152  QualType NativeParamType = NativeParam->getType();
3153  QualifierCollector QC;
3154  const Type *NonQualTy = QC.strip(NativeParamType);
3155  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3156  unsigned NativePointeeAddrSpace =
3157  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3158  QualType TargetTy = TargetParam->getType();
3159  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3160  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3161  // First cast to generic.
3163  TargetAddr, llvm::PointerType::getWithSamePointeeType(
3164  cast<llvm::PointerType>(TargetAddr->getType()), /*AddrSpace=*/0));
3165  // Cast from generic to native address space.
3167  TargetAddr, llvm::PointerType::getWithSamePointeeType(
3168  cast<llvm::PointerType>(TargetAddr->getType()),
3169  NativePointeeAddrSpace));
3170  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3171  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3172  NativeParamType);
3173  return NativeParamAddr;
3174 }
3175 
3177  CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3178  ArrayRef<llvm::Value *> Args) const {
3179  SmallVector<llvm::Value *, 4> TargetArgs;
3180  TargetArgs.reserve(Args.size());
3181  auto *FnType = OutlinedFn.getFunctionType();
3182  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3183  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3184  TargetArgs.append(std::next(Args.begin(), I), Args.end());
3185  break;
3186  }
3187  llvm::Type *TargetType = FnType->getParamType(I);
3188  llvm::Value *NativeArg = Args[I];
3189  if (!TargetType->isPointerTy()) {
3190  TargetArgs.emplace_back(NativeArg);
3191  continue;
3192  }
3193  llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3194  NativeArg, llvm::PointerType::getWithSamePointeeType(
3195  cast<llvm::PointerType>(NativeArg->getType()), /*AddrSpace*/ 0));
3196  TargetArgs.emplace_back(
3197  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3198  }
3199  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3200 }
3201 
3202 /// Emit function which wraps the outline parallel region
3203 /// and controls the arguments which are passed to this function.
3204 /// The wrapper ensures that the outlined function is called
3205 /// with the correct arguments when data is shared.
3206 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3207  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3208  ASTContext &Ctx = CGM.getContext();
3209  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3210 
3211  // Create a function that takes as argument the source thread.
3212  FunctionArgList WrapperArgs;
3213  QualType Int16QTy =
3214  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3215  QualType Int32QTy =
3216  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3217  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3218  /*Id=*/nullptr, Int16QTy,
3220  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3221  /*Id=*/nullptr, Int32QTy,
3223  WrapperArgs.emplace_back(&ParallelLevelArg);
3224  WrapperArgs.emplace_back(&WrapperArg);
3225 
3226  const CGFunctionInfo &CGFI =
3228 
3229  auto *Fn = llvm::Function::Create(
3231  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3232 
3233  // Ensure we do not inline the function. This is trivially true for the ones
3234  // passed to __kmpc_fork_call but the ones calles in serialized regions
3235  // could be inlined. This is not a perfect but it is closer to the invariant
3236  // we want, namely, every data environment starts with a new function.
3237  // TODO: We should pass the if condition to the runtime function and do the
3238  // handling there. Much cleaner code.
3239  Fn->addFnAttr(llvm::Attribute::NoInline);
3240 
3242  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3243  Fn->setDoesNotRecurse();
3244 
3245  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3246  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3247  D.getBeginLoc(), D.getBeginLoc());
3248 
3249  const auto *RD = CS.getCapturedRecordDecl();
3250  auto CurField = RD->field_begin();
3251 
3252  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3253  /*Name=*/".zero.addr");
3254  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3255  // Get the array of arguments.
3257 
3258  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3259  Args.emplace_back(ZeroAddr.getPointer());
3260 
3261  CGBuilderTy &Bld = CGF.Builder;
3262  auto CI = CS.capture_begin();
3263 
3264  // Use global memory for data sharing.
3265  // Handle passing of global args to workers.
3266  Address GlobalArgs =
3267  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3268  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3269  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3270  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3271  CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3272  DataSharingArgs);
3273 
3274  // Retrieve the shared variables from the list of references returned
3275  // by the runtime. Pass the variables to the outlined function.
3276  Address SharedArgListAddress = Address::invalid();
3277  if (CS.capture_size() > 0 ||
3279  SharedArgListAddress = CGF.EmitLoadOfPointer(
3280  GlobalArgs, CGF.getContext()
3282  .castAs<PointerType>());
3283  }
3284  unsigned Idx = 0;
3286  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3287  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3288  Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3289  llvm::Value *LB = CGF.EmitLoadOfScalar(
3290  TypedAddress,
3291  /*Volatile=*/false,
3293  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3294  Args.emplace_back(LB);
3295  ++Idx;
3296  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3297  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3298  Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3299  llvm::Value *UB = CGF.EmitLoadOfScalar(
3300  TypedAddress,
3301  /*Volatile=*/false,
3303  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3304  Args.emplace_back(UB);
3305  ++Idx;
3306  }
3307  if (CS.capture_size() > 0) {
3308  ASTContext &CGFContext = CGF.getContext();
3309  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3310  QualType ElemTy = CurField->getType();
3311  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3312  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3313  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3314  CGF.ConvertTypeForMem(ElemTy));
3315  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3316  /*Volatile=*/false,
3317  CGFContext.getPointerType(ElemTy),
3318  CI->getLocation());
3319  if (CI->capturesVariableByCopy() &&
3320  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3321  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3322  CI->getLocation());
3323  }
3324  Args.emplace_back(Arg);
3325  }
3326  }
3327 
3328  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3329  CGF.FinishFunction();
3330  return Fn;
3331 }
3332 
3334  const Decl *D) {
3336  return;
3337 
3338  assert(D && "Expected function or captured|block decl.");
3339  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3340  "Function is registered already.");
3341  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3342  "Team is set but not processed.");
3343  const Stmt *Body = nullptr;
3344  bool NeedToDelayGlobalization = false;
3345  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3346  Body = FD->getBody();
3347  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3348  Body = BD->getBody();
3349  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3350  Body = CD->getBody();
3351  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3352  if (NeedToDelayGlobalization &&
3353  getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3354  return;
3355  }
3356  if (!Body)
3357  return;
3358  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3359  VarChecker.Visit(Body);
3360  const RecordDecl *GlobalizedVarsRecord =
3361  VarChecker.getGlobalizedRecord(IsInTTDRegion);
3362  TeamAndReductions.first = nullptr;
3363  TeamAndReductions.second.clear();
3364  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3365  VarChecker.getEscapedVariableLengthDecls();
3366  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3367  return;
3368  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3369  I->getSecond().MappedParams =
3370  std::make_unique<CodeGenFunction::OMPMapVars>();
3371  I->getSecond().EscapedParameters.insert(
3372  VarChecker.getEscapedParameters().begin(),
3373  VarChecker.getEscapedParameters().end());
3374  I->getSecond().EscapedVariableLengthDecls.append(
3375  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3376  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3377  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3378  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3379  Data.insert(std::make_pair(VD, MappedVarData()));
3380  }
3381  if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
3382  CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
3383  VarChecker.Visit(Body);
3384  I->getSecond().SecondaryLocalVarData.emplace();
3385  DeclToAddrMapTy &Data = *I->getSecond().SecondaryLocalVarData;
3386  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3387  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3388  Data.insert(std::make_pair(VD, MappedVarData()));
3389  }
3390  }
3391  if (!NeedToDelayGlobalization) {
3392  emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3393  struct GlobalizationScope final : EHScopeStack::Cleanup {
3394  GlobalizationScope() = default;
3395 
3396  void Emit(CodeGenFunction &CGF, Flags flags) override {
3397  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3398  .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3399  }
3400  };
3401  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3402  }
3403 }
3404 
3406  const VarDecl *VD) {
3407  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3408  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3409  auto AS = LangAS::Default;
3410  switch (A->getAllocatorType()) {
3411  // Use the default allocator here as by default local vars are
3412  // threadlocal.
3413  case OMPAllocateDeclAttr::OMPNullMemAlloc:
3414  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3415  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3416  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3417  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3418  // Follow the user decision - use default allocation.
3419  return Address::invalid();
3420  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3421  // TODO: implement aupport for user-defined allocators.
3422  return Address::invalid();
3423  case OMPAllocateDeclAttr::OMPConstMemAlloc:
3424  AS = LangAS::cuda_constant;
3425  break;
3426  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3427  AS = LangAS::cuda_shared;
3428  break;
3429  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3430  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3431  break;
3432  }
3433  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3434  auto *GV = new llvm::GlobalVariable(
3435  CGM.getModule(), VarTy, /*isConstant=*/false,
3436  llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
3437  VD->getName(),
3438  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3440  CharUnits Align = CGM.getContext().getDeclAlign(VD);
3441  GV->setAlignment(Align.getAsAlign());
3442  return Address(
3444  GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3445  VD->getType().getAddressSpace()))),
3446  VarTy, Align);
3447  }
3448 
3450  return Address::invalid();
3451 
3452  VD = VD->getCanonicalDecl();
3453  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3454  if (I == FunctionGlobalizedDecls.end())
3455  return Address::invalid();
3456  auto VDI = I->getSecond().LocalVarData.find(VD);
3457  if (VDI != I->getSecond().LocalVarData.end())
3458  return VDI->second.PrivateAddr;
3459  if (VD->hasAttrs()) {
3461  E(VD->attr_end());
3462  IT != E; ++IT) {
3463  auto VDI = I->getSecond().LocalVarData.find(
3464  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3465  ->getCanonicalDecl());
3466  if (VDI != I->getSecond().LocalVarData.end())
3467  return VDI->second.PrivateAddr;
3468  }
3469  }
3470 
3471  return Address::invalid();
3472 }
3473 
3475  FunctionGlobalizedDecls.erase(CGF.CurFn);
3477 }
3478 
3480  CodeGenFunction &CGF, const OMPLoopDirective &S,
3481  OpenMPDistScheduleClauseKind &ScheduleKind,
3482  llvm::Value *&Chunk) const {
3483  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3484  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3485  ScheduleKind = OMPC_DIST_SCHEDULE_static;
3486  Chunk = CGF.EmitScalarConversion(
3487  RT.getGPUNumThreads(CGF),
3488  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3489  S.getIterationVariable()->getType(), S.getBeginLoc());
3490  return;
3491  }
3493  CGF, S, ScheduleKind, Chunk);
3494 }
3495 
3497  CodeGenFunction &CGF, const OMPLoopDirective &S,
3498  OpenMPScheduleClauseKind &ScheduleKind,
3499  const Expr *&ChunkExpr) const {
3500  ScheduleKind = OMPC_SCHEDULE_static;
3501  // Chunk size is 1 in this case.
3502  llvm::APInt ChunkSize(32, 1);
3503  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3504  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3505  SourceLocation());
3506 }
3507 
3509  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3511  " Expected target-based directive.");
3512  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3513  for (const CapturedStmt::Capture &C : CS->captures()) {
3514  // Capture variables captured by reference in lambdas for target-based
3515  // directives.
3516  if (!C.capturesVariable())
3517  continue;
3518  const VarDecl *VD = C.getCapturedVar();
3519  const auto *RD = VD->getType()
3520  .getCanonicalType()
3522  ->getAsCXXRecordDecl();
3523  if (!RD || !RD->isLambda())
3524  continue;
3525  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3526  LValue VDLVal;
3527  if (VD->getType().getCanonicalType()->isReferenceType())
3528  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3529  else
3530  VDLVal = CGF.MakeAddrLValue(
3531  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3532  llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3533  FieldDecl *ThisCapture = nullptr;
3534  RD->getCaptureFields(Captures, ThisCapture);
3535  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3536  LValue ThisLVal =
3537  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3538  llvm::Value *CXXThis = CGF.LoadCXXThis();
3539  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3540  }
3541  for (const LambdaCapture &LC : RD->captures()) {
3542  if (LC.getCaptureKind() != LCK_ByRef)
3543  continue;
3544  const ValueDecl *VD = LC.getCapturedVar();
3545  // FIXME: For now VD is always a VarDecl because OpenMP does not support
3546  // capturing structured bindings in lambdas yet.
3547  if (!CS->capturesVariable(cast<VarDecl>(VD)))
3548  continue;
3549  auto It = Captures.find(VD);
3550  assert(It != Captures.end() && "Found lambda capture without field.");
3551  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3552  Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3553  if (VD->getType().getCanonicalType()->isReferenceType())
3554  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3555  VD->getType().getCanonicalType())
3556  .getAddress(CGF);
3557  CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3558  }
3559  }
3560 }
3561 
3563  LangAS &AS) {
3564  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3565  return false;
3566  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3567  switch(A->getAllocatorType()) {
3568  case OMPAllocateDeclAttr::OMPNullMemAlloc:
3569  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3570  // Not supported, fallback to the default mem space.
3571  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3572  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3573  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3574  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3575  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3576  AS = LangAS::Default;
3577  return true;
3578  case OMPAllocateDeclAttr::OMPConstMemAlloc:
3579  AS = LangAS::cuda_constant;
3580  return true;
3581  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3582  AS = LangAS::cuda_shared;
3583  return true;
3584  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3585  llvm_unreachable("Expected predefined allocator for the variables with the "
3586  "static storage.");
3587  }
3588  return false;
3589 }
3590 
3591 // Get current CudaArch and ignore any unknown values
3593  if (!CGM.getTarget().hasFeature("ptx"))
3594  return CudaArch::UNKNOWN;
3595  for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3596  if (Feature.getValue()) {
3597  CudaArch Arch = StringToCudaArch(Feature.getKey());
3598  if (Arch != CudaArch::UNKNOWN)
3599  return Arch;
3600  }
3601  }
3602  return CudaArch::UNKNOWN;
3603 }
3604 
3605 /// Check to see if target architecture supports unified addressing which is
3606 /// a restriction for OpenMP requires clause "unified_shared_memory".
3608  const OMPRequiresDecl *D) {
3609  for (const OMPClause *Clause : D->clauselists()) {
3610  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3611  CudaArch Arch = getCudaArch(CGM);
3612  switch (Arch) {
3613  case CudaArch::SM_20:
3614  case CudaArch::SM_21:
3615  case CudaArch::SM_30:
3616  case CudaArch::SM_32:
3617  case CudaArch::SM_35:
3618  case CudaArch::SM_37:
3619  case CudaArch::SM_50:
3620  case CudaArch::SM_52:
3621  case CudaArch::SM_53: {
3622  SmallString<256> Buffer;
3623  llvm::raw_svector_ostream Out(Buffer);
3624  Out << "Target architecture " << CudaArchToString(Arch)
3625  << " does not support unified addressing";
3626  CGM.Error(Clause->getBeginLoc(), Out.str());
3627  return;
3628  }
3629  case CudaArch::SM_60:
3630  case CudaArch::SM_61:
3631  case CudaArch::SM_62:
3632  case CudaArch::SM_70:
3633  case CudaArch::SM_72:
3634  case CudaArch::SM_75:
3635  case CudaArch::SM_80:
3636  case CudaArch::SM_86:
3637  case CudaArch::GFX600:
3638  case CudaArch::GFX601:
3639  case CudaArch::GFX602:
3640  case CudaArch::GFX700:
3641  case CudaArch::GFX701:
3642  case CudaArch::GFX702:
3643  case CudaArch::GFX703:
3644  case CudaArch::GFX704:
3645  case CudaArch::GFX705:
3646  case CudaArch::GFX801:
3647  case CudaArch::GFX802:
3648  case CudaArch::GFX803:
3649  case CudaArch::GFX805:
3650  case CudaArch::GFX810:
3651  case CudaArch::GFX900:
3652  case CudaArch::GFX902:
3653  case CudaArch::GFX904:
3654  case CudaArch::GFX906:
3655  case CudaArch::GFX908:
3656  case CudaArch::GFX909:
3657  case CudaArch::GFX90a:
3658  case CudaArch::GFX90c:
3659  case CudaArch::GFX940:
3660  case CudaArch::GFX1010:
3661  case CudaArch::GFX1011:
3662  case CudaArch::GFX1012:
3663  case CudaArch::GFX1013:
3664  case CudaArch::GFX1030:
3665  case CudaArch::GFX1031:
3666  case CudaArch::GFX1032:
3667  case CudaArch::GFX1033:
3668  case CudaArch::GFX1034:
3669  case CudaArch::GFX1035:
3670  case CudaArch::GFX1036:
3671  case CudaArch::GFX1100:
3672  case CudaArch::GFX1101:
3673  case CudaArch::GFX1102:
3674  case CudaArch::GFX1103:
3675  case CudaArch::Generic:
3676  case CudaArch::UNUSED:
3677  case CudaArch::UNKNOWN:
3678  break;
3679  case CudaArch::LAST:
3680  llvm_unreachable("Unexpected Cuda arch.");
3681  }
3682  }
3683  }
3685 }
3686 
3688 
3689  if (!TeamsReductions.empty()) {
3690  ASTContext &C = CGM.getContext();
3691  RecordDecl *StaticRD = C.buildImplicitRecord(
3692  "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3693  StaticRD->startDefinition();
3694  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3695  QualType RecTy = C.getRecordType(TeamReductionRec);
3696  auto *Field = FieldDecl::Create(
3697  C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3698  C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3699  /*BW=*/nullptr, /*Mutable=*/false,
3700  /*InitStyle=*/ICIS_NoInit);
3701  Field->setAccess(AS_public);
3702  StaticRD->addDecl(Field);
3703  }
3704  StaticRD->completeDefinition();
3705  QualType StaticTy = C.getRecordType(StaticRD);
3706  llvm::Type *LLVMReductionsBufferTy =
3707  CGM.getTypes().ConvertTypeForMem(StaticTy);
3708  // FIXME: nvlink does not handle weak linkage correctly (object with the
3709  // different size are reported as erroneous).
3710  // Restore CommonLinkage as soon as nvlink is fixed.
3711  auto *GV = new llvm::GlobalVariable(
3712  CGM.getModule(), LLVMReductionsBufferTy,
3713  /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3714  llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3715  "_openmp_teams_reductions_buffer_$_");
3716  KernelTeamsReductionPtr->setInitializer(
3717  llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3718  CGM.VoidPtrTy));
3719  }
3721 }
3722 
3724  CGBuilderTy &Bld = CGF.Builder;
3725  llvm::Module *M = &CGF.CGM.getModule();
3726  const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3727  llvm::Function *F = M->getFunction(LocSize);
3728  if (!F) {
3729  F = llvm::Function::Create(
3730  llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
3732  }
3733  return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
3734 }
3735 
3737  ArrayRef<llvm::Value *> Args{};
3738  return CGF.EmitRuntimeCall(
3739  OMPBuilder.getOrCreateRuntimeFunction(
3740  CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3741  Args);
3742 }
3743 
3745  ArrayRef<llvm::Value *> Args{};
3746  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3747  CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3748  Args);
3749 }
clang::QualifierCollector::strip
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:6622
clang::CodeGen::CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU
CGOpenMPRuntimeGPU(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:917
clang::CudaArch::SM_35
@ SM_35
clang::InternalLinkage
@ InternalLinkage
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:31
clang::CudaArch::GFX805
@ GFX805
clang::LangAS::cuda_shared
@ cuda_shared
clang::CodeGen::CGOpenMPRuntimeGPU::emitOutlinedFunctionCall
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=llvm::None) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntimeGPU.cpp:3176
clang::CodeGen::CodeGenFunction::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T)
Definition: CodeGenFunction.cpp:209
clang::ASTContext::getTypeSizeInChars
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
Definition: ASTContext.cpp:2472
supportsSPMDExecutionMode
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
Definition: CGOpenMPRuntimeGPU.cpp:653
clang::CudaArch::SM_70
@ SM_70
clang::CudaArch::GFX704
@ GFX704
clang::prec::Conditional
@ Conditional
Definition: OperatorPrecedence.h:30
clang::CodeGen::CodeGenTypeCache::SizeTy
llvm::IntegerType * SizeTy
Definition: CodeGenTypeCache.h:50
max
__DEVICE__ int max(int __a, int __b)
Definition: __clang_cuda_math.h:196
clang::CodeGen::CodeGenFunction::getTypeSize
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
Definition: CGStmtOpenMP.cpp:305
clang::LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
clang::CodeGen::LValueBaseInfo
Definition: CGValue.h:154
clang::OMPLastprivateClause
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:2728
clang::CodeGen::CodeGenFunction::EmitScalarConversion
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
Definition: CGExprScalar.cpp:5055
clang::CodeGen::CodeGenTypeCache::Int8PtrTy
llvm::PointerType * Int8PtrTy
Definition: CodeGenTypeCache.h:57
clang::Decl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:424
clang::CodeGen::CodeGenFunction::LoadCXXThis
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
Definition: CodeGenFunction.h:2802
clang::CharUnits::getAsAlign
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:183
clang::CudaArch::GFX1035
@ GFX1035
clang::CodeGen::CGOpenMPRuntime::getThreadID
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
Definition: CGOpenMPRuntime.cpp:1394
clang::interp::APInt
llvm::APInt APInt
Definition: Integral.h:27
setPropertyExecutionMode
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
Definition: CGOpenMPRuntimeGPU.cpp:831
clang::CodeGen::CodeGenTypeCache::IntTy
llvm::IntegerType * IntTy
int
Definition: CodeGenTypeCache.h:42
clang::CodeGen::TEK_Aggregate
@ TEK_Aggregate
Definition: CodeGenFunction.h:109
clang::CodeGen::CGOpenMPRuntime
Definition: CGOpenMPRuntime.h:228
clang::CudaArch::GFX601
@ GFX601
clang::DeclaratorDecl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:783
clang::Expr::isLValue
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:270
clang::CodeGen::CGOpenMPRuntime::emitSingleReductionCombiner
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Definition: CGOpenMPRuntime.cpp:5411
clang::Qualifiers::addRestrict
void addRestrict()
Definition: Type.h:287
CodeGenFunction.h
clang::DeclContext
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1389
clang::CapturedStmt::captures
capture_range captures()
Definition: Stmt.h:3667
clang::CodeGen::CGOpenMPRuntimeGPU::EM_SPMD
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
Definition: CGOpenMPRuntimeGPU.h:29
clang::LambdaCapture
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
clang::OMPExecutableDirective::getClausesOfKind
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:445
clang::Decl::hasAttr
bool hasAttr() const
Definition: DeclBase.h:560
emitShuffleAndReduceFunction
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
Definition: CGOpenMPRuntimeGPU.cpp:2114
clang::CudaArch::GFX908
@ GFX908
clang::CodeGen::CGOpenMPRuntimeGPU::emitBarrierCall
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntimeGPU.cpp:1334
emitListToGlobalCopyFunction
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2272
clang::VarDecl::getCanonicalDecl
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2166
clang::QualifierCollector::apply
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3791
clang::CodeGen::CGOpenMPRuntime::emitParallelOutlinedFunction
virtual llvm::Function * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
Definition: CGOpenMPRuntime.cpp:1269
clang::CodeGen::Address::getAlignment
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:120
clang::CodeGen::CGOpenMPRuntime::emitProcBindClause
virtual void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Definition: CGOpenMPRuntime.cpp:2899
clang::isOpenMPParallelDirective
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
Definition: OpenMPKinds.cpp:533
clang::CodeGen::CGOpenMPRuntime::clearLocThreadIdInsertPt
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
Definition: CGOpenMPRuntime.cpp:1348
clang::ASTContext::VoidTy
CanQualType VoidTy
Definition: ASTContext.h:1107
clang::TargetInfo::getNewAlign
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:684
llvm::SmallVector
Definition: LLVM.h:38
CopyOptionsTy
Definition: CGOpenMPRuntimeGPU.cpp:1584
clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:86
CopyOptionsTy::ScratchpadWidth
llvm::Value * ScratchpadWidth
Definition: CGOpenMPRuntimeGPU.cpp:1587
clang::CodeGen::CodeGenModule::SetInternalFunctionAttributes
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
Definition: CodeGenModule.cpp:2264
clang::VarDecl::isInitCapture
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1491
clang::CodeGen::CGOpenMPRuntimeGPU::getGPUWarpSize
llvm::Value * getGPUWarpSize(CodeGenFunction &CGF)
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
Definition: CGOpenMPRuntimeGPU.cpp:3744
clang::CodeGen::LValue::getAddress
Address getAddress(CodeGenFunction &CGF) const
Definition: CGValue.h:341
emitReductionListCopy
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values,...
Definition: CGOpenMPRuntimeGPU.cpp:1592
clang::CodeGen::CodeGenFunction::EmitLoadOfComplex
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.
Definition: CGExprComplex.cpp:1316
clang::CudaArch::GFX906
@ GFX906
clang::QualType::getNonReferenceType
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6884
clang::CastExpr::getSubExpr
Expr * getSubExpr()
Definition: Expr.h:3529
clang::CodeGen::CGOpenMPRuntime::setLocThreadIdInsertPt
void setLocThreadIdInsertPt(CodeGenFunction &CGF, bool AtCurrentPoint=false)
Definition: CGOpenMPRuntime.cpp:1332
clang::CharUnits::QuantityType
int64_t QuantityType
Definition: CharUnits.h:40
clang::CodeGen::CodeGenTypeCache::getPointerAlign
CharUnits getPointerAlign() const
Definition: CodeGenTypeCache.h:117
clang::QualType
A (possibly-)qualified type.
Definition: Type.h:737
clang::CudaArch::GFX802
@ GFX802
clang::CudaArch::GFX940
@ GFX940
clang::ASTContext::getTargetAddressSpace
unsigned getTargetAddressSpace(QualType T) const
Definition: ASTContext.cpp:12189
Attr.h
clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelCall
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
Definition: CGOpenMPRuntimeGPU.cpp:1239
AttributeLangSupport::C
@ C
Definition: SemaDeclAttr.cpp:55
clang::CudaArch::SM_53
@ SM_53
clang::CodeGen::CodeGenModule::getContext
ASTContext & getContext() const
Definition: CodeGenModule.h:722
clang::QualType::getCanonicalType
QualType getCanonicalType() const
Definition: Type.h:6727
clang::FieldDecl
Represents a member of a struct/union/class.
Definition: Decl.h:2903
clang::CodeGen::CGOpenMPRuntime::OMPBuilder
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
Definition: CGOpenMPRuntime.h:312
clang::CodeGen::CGOpenMPRuntime::emitCriticalRegion
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
Definition: CGOpenMPRuntime.cpp:2199
clang::CodeGen::CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
Definition: CGOpenMPRuntimeGPU.cpp:3508
clang::CudaArch::GFX1034
@ GFX1034
clang::CodeGen::CodeGenFunction::createBasicBlock
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
Definition: CodeGenFunction.h:2428
clang::CudaArch::GFX1102
@ GFX1102
clang::CodeGen::CGOpenMPRuntimeGPU::Generic
@ Generic
Generic data-sharing mode.
Definition: CGOpenMPRuntimeGPU.h:349
clang::CodeGen::CGOpenMPRuntimeGPU::functionFinished
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntimeGPU.cpp:3474
DeclOpenMP.h
clang::CodeGen::CGBuilderTy::CreateStore
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:99
clang::CodeGen::LValue::setAddress
void setAddress(Address address)
Definition: CGValue.h:344
clang::StringToCudaArch
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:157
createRuntimeShuffleFunction
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
Definition: CGOpenMPRuntimeGPU.cpp:1451
clang::CodeGen::CodeGenTypes::arrangeBuiltinFunctionDeclaration
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:668
clang::TargetInfo::hasFeature
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1378
llvm::SmallPtrSet
Definition: ASTContext.h:82
clang::UnaryOperator
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2171
clang::CodeGen::CodeGenModule::getLangOpts
const LangOptions & getLangOpts() const
Definition: CodeGenModule.h:723
clang::OMPReductionClause
This represents clause 'reduction' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:3039
clang::CodeGen::CodeGenFunction::EmitLoadOfPointer
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
Definition: CGExpr.cpp:2530
clang::CodeGen::CGOpenMPRuntime::emitThreadIDAddress
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
Definition: CGOpenMPRuntime.cpp:2117
clang::CudaArch::GFX1010
@ GFX1010
clang::CodeGen::CodeGenFunction::HaveInsertPoint
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
Definition: CodeGenFunction.h:2469
clang::CodeGen::CodeGenFunction::EmitLValueForField
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:4329
clang::Decl::getAttr
T * getAttr() const
Definition: DeclBase.h:556
clang::CodeGen::CodeGenFunction::EmitStoreOfScalar
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:3850
clang::CudaArch::SM_60
@ SM_60
clang::CodeGen::CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
Definition: CGOpenMPRuntimeGPU.cpp:3562
clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelOutlinedFunction
llvm::Function * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
Definition: CGOpenMPRuntimeGPU.cpp:959
clang::CudaArch::GFX1100
@ GFX1100
getTeamsReductionVars
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
Definition: CGOpenMPRuntimeGPU.cpp:1026
clang::CodeGen::CGOpenMPRuntime::emitBarrierCall
virtual void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false)
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntime.cpp:2536
clang::CodeGen::CGBuilderTy
Definition: CGBuilder.h:45
clang::ASTContext::toBits
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
Definition: ASTContext.cpp:2466
clang::AttributeCommonInfo::AS_GNU
@ AS_GNU
attribute((...))
Definition: AttributeCommonInfo.h:27
clang::CodeGen::CGOpenMPRuntimeGPU::CUDA
@ CUDA
CUDA data sharing mode.
Definition: CGOpenMPRuntimeGPU.h:347
clang::ArrayType::Normal
@ Normal
Definition: Type.h:3016
clang::CodeGen::CGBuilderTy::CreateConstGEP
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:249
emitGlobalToListReduceFunction
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2594
clang::CodeGen::CodeGenFunction::Builder
CGBuilderTy Builder
Definition: CodeGenFunction.h:269
clang::Type
The base class of the type hierarchy.
Definition: Type.h:1565
clang::FieldDecl::Create
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4258
clang::CodeGen::CodeGenTypeCache::VoidPtrTy
llvm::PointerType * VoidPtrTy
Definition: CodeGenTypeCache.h:56
clang::CodeGen::CGOpenMPRuntimeGPU
Definition: CGOpenMPRuntimeGPU.h:24
clang::Decl::attr_end
attr_iterator attr_end() const
Definition: DeclBase.h:526
clang::Type::hasSignedIntegerRepresentation
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2058
clang::CodeGen::CodeGenFunction::GetAddrOfLocalVar
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
Definition: CodeGenFunction.h:2718
clang::CodeGen::CGOpenMPRuntimeGPU::emitNumThreadsClause
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Definition: CGOpenMPRuntimeGPU.cpp:948
Offset
unsigned Offset
Definition: Format.cpp:2590
clang::CodeGen::CodeGenModule::getOpenMPRuntime
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
Definition: CodeGenModule.h:630
clang::CudaArch::GFX1011
@ GFX1011
clang::GlobalDecl
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
uint64_t
unsigned long uint64_t
Definition: hlsl_basic_types.h:24
clang::CodeGen::AlignmentSource::Decl
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
clang::BlockDecl::captures
ArrayRef< Capture > captures() const
Definition: Decl.h:4381
clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::isCXXThisExprCaptured
bool isCXXThisExprCaptured() const
Definition: CodeGenFunction.h:441
clang::CodeGen::CodeGenFunction::EmitLoadOfScalar
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:3828
clang::CodeGen::CodeGenFunction::ComplexPairTy
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
Definition: CodeGenFunction.h:267
clang::CodeGen::CodeGenTypes::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T, bool ForBitField=false)
ConvertTypeForMem - Convert type T into a llvm::Type.
Definition: CodeGenTypes.cpp:91
clang::CudaArch::SM_20
@ SM_20
clang::isOpenMPDistributeDirective
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
Definition: OpenMPKinds.cpp:608
clang::CodeGen::CGOpenMPRuntimeGPU::getGPUNumThreads
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
Definition: CGOpenMPRuntimeGPU.cpp:3723
clang::CodeGen::CodeGenFunction::CreateDefaultAlignTempAlloca
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:123
clang::CapturedStmt::capturesVariable
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1423
clang::ASTContext::getUIntPtrType
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Definition: ASTContext.cpp:5930
clang::Type::isReferenceType
bool isReferenceType() const
Definition: Type.h:6948
clang::CapturedStmt::getCapturedStmt
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3633
clang::CallExpr::getCallee
Expr * getCallee()
Definition: Expr.h:2959
V
#define V(N, I)
Definition: ASTContext.h:3233
clang::CodeGen::CGOpenMPRuntime::getOMPBuilder
llvm::OpenMPIRBuilder & getOMPBuilder()
Definition: CGOpenMPRuntime.h:305
clang::CodeGen::CGBuilderTy::CreateConstInBoundsGEP
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:232
clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3496
clang::CodeGen::TBAAAccessInfo
Definition: CodeGenTBAA.h:41
clang::CodeGen::CGBuilderTy::CreatePointerBitCastOrAddrSpaceCast
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:176
getDistributeLastprivateVars
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ...
Definition: CGOpenMPRuntimeGPU.cpp:1001
clang::CodeGen::CGOpenMPRuntimeGPU::DataSharingMode
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
Definition: CGOpenMPRuntimeGPU.h:345
clang::CudaArch::SM_86
@ SM_86
clang::CudaArch::GFX702
@ GFX702
clang::CodeGen::EHScopeStack::Cleanup
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141
StmtOpenMP.h
getDataSharingMode
static CGOpenMPRuntimeGPU::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:545
clang::QualType::getAddressSpace
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:6809
clang::OMPClause
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55
clang::CodeGen::CGOpenMPRuntimeGPU::emitReduction
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
Definition: CGOpenMPRuntimeGPU.cpp:2927
clang::LambdaExpr
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1865
clang::OMPLoopDirective
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1002
clang::CudaArch::SM_72
@ SM_72
clang::CharUnits::fromQuantity
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
clang::CodeGen::CodeGenFunction::EmitCastToVoidPtr
llvm::Value * EmitCastToVoidPtr(llvm::Value *value)
Emit a cast to void* in the appropriate address space.
Definition: CGExpr.cpp:54
clang::AS_public
@ AS_public
Definition: Specifiers.h:112
clang::BlockExpr::getBlockDecl
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5986
clang::CodeGen::CGOpenMPRuntimeGPU::emitNumTeamsClause
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
Definition: CGOpenMPRuntimeGPU.cpp:954
clang::OMPRequiresDecl::clauselists
clauselist_range clauselists()
Definition: DeclOpenMP.h:441
clang::CodeGen::Address::getType
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:99
clang::getOpenMPCaptureRegions
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
Definition: OpenMPKinds.cpp:653
clang::ASTContext
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:209
clang::ASTContext::getSizeType
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
Definition: ASTContext.cpp:5892
clang::CodeGen::CodeGenFunction::OMPPrivateScope
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
Definition: CodeGenFunction.h:1066
clang::CudaArch::GFX90c
@ GFX90c
clang::CodeGen::CGOpenMPRuntimeGPU::ExecutionMode
ExecutionMode
Defines the execution mode.
Definition: CGOpenMPRuntimeGPU.h:27
clang::CodeGen::CodeGenFunction::EmitRuntimeCall
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
clang::CodeGen::TEK_Complex
@ TEK_Complex
Definition: CodeGenFunction.h:108
clang::CodeGen::CodeGenTypeCache::Int16Ty
llvm::IntegerType * Int16Ty
Definition: CodeGenTypeCache.h:37
clang::CudaArch::GFX1012
@ GFX1012
clang::CodeGen::CodeGenFunction::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenFunction.h:2026
clang::CudaArch::GFX1103
@ GFX1103
clang::CodeGen::CodeGenFunction::getDebugInfo
CGDebugInfo * getDebugInfo()
Definition: CodeGenFunction.h:1985
clang::ASTContext::getAsVariableArrayType
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2723
clang::CudaArch::GFX1033
@ GFX1033
clang::CudaArch::GFX1031
@ GFX1031
clang::CudaArch::GFX90a
@ GFX90a
clang::CudaArch::SM_21
@ SM_21
clang::Decl::getCanonicalDecl
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:943
clang::OMPExecutableDirective::getInnermostCapturedStmt
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:542
clang::Stmt::children
child_range children()
Definition: Stmt.cpp:285
clang::IntegerLiteral::Create
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:971
clang::CodeGen::CodeGenModule::getModule
llvm::Module & getModule() const
Definition: CodeGenModule.h:732
clang::Type::isLValueReferenceType
bool isLValueReferenceType() const
Definition: Type.h:6952
clang::index::SymbolKind::Field
@ Field
clang::DeclRefExpr::getDecl
ValueDecl * getDecl()
Definition: Expr.h:1303
clang::ImplicitParamDecl
Definition: Decl.h:1618
clang::CodeGen::CGOpenMPRuntime::getDefaultFlagsForBarriers
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
Definition: CGOpenMPRuntime.cpp:2503
clang::CR_OpenMP
@ CR_OpenMP
Definition: CapturedStmt.h:19
clang::CudaArch::GFX1032
@ GFX1032
clang::CodeGen::LValue::getType
QualType getType() const
Definition: CGValue.h:278
llvm::SmallString
Definition: LLVM.h:37
clang::getLangASFromTargetAS
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:80
clang::ASTContext::getIntTypeForBitwidth
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
Definition: ASTContext.cpp:11917
clang::VarDecl
Represents a variable declaration or definition.
Definition: Decl.h:879
hasNestedSPMDDirective
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Definition: CGOpenMPRuntimeGPU.cpp:551
clang::Type::getPointeeType
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:625
clang::CodeGen::CodeGenModule::getTypes
CodeGenTypes & getTypes()
Definition: CodeGenModule.h:749
clang::CudaArch::GFX904
@ GFX904
clang::CapturedStmt
This captures a statement into a function.
Definition: Stmt.h:3532
clang::Type::getAsCXXRecordDecl
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1759
clang::TTK_Union
@ TTK_Union
The "union" keyword.
Definition: Type.h:5570
clang::CodeGen::CodeGenFunction::getVLASize
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements,...
Definition: CodeGenFunction.cpp:2163
clang::Type::isVariablyModifiedType
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2316
clang::CapturedStmt::Capture
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3545
clang::CudaArch::SM_52
@ SM_52
clang::CodeGen::CGOpenMPRuntime::getDefaultDistScheduleAndChunk
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
Definition: CGOpenMPRuntime.h:1780
clang::CodeGen::CGOpenMPRuntime::getIdentTyPointerTy
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
Definition: CGOpenMPRuntime.cpp:1484
clang::CodeGen::CGOpenMPRuntime::ReductionOptionsTy
Definition: CGOpenMPRuntime.h:1428
clang::CodeGen::CodeGenFunction::getContext
ASTContext & getContext() const
Definition: CodeGenFunction.h:1984
clang::QualifierCollector
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:6615
clang::CodeGen::CGOpenMPRuntime::getSingleCompoundChild
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
Definition: CGOpenMPRuntime.cpp:6515
clang::CodeGen::CodeGenFunction::FinishFunction
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
Definition: CodeGenFunction.cpp:326
clang::CodeGen::Address
An aligned address.
Definition: Address.h:74
castValueToType
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
Definition: CGOpenMPRuntimeGPU.cpp:1421
Base
clang::CodeGen::CGBuilderTy::CreateGEP
Address CreateGEP(Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition: CGBuilder.h:264
clang::CodeGen::CodeGenFunction::getEvaluationKind
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
Definition: CodeGenFunction.cpp:217
clang::isOpenMPPrivate
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
Definition: OpenMPKinds.cpp:625
clang::CodeGen::CGOpenMPRuntime::emitUpdateLocation
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
Definition: CGOpenMPRuntime.cpp:1370
clang::CodeGen::Address::getAddressSpace
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:110
clang::CastExpr::getCastKind
CastKind getCastKind() const
Definition: Expr.h:3523
clang::CudaArch::SM_61
@ SM_61
clang::isOpenMPTeamsDirective
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
Definition: OpenMPKinds.cpp:576
clang::CodeGen::CodeGenModule::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenModule.h:737
clang::CudaArch::GFX803
@ GFX803
getNVPTXWarpID
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Definition: CGOpenMPRuntimeGPU.cpp:518
clang::CodeGen::CodeGenTypeCache::Int32Ty
llvm::IntegerType * Int32Ty
Definition: CodeGenTypeCache.h:37
clang::CodeGen::CGOpenMPRuntimeGPU::translateParameter
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
Definition: CGOpenMPRuntimeGPU.cpp:3113
clang::LangAS
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
clang::CodeGen::CodeGenTypes::GetFunctionType
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1615
clang::Type::castAs
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:7498
clang::CodeGen::RegionCodeGenTy
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
Definition: CGOpenMPRuntime.h:68
clang::CodeGen::CodeGenFunction::StartFunction
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
Definition: CodeGenFunction.cpp:688
clang::CudaArch::LAST
@ LAST
clang::CodeGen::CodeGenFunction::VlaSizePair::NumElts
llvm::Value * NumElts
Definition: CodeGenFunction.h:2782
clang::CodeGen::LValue
LValue - This represents an lvalue references.
Definition: CGValue.h:171
clang::CodeGen::CodeGenFunction::EHStack
EHScopeStack EHStack
Definition: CodeGenFunction.h:619
clang::TargetOptions::FeatureMap
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62
clang::dependency_directives_scan::DirectiveKind
DirectiveKind
Represents the kind of preprocessor directive or a module declaration that is tracked by the scanner ...
Definition: DependencyDirectivesScanner.h:59
clang::TargetInfo::getTargetOpts
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:282
clang::CodeGen::CodeGenFunction::MakeNaturalAlignAddrLValue
LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T)
Definition: CodeGenFunction.cpp:188
clang::OMPExecutableDirective::getDirectiveKind
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:555
clang::LambdaExpr::isInitCapture
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition: ExprCXX.cpp:1213
clang::CodeGen::CGOpenMPRuntimeGPU::processRequiresDirective
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntimeGPU.cpp:3607
clang::TagDecl::startDefinition
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4407
clang::ASTContext::VoidPtrTy
CanQualType VoidPtrTy
Definition: ASTContext.h:1134
clang::Decl::hasAttrs
bool hasAttrs() const
Definition: DeclBase.h:502
clang::CudaArch::SM_80
@ SM_80
clang::syntax::NodeRole::Size
@ Size
clang::CodeGen::Address::getPointer
llvm::Value * getPointer() const
Definition: Address.h:93
clang::CodeGen::CodeGenFunction
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
Definition: CodeGenFunction.h:231
clang::Stmt::IgnoreContainers
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top,...
Definition: Stmt.cpp:195
clang::ASTContext::getDeclAlign
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
Definition: ASTContext.cpp:1737
clang::CudaArch::GFX909
@ GFX909
clang::CudaArch::SM_30
@ SM_30
clang::LCK_ByRef
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37
clang::NamedDecl::getIdentifier
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:268
clang::ValueDecl
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:674
clang::RecordDecl::completeDefinition
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:4729
clang::CodeGen::CGOpenMPRuntime::processRequiresDirective
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntime.cpp:10878
clang::CharUnits::isZero
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:116
clang::DeclContext::addDecl
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1606
clang::CodeGen::CodeGenModule
This class organizes the cross-function state that is used while generating LLVM code.
Definition: CodeGenModule.h:279
clang::LambdaExpr::captures
capture_range captures() const
Retrieve this lambda's captures.
Definition: ExprCXX.cpp:1226
clang::BlockExpr
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:5974
llvm::ArrayRef
Definition: LLVM.h:34
clang::LangAS::opencl_global
@ opencl_global
clang::Decl::attr_begin
attr_iterator attr_begin() const
Definition: DeclBase.h:523
clang::Decl
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:83
clang::CudaArch::GFX600
@ GFX600
clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::getKind
CapturedRegionKind getKind() const
Definition: CodeGenFunction.h:430
clang::Expr::IgnoreParenImpCasts
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3024
clang::RISCV::TypeModifier::LLVM_MARK_AS_BITMASK_ENUM
@ LLVM_MARK_AS_BITMASK_ENUM
clang::OMPExecutableDirective::getAssociatedStmt
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:520
CopyOptionsTy::ScratchpadIndex
llvm::Value * ScratchpadIndex
Definition: CGOpenMPRuntimeGPU.cpp:1586
emitListToGlobalReduceFunction
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2384
clang::CodeGen::Address::invalid
static Address invalid()
Definition: Address.h:90
clang::ValueDecl::isInitCapture
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:4988
clang::CudaArchToString
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:139
clang::CodeGen::FunctionArgList
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:353
StmtVisitor.h
clang::CudaArch::UNKNOWN
@ UNKNOWN
clang::CodeGen::CodeGenFunction::ConvertType
llvm::Type * ConvertType(QualType T)
Definition: CodeGenFunction.cpp:213
clang::CudaArch::SM_32
@ SM_32
clang::OMPExecutableDirective::hasAssociatedStmt
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:517
clang::CodeGen::CGOpenMPRuntime::emitOutlinedFunctionCall
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=llvm::None) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntime.cpp:12131
clang::CodeGen::NormalAndEHCleanup
@ NormalAndEHCleanup
Definition: EHScopeStack.h:86
clang::CudaArch::GFX900
@ GFX900
getNVPTXLaneID
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
Definition: CGOpenMPRuntimeGPU.cpp:529
clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3479
clang::CodeGen::Address::getElementType
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:104
clang::CodeGen::CodeGenFunction::MakeAddrLValue
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
Definition: CodeGenFunction.h:2490
clang::CudaArch::GFX801
@ GFX801
clang::ConstStmtVisitor
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:193
clang::CodeGen::CGOpenMPRuntime::emitReductionFunction
llvm::Function * emitReductionFunction(SourceLocation Loc, llvm::Type *ArgsElemType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
Definition: CGOpenMPRuntime.cpp:5321
clang::CodeGen::CGOpenMPRuntimeGPU::getGPUThreadID
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
Definition: CGOpenMPRuntimeGPU.cpp:3736
clang::ASTContext::getTargetInfo
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:773
clang::CodeGen::LValue::getPointer
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:337
clang::CodeGen::CGBuilderTy::CreateElementBitCast
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:168
CopyOptionsTy::RemoteLaneOffset
llvm::Value * RemoteLaneOffset
Definition: CGOpenMPRuntimeGPU.cpp:1585
clang::OMPExecutableDirective
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266
clang::CudaArch::GFX703
@ GFX703
clang::ObjCPropertyAttribute::Kind
Kind
Definition: DeclObjCCommon.h:22
clang::CodeGen::AlignmentSource::Type
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
clang::DeclStmt
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1299
clang::CodeGen::CGFunctionInfo
CGFunctionInfo - Class to encapsulate the information about a function definition.
Definition: CGFunctionInfo.h:546
clang::isOpenMPLoopBoundSharingDirective
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
Definition: OpenMPKinds.cpp:640
clang::PointerType
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2766
clang::CodeGen::CodeGenFunction::EmitStoreOfComplex
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.
Definition: CGExprComplex.cpp:1310
clang::CodeGen::CGOpenMPRuntimeGPU::clear
void clear() override
Definition: CGOpenMPRuntimeGPU.cpp:3687
clang::LangAS::cuda_constant
@ cuda_constant
clang::CudaArch::GFX705
@ GFX705
clang::LangAS::Default
@ Default
Cuda.h
clang::CodeGen::CGBuilderTy::CreateConstArrayGEP
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:213
clang::CodeGen::CodeGenFunction::CGM
CodeGenModule & CGM
Definition: CodeGenFunction.h:261
clang::TargetInfo::getGridValue
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1511
clang::ImplicitParamDecl::Create
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:4996
clang::CodeGen::TEK_Scalar
@ TEK_Scalar
Definition: CodeGenFunction.h:107
clang::ASTContext::getAddrSpaceQualType
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Definition: ASTContext.cpp:3039
clang::CodeGen::CodeGenFunction::CurFn
llvm::Function * CurFn
Definition: CodeGenFunction.h:325
clang::Builtin::ID
ID
Definition: Builtins.h:52
clang::OMPExecutableDirective::getBeginLoc
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:488
clang::UnaryOperator::getSubExpr
Expr * getSubExpr() const
Definition: Expr.h:2218
clang::Expr::IgnoreParens
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3020
clang
Definition: CalledOnceCheck.h:17
clang::LangOptions::OMPHostIRFile
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:445
clang::CodeGen::CGOpenMPRuntimeGPU::getParameterAddress
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
Definition: CGOpenMPRuntimeGPU.cpp:3145
emitGlobalToListCopyFunction
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2481
clang::CodeGen::CodeGenModule::addCompilerUsedGlobal
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
Definition: CodeGenModule.cpp:2474
clang::CudaArch::GFX1036
@ GFX1036
clang::CodeGen::CodeGenFunction::EmitAggregateCopy
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:2056
clang::OpenMPDistScheduleClauseKind
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103
clang::CudaArch::GFX902
@ GFX902
clang::ExternalLinkage
@ ExternalLinkage
External linkage, which indicates that the entity can be referred to from other translation units.
Definition: Linkage.h:59
clang::Stmt
Stmt - This represents one statement.
Definition: Stmt.h:70
clang::UnaryOperator::getOpcode
Opcode getOpcode() const
Definition: Expr.h:2213
clang::CodeGen::CGOpenMPRuntime::clear
virtual void clear()
Definition: CGOpenMPRuntime.cpp:1071
clang::CodeGen::CGOpenMPRuntimeGPU::emitCriticalRegion
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
Definition: CGOpenMPRuntimeGPU.cpp:1351
clang::CodeGen::CGOpenMPRuntimeGPU::emitProcBindClause
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Definition: CGOpenMPRuntimeGPU.cpp:938
clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction
llvm::Function * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
Definition: CGOpenMPRuntimeGPU.cpp:1036
clang::CudaArch::GFX1101
@ GFX1101
clang::CodeGen::CodeGenFunction::CapturedStmtInfo
CGCapturedStmtInfo * CapturedStmtInfo
Definition: CodeGenFunction.h:476
clang::CodeGen::CGOpenMPRuntime::emitTeamsOutlinedFunction
virtual llvm::Function * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
Definition: CGOpenMPRuntime.cpp:1277
clang::CodeGen::CodeGenFunction::EmitLoadOfReferenceLValue
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Definition: CGExpr.cpp:2521
clang::CudaArch::SM_37
@ SM_37
clang::CudaArch::GFX602
@ GFX602
clang::CudaArch::SM_50
@ SM_50
clang::Expr::getType
QualType getType() const
Definition: Expr.h:141
clang::Attr
Attr - This represents one attribute.
Definition: Attr.h:40
clang::Qualifiers::addAddressSpace
void addAddressSpace(LangAS space)
Definition: Type.h:404
clang::CudaArch::GFX1013
@ GFX1013
clang::CodeGen::RegionCodeGenTy::setAction
void setAction(PrePostActionTy &Action) const
Definition: CGOpenMPRuntime.h:89
clang::OMPRequiresDecl
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:416
clang::CudaArch
CudaArch
Definition: Cuda.h:46
clang::CodeGen::ApplyDebugLocation::CreateEmpty
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:846
clang::Decl::getAttrs
AttrVec & getAttrs()
Definition: DeclBase.h:508
clang::CudaArch::Generic
@ Generic
clang::ImplicitCastExpr
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3623
clang::CudaArch::GFX810
@ GFX810
clang::CodeGen::CodeGenFunction::EmitLValue
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression.
Definition: CGExpr.cpp:1270
clang::CudaArch::UNUSED
@ UNUSED
clang::OpenMPDirectiveKind
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24
clang::ASTContext::getPointerType
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
Definition: ASTContext.cpp:3286
clang::OMPExecutableDirective::clauses
ArrayRef< OMPClause * > clauses() const
Definition: StmtOpenMP.h:572
clang::CharUnits
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
clang::isOpenMPTargetExecutionDirective
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
Definition: OpenMPKinds.cpp:552
clang::CodeGen::CGOpenMPRuntimeGPU::emitFunctionProlog
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
Definition: CGOpenMPRuntimeGPU.cpp:3333
clang::CudaArch::SM_62
@ SM_62
llvm::SmallVectorImpl
Definition: Randstruct.h:18
emitInterWarpCopyFunction
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
Definition: CGOpenMPRuntimeGPU.cpp:1810
clang::CallExpr::arguments
arg_range arguments()
Definition: Expr.h:3048
clang::SC_None
@ SC_None
Definition: Specifiers.h:238
clang::ValueDecl::getType
QualType getType() const
Definition: Decl.h:685
clang::Type::isIntegerType
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:7257
clang::CodeGen::CodeGenFunction::CreateMemTemp
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:135
clang::Expr
This represents one expression.
Definition: Expr.h:109
clang::OpenMPScheduleClauseKind
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30
clang::CodeGen::CGOpenMPRuntime::CGM
CodeGenModule & CGM
Definition: CGOpenMPRuntime.h:308
clang::CapturedStmt::getCapturedDecl
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1399
clang::ParmVarDecl::Create
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2814
clang::CodeGen::CodeGenTypeCache::Int8Ty
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Definition: CodeGenTypeCache.h:37
clang::BlockDecl::Capture
A class which contains all the information about a particular captured value.
Definition: Decl.h:4260
shuffleAndStore
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
Definition: CGOpenMPRuntimeGPU.cpp:1484
clang::OMPExecutableDirective::getCapturedStmt
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive.
Definition: StmtOpenMP.h:533
getPrivateItem
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
Definition: SemaOpenMP.cpp:5280
clang::CodeGen::CodeGenTypeCache::VoidPtrPtrTy
llvm::PointerType * VoidPtrPtrTy
Definition: CodeGenTypeCache.h:62
getCudaArch
static CudaArch getCudaArch(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:3592
clang::Decl::getLocation
SourceLocation getLocation() const
Definition: DeclBase.h:432
clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsCall
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
Definition: CGOpenMPRuntimeGPU.cpp:1221
clang::Decl::isCanonicalDecl
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:949
clang::DeclRefExpr
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1231
clang::CudaArch::SM_75
@ SM_75
clang::RecordDecl
Represents a struct/union/class.
Definition: Decl.h:3954
clang::CodeGen::CGOpenMPRuntime::emitReduction
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
Definition: CGOpenMPRuntime.cpp:5431
clang::CallExpr
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2809
clang::CodeGen::CodeGenFunction::EmitBlock
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:560
clang::CudaArch::GFX701
@ GFX701
clang::CodeGen::CodeGenFunction::EmitLValueForFieldInitialization
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
Definition: CGExpr.cpp:4498
clang::CodeGen::CodeGenModule::Error
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
Definition: CodeGenModule.cpp:1067
clang::CodeGen::CodeGenModule::getLLVMContext
llvm::LLVMContext & getLLVMContext()
Definition: CodeGenModule.h:743
clang::CharUnits::getQuantity
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
clang::CudaArch::GFX1030
@ GFX1030
clang::CodeGen::PrePostActionTy
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
Definition: CGOpenMPRuntime.h:58
clang::ICIS_NoInit
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:260
clang::specific_attr_iterator
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:33
CGOpenMPRuntimeGPU.h
clang::CudaArch::GFX700
@ GFX700
clang::CodeGen::CGOpenMPRuntimeGPU::EM_Unknown
@ EM_Unknown
Unknown execution mode (orphaned directive).
Definition: CGOpenMPRuntimeGPU.h:33
clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags
unsigned getDefaultLocationReserved2Flags() const override
Returns additional flags that can be stored in reserved_2 field of the default location.
Definition: CGOpenMPRuntimeGPU.cpp:905
clang::CodeGen::CGOpenMPRuntime::functionFinished
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntime.cpp:1463
clang::ImplicitParamDecl::Other
@ Other
Other implicit parameter.
Definition: Decl.h:1645
clang::CodeGen::CGOpenMPRuntimeGPU::getAddressOfLocalVariable
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Definition: CGOpenMPRuntimeGPU.cpp:3405
clang::CodeGen::CodeGenFunction::EmitBranch
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Definition: CGStmt.cpp:580
clang::Decl::getDeclContext
DeclContext * getDeclContext()
Definition: DeclBase.h:441
clang::CodeGen::AggValueSlot::DoesNotOverlap
@ DoesNotOverlap
Definition: CGValue.h:553
clang::NamedDecl::getName
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:274