doxygen/CGOpenMPRuntimeGPU_8cpp_source.html

//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This provides a generalized class for OpenMP runtime code generation

// specialized by GPU targets NVPTX and AMDGCN.

//

//===----------------------------------------------------------------------===//


#include "CGOpenMPRuntimeGPU.h"

#include "CodeGenFunction.h"

#include "clang/AST/Attr.h"

#include "clang/AST/DeclOpenMP.h"

#include "clang/AST/OpenMPClause.h"

#include "clang/AST/StmtOpenMP.h"

#include "clang/AST/StmtVisitor.h"

#include "clang/Basic/Cuda.h"

#include "llvm/ADT/SmallPtrSet.h"

#include "llvm/Frontend/OpenMP/OMPGridValues.h"

#include "llvm/Support/MathExtras.h"


using namespace clang;

using namespace CodeGen;

using namespace llvm::omp;


namespace {

/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.

class NVPTXActionTy final : public PrePostActionTy {

  llvm::FunctionCallee EnterCallee = nullptr;

  ArrayRef<llvm::Value *> EnterArgs;

  llvm::FunctionCallee ExitCallee = nullptr;

  ArrayRef<llvm::Value *> ExitArgs;

  bool Conditional = false;

  llvm::BasicBlock *ContBlock = nullptr;


public:

  NVPTXActionTy(llvm::FunctionCallee EnterCallee,

                ArrayRef<llvm::Value *> EnterArgs,

                llvm::FunctionCallee ExitCallee,

                ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)

      : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),

        ExitArgs(ExitArgs), Conditional(Conditional) {}

  void Enter(CodeGenFunction &CGF) override {

    llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);

    if (Conditional) {

      llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);

      auto *ThenBlock = CGF.createBasicBlock("omp_if.then");

      ContBlock = CGF.createBasicBlock("omp_if.end");

      // Generate the branch (If-stmt)

      CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);

      CGF.EmitBlock(ThenBlock);

    }

  }

  void Done(CodeGenFunction &CGF) {

    // Emit the rest of blocks/branches

    CGF.EmitBranch(ContBlock);

    CGF.EmitBlock(ContBlock, true);

  }

  void Exit(CodeGenFunction &CGF) override {

    CGF.EmitRuntimeCall(ExitCallee, ExitArgs);

  }

};


/// A class to track the execution mode when codegening directives within

/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry

/// to the target region and used by containing directives such as 'parallel'

/// to emit optimized code.

class ExecutionRuntimeModesRAII {

private:

  CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =

      CGOpenMPRuntimeGPU::EM_Unknown;

  CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;


public:

  ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,

                            CGOpenMPRuntimeGPU::ExecutionMode EntryMode)

      : ExecMode(ExecMode) {

    SavedExecMode = ExecMode;

    ExecMode = EntryMode;

  }

  ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }

};


static const ValueDecl *getPrivateItem(const Expr *RefExpr) {

  RefExpr = RefExpr->IgnoreParens();

  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {

    const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();

    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))

      Base = TempASE->getBase()->IgnoreParenImpCasts();

    RefExpr = Base;

  } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {

    const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();

    while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))

      Base = TempOASE->getBase()->IgnoreParenImpCasts();

    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))

      Base = TempASE->getBase()->IgnoreParenImpCasts();

    RefExpr = Base;

  }

  RefExpr = RefExpr->IgnoreParenImpCasts();

  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))

    return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());

  const auto *ME = cast<MemberExpr>(RefExpr);

  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());

}


static RecordDecl *buildRecordForGlobalizedVars(

    ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,

    ArrayRef<const ValueDecl *> EscapedDeclsForTeams,

    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

        &MappedDeclsFields,

    int BufSize) {

  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;

  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())

    return nullptr;

  SmallVector<VarsDataTy, 4> GlobalizedVars;

  for (const ValueDecl *D : EscapedDecls)

    GlobalizedVars.emplace_back(C.getDeclAlign(D), D);

  for (const ValueDecl *D : EscapedDeclsForTeams)

    GlobalizedVars.emplace_back(C.getDeclAlign(D), D);


  // Build struct _globalized_locals_ty {

  //         /*  globalized vars  */[WarSize] align (decl_align)

  //         /*  globalized vars  */ for EscapedDeclsForTeams

  //       };

  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");

  GlobalizedRD->startDefinition();

  llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(

      EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());

  for (const auto &Pair : GlobalizedVars) {

    const ValueDecl *VD = Pair.second;

    QualType Type = VD->getType();

    if (Type->isLValueReferenceType())

      Type = C.getPointerType(Type.getNonReferenceType());

    else

      Type = Type.getNonReferenceType();

    SourceLocation Loc = VD->getLocation();

    FieldDecl *Field;

    if (SingleEscaped.count(VD)) {

      Field = FieldDecl::Create(

          C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,

          C.getTrivialTypeSourceInfo(Type, SourceLocation()),

          /*BW=*/nullptr, /*Mutable=*/false,

          /*InitStyle=*/ICIS_NoInit);

      Field->setAccess(AS_public);

      if (VD->hasAttrs()) {

        for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),

             E(VD->getAttrs().end());

             I != E; ++I)

          Field->addAttr(*I);

      }

    } else {

      if (BufSize > 1) {

        llvm::APInt ArraySize(32, BufSize);

        Type = C.getConstantArrayType(Type, ArraySize, nullptr,

                                      ArraySizeModifier::Normal, 0);

      }

      Field = FieldDecl::Create(

          C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,

          C.getTrivialTypeSourceInfo(Type, SourceLocation()),

          /*BW=*/nullptr, /*Mutable=*/false,

          /*InitStyle=*/ICIS_NoInit);

      Field->setAccess(AS_public);

      llvm::APInt Align(32, Pair.first.getQuantity());

      Field->addAttr(AlignedAttr::CreateImplicit(

          C, /*IsAlignmentExpr=*/true,

          IntegerLiteral::Create(C, Align,

                                 C.getIntTypeForBitwidth(32, /*Signed=*/0),

                                 SourceLocation()),

          {}, AlignedAttr::GNU_aligned));

    }

    GlobalizedRD->addDecl(Field);

    MappedDeclsFields.try_emplace(VD, Field);

  }

  GlobalizedRD->completeDefinition();

  return GlobalizedRD;

}


/// Get the list of variables that can escape their declaration context.

class CheckVarsEscapingDeclContext final

    : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {

  CodeGenFunction &CGF;

  llvm::SetVector<const ValueDecl *> EscapedDecls;

  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;

  llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;

  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;

  RecordDecl *GlobalizedRD = nullptr;

  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;

  bool AllEscaped = false;

  bool IsForCombinedParallelRegion = false;


  void markAsEscaped(const ValueDecl *VD) {

    // Do not globalize declare target variables.

    if (!isa<VarDecl>(VD) ||

        OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))

      return;

    VD = cast<ValueDecl>(VD->getCanonicalDecl());

    // Use user-specified allocation.

    if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())

      return;

    // Variables captured by value must be globalized.

    bool IsCaptured = false;

    if (auto *CSI = CGF.CapturedStmtInfo) {

      if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {

        // Check if need to capture the variable that was already captured by

        // value in the outer region.

        IsCaptured = true;

        if (!IsForCombinedParallelRegion) {

          if (!FD->hasAttrs())

            return;

          const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();

          if (!Attr)

            return;

          if (((Attr->getCaptureKind() != OMPC_map) &&

               !isOpenMPPrivate(Attr->getCaptureKind())) ||

              ((Attr->getCaptureKind() == OMPC_map) &&

               !FD->getType()->isAnyPointerType()))

            return;

        }

        if (!FD->getType()->isReferenceType()) {

          assert(!VD->getType()->isVariablyModifiedType() &&

                 "Parameter captured by value with variably modified type");

          EscapedParameters.insert(VD);

        } else if (!IsForCombinedParallelRegion) {

          return;

        }

      }

    }

    if ((!CGF.CapturedStmtInfo ||

         (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&

        VD->getType()->isReferenceType())

      // Do not globalize variables with reference type.

      return;

    if (VD->getType()->isVariablyModifiedType()) {

      // If not captured at the target region level then mark the escaped

      // variable as delayed.

      if (IsCaptured)

        EscapedVariableLengthDecls.insert(VD);

      else

        DelayedVariableLengthDecls.insert(VD);

    } else

      EscapedDecls.insert(VD);

  }


  void VisitValueDecl(const ValueDecl *VD) {

    if (VD->getType()->isLValueReferenceType())

      markAsEscaped(VD);

    if (const auto *VarD = dyn_cast<VarDecl>(VD)) {

      if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {

        const bool SavedAllEscaped = AllEscaped;

        AllEscaped = VD->getType()->isLValueReferenceType();

        Visit(VarD->getInit());

        AllEscaped = SavedAllEscaped;

      }

    }

  }

  void VisitOpenMPCapturedStmt(const CapturedStmt *S,

                               ArrayRef<OMPClause *> Clauses,

                               bool IsCombinedParallelRegion) {

    if (!S)

      return;

    for (const CapturedStmt::Capture &C : S->captures()) {

      if (C.capturesVariable() && !C.capturesVariableByCopy()) {

        const ValueDecl *VD = C.getCapturedVar();

        bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;

        if (IsCombinedParallelRegion) {

          // Check if the variable is privatized in the combined construct and

          // those private copies must be shared in the inner parallel

          // directive.

          IsForCombinedParallelRegion = false;

          for (const OMPClause *C : Clauses) {

            if (!isOpenMPPrivate(C->getClauseKind()) ||

                C->getClauseKind() == OMPC_reduction ||

                C->getClauseKind() == OMPC_linear ||

                C->getClauseKind() == OMPC_private)

              continue;

            ArrayRef<const Expr *> Vars;

            if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))

              Vars = PC->getVarRefs();

            else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))

              Vars = PC->getVarRefs();

            else

              llvm_unreachable("Unexpected clause.");

            for (const auto *E : Vars) {

              const Decl *D =

                  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();

              if (D == VD->getCanonicalDecl()) {

                IsForCombinedParallelRegion = true;

                break;

              }

            }

            if (IsForCombinedParallelRegion)

              break;

          }

        }

        markAsEscaped(VD);

        if (isa<OMPCapturedExprDecl>(VD))

          VisitValueDecl(VD);

        IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;

      }

    }

  }


  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {

    assert(!GlobalizedRD &&

           "Record for globalized variables is built already.");

    ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;

    unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;

    if (IsInTTDRegion)

      EscapedDeclsForTeams = EscapedDecls.getArrayRef();

    else

      EscapedDeclsForParallel = EscapedDecls.getArrayRef();

    GlobalizedRD = ::buildRecordForGlobalizedVars(

        CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,

        MappedDeclsFields, WarpSize);

  }


public:

  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,

                               ArrayRef<const ValueDecl *> TeamsReductions)

      : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {

  }

  virtual ~CheckVarsEscapingDeclContext() = default;

  void VisitDeclStmt(const DeclStmt *S) {

    if (!S)

      return;

    for (const Decl *D : S->decls())

      if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))

        VisitValueDecl(VD);

  }

  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {

    if (!D)

      return;

    if (!D->hasAssociatedStmt())

      return;

    if (const auto *S =

            dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {

      // Do not analyze directives that do not actually require capturing,

      // like `omp for` or `omp simd` directives.

      llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;

      getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());

      if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {

        VisitStmt(S->getCapturedStmt());

        return;

      }

      VisitOpenMPCapturedStmt(

          S, D->clauses(),

          CaptureRegions.back() == OMPD_parallel &&

              isOpenMPDistributeDirective(D->getDirectiveKind()));

    }

  }

  void VisitCapturedStmt(const CapturedStmt *S) {

    if (!S)

      return;

    for (const CapturedStmt::Capture &C : S->captures()) {

      if (C.capturesVariable() && !C.capturesVariableByCopy()) {

        const ValueDecl *VD = C.getCapturedVar();

        markAsEscaped(VD);

        if (isa<OMPCapturedExprDecl>(VD))

          VisitValueDecl(VD);

      }

    }

  }

  void VisitLambdaExpr(const LambdaExpr *E) {

    if (!E)

      return;

    for (const LambdaCapture &C : E->captures()) {

      if (C.capturesVariable()) {

        if (C.getCaptureKind() == LCK_ByRef) {

          const ValueDecl *VD = C.getCapturedVar();

          markAsEscaped(VD);

          if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))

            VisitValueDecl(VD);

        }

      }

    }

  }

  void VisitBlockExpr(const BlockExpr *E) {

    if (!E)

      return;

    for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {

      if (C.isByRef()) {

        const VarDecl *VD = C.getVariable();

        markAsEscaped(VD);

        if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())

          VisitValueDecl(VD);

      }

    }

  }

  void VisitCallExpr(const CallExpr *E) {

    if (!E)

      return;

    for (const Expr *Arg : E->arguments()) {

      if (!Arg)

        continue;

      if (Arg->isLValue()) {

        const bool SavedAllEscaped = AllEscaped;

        AllEscaped = true;

        Visit(Arg);

        AllEscaped = SavedAllEscaped;

      } else {

        Visit(Arg);

      }

    }

    Visit(E->getCallee());

  }

  void VisitDeclRefExpr(const DeclRefExpr *E) {

    if (!E)

      return;

    const ValueDecl *VD = E->getDecl();

    if (AllEscaped)

      markAsEscaped(VD);

    if (isa<OMPCapturedExprDecl>(VD))

      VisitValueDecl(VD);

    else if (VD->isInitCapture())

      VisitValueDecl(VD);

  }

  void VisitUnaryOperator(const UnaryOperator *E) {

    if (!E)

      return;

    if (E->getOpcode() == UO_AddrOf) {

      const bool SavedAllEscaped = AllEscaped;

      AllEscaped = true;

      Visit(E->getSubExpr());

      AllEscaped = SavedAllEscaped;

    } else {

      Visit(E->getSubExpr());

    }

  }

  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {

    if (!E)

      return;

    if (E->getCastKind() == CK_ArrayToPointerDecay) {

      const bool SavedAllEscaped = AllEscaped;

      AllEscaped = true;

      Visit(E->getSubExpr());

      AllEscaped = SavedAllEscaped;

    } else {

      Visit(E->getSubExpr());

    }

  }

  void VisitExpr(const Expr *E) {

    if (!E)

      return;

    bool SavedAllEscaped = AllEscaped;

    if (!E->isLValue())

      AllEscaped = false;

    for (const Stmt *Child : E->children())

      if (Child)

        Visit(Child);

    AllEscaped = SavedAllEscaped;

  }

  void VisitStmt(const Stmt *S) {

    if (!S)

      return;

    for (const Stmt *Child : S->children())

      if (Child)

        Visit(Child);

  }


  /// Returns the record that handles all the escaped local variables and used

  /// instead of their original storage.

  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {

    if (!GlobalizedRD)

      buildRecordForGlobalizedVars(IsInTTDRegion);

    return GlobalizedRD;

  }


  /// Returns the field in the globalized record for the escaped variable.

  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {

    assert(GlobalizedRD &&

           "Record for globalized variables must be generated already.");

    return MappedDeclsFields.lookup(VD);

  }


  /// Returns the list of the escaped local variables/parameters.

  ArrayRef<const ValueDecl *> getEscapedDecls() const {

    return EscapedDecls.getArrayRef();

  }


  /// Checks if the escaped local variable is actually a parameter passed by

  /// value.

  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {

    return EscapedParameters;

  }


  /// Returns the list of the escaped variables with the variably modified

  /// types.

  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {

    return EscapedVariableLengthDecls.getArrayRef();

  }


  /// Returns the list of the delayed variables with the variably modified

  /// types.

  ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {

    return DelayedVariableLengthDecls.getArrayRef();

  }

};

} // anonymous namespace


/// Get the id of the warp in the block.

/// We assume that the warp size is 32, which is always the case

/// on the NVPTX device, to generate more efficient code.

static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {

  CGBuilderTy &Bld = CGF.Builder;

  unsigned LaneIDBits =

      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);

  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");

}


/// Get the id of the current lane in the Warp.

/// We assume that the warp size is 32, which is always the case

/// on the NVPTX device, to generate more efficient code.

static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {

  CGBuilderTy &Bld = CGF.Builder;

  unsigned LaneIDBits =

      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);

  assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");

  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);

  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),

                       "nvptx_lane_id");

}


CGOpenMPRuntimeGPU::ExecutionMode

CGOpenMPRuntimeGPU::getExecutionMode() const {

  return CurrentExecutionMode;

}


CGOpenMPRuntimeGPU::DataSharingMode

CGOpenMPRuntimeGPU::getDataSharingMode() const {

  return CurrentDataSharingMode;

}


/// Check for inner (nested) SPMD construct, if any

static bool hasNestedSPMDDirective(ASTContext &Ctx,

                                   const OMPExecutableDirective &D) {

  const auto *CS = D.getInnermostCapturedStmt();

  const auto *Body =

      CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);

  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);


  if (const auto *NestedDir =

          dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {

    OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();

    switch (D.getDirectiveKind()) {

    case OMPD_target:

      if (isOpenMPParallelDirective(DKind))

        return true;

      if (DKind == OMPD_teams) {

        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(

            /*IgnoreCaptured=*/true);

        if (!Body)

          return false;

        ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);

        if (const auto *NND =

                dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {

          DKind = NND->getDirectiveKind();

          if (isOpenMPParallelDirective(DKind))

            return true;

        }

      }

      return false;

    case OMPD_target_teams:

      return isOpenMPParallelDirective(DKind);

    case OMPD_target_simd:

    case OMPD_target_parallel:

    case OMPD_target_parallel_for:

    case OMPD_target_parallel_for_simd:

    case OMPD_target_teams_distribute:

    case OMPD_target_teams_distribute_simd:

    case OMPD_target_teams_distribute_parallel_for:

    case OMPD_target_teams_distribute_parallel_for_simd:

    case OMPD_parallel:

    case OMPD_for:

    case OMPD_parallel_for:

    case OMPD_parallel_master:

    case OMPD_parallel_sections:

    case OMPD_for_simd:

    case OMPD_parallel_for_simd:

    case OMPD_cancel:

    case OMPD_cancellation_point:

    case OMPD_ordered:

    case OMPD_threadprivate:

    case OMPD_allocate:

    case OMPD_task:

    case OMPD_simd:

    case OMPD_sections:

    case OMPD_section:

    case OMPD_single:

    case OMPD_master:

    case OMPD_critical:

    case OMPD_taskyield:

    case OMPD_barrier:

    case OMPD_taskwait:

    case OMPD_taskgroup:

    case OMPD_atomic:

    case OMPD_flush:

    case OMPD_depobj:

    case OMPD_scan:

    case OMPD_teams:

    case OMPD_target_data:

    case OMPD_target_exit_data:

    case OMPD_target_enter_data:

    case OMPD_distribute:

    case OMPD_distribute_simd:

    case OMPD_distribute_parallel_for:

    case OMPD_distribute_parallel_for_simd:

    case OMPD_teams_distribute:

    case OMPD_teams_distribute_simd:

    case OMPD_teams_distribute_parallel_for:

    case OMPD_teams_distribute_parallel_for_simd:

    case OMPD_target_update:

    case OMPD_declare_simd:

    case OMPD_declare_variant:

    case OMPD_begin_declare_variant:

    case OMPD_end_declare_variant:

    case OMPD_declare_target:

    case OMPD_end_declare_target:

    case OMPD_declare_reduction:

    case OMPD_declare_mapper:

    case OMPD_taskloop:

    case OMPD_taskloop_simd:

    case OMPD_master_taskloop:

    case OMPD_master_taskloop_simd:

    case OMPD_parallel_master_taskloop:

    case OMPD_parallel_master_taskloop_simd:

    case OMPD_requires:

    case OMPD_unknown:

    default:

      llvm_unreachable("Unexpected directive.");

    }

  }


  return false;

}


static bool supportsSPMDExecutionMode(ASTContext &Ctx,

                                      const OMPExecutableDirective &D) {

  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();

  switch (DirectiveKind) {

  case OMPD_target:

  case OMPD_target_teams:

    return hasNestedSPMDDirective(Ctx, D);

  case OMPD_target_parallel_loop:

  case OMPD_target_parallel:

  case OMPD_target_parallel_for:

  case OMPD_target_parallel_for_simd:

  case OMPD_target_teams_distribute_parallel_for:

  case OMPD_target_teams_distribute_parallel_for_simd:

  case OMPD_target_simd:

  case OMPD_target_teams_distribute_simd:

    return true;

  case OMPD_target_teams_distribute:

    return false;

  case OMPD_target_teams_loop:

    // Whether this is true or not depends on how the directive will

    // eventually be emitted.

    if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))

      return TTLD->canBeParallelFor();

    return false;

  case OMPD_parallel:

  case OMPD_for:

  case OMPD_parallel_for:

  case OMPD_parallel_master:

  case OMPD_parallel_sections:

  case OMPD_for_simd:

  case OMPD_parallel_for_simd:

  case OMPD_cancel:

  case OMPD_cancellation_point:

  case OMPD_ordered:

  case OMPD_threadprivate:

  case OMPD_allocate:

  case OMPD_task:

  case OMPD_simd:

  case OMPD_sections:

  case OMPD_section:

  case OMPD_single:

  case OMPD_master:

  case OMPD_critical:

  case OMPD_taskyield:

  case OMPD_barrier:

  case OMPD_taskwait:

  case OMPD_taskgroup:

  case OMPD_atomic:

  case OMPD_flush:

  case OMPD_depobj:

  case OMPD_scan:

  case OMPD_teams:

  case OMPD_target_data:

  case OMPD_target_exit_data:

  case OMPD_target_enter_data:

  case OMPD_distribute:

  case OMPD_distribute_simd:

  case OMPD_distribute_parallel_for:

  case OMPD_distribute_parallel_for_simd:

  case OMPD_teams_distribute:

  case OMPD_teams_distribute_simd:

  case OMPD_teams_distribute_parallel_for:

  case OMPD_teams_distribute_parallel_for_simd:

  case OMPD_target_update:

  case OMPD_declare_simd:

  case OMPD_declare_variant:

  case OMPD_begin_declare_variant:

  case OMPD_end_declare_variant:

  case OMPD_declare_target:

  case OMPD_end_declare_target:

  case OMPD_declare_reduction:

  case OMPD_declare_mapper:

  case OMPD_taskloop:

  case OMPD_taskloop_simd:

  case OMPD_master_taskloop:

  case OMPD_master_taskloop_simd:

  case OMPD_parallel_master_taskloop:

  case OMPD_parallel_master_taskloop_simd:

  case OMPD_requires:

  case OMPD_unknown:

  default:

    break;

  }

  llvm_unreachable(

      "Unknown programming model for OpenMP directive on NVPTX target.");

}


void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,

                                             StringRef ParentName,

                                             llvm::Function *&OutlinedFn,

                                             llvm::Constant *&OutlinedFnID,

                                             bool IsOffloadEntry,

                                             const RegionCodeGenTy &CodeGen) {

  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);

  EntryFunctionState EST;

  WrapperFunctionsMap.clear();


  [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();

  assert(!IsBareKernel && "bare kernel should not be at generic mode");


  // Emit target region as a standalone region.

  class NVPTXPrePostActionTy : public PrePostActionTy {

    CGOpenMPRuntimeGPU::EntryFunctionState &EST;

    const OMPExecutableDirective &D;


  public:

    NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,

                         const OMPExecutableDirective &D)

        : EST(EST), D(D) {}

    void Enter(CodeGenFunction &CGF) override {

      auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

      RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);

      // Skip target region initialization.

      RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);

    }

    void Exit(CodeGenFunction &CGF) override {

      auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

      RT.clearLocThreadIdInsertPt(CGF);

      RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);

    }

  } Action(EST, D);

  CodeGen.setAction(Action);

  IsInTTDRegion = true;

  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,

                                   IsOffloadEntry, CodeGen);

  IsInTTDRegion = false;

}


void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,

                                        CodeGenFunction &CGF,

                                        EntryFunctionState &EST, bool IsSPMD) {

  int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,

          MaxTeamsVal = -1;

  computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,

                                  MinTeamsVal, MaxTeamsVal);


  CGBuilderTy &Bld = CGF.Builder;

  Bld.restoreIP(OMPBuilder.createTargetInit(

      Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));

  if (!IsSPMD)

    emitGenericVarsProlog(CGF, EST.Loc);

}


void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,

                                          EntryFunctionState &EST,

                                          bool IsSPMD) {

  if (!IsSPMD)

    emitGenericVarsEpilog(CGF);


  // This is temporary until we remove the fixed sized buffer.

  ASTContext &C = CGM.getContext();

  RecordDecl *StaticRD = C.buildImplicitRecord(

      "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);

  StaticRD->startDefinition();

  for (const RecordDecl *TeamReductionRec : TeamsReductions) {

    QualType RecTy = C.getRecordType(TeamReductionRec);

    auto *Field = FieldDecl::Create(

        C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,

        C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),

        /*BW=*/nullptr, /*Mutable=*/false,

        /*InitStyle=*/ICIS_NoInit);

    Field->setAccess(AS_public);

    StaticRD->addDecl(Field);

  }

  StaticRD->completeDefinition();

  QualType StaticTy = C.getRecordType(StaticRD);

  llvm::Type *LLVMReductionsBufferTy =

      CGM.getTypes().ConvertTypeForMem(StaticTy);

  const auto &DL = CGM.getModule().getDataLayout();

  uint64_t ReductionDataSize =

      TeamsReductions.empty()

          ? 0

          : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();

  CGBuilderTy &Bld = CGF.Builder;

  OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,

                                C.getLangOpts().OpenMPCUDAReductionBufNum);

  TeamsReductions.clear();

}


void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,

                                          StringRef ParentName,

                                          llvm::Function *&OutlinedFn,

                                          llvm::Constant *&OutlinedFnID,

                                          bool IsOffloadEntry,

                                          const RegionCodeGenTy &CodeGen) {

  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);

  EntryFunctionState EST;


  bool IsBareKernel = D.getSingleClause<OMPXBareClause>();


  // Emit target region as a standalone region.

  class NVPTXPrePostActionTy : public PrePostActionTy {

    CGOpenMPRuntimeGPU &RT;

    CGOpenMPRuntimeGPU::EntryFunctionState &EST;

    bool IsBareKernel;

    DataSharingMode Mode;

    const OMPExecutableDirective &D;


  public:

    NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,

                         CGOpenMPRuntimeGPU::EntryFunctionState &EST,

                         bool IsBareKernel, const OMPExecutableDirective &D)

        : RT(RT), EST(EST), IsBareKernel(IsBareKernel),

          Mode(RT.CurrentDataSharingMode), D(D) {}

    void Enter(CodeGenFunction &CGF) override {

      if (IsBareKernel) {

        RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;

        return;

      }

      RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);

      // Skip target region initialization.

      RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);

    }

    void Exit(CodeGenFunction &CGF) override {

      if (IsBareKernel) {

        RT.CurrentDataSharingMode = Mode;

        return;

      }

      RT.clearLocThreadIdInsertPt(CGF);

      RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);

    }

  } Action(*this, EST, IsBareKernel, D);

  CodeGen.setAction(Action);

  IsInTTDRegion = true;

  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,

                                   IsOffloadEntry, CodeGen);

  IsInTTDRegion = false;

}


void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(

    const OMPExecutableDirective &D, StringRef ParentName,

    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,

    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {

  if (!IsOffloadEntry) // Nothing to do.

    return;


  assert(!ParentName.empty() && "Invalid target region parent name!");


  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);

  bool IsBareKernel = D.getSingleClause<OMPXBareClause>();

  if (Mode || IsBareKernel)

    emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,

                   CodeGen);

  else

    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,

                      CodeGen);

}


CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)

    : CGOpenMPRuntime(CGM) {

  llvm::OpenMPIRBuilderConfig Config(

      CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),

      CGM.getLangOpts().OpenMPOffloadMandatory,

      /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,

      hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);

  OMPBuilder.setConfig(Config);


  if (!CGM.getLangOpts().OpenMPIsTargetDevice)

    llvm_unreachable("OpenMP can only handle device code.");


  if (CGM.getLangOpts().OpenMPCUDAMode)

    CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;


  llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();

  if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())

    return;


  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,

                              "__omp_rtl_debug_kind");

  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,

                              "__omp_rtl_assume_teams_oversubscription");

  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,

                              "__omp_rtl_assume_threads_oversubscription");

  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,

                              "__omp_rtl_assume_no_thread_state");

  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,

                              "__omp_rtl_assume_no_nested_parallelism");

}


void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,

                                              ProcBindKind ProcBind,

                                              SourceLocation Loc) {

  // Nothing to do.

}


void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,

                                                llvm::Value *NumThreads,

                                                SourceLocation Loc) {

  // Nothing to do.

}


void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,

                                              const Expr *NumTeams,

                                              const Expr *ThreadLimit,

                                              SourceLocation Loc) {}


llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(

    CodeGenFunction &CGF, const OMPExecutableDirective &D,

    const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,

    const RegionCodeGenTy &CodeGen) {

  // Emit target region as a standalone region.

  bool PrevIsInTTDRegion = IsInTTDRegion;

  IsInTTDRegion = false;

  auto *OutlinedFun =

      cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(

          CGF, D, ThreadIDVar, InnermostKind, CodeGen));

  IsInTTDRegion = PrevIsInTTDRegion;

  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {

    llvm::Function *WrapperFun =

        createParallelDataSharingWrapper(OutlinedFun, D);

    WrapperFunctionsMap[OutlinedFun] = WrapperFun;

  }


  return OutlinedFun;

}


/// Get list of lastprivate variables from the teams distribute ... or

/// teams {distribute ...} directives.

static void

getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,

                             llvm::SmallVectorImpl<const ValueDecl *> &Vars) {

  assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&

         "expected teams directive.");

  const OMPExecutableDirective *Dir = &D;

  if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {

    if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(

            Ctx,

            D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(

                /*IgnoreCaptured=*/true))) {

      Dir = dyn_cast_or_null<OMPExecutableDirective>(S);

      if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))

        Dir = nullptr;

    }

  }

  if (!Dir)

    return;

  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {

    for (const Expr *E : C->getVarRefs())

      Vars.push_back(getPrivateItem(E));

  }

}


/// Get list of reduction variables from the teams ... directives.

static void

getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,

                      llvm::SmallVectorImpl<const ValueDecl *> &Vars) {

  assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&

         "expected teams directive.");

  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {

    for (const Expr *E : C->privates())

      Vars.push_back(getPrivateItem(E));

  }

}


llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(

    CodeGenFunction &CGF, const OMPExecutableDirective &D,

    const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,

    const RegionCodeGenTy &CodeGen) {

  SourceLocation Loc = D.getBeginLoc();


  const RecordDecl *GlobalizedRD = nullptr;

  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;

  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;

  unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;

  // Globalize team reductions variable unconditionally in all modes.

  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)

    getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);

  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {

    getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);

    if (!LastPrivatesReductions.empty()) {

      GlobalizedRD = ::buildRecordForGlobalizedVars(

          CGM.getContext(), std::nullopt, LastPrivatesReductions,

          MappedDeclsFields, WarpSize);

    }

  } else if (!LastPrivatesReductions.empty()) {

    assert(!TeamAndReductions.first &&

           "Previous team declaration is not expected.");

    TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();

    std::swap(TeamAndReductions.second, LastPrivatesReductions);

  }


  // Emit target region as a standalone region.

  class NVPTXPrePostActionTy : public PrePostActionTy {

    SourceLocation &Loc;

    const RecordDecl *GlobalizedRD;

    llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

        &MappedDeclsFields;


  public:

    NVPTXPrePostActionTy(

        SourceLocation &Loc, const RecordDecl *GlobalizedRD,

        llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

            &MappedDeclsFields)

        : Loc(Loc), GlobalizedRD(GlobalizedRD),

          MappedDeclsFields(MappedDeclsFields) {}

    void Enter(CodeGenFunction &CGF) override {

      auto &Rt =

          static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

      if (GlobalizedRD) {

        auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;

        I->getSecond().MappedParams =

            std::make_unique<CodeGenFunction::OMPMapVars>();

        DeclToAddrMapTy &Data = I->getSecond().LocalVarData;

        for (const auto &Pair : MappedDeclsFields) {

          assert(Pair.getFirst()->isCanonicalDecl() &&

                 "Expected canonical declaration");

          Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));

        }

      }

      Rt.emitGenericVarsProlog(CGF, Loc);

    }

    void Exit(CodeGenFunction &CGF) override {

      static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())

          .emitGenericVarsEpilog(CGF);

    }

  } Action(Loc, GlobalizedRD, MappedDeclsFields);

  CodeGen.setAction(Action);

  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(

      CGF, D, ThreadIDVar, InnermostKind, CodeGen);


  return OutlinedFun;

}


void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,

                                               SourceLocation Loc) {

  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)

    return;


  CGBuilderTy &Bld = CGF.Builder;


  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);

  if (I == FunctionGlobalizedDecls.end())

    return;


  for (auto &Rec : I->getSecond().LocalVarData) {

    const auto *VD = cast<VarDecl>(Rec.first);

    bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);

    QualType VarTy = VD->getType();


    // Get the local allocation of a firstprivate variable before sharing

    llvm::Value *ParValue;

    if (EscapedParam) {

      LValue ParLVal =

          CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());

      ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);

    }


    // Allocate space for the variable to be globalized

    llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};

    llvm::CallBase *VoidPtr =

        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),

                            AllocArgs, VD->getName());

    // FIXME: We should use the variables actual alignment as an argument.

    VoidPtr->addRetAttr(llvm::Attribute::get(

        CGM.getLLVMContext(), llvm::Attribute::Alignment,

        CGM.getContext().getTargetInfo().getNewAlign() / 8));


    // Cast the void pointer and get the address of the globalized variable.

    llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();

    llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

        VoidPtr, VarPtrTy, VD->getName() + "_on_stack");

    LValue VarAddr =

        CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);

    Rec.second.PrivateAddr = VarAddr.getAddress(CGF);

    Rec.second.GlobalizedVal = VoidPtr;


    // Assign the local allocation to the newly globalized location.

    if (EscapedParam) {

      CGF.EmitStoreOfScalar(ParValue, VarAddr);

      I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));

    }

    if (auto *DI = CGF.getDebugInfo())

      VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));

  }


  for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {

    const auto *VD = cast<VarDecl>(ValueD);

    std::pair<llvm::Value *, llvm::Value *> AddrSizePair =

        getKmpcAllocShared(CGF, VD);

    I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);

    LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),

                                     CGM.getContext().getDeclAlign(VD),

                                     AlignmentSource::Decl);

    I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF));

  }

  I->getSecond().MappedParams->apply(CGF);

}


bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,

                                                     const VarDecl *VD) const {

  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);

  if (I == FunctionGlobalizedDecls.end())

    return false;


  // Check variable declaration is delayed:

  return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);

}


std::pair<llvm::Value *, llvm::Value *>

CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,

                                       const VarDecl *VD) {

  CGBuilderTy &Bld = CGF.Builder;


  // Compute size and alignment.

  llvm::Value *Size = CGF.getTypeSize(VD->getType());

  CharUnits Align = CGM.getContext().getDeclAlign(VD);

  Size = Bld.CreateNUWAdd(

      Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));

  llvm::Value *AlignVal =

      llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());

  Size = Bld.CreateUDiv(Size, AlignVal);

  Size = Bld.CreateNUWMul(Size, AlignVal);


  // Allocate space for this VLA object to be globalized.

  llvm::Value *AllocArgs[] = {Size};

  llvm::CallBase *VoidPtr =

      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                              CGM.getModule(), OMPRTL___kmpc_alloc_shared),

                          AllocArgs, VD->getName());

  VoidPtr->addRetAttr(llvm::Attribute::get(

      CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));


  return std::make_pair(VoidPtr, Size);

}


void CGOpenMPRuntimeGPU::getKmpcFreeShared(

    CodeGenFunction &CGF,

    const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {

  // Deallocate the memory for each globalized VLA object

  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                          CGM.getModule(), OMPRTL___kmpc_free_shared),

                      {AddrSizePair.first, AddrSizePair.second});

}


void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {

  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)

    return;


  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);

  if (I != FunctionGlobalizedDecls.end()) {

    // Deallocate the memory for each globalized VLA object that was

    // globalized in the prolog (i.e. emitGenericVarsProlog).

    for (const auto &AddrSizePair :

         llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {

      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                              CGM.getModule(), OMPRTL___kmpc_free_shared),

                          {AddrSizePair.first, AddrSizePair.second});

    }

    // Deallocate the memory for each globalized value

    for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {

      const auto *VD = cast<VarDecl>(Rec.first);

      I->getSecond().MappedParams->restore(CGF);


      llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,

                                 CGF.getTypeSize(VD->getType())};

      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                              CGM.getModule(), OMPRTL___kmpc_free_shared),

                          FreeArgs);

    }

  }

}


void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,

                                         const OMPExecutableDirective &D,

                                         SourceLocation Loc,

                                         llvm::Function *OutlinedFn,

                                         ArrayRef<llvm::Value *> CapturedVars) {

  if (!CGF.HaveInsertPoint())

    return;


  bool IsBareKernel = D.getSingleClause<OMPXBareClause>();


  RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,

                                                         /*Name=*/".zero.addr");

  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);

  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;

  // We don't emit any thread id function call in bare kernel, but because the

  // outlined function has a pointer argument, we emit a nullptr here.

  if (IsBareKernel)

    OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));

  else

    OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));

  OutlinedFnArgs.push_back(ZeroAddr.getPointer());

  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());

  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);

}


void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,

                                          SourceLocation Loc,

                                          llvm::Function *OutlinedFn,

                                          ArrayRef<llvm::Value *> CapturedVars,

                                          const Expr *IfCond,

                                          llvm::Value *NumThreads) {

  if (!CGF.HaveInsertPoint())

    return;


  auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,

                        NumThreads](CodeGenFunction &CGF,

                                    PrePostActionTy &Action) {

    CGBuilderTy &Bld = CGF.Builder;

    llvm::Value *NumThreadsVal = NumThreads;

    llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];

    llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);

    if (WFn)

      ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);

    llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);


    // Create a private scope that will globalize the arguments

    // passed from the outside of the target region.

    // TODO: Is that needed?

    CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);


    Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(

        llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),

        "captured_vars_addrs");

    // There's something to share.

    if (!CapturedVars.empty()) {

      // Prepare for parallel region. Indicate the outlined function.

      ASTContext &Ctx = CGF.getContext();

      unsigned Idx = 0;

      for (llvm::Value *V : CapturedVars) {

        Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);

        llvm::Value *PtrV;

        if (V->getType()->isIntegerTy())

          PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);

        else

          PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);

        CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,

                              Ctx.getPointerType(Ctx.VoidPtrTy));

        ++Idx;

      }

    }


    llvm::Value *IfCondVal = nullptr;

    if (IfCond)

      IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,

                                    /* isSigned */ false);

    else

      IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);


    if (!NumThreadsVal)

      NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);

    else

      NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),


      assert(IfCondVal && "Expected a value");

    llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);

    llvm::Value *Args[] = {

        RTLoc,

        getThreadID(CGF, Loc),

        IfCondVal,

        NumThreadsVal,

        llvm::ConstantInt::get(CGF.Int32Ty, -1),

        FnPtr,

        ID,

        Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),

                                   CGF.VoidPtrPtrTy),

        llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};

    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                            CGM.getModule(), OMPRTL___kmpc_parallel_51),

                        Args);

  };


  RegionCodeGenTy RCG(ParallelGen);

  RCG(CGF);

}


void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {

  // Always emit simple barriers!

  if (!CGF.HaveInsertPoint())

    return;

  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);

  // This function does not use parameters, so we can emit just default values.

  llvm::Value *Args[] = {

      llvm::ConstantPointerNull::get(

          cast<llvm::PointerType>(getIdentTyPointerTy())),

      llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};

  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                          CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),

                      Args);

}


void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,

                                           SourceLocation Loc,

                                           OpenMPDirectiveKind Kind, bool,

                                           bool) {

  // Always emit simple barriers!

  if (!CGF.HaveInsertPoint())

    return;

  // Build call __kmpc_cancel_barrier(loc, thread_id);

  unsigned Flags = getDefaultFlagsForBarriers(Kind);

  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),

                         getThreadID(CGF, Loc)};


  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                          CGM.getModule(), OMPRTL___kmpc_barrier),

                      Args);

}


void CGOpenMPRuntimeGPU::emitCriticalRegion(

    CodeGenFunction &CGF, StringRef CriticalName,

    const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,

    const Expr *Hint) {

  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");

  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");

  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");

  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");

  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");


  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());


  // Get the mask of active threads in the warp.

  llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

      CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));

  // Fetch team-local id of the thread.

  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);


  // Get the width of the team.

  llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);


  // Initialize the counter variable for the loop.

  QualType Int32Ty =

      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);

  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");

  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);

  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,

                        /*isInit=*/true);


  // Block checks if loop counter exceeds upper bound.

  CGF.EmitBlock(LoopBB);

  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);

  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);

  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);


  // Block tests which single thread should execute region, and which threads

  // should go straight to synchronisation point.

  CGF.EmitBlock(TestBB);

  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);

  llvm::Value *CmpThreadToCounter =

      CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);

  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);


  // Block emits the body of the critical region.

  CGF.EmitBlock(BodyBB);


  // Output the critical statement.

  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,

                                      Hint);


  // After the body surrounded by the critical region, the single executing

  // thread will jump to the synchronisation point.

  // Block waits for all threads in current team to finish then increments the

  // counter variable and returns to the loop.

  CGF.EmitBlock(SyncBB);

  // Reconverge active threads in the warp.

  (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                                CGM.getModule(), OMPRTL___kmpc_syncwarp),

                            Mask);


  llvm::Value *IncCounterVal =

      CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));

  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);

  CGF.EmitBranch(LoopBB);


  // Block that is reached when  all threads in the team complete the region.

  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);

}


/// Cast value to the specified type.

static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,

                                    QualType ValTy, QualType CastTy,

                                    SourceLocation Loc) {

  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&

         "Cast type must sized.");

  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&

         "Val type must sized.");

  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);

  if (ValTy == CastTy)

    return Val;

  if (CGF.getContext().getTypeSizeInChars(ValTy) ==

      CGF.getContext().getTypeSizeInChars(CastTy))

    return CGF.Builder.CreateBitCast(Val, LLVMCastTy);

  if (CastTy->isIntegerType() && ValTy->isIntegerType())

    return CGF.Builder.CreateIntCast(Val, LLVMCastTy,

                                     CastTy->hasSignedIntegerRepresentation());

  Address CastItem = CGF.CreateMemTemp(CastTy);

  Address ValCastItem = CastItem.withElementType(Val->getType());

  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,

                        LValueBaseInfo(AlignmentSource::Type),

                        TBAAAccessInfo());

  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,

                              LValueBaseInfo(AlignmentSource::Type),

                              TBAAAccessInfo());

}


/// This function creates calls to one of two shuffle functions to copy

/// variables between lanes in a warp.

static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,

                                                 llvm::Value *Elem,

                                                 QualType ElemType,

                                                 llvm::Value *Offset,

                                                 SourceLocation Loc) {

  CodeGenModule &CGM = CGF.CGM;

  CGBuilderTy &Bld = CGF.Builder;

  CGOpenMPRuntimeGPU &RT =

      *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));

  llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();


  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);

  assert(Size.getQuantity() <= 8 &&

         "Unsupported bitwidth in shuffle instruction.");


  RuntimeFunction ShuffleFn = Size.getQuantity() <= 4

                                  ? OMPRTL___kmpc_shuffle_int32

                                  : OMPRTL___kmpc_shuffle_int64;


  // Cast all types to 32- or 64-bit values before calling shuffle routines.

  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(

      Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);

  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);

  llvm::Value *WarpSize =

      Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);


  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(

      OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),

      {ElemCast, Offset, WarpSize});


  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);

}


static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,

                            Address DestAddr, QualType ElemType,

                            llvm::Value *Offset, SourceLocation Loc) {

  CGBuilderTy &Bld = CGF.Builder;


  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);

  // Create the loop over the big sized data.

  // ptr = (void*)Elem;

  // ptrEnd = (void*) Elem + 1;

  // Step = 8;

  // while (ptr + Step < ptrEnd)

  //   shuffle((int64_t)*ptr);

  // Step = 4;

  // while (ptr + Step < ptrEnd)

  //   shuffle((int32_t)*ptr);

  // ...

  Address ElemPtr = DestAddr;

  Address Ptr = SrcAddr;

  Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(

      Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);

  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {

    if (Size < CharUnits::fromQuantity(IntSize))

      continue;

    QualType IntType = CGF.getContext().getIntTypeForBitwidth(

        CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),

        /*Signed=*/1);

    llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);

    Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),

                                                  IntTy);

    ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

        ElemPtr, IntTy->getPointerTo(), IntTy);

    if (Size.getQuantity() / IntSize > 1) {

      llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");

      llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");

      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");

      llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();

      CGF.EmitBlock(PreCondBB);

      llvm::PHINode *PhiSrc =

          Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);

      PhiSrc->addIncoming(Ptr.emitRawPointer(CGF), CurrentBB);

      llvm::PHINode *PhiDest =

          Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);

      PhiDest->addIncoming(ElemPtr.emitRawPointer(CGF), CurrentBB);

      Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());

      ElemPtr =

          Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());

      llvm::Value *PtrEndRaw = PtrEnd.emitRawPointer(CGF);

      llvm::Value *PtrRaw = Ptr.emitRawPointer(CGF);

      llvm::Value *PtrDiff = Bld.CreatePtrDiff(

          CGF.Int8Ty, PtrEndRaw,

          Bld.CreatePointerBitCastOrAddrSpaceCast(PtrRaw, CGF.VoidPtrTy));

      Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),

                       ThenBB, ExitBB);

      CGF.EmitBlock(ThenBB);

      llvm::Value *Res = createRuntimeShuffleFunction(

          CGF,

          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,

                               LValueBaseInfo(AlignmentSource::Type),

                               TBAAAccessInfo()),

          IntType, Offset, Loc);

      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,

                            LValueBaseInfo(AlignmentSource::Type),

                            TBAAAccessInfo());

      Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);

      Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);

      PhiSrc->addIncoming(LocalPtr.emitRawPointer(CGF), ThenBB);

      PhiDest->addIncoming(LocalElemPtr.emitRawPointer(CGF), ThenBB);

      CGF.EmitBranch(PreCondBB);

      CGF.EmitBlock(ExitBB);

    } else {

      llvm::Value *Res = createRuntimeShuffleFunction(

          CGF,

          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,

                               LValueBaseInfo(AlignmentSource::Type),

                               TBAAAccessInfo()),

          IntType, Offset, Loc);

      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,

                            LValueBaseInfo(AlignmentSource::Type),

                            TBAAAccessInfo());

      Ptr = Bld.CreateConstGEP(Ptr, 1);

      ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);

    }

    Size = Size % IntSize;

  }

}


namespace {

enum CopyAction : unsigned {

  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in

  // the warp using shuffle instructions.

  RemoteLaneToThread,

  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.

  ThreadCopy,

};

} // namespace


struct CopyOptionsTy {

  llvm::Value *RemoteLaneOffset;

  llvm::Value *ScratchpadIndex;

  llvm::Value *ScratchpadWidth;

};


/// Emit instructions to copy a Reduce list, which contains partially

/// aggregated values, in the specified direction.

static void emitReductionListCopy(

    CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,

    ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,

    CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {


  CodeGenModule &CGM = CGF.CGM;

  ASTContext &C = CGM.getContext();

  CGBuilderTy &Bld = CGF.Builder;


  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;


  // Iterates, element-by-element, through the source Reduce list and

  // make a copy.

  unsigned Idx = 0;

  for (const Expr *Private : Privates) {

    Address SrcElementAddr = Address::invalid();

    Address DestElementAddr = Address::invalid();

    Address DestElementPtrAddr = Address::invalid();

    // Should we shuffle in an element from a remote lane?

    bool ShuffleInElement = false;

    // Set to true to update the pointer in the dest Reduce list to a

    // newly created element.

    bool UpdateDestListPtr = false;

    QualType PrivatePtrType = C.getPointerType(Private->getType());

    llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);


    switch (Action) {

    case RemoteLaneToThread: {

      // Step 1.1: Get the address for the src element in the Reduce list.

      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);

      SrcElementAddr = CGF.EmitLoadOfPointer(

          SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),

          PrivatePtrType->castAs<PointerType>());


      // Step 1.2: Create a temporary to store the element in the destination

      // Reduce list.

      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);

      DestElementAddr =

          CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");

      ShuffleInElement = true;

      UpdateDestListPtr = true;

      break;

    }

    case ThreadCopy: {

      // Step 1.1: Get the address for the src element in the Reduce list.

      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);

      SrcElementAddr = CGF.EmitLoadOfPointer(

          SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),

          PrivatePtrType->castAs<PointerType>());


      // Step 1.2: Get the address for dest element.  The destination

      // element has already been created on the thread's stack.

      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);

      DestElementAddr = CGF.EmitLoadOfPointer(

          DestElementPtrAddr.withElementType(PrivateLlvmPtrType),

          PrivatePtrType->castAs<PointerType>());

      break;

    }

    }


    // Regardless of src and dest of copy, we emit the load of src

    // element as this is required in all directions

    SrcElementAddr = SrcElementAddr.withElementType(

        CGF.ConvertTypeForMem(Private->getType()));

    DestElementAddr =

        DestElementAddr.withElementType(SrcElementAddr.getElementType());


    // Now that all active lanes have read the element in the

    // Reduce list, shuffle over the value from the remote lane.

    if (ShuffleInElement) {

      shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),

                      RemoteLaneOffset, Private->getExprLoc());

    } else {

      switch (CGF.getEvaluationKind(Private->getType())) {

      case TEK_Scalar: {

        llvm::Value *Elem = CGF.EmitLoadOfScalar(

            SrcElementAddr, /*Volatile=*/false, Private->getType(),

            Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),

            TBAAAccessInfo());

        // Store the source element value to the dest element address.

        CGF.EmitStoreOfScalar(

            Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),

            LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());

        break;

      }

      case TEK_Complex: {

        CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(

            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),

            Private->getExprLoc());

        CGF.EmitStoreOfComplex(

            Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),

            /*isInit=*/false);

        break;

      }

      case TEK_Aggregate:

        CGF.EmitAggregateCopy(

            CGF.MakeAddrLValue(DestElementAddr, Private->getType()),

            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),

            Private->getType(), AggValueSlot::DoesNotOverlap);

        break;

      }

    }


    // Step 3.1: Modify reference in dest Reduce list as needed.

    // Modifying the reference in Reduce list to point to the newly

    // created element.  The element is live in the current function

    // scope and that of functions it invokes (i.e., reduce_function).

    // RemoteReduceData[i] = (void*)&RemoteElem

    if (UpdateDestListPtr) {

      CGF.EmitStoreOfScalar(

          Bld.CreatePointerBitCastOrAddrSpaceCast(

              DestElementAddr.emitRawPointer(CGF), CGF.VoidPtrTy),

          DestElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy);

    }


    ++Idx;

  }

}


/// This function emits a helper that gathers Reduce lists from the first

/// lane of every active warp to lanes in the first warp.

///

/// void inter_warp_copy_func(void* reduce_data, num_warps)

///   shared smem[warp_size];

///   For all data entries D in reduce_data:

///     sync

///     If (I am the first lane in each warp)

///       Copy my local D to smem[warp_id]

///     sync

///     if (I am the first warp)

///       Copy smem[thread_id] to my local D

static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,

                                              ArrayRef<const Expr *> Privates,

                                              QualType ReductionArrayTy,

                                              SourceLocation Loc) {

  ASTContext &C = CGM.getContext();

  llvm::Module &M = CGM.getModule();


  // ReduceList: thread local Reduce list.

  // At the stage of the computation when this function is called, partially

  // aggregated values reside in the first lane of every active warp.

  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                  C.VoidPtrTy, ImplicitParamKind::Other);

  // NumWarps: number of warps active in the parallel region.  This could

  // be smaller than 32 (max warps in a CTA) for partial block reduction.

  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                C.getIntTypeForBitwidth(32, /* Signed */ true),

                                ImplicitParamKind::Other);

  FunctionArgList Args;

  Args.push_back(&ReduceListArg);

  Args.push_back(&NumWarpsArg);


  const CGFunctionInfo &CGFI =

      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),

                                    llvm::GlobalValue::InternalLinkage,

                                    "_omp_reduction_inter_warp_copy_func", &M);

  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

  Fn->setDoesNotRecurse();

  CodeGenFunction CGF(CGM);

  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


  CGBuilderTy &Bld = CGF.Builder;


  // This array is used as a medium to transfer, one reduce element at a time,

  // the data from the first lane of every warp to lanes in the first warp

  // in order to perform the final step of a reduction in a parallel region

  // (reduction across warps).  The array is placed in NVPTX __shared__ memory

  // for reduced latency, as well as to have a distinct copy for concurrently

  // executing target regions.  The array is declared with common linkage so

  // as to be shared across compilation units.

  StringRef TransferMediumName =

      "__openmp_nvptx_data_transfer_temporary_storage";

  llvm::GlobalVariable *TransferMedium =

      M.getGlobalVariable(TransferMediumName);

  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;

  if (!TransferMedium) {

    auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);

    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);

    TransferMedium = new llvm::GlobalVariable(

        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,

        llvm::UndefValue::get(Ty), TransferMediumName,

        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,

        SharedAddressSpace);

    CGM.addCompilerUsedGlobal(TransferMedium);

  }


  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

  // Get the CUDA thread id of the current OpenMP thread on the GPU.

  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);

  // nvptx_lane_id = nvptx_id % warpsize

  llvm::Value *LaneID = getNVPTXLaneID(CGF);

  // nvptx_warp_id = nvptx_id / warpsize

  llvm::Value *WarpID = getNVPTXWarpID(CGF);


  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);

  Address LocalReduceList(

      Bld.CreatePointerBitCastOrAddrSpaceCast(

          CGF.EmitLoadOfScalar(

              AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,

              LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),

          ElemTy->getPointerTo()),

      ElemTy, CGF.getPointerAlign());


  unsigned Idx = 0;

  for (const Expr *Private : Privates) {

    //

    // Warp master copies reduce element to transfer medium in __shared__

    // memory.

    //

    unsigned RealTySize =

        C.getTypeSizeInChars(Private->getType())

            .alignTo(C.getTypeAlignInChars(Private->getType()))

            .getQuantity();

    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {

      unsigned NumIters = RealTySize / TySize;

      if (NumIters == 0)

        continue;

      QualType CType = C.getIntTypeForBitwidth(

          C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);

      llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);

      CharUnits Align = CharUnits::fromQuantity(TySize);

      llvm::Value *Cnt = nullptr;

      Address CntAddr = Address::invalid();

      llvm::BasicBlock *PrecondBB = nullptr;

      llvm::BasicBlock *ExitBB = nullptr;

      if (NumIters > 1) {

        CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");

        CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,

                              /*Volatile=*/false, C.IntTy);

        PrecondBB = CGF.createBasicBlock("precond");

        ExitBB = CGF.createBasicBlock("exit");

        llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");

        // There is no need to emit line number for unconditional branch.

        (void)ApplyDebugLocation::CreateEmpty(CGF);

        CGF.EmitBlock(PrecondBB);

        Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);

        llvm::Value *Cmp =

            Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));

        Bld.CreateCondBr(Cmp, BodyBB, ExitBB);

        CGF.EmitBlock(BodyBB);

      }

      // kmpc_barrier.

      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,

                                             /*EmitChecks=*/false,

                                             /*ForceSimpleCall=*/true);

      llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");

      llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");

      llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");


      // if (lane_id == 0)

      llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");

      Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);

      CGF.EmitBlock(ThenBB);


      // Reduce element = LocalReduceList[i]

      Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);

      llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(

          ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());

      // elemptr = ((CopyType*)(elemptrptr)) + I

      Address ElemPtr(ElemPtrPtr, CopyType, Align);

      if (NumIters > 1)

        ElemPtr = Bld.CreateGEP(CGF, ElemPtr, Cnt);


      // Get pointer to location in transfer medium.

      // MediumPtr = &medium[warp_id]

      llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(

          TransferMedium->getValueType(), TransferMedium,

          {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});

      // Casting to actual data type.

      // MediumPtr = (CopyType*)MediumPtrAddr;

      Address MediumPtr(MediumPtrVal, CopyType, Align);


      // elem = *elemptr

      //*MediumPtr = elem

      llvm::Value *Elem = CGF.EmitLoadOfScalar(

          ElemPtr, /*Volatile=*/false, CType, Loc,

          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());

      // Store the source element value to the dest element address.

      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,

                            LValueBaseInfo(AlignmentSource::Type),

                            TBAAAccessInfo());


      Bld.CreateBr(MergeBB);


      CGF.EmitBlock(ElseBB);

      Bld.CreateBr(MergeBB);


      CGF.EmitBlock(MergeBB);


      // kmpc_barrier.

      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,

                                             /*EmitChecks=*/false,

                                             /*ForceSimpleCall=*/true);


      //

      // Warp 0 copies reduce element from transfer medium.

      //

      llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");

      llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");

      llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");


      Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);

      llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(

          AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);


      // Up to 32 threads in warp 0 are active.

      llvm::Value *IsActiveThread =

          Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");

      Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);


      CGF.EmitBlock(W0ThenBB);


      // SrcMediumPtr = &medium[tid]

      llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(

          TransferMedium->getValueType(), TransferMedium,

          {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});

      // SrcMediumVal = *SrcMediumPtr;

      Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);


      // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I

      Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);

      llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(

          TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);

      Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);

      if (NumIters > 1)

        TargetElemPtr = Bld.CreateGEP(CGF, TargetElemPtr, Cnt);


      // *TargetElemPtr = SrcMediumVal;

      llvm::Value *SrcMediumValue =

          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);

      CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,

                            CType);

      Bld.CreateBr(W0MergeBB);


      CGF.EmitBlock(W0ElseBB);

      Bld.CreateBr(W0MergeBB);


      CGF.EmitBlock(W0MergeBB);


      if (NumIters > 1) {

        Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));

        CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);

        CGF.EmitBranch(PrecondBB);

        (void)ApplyDebugLocation::CreateEmpty(CGF);

        CGF.EmitBlock(ExitBB);

      }

      RealTySize %= TySize;

    }

    ++Idx;

  }


  CGF.FinishFunction();

  return Fn;

}


/// Emit a helper that reduces data across two OpenMP threads (lanes)

/// in the same warp.  It uses shuffle instructions to copy over data from

/// a remote lane's stack.  The reduction algorithm performed is specified

/// by the fourth parameter.

///

/// Algorithm Versions.

/// Full Warp Reduce (argument value 0):

///   This algorithm assumes that all 32 lanes are active and gathers

///   data from these 32 lanes, producing a single resultant value.

/// Contiguous Partial Warp Reduce (argument value 1):

///   This algorithm assumes that only a *contiguous* subset of lanes

///   are active.  This happens for the last warp in a parallel region

///   when the user specified num_threads is not an integer multiple of

///   32.  This contiguous subset always starts with the zeroth lane.

/// Partial Warp Reduce (argument value 2):

///   This algorithm gathers data from any number of lanes at any position.

/// All reduced values are stored in the lowest possible lane.  The set

/// of problems every algorithm addresses is a super set of those

/// addressable by algorithms with a lower version number.  Overhead

/// increases as algorithm version increases.

///

/// Terminology

/// Reduce element:

///   Reduce element refers to the individual data field with primitive

///   data types to be combined and reduced across threads.

/// Reduce list:

///   Reduce list refers to a collection of local, thread-private

///   reduce elements.

/// Remote Reduce list:

///   Remote Reduce list refers to a collection of remote (relative to

///   the current thread) reduce elements.

///

/// We distinguish between three states of threads that are important to

/// the implementation of this function.

/// Alive threads:

///   Threads in a warp executing the SIMT instruction, as distinguished from

///   threads that are inactive due to divergent control flow.

/// Active threads:

///   The minimal set of threads that has to be alive upon entry to this

///   function.  The computation is correct iff active threads are alive.

///   Some threads are alive but they are not active because they do not

///   contribute to the computation in any useful manner.  Turning them off

///   may introduce control flow overheads without any tangible benefits.

/// Effective threads:

///   In order to comply with the argument requirements of the shuffle

///   function, we must keep all lanes holding data alive.  But at most

///   half of them perform value aggregation; we refer to this half of

///   threads as effective. The other half is simply handing off their

///   data.

///

/// Procedure

/// Value shuffle:

///   In this step active threads transfer data from higher lane positions

///   in the warp to lower lane positions, creating Remote Reduce list.

/// Value aggregation:

///   In this step, effective threads combine their thread local Reduce list

///   with Remote Reduce list and store the result in the thread local

///   Reduce list.

/// Value copy:

///   In this step, we deal with the assumption made by algorithm 2

///   (i.e. contiguity assumption).  When we have an odd number of lanes

///   active, say 2k+1, only k threads will be effective and therefore k

///   new values will be produced.  However, the Reduce list owned by the

///   (2k+1)th thread is ignored in the value aggregation.  Therefore

///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so

///   that the contiguity assumption still holds.

static llvm::Function *emitShuffleAndReduceFunction(

    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

    QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {

  ASTContext &C = CGM.getContext();


  // Thread local Reduce list used to host the values of data to be reduced.

  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                  C.VoidPtrTy, ImplicitParamKind::Other);

  // Current lane id; could be logical.

  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,

                              ImplicitParamKind::Other);

  // Offset of the remote source lane relative to the current lane.

  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                        C.ShortTy, ImplicitParamKind::Other);

  // Algorithm version.  This is expected to be known at compile time.

  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                               C.ShortTy, ImplicitParamKind::Other);

  FunctionArgList Args;

  Args.push_back(&ReduceListArg);

  Args.push_back(&LaneIDArg);

  Args.push_back(&RemoteLaneOffsetArg);

  Args.push_back(&AlgoVerArg);


  const CGFunctionInfo &CGFI =

      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

  auto *Fn = llvm::Function::Create(

      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

      "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());

  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

  Fn->setDoesNotRecurse();


  CodeGenFunction CGF(CGM);

  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


  CGBuilderTy &Bld = CGF.Builder;


  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);

  Address LocalReduceList(

      Bld.CreatePointerBitCastOrAddrSpaceCast(

          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

                               C.VoidPtrTy, SourceLocation()),

          ElemTy->getPointerTo()),

      ElemTy, CGF.getPointerAlign());


  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);

  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(

      AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());


  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);

  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(

      AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());


  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);

  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(

      AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());


  // Create a local thread-private variable to host the Reduce list

  // from a remote lane.

  Address RemoteReduceList =

      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");


  // This loop iterates through the list of reduce elements and copies,

  // element by element, from a remote lane in the warp to RemoteReduceList,

  // hosted on the thread's stack.

  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,

                        LocalReduceList, RemoteReduceList,

                        {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,

                         /*ScratchpadIndex=*/nullptr,

                         /*ScratchpadWidth=*/nullptr});


  // The actions to be performed on the Remote Reduce list is dependent

  // on the algorithm version.

  //

  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&

  //  LaneId % 2 == 0 && Offset > 0):

  //    do the reduction value aggregation

  //

  //  The thread local variable Reduce list is mutated in place to host the

  //  reduced data, which is the aggregated value produced from local and

  //  remote lanes.

  //

  //  Note that AlgoVer is expected to be a constant integer known at compile

  //  time.

  //  When AlgoVer==0, the first conjunction evaluates to true, making

  //    the entire predicate true during compile time.

  //  When AlgoVer==1, the second conjunction has only the second part to be

  //    evaluated during runtime.  Other conjunctions evaluates to false

  //    during compile time.

  //  When AlgoVer==2, the third conjunction has only the second part to be

  //    evaluated during runtime.  Other conjunctions evaluates to false

  //    during compile time.

  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);


  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));

  llvm::Value *CondAlgo1 = Bld.CreateAnd(

      Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));


  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));

  llvm::Value *CondAlgo2 = Bld.CreateAnd(

      Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));

  CondAlgo2 = Bld.CreateAnd(

      CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));


  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);

  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);


  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");

  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");

  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");

  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);


  CGF.EmitBlock(ThenBB);

  // reduce_function(LocalReduceList, RemoteReduceList)

  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

      LocalReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);

  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

      RemoteReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);

  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(

      CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});

  Bld.CreateBr(MergeBB);


  CGF.EmitBlock(ElseBB);

  Bld.CreateBr(MergeBB);


  CGF.EmitBlock(MergeBB);


  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local

  // Reduce list.

  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));

  llvm::Value *CondCopy = Bld.CreateAnd(

      Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));


  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");

  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");

  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");

  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);


  CGF.EmitBlock(CpyThenBB);

  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,

                        RemoteReduceList, LocalReduceList);

  Bld.CreateBr(CpyMergeBB);


  CGF.EmitBlock(CpyElseBB);

  Bld.CreateBr(CpyMergeBB);


  CGF.EmitBlock(CpyMergeBB);


  CGF.FinishFunction();

  return Fn;

}


/// This function emits a helper that copies all the reduction variables from

/// the team into the provided global buffer for the reduction variables.

///

/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)

///   For all data entries D in reduce_data:

///     Copy local D to buffer.D[Idx]

static llvm::Value *emitListToGlobalCopyFunction(

    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

    QualType ReductionArrayTy, SourceLocation Loc,

    const RecordDecl *TeamReductionRec,

    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

        &VarFieldMap) {

  ASTContext &C = CGM.getContext();


  // Buffer: global reduction buffer.

  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                              C.VoidPtrTy, ImplicitParamKind::Other);

  // Idx: index of the buffer.

  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,

                           ImplicitParamKind::Other);

  // ReduceList: thread local Reduce list.

  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                  C.VoidPtrTy, ImplicitParamKind::Other);

  FunctionArgList Args;

  Args.push_back(&BufferArg);

  Args.push_back(&IdxArg);

  Args.push_back(&ReduceListArg);


  const CGFunctionInfo &CGFI =

      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

  auto *Fn = llvm::Function::Create(

      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

      "_omp_reduction_list_to_global_copy_func", &CGM.getModule());

  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

  Fn->setDoesNotRecurse();

  CodeGenFunction CGF(CGM);

  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


  CGBuilderTy &Bld = CGF.Builder;


  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);

  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);

  Address LocalReduceList(

      Bld.CreatePointerBitCastOrAddrSpaceCast(

          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

                               C.VoidPtrTy, Loc),

          ElemTy->getPointerTo()),

      ElemTy, CGF.getPointerAlign());

  QualType StaticTy = C.getRecordType(TeamReductionRec);

  llvm::Type *LLVMReductionsBufferTy =

      CGM.getTypes().ConvertTypeForMem(StaticTy);

  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),

      LLVMReductionsBufferTy->getPointerTo());

  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),

                                              /*Volatile=*/false, C.IntTy,

                                              Loc)};

  unsigned Idx = 0;

  for (const Expr *Private : Privates) {

    // Reduce element = LocalReduceList[i]

    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);

    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(

        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());

    // elemptr = ((CopyType*)(elemptrptr)) + I

    ElemTy = CGF.ConvertTypeForMem(Private->getType());

    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

        ElemPtrPtr, ElemTy->getPointerTo());

    Address ElemPtr =

        Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));

    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();

    // Global = Buffer.VD[Idx];

    const FieldDecl *FD = VarFieldMap.lookup(VD);

    llvm::Value *BufferPtr =

        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);

    LValue GlobLVal = CGF.EmitLValueForField(

        CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);

    Address GlobAddr = GlobLVal.getAddress(CGF);

    GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),

                                CGF.ConvertTypeForMem(Private->getType()),

                                GlobAddr.getAlignment()));

    switch (CGF.getEvaluationKind(Private->getType())) {

    case TEK_Scalar: {

      llvm::Value *V = CGF.EmitLoadOfScalar(

          ElemPtr, /*Volatile=*/false, Private->getType(), Loc,

          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());

      CGF.EmitStoreOfScalar(V, GlobLVal);

      break;

    }

    case TEK_Complex: {

      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(

          CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);

      CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);

      break;

    }

    case TEK_Aggregate:

      CGF.EmitAggregateCopy(GlobLVal,

                            CGF.MakeAddrLValue(ElemPtr, Private->getType()),

                            Private->getType(), AggValueSlot::DoesNotOverlap);

      break;

    }

    ++Idx;

  }


  CGF.FinishFunction();

  return Fn;

}


/// This function emits a helper that reduces all the reduction variables from

/// the team into the provided global buffer for the reduction variables.

///

/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)

///  void *GlobPtrs[];

///  GlobPtrs[0] = (void*)&buffer.D0[Idx];

///  ...

///  GlobPtrs[N] = (void*)&buffer.DN[Idx];

///  reduce_function(GlobPtrs, reduce_data);

static llvm::Value *emitListToGlobalReduceFunction(

    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

    QualType ReductionArrayTy, SourceLocation Loc,

    const RecordDecl *TeamReductionRec,

    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

        &VarFieldMap,

    llvm::Function *ReduceFn) {

  ASTContext &C = CGM.getContext();


  // Buffer: global reduction buffer.

  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                              C.VoidPtrTy, ImplicitParamKind::Other);

  // Idx: index of the buffer.

  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,

                           ImplicitParamKind::Other);

  // ReduceList: thread local Reduce list.

  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                  C.VoidPtrTy, ImplicitParamKind::Other);

  FunctionArgList Args;

  Args.push_back(&BufferArg);

  Args.push_back(&IdxArg);

  Args.push_back(&ReduceListArg);


  const CGFunctionInfo &CGFI =

      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

  auto *Fn = llvm::Function::Create(

      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

      "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());

  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

  Fn->setDoesNotRecurse();

  CodeGenFunction CGF(CGM);

  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


  CGBuilderTy &Bld = CGF.Builder;


  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);

  QualType StaticTy = C.getRecordType(TeamReductionRec);

  llvm::Type *LLVMReductionsBufferTy =

      CGM.getTypes().ConvertTypeForMem(StaticTy);

  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),

      LLVMReductionsBufferTy->getPointerTo());


  // 1. Build a list of reduction variables.

  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};

  RawAddress ReductionList =

      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");

  auto IPriv = Privates.begin();

  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),

                                              /*Volatile=*/false, C.IntTy,

                                              Loc)};

  unsigned Idx = 0;

  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {

    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

    // Global = Buffer.VD[Idx];

    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();

    const FieldDecl *FD = VarFieldMap.lookup(VD);

    llvm::Value *BufferPtr =

        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);

    LValue GlobLVal = CGF.EmitLValueForField(

        CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);

    Address GlobAddr = GlobLVal.getAddress(CGF);

    CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,

                          /*Volatile=*/false, C.VoidPtrTy);

    if ((*IPriv)->getType()->isVariablyModifiedType()) {

      // Store array size.

      ++Idx;

      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

      llvm::Value *Size = CGF.Builder.CreateIntCast(

          CGF.getVLASize(

                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))

              .NumElts,

          CGF.SizeTy, /*isSigned=*/false);

      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),

                              Elem);

    }

  }


  // Call reduce_function(GlobalReduceList, ReduceList)

  llvm::Value *GlobalReduceList = ReductionList.getPointer();

  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(

      AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);

  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(

      CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});

  CGF.FinishFunction();

  return Fn;

}


/// This function emits a helper that copies all the reduction variables from

/// the team into the provided global buffer for the reduction variables.

///

/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)

///   For all data entries D in reduce_data:

///     Copy buffer.D[Idx] to local D;

static llvm::Value *emitGlobalToListCopyFunction(

    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

    QualType ReductionArrayTy, SourceLocation Loc,

    const RecordDecl *TeamReductionRec,

    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

        &VarFieldMap) {

  ASTContext &C = CGM.getContext();


  // Buffer: global reduction buffer.

  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                              C.VoidPtrTy, ImplicitParamKind::Other);

  // Idx: index of the buffer.

  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,

                           ImplicitParamKind::Other);

  // ReduceList: thread local Reduce list.

  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                  C.VoidPtrTy, ImplicitParamKind::Other);

  FunctionArgList Args;

  Args.push_back(&BufferArg);

  Args.push_back(&IdxArg);

  Args.push_back(&ReduceListArg);


  const CGFunctionInfo &CGFI =

      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

  auto *Fn = llvm::Function::Create(

      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

      "_omp_reduction_global_to_list_copy_func", &CGM.getModule());

  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

  Fn->setDoesNotRecurse();

  CodeGenFunction CGF(CGM);

  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


  CGBuilderTy &Bld = CGF.Builder;


  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);

  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);

  Address LocalReduceList(

      Bld.CreatePointerBitCastOrAddrSpaceCast(

          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

                               C.VoidPtrTy, Loc),

          ElemTy->getPointerTo()),

      ElemTy, CGF.getPointerAlign());

  QualType StaticTy = C.getRecordType(TeamReductionRec);

  llvm::Type *LLVMReductionsBufferTy =

      CGM.getTypes().ConvertTypeForMem(StaticTy);

  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),

      LLVMReductionsBufferTy->getPointerTo());


  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),

                                              /*Volatile=*/false, C.IntTy,

                                              Loc)};

  unsigned Idx = 0;

  for (const Expr *Private : Privates) {

    // Reduce element = LocalReduceList[i]

    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);

    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(

        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());

    // elemptr = ((CopyType*)(elemptrptr)) + I

    ElemTy = CGF.ConvertTypeForMem(Private->getType());

    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

        ElemPtrPtr, ElemTy->getPointerTo());

    Address ElemPtr =

        Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));

    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();

    // Global = Buffer.VD[Idx];

    const FieldDecl *FD = VarFieldMap.lookup(VD);

    llvm::Value *BufferPtr =

        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);

    LValue GlobLVal = CGF.EmitLValueForField(

        CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);

    Address GlobAddr = GlobLVal.getAddress(CGF);

    GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),

                                CGF.ConvertTypeForMem(Private->getType()),

                                GlobAddr.getAlignment()));

    switch (CGF.getEvaluationKind(Private->getType())) {

    case TEK_Scalar: {

      llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);

      CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),

                            LValueBaseInfo(AlignmentSource::Type),

                            TBAAAccessInfo());

      break;

    }

    case TEK_Complex: {

      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);

      CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),

                             /*isInit=*/false);

      break;

    }

    case TEK_Aggregate:

      CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),

                            GlobLVal, Private->getType(),

                            AggValueSlot::DoesNotOverlap);

      break;

    }

    ++Idx;

  }


  CGF.FinishFunction();

  return Fn;

}


/// This function emits a helper that reduces all the reduction variables from

/// the team into the provided global buffer for the reduction variables.

///

/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)

///  void *GlobPtrs[];

///  GlobPtrs[0] = (void*)&buffer.D0[Idx];

///  ...

///  GlobPtrs[N] = (void*)&buffer.DN[Idx];

///  reduce_function(reduce_data, GlobPtrs);

static llvm::Value *emitGlobalToListReduceFunction(

    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

    QualType ReductionArrayTy, SourceLocation Loc,

    const RecordDecl *TeamReductionRec,

    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

        &VarFieldMap,

    llvm::Function *ReduceFn) {

  ASTContext &C = CGM.getContext();


  // Buffer: global reduction buffer.

  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                              C.VoidPtrTy, ImplicitParamKind::Other);

  // Idx: index of the buffer.

  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,

                           ImplicitParamKind::Other);

  // ReduceList: thread local Reduce list.

  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                  C.VoidPtrTy, ImplicitParamKind::Other);

  FunctionArgList Args;

  Args.push_back(&BufferArg);

  Args.push_back(&IdxArg);

  Args.push_back(&ReduceListArg);


  const CGFunctionInfo &CGFI =

      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

  auto *Fn = llvm::Function::Create(

      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

      "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());

  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

  Fn->setDoesNotRecurse();

  CodeGenFunction CGF(CGM);

  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


  CGBuilderTy &Bld = CGF.Builder;


  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);

  QualType StaticTy = C.getRecordType(TeamReductionRec);

  llvm::Type *LLVMReductionsBufferTy =

      CGM.getTypes().ConvertTypeForMem(StaticTy);

  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),

      LLVMReductionsBufferTy->getPointerTo());


  // 1. Build a list of reduction variables.

  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};

  Address ReductionList =

      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");

  auto IPriv = Privates.begin();

  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),

                                              /*Volatile=*/false, C.IntTy,

                                              Loc)};

  unsigned Idx = 0;

  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {

    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

    // Global = Buffer.VD[Idx];

    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();

    const FieldDecl *FD = VarFieldMap.lookup(VD);

    llvm::Value *BufferPtr =

        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);

    LValue GlobLVal = CGF.EmitLValueForField(

        CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);

    Address GlobAddr = GlobLVal.getAddress(CGF);

    CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,

                          /*Volatile=*/false, C.VoidPtrTy);

    if ((*IPriv)->getType()->isVariablyModifiedType()) {

      // Store array size.

      ++Idx;

      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

      llvm::Value *Size = CGF.Builder.CreateIntCast(

          CGF.getVLASize(

                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))

              .NumElts,

          CGF.SizeTy, /*isSigned=*/false);

      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),

                              Elem);

    }

  }


  // Call reduce_function(ReduceList, GlobalReduceList)

  llvm::Value *GlobalReduceList = ReductionList.emitRawPointer(CGF);

  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(

      AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);

  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(

      CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});

  CGF.FinishFunction();

  return Fn;

}


///

/// Design of OpenMP reductions on the GPU

///

/// Consider a typical OpenMP program with one or more reduction

/// clauses:

///

/// float foo;

/// double bar;

/// #pragma omp target teams distribute parallel for \

///             reduction(+:foo) reduction(*:bar)

/// for (int i = 0; i < N; i++) {

///   foo += A[i]; bar *= B[i];

/// }

///

/// where 'foo' and 'bar' are reduced across all OpenMP threads in

/// all teams.  In our OpenMP implementation on the NVPTX device an

/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads

/// within a team are mapped to CUDA threads within a threadblock.

/// Our goal is to efficiently aggregate values across all OpenMP

/// threads such that:

///

///   - the compiler and runtime are logically concise, and

///   - the reduction is performed efficiently in a hierarchical

///     manner as follows: within OpenMP threads in the same warp,

///     across warps in a threadblock, and finally across teams on

///     the NVPTX device.

///

/// Introduction to Decoupling

///

/// We would like to decouple the compiler and the runtime so that the

/// latter is ignorant of the reduction variables (number, data types)

/// and the reduction operators.  This allows a simpler interface

/// and implementation while still attaining good performance.

///

/// Pseudocode for the aforementioned OpenMP program generated by the

/// compiler is as follows:

///

/// 1. Create private copies of reduction variables on each OpenMP

///    thread: 'foo_private', 'bar_private'

/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned

///    to it and writes the result in 'foo_private' and 'bar_private'

///    respectively.

/// 3. Call the OpenMP runtime on the GPU to reduce within a team

///    and store the result on the team master:

///

///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,

///        reduceData, shuffleReduceFn, interWarpCpyFn)

///

///     where:

///       struct ReduceData {

///         double *foo;

///         double *bar;

///       } reduceData

///       reduceData.foo = &foo_private

///       reduceData.bar = &bar_private

///

///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two

///     auxiliary functions generated by the compiler that operate on

///     variables of type 'ReduceData'.  They aid the runtime perform

///     algorithmic steps in a data agnostic manner.

///

///     'shuffleReduceFn' is a pointer to a function that reduces data

///     of type 'ReduceData' across two OpenMP threads (lanes) in the

///     same warp.  It takes the following arguments as input:

///

///     a. variable of type 'ReduceData' on the calling lane,

///     b. its lane_id,

///     c. an offset relative to the current lane_id to generate a

///        remote_lane_id.  The remote lane contains the second

///        variable of type 'ReduceData' that is to be reduced.

///     d. an algorithm version parameter determining which reduction

///        algorithm to use.

///

///     'shuffleReduceFn' retrieves data from the remote lane using

///     efficient GPU shuffle intrinsics and reduces, using the

///     algorithm specified by the 4th parameter, the two operands

///     element-wise.  The result is written to the first operand.

///

///     Different reduction algorithms are implemented in different

///     runtime functions, all calling 'shuffleReduceFn' to perform

///     the essential reduction step.  Therefore, based on the 4th

///     parameter, this function behaves slightly differently to

///     cooperate with the runtime to ensure correctness under

///     different circumstances.

///

///     'InterWarpCpyFn' is a pointer to a function that transfers

///     reduced variables across warps.  It tunnels, through CUDA

///     shared memory, the thread-private data of type 'ReduceData'

///     from lane 0 of each warp to a lane in the first warp.

/// 4. Call the OpenMP runtime on the GPU to reduce across teams.

///    The last team writes the global reduced value to memory.

///

///     ret = __kmpc_nvptx_teams_reduce_nowait(...,

///             reduceData, shuffleReduceFn, interWarpCpyFn,

///             scratchpadCopyFn, loadAndReduceFn)

///

///     'scratchpadCopyFn' is a helper that stores reduced

///     data from the team master to a scratchpad array in

///     global memory.

///

///     'loadAndReduceFn' is a helper that loads data from

///     the scratchpad array and reduces it with the input

///     operand.

///

///     These compiler generated functions hide address

///     calculation and alignment information from the runtime.

/// 5. if ret == 1:

///     The team master of the last team stores the reduced

///     result to the globals in memory.

///     foo += reduceData.foo; bar *= reduceData.bar

///

///

/// Warp Reduction Algorithms

///

/// On the warp level, we have three algorithms implemented in the

/// OpenMP runtime depending on the number of active lanes:

///

/// Full Warp Reduction

///

/// The reduce algorithm within a warp where all lanes are active

/// is implemented in the runtime as follows:

///

/// full_warp_reduce(void *reduce_data,

///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {

///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)

///     ShuffleReduceFn(reduce_data, 0, offset, 0);

/// }

///

/// The algorithm completes in log(2, WARPSIZE) steps.

///

/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is

/// not used therefore we save instructions by not retrieving lane_id

/// from the corresponding special registers.  The 4th parameter, which

/// represents the version of the algorithm being used, is set to 0 to

/// signify full warp reduction.

///

/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:

///

/// #reduce_elem refers to an element in the local lane's data structure

/// #remote_elem is retrieved from a remote lane

/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);

/// reduce_elem = reduce_elem REDUCE_OP remote_elem;

///

/// Contiguous Partial Warp Reduction

///

/// This reduce algorithm is used within a warp where only the first

/// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the

/// number of OpenMP threads in a parallel region is not a multiple of

/// WARPSIZE.  The algorithm is implemented in the runtime as follows:

///

/// void

/// contiguous_partial_reduce(void *reduce_data,

///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,

///                           int size, int lane_id) {

///   int curr_size;

///   int offset;

///   curr_size = size;

///   mask = curr_size/2;

///   while (offset>0) {

///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);

///     curr_size = (curr_size+1)/2;

///     offset = curr_size/2;

///   }

/// }

///

/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:

///

/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);

/// if (lane_id < offset)

///     reduce_elem = reduce_elem REDUCE_OP remote_elem

/// else

///     reduce_elem = remote_elem

///

/// This algorithm assumes that the data to be reduced are located in a

/// contiguous subset of lanes starting from the first.  When there is

/// an odd number of active lanes, the data in the last lane is not

/// aggregated with any other lane's dat but is instead copied over.

///

/// Dispersed Partial Warp Reduction

///

/// This algorithm is used within a warp when any discontiguous subset of

/// lanes are active.  It is used to implement the reduction operation

/// across lanes in an OpenMP simd region or in a nested parallel region.

///

/// void

/// dispersed_partial_reduce(void *reduce_data,

///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {

///   int size, remote_id;

///   int logical_lane_id = number_of_active_lanes_before_me() * 2;

///   do {

///       remote_id = next_active_lane_id_right_after_me();

///       # the above function returns 0 of no active lane

///       # is present right after the current lane.

///       size = number_of_active_lanes_in_this_warp();

///       logical_lane_id /= 2;

///       ShuffleReduceFn(reduce_data, logical_lane_id,

///                       remote_id-1-threadIdx.x, 2);

///   } while (logical_lane_id % 2 == 0 && size > 1);

/// }

///

/// There is no assumption made about the initial state of the reduction.

/// Any number of lanes (>=1) could be active at any position.  The reduction

/// result is returned in the first active lane.

///

/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:

///

/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);

/// if (lane_id % 2 == 0 && offset > 0)

///     reduce_elem = reduce_elem REDUCE_OP remote_elem

/// else

///     reduce_elem = remote_elem

///

///

/// Intra-Team Reduction

///

/// This function, as implemented in the runtime call

/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP

/// threads in a team.  It first reduces within a warp using the

/// aforementioned algorithms.  We then proceed to gather all such

/// reduced values at the first warp.

///

/// The runtime makes use of the function 'InterWarpCpyFn', which copies

/// data from each of the "warp master" (zeroth lane of each warp, where

/// warp-reduced data is held) to the zeroth warp.  This step reduces (in

/// a mathematical sense) the problem of reduction across warp masters in

/// a block to the problem of warp reduction.

///

///

/// Inter-Team Reduction

///

/// Once a team has reduced its data to a single value, it is stored in

/// a global scratchpad array.  Since each team has a distinct slot, this

/// can be done without locking.

///

/// The last team to write to the scratchpad array proceeds to reduce the

/// scratchpad array.  One or more workers in the last team use the helper

/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,

/// the k'th worker reduces every k'th element.

///

/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to

/// reduce across workers and compute a globally reduced value.

///

void CGOpenMPRuntimeGPU::emitReduction(

    CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,

    ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,

    ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {

  if (!CGF.HaveInsertPoint())

    return;


  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);

#ifndef NDEBUG

  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);

#endif


  if (Options.SimpleReduction) {

    assert(!TeamsReduction && !ParallelReduction &&

           "Invalid reduction selection in emitReduction.");

    CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,

                                   ReductionOps, Options);

    return;

  }


  assert((TeamsReduction || ParallelReduction) &&

         "Invalid reduction selection in emitReduction.");


  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;

  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());

  int Cnt = 0;

  for (const Expr *DRE : Privates) {

    PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();

    ++Cnt;

  }


  ASTContext &C = CGM.getContext();

  const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(

      CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);


  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),

  // RedList, shuffle_reduce_func, interwarp_copy_func);

  // or

  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);

  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);


  llvm::Value *Res;

  // 1. Build a list of reduction variables.

  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};

  auto Size = RHSExprs.size();

  for (const Expr *E : Privates) {

    if (E->getType()->isVariablyModifiedType())

      // Reserve place for array size.

      ++Size;

  }

  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);

  QualType ReductionArrayTy = C.getConstantArrayType(

      C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,

      /*IndexTypeQuals=*/0);

  Address ReductionList =

      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");

  auto IPriv = Privates.begin();

  unsigned Idx = 0;

  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {

    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

    CGF.Builder.CreateStore(

        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

            CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),

        Elem);

    if ((*IPriv)->getType()->isVariablyModifiedType()) {

      // Store array size.

      ++Idx;

      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

      llvm::Value *Size = CGF.Builder.CreateIntCast(

          CGF.getVLASize(

                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))

              .NumElts,

          CGF.SizeTy, /*isSigned=*/false);

      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),

                              Elem);

    }

  }


  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

      ReductionList.emitRawPointer(CGF), CGF.VoidPtrTy);

  llvm::Function *ReductionFn = emitReductionFunction(

      CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),

      Privates, LHSExprs, RHSExprs, ReductionOps);

  llvm::Value *ReductionDataSize =

      CGF.getTypeSize(C.getRecordType(ReductionRec));

  ReductionDataSize =

      CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);

  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(

      CGM, Privates, ReductionArrayTy, ReductionFn, Loc);

  llvm::Value *InterWarpCopyFn =

      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);


  if (ParallelReduction) {

    llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,

                           InterWarpCopyFn};


    Res = CGF.EmitRuntimeCall(

        OMPBuilder.getOrCreateRuntimeFunction(

            CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),

        Args);

  } else {

    assert(TeamsReduction && "expected teams reduction.");

    TeamsReductions.push_back(ReductionRec);

    auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(

        OMPBuilder.getOrCreateRuntimeFunction(

            CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),

        {}, "_openmp_teams_reductions_buffer_$_$ptr");

    llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(

        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);

    llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(

        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,

        ReductionFn);

    llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(

        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);

    llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(

        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,

        ReductionFn);


    llvm::Value *Args[] = {

        RTLoc,

        KernelTeamsReductionPtr,

        CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),

        ReductionDataSize,

        RL,

        ShuffleAndReduceFn,

        InterWarpCopyFn,

        GlobalToBufferCpyFn,

        GlobalToBufferRedFn,

        BufferToGlobalCpyFn,

        BufferToGlobalRedFn};


    Res = CGF.EmitRuntimeCall(

        OMPBuilder.getOrCreateRuntimeFunction(

            CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),

        Args);

  }


  // 5. Build if (res == 1)

  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");

  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");

  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(

      Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));

  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);


  // 6. Build then branch: where we have reduced values in the master

  //    thread in each team.

  //    __kmpc_end_reduce{_nowait}(<gtid>);

  //    break;

  CGF.EmitBlock(ThenBB);


  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);

  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,

                    this](CodeGenFunction &CGF, PrePostActionTy &Action) {

    auto IPriv = Privates.begin();

    auto ILHS = LHSExprs.begin();

    auto IRHS = RHSExprs.begin();

    for (const Expr *E : ReductionOps) {

      emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),

                                  cast<DeclRefExpr>(*IRHS));

      ++IPriv;

      ++ILHS;

      ++IRHS;

    }

  };

  RegionCodeGenTy RCG(CodeGen);

  RCG(CGF);

  // There is no need to emit line number for unconditional branch.

  (void)ApplyDebugLocation::CreateEmpty(CGF);

  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);

}


const VarDecl *

CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,

                                       const VarDecl *NativeParam) const {

  if (!NativeParam->getType()->isReferenceType())

    return NativeParam;

  QualType ArgType = NativeParam->getType();

  QualifierCollector QC;

  const Type *NonQualTy = QC.strip(ArgType);

  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();

  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {

    if (Attr->getCaptureKind() == OMPC_map) {

      PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,

                                                        LangAS::opencl_global);

    }

  }

  ArgType = CGM.getContext().getPointerType(PointeeTy);

  QC.addRestrict();

  enum { NVPTX_local_addr = 5 };

  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));

  ArgType = QC.apply(CGM.getContext(), ArgType);

  if (isa<ImplicitParamDecl>(NativeParam))

    return ImplicitParamDecl::Create(

        CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),

        NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);

  return ParmVarDecl::Create(

      CGM.getContext(),

      const_cast<DeclContext *>(NativeParam->getDeclContext()),

      NativeParam->getBeginLoc(), NativeParam->getLocation(),

      NativeParam->getIdentifier(), ArgType,

      /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);

}


Address

CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,

                                          const VarDecl *NativeParam,

                                          const VarDecl *TargetParam) const {

  assert(NativeParam != TargetParam &&

         NativeParam->getType()->isReferenceType() &&

         "Native arg must not be the same as target arg.");

  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);

  QualType NativeParamType = NativeParam->getType();

  QualifierCollector QC;

  const Type *NonQualTy = QC.strip(NativeParamType);

  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();

  unsigned NativePointeeAddrSpace =

      CGF.getTypes().getTargetAddressSpace(NativePointeeTy);

  QualType TargetTy = TargetParam->getType();

  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,

                                                 TargetTy, SourceLocation());

  // Cast to native address space.

  TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

      TargetAddr,

      llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));

  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);

  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,

                        NativeParamType);

  return NativeParamAddr;

}


void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(

    CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,

    ArrayRef<llvm::Value *> Args) const {

  SmallVector<llvm::Value *, 4> TargetArgs;

  TargetArgs.reserve(Args.size());

  auto *FnType = OutlinedFn.getFunctionType();

  for (unsigned I = 0, E = Args.size(); I < E; ++I) {

    if (FnType->isVarArg() && FnType->getNumParams() <= I) {

      TargetArgs.append(std::next(Args.begin(), I), Args.end());

      break;

    }

    llvm::Type *TargetType = FnType->getParamType(I);

    llvm::Value *NativeArg = Args[I];

    if (!TargetType->isPointerTy()) {

      TargetArgs.emplace_back(NativeArg);

      continue;

    }

    TargetArgs.emplace_back(

        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));

  }

  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);

}


/// Emit function which wraps the outline parallel region

/// and controls the arguments which are passed to this function.

/// The wrapper ensures that the outlined function is called

/// with the correct arguments when data is shared.

llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(

    llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {

  ASTContext &Ctx = CGM.getContext();

  const auto &CS = *D.getCapturedStmt(OMPD_parallel);


  // Create a function that takes as argument the source thread.

  FunctionArgList WrapperArgs;

  QualType Int16QTy =

      Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);

  QualType Int32QTy =

      Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);

  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),

                                     /*Id=*/nullptr, Int16QTy,

                                     ImplicitParamKind::Other);

  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),

                               /*Id=*/nullptr, Int32QTy,

                               ImplicitParamKind::Other);

  WrapperArgs.emplace_back(&ParallelLevelArg);

  WrapperArgs.emplace_back(&WrapperArg);


  const CGFunctionInfo &CGFI =

      CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);


  auto *Fn = llvm::Function::Create(

      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

      Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());


  // Ensure we do not inline the function. This is trivially true for the ones

  // passed to __kmpc_fork_call but the ones calles in serialized regions

  // could be inlined. This is not a perfect but it is closer to the invariant

  // we want, namely, every data environment starts with a new function.

  // TODO: We should pass the if condition to the runtime function and do the

  //       handling there. Much cleaner code.

  Fn->addFnAttr(llvm::Attribute::NoInline);


  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);

  Fn->setDoesNotRecurse();


  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);

  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,

                    D.getBeginLoc(), D.getBeginLoc());


  const auto *RD = CS.getCapturedRecordDecl();

  auto CurField = RD->field_begin();


  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,

                                                      /*Name=*/".zero.addr");

  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);

  // Get the array of arguments.

  SmallVector<llvm::Value *, 8> Args;


  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));

  Args.emplace_back(ZeroAddr.emitRawPointer(CGF));


  CGBuilderTy &Bld = CGF.Builder;

  auto CI = CS.capture_begin();


  // Use global memory for data sharing.

  // Handle passing of global args to workers.

  RawAddress GlobalArgs =

      CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");

  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();

  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};

  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                          CGM.getModule(), OMPRTL___kmpc_get_shared_variables),

                      DataSharingArgs);


  // Retrieve the shared variables from the list of references returned

  // by the runtime. Pass the variables to the outlined function.

  Address SharedArgListAddress = Address::invalid();

  if (CS.capture_size() > 0 ||

      isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {

    SharedArgListAddress = CGF.EmitLoadOfPointer(

        GlobalArgs, CGF.getContext()

                        .getPointerType(CGF.getContext().VoidPtrTy)

                        .castAs<PointerType>());

  }

  unsigned Idx = 0;

  if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {

    Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);

    Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(

        Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);

    llvm::Value *LB = CGF.EmitLoadOfScalar(

        TypedAddress,

        /*Volatile=*/false,

        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),

        cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());

    Args.emplace_back(LB);

    ++Idx;

    Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);

    TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(

        Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);

    llvm::Value *UB = CGF.EmitLoadOfScalar(

        TypedAddress,

        /*Volatile=*/false,

        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),

        cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());

    Args.emplace_back(UB);

    ++Idx;

  }

  if (CS.capture_size() > 0) {

    ASTContext &CGFContext = CGF.getContext();

    for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {

      QualType ElemTy = CurField->getType();

      Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);

      Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(

          Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),

          CGF.ConvertTypeForMem(ElemTy));

      llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,

                                              /*Volatile=*/false,

                                              CGFContext.getPointerType(ElemTy),

                                              CI->getLocation());

      if (CI->capturesVariableByCopy() &&

          !CI->getCapturedVar()->getType()->isAnyPointerType()) {

        Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),

                              CI->getLocation());

      }

      Args.emplace_back(Arg);

    }

  }


  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);

  CGF.FinishFunction();

  return Fn;

}


void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,

                                              const Decl *D) {

  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)

    return;


  assert(D && "Expected function or captured|block decl.");

  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&

         "Function is registered already.");

  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&

         "Team is set but not processed.");

  const Stmt *Body = nullptr;

  bool NeedToDelayGlobalization = false;

  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {

    Body = FD->getBody();

  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {

    Body = BD->getBody();

  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {

    Body = CD->getBody();

    NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;

    if (NeedToDelayGlobalization &&

        getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)

      return;

  }

  if (!Body)

    return;

  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);

  VarChecker.Visit(Body);

  const RecordDecl *GlobalizedVarsRecord =

      VarChecker.getGlobalizedRecord(IsInTTDRegion);

  TeamAndReductions.first = nullptr;

  TeamAndReductions.second.clear();

  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =

      VarChecker.getEscapedVariableLengthDecls();

  ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =

      VarChecker.getDelayedVariableLengthDecls();

  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&

      DelayedVariableLengthDecls.empty())

    return;

  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;

  I->getSecond().MappedParams =

      std::make_unique<CodeGenFunction::OMPMapVars>();

  I->getSecond().EscapedParameters.insert(

      VarChecker.getEscapedParameters().begin(),

      VarChecker.getEscapedParameters().end());

  I->getSecond().EscapedVariableLengthDecls.append(

      EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());

  I->getSecond().DelayedVariableLengthDecls.append(

      DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());

  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;

  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {

    assert(VD->isCanonicalDecl() && "Expected canonical declaration");

    Data.insert(std::make_pair(VD, MappedVarData()));

  }

  if (!NeedToDelayGlobalization) {

    emitGenericVarsProlog(CGF, D->getBeginLoc());

    struct GlobalizationScope final : EHScopeStack::Cleanup {

      GlobalizationScope() = default;


      void Emit(CodeGenFunction &CGF, Flags flags) override {

        static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())

            .emitGenericVarsEpilog(CGF);

      }

    };

    CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);

  }

}


Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,

                                                        const VarDecl *VD) {

  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {

    const auto *A = VD->getAttr<OMPAllocateDeclAttr>();

    auto AS = LangAS::Default;

    switch (A->getAllocatorType()) {

      // Use the default allocator here as by default local vars are

      // threadlocal.

    case OMPAllocateDeclAttr::OMPNullMemAlloc:

    case OMPAllocateDeclAttr::OMPDefaultMemAlloc:

    case OMPAllocateDeclAttr::OMPThreadMemAlloc:

    case OMPAllocateDeclAttr::OMPHighBWMemAlloc:

    case OMPAllocateDeclAttr::OMPLowLatMemAlloc:

      // Follow the user decision - use default allocation.

      return Address::invalid();

    case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:

      // TODO: implement aupport for user-defined allocators.

      return Address::invalid();

    case OMPAllocateDeclAttr::OMPConstMemAlloc:

      AS = LangAS::cuda_constant;

      break;

    case OMPAllocateDeclAttr::OMPPTeamMemAlloc:

      AS = LangAS::cuda_shared;

      break;

    case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:

    case OMPAllocateDeclAttr::OMPCGroupMemAlloc:

      break;

    }

    llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());

    auto *GV = new llvm::GlobalVariable(

        CGM.getModule(), VarTy, /*isConstant=*/false,

        llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),

        VD->getName(),

        /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,

        CGM.getContext().getTargetAddressSpace(AS));

    CharUnits Align = CGM.getContext().getDeclAlign(VD);

    GV->setAlignment(Align.getAsAlign());

    return Address(

        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

            GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(

                    VD->getType().getAddressSpace()))),

        VarTy, Align);

  }


  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)

    return Address::invalid();


  VD = VD->getCanonicalDecl();

  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);

  if (I == FunctionGlobalizedDecls.end())

    return Address::invalid();

  auto VDI = I->getSecond().LocalVarData.find(VD);

  if (VDI != I->getSecond().LocalVarData.end())

    return VDI->second.PrivateAddr;

  if (VD->hasAttrs()) {

    for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),

         E(VD->attr_end());

         IT != E; ++IT) {

      auto VDI = I->getSecond().LocalVarData.find(

          cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())

              ->getCanonicalDecl());

      if (VDI != I->getSecond().LocalVarData.end())

        return VDI->second.PrivateAddr;

    }

  }


  return Address::invalid();

}


void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {

  FunctionGlobalizedDecls.erase(CGF.CurFn);

  CGOpenMPRuntime::functionFinished(CGF);

}


void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(

    CodeGenFunction &CGF, const OMPLoopDirective &S,

    OpenMPDistScheduleClauseKind &ScheduleKind,

    llvm::Value *&Chunk) const {

  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {

    ScheduleKind = OMPC_DIST_SCHEDULE_static;

    Chunk = CGF.EmitScalarConversion(

        RT.getGPUNumThreads(CGF),

        CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),

        S.getIterationVariable()->getType(), S.getBeginLoc());

    return;

  }

  CGOpenMPRuntime::getDefaultDistScheduleAndChunk(

      CGF, S, ScheduleKind, Chunk);

}


void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(

    CodeGenFunction &CGF, const OMPLoopDirective &S,

    OpenMPScheduleClauseKind &ScheduleKind,

    const Expr *&ChunkExpr) const {

  ScheduleKind = OMPC_SCHEDULE_static;

  // Chunk size is 1 in this case.

  llvm::APInt ChunkSize(32, 1);

  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,

      CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),

      SourceLocation());

}


void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(

    CodeGenFunction &CGF, const OMPExecutableDirective &D) const {

  assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&

         " Expected target-based directive.");

  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);

  for (const CapturedStmt::Capture &C : CS->captures()) {

    // Capture variables captured by reference in lambdas for target-based

    // directives.

    if (!C.capturesVariable())

      continue;

    const VarDecl *VD = C.getCapturedVar();

    const auto *RD = VD->getType()

                         .getCanonicalType()

                         .getNonReferenceType()

                         ->getAsCXXRecordDecl();

    if (!RD || !RD->isLambda())

      continue;

    Address VDAddr = CGF.GetAddrOfLocalVar(VD);

    LValue VDLVal;

    if (VD->getType().getCanonicalType()->isReferenceType())

      VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());

    else

      VDLVal = CGF.MakeAddrLValue(

          VDAddr, VD->getType().getCanonicalType().getNonReferenceType());

    llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;

    FieldDecl *ThisCapture = nullptr;

    RD->getCaptureFields(Captures, ThisCapture);

    if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {

      LValue ThisLVal =

          CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);

      llvm::Value *CXXThis = CGF.LoadCXXThis();

      CGF.EmitStoreOfScalar(CXXThis, ThisLVal);

    }

    for (const LambdaCapture &LC : RD->captures()) {

      if (LC.getCaptureKind() != LCK_ByRef)

        continue;

      const ValueDecl *VD = LC.getCapturedVar();

      // FIXME: For now VD is always a VarDecl because OpenMP does not support

      //  capturing structured bindings in lambdas yet.

      if (!CS->capturesVariable(cast<VarDecl>(VD)))

        continue;

      auto It = Captures.find(VD);

      assert(It != Captures.end() && "Found lambda capture without field.");

      LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);

      Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));

      if (VD->getType().getCanonicalType()->isReferenceType())

        VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,

                                               VD->getType().getCanonicalType())

                     .getAddress(CGF);

      CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);

    }

  }

}


bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,

                                                            LangAS &AS) {

  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())

    return false;

  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();

  switch(A->getAllocatorType()) {

  case OMPAllocateDeclAttr::OMPNullMemAlloc:

  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:

  // Not supported, fallback to the default mem space.

  case OMPAllocateDeclAttr::OMPThreadMemAlloc:

  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:

  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:

  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:

  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:

    AS = LangAS::Default;

    return true;

  case OMPAllocateDeclAttr::OMPConstMemAlloc:

    AS = LangAS::cuda_constant;

    return true;

  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:

    AS = LangAS::cuda_shared;

    return true;

  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:

    llvm_unreachable("Expected predefined allocator for the variables with the "

                     "static storage.");

  }

  return false;

}


// Get current CudaArch and ignore any unknown values

static CudaArch getCudaArch(CodeGenModule &CGM) {

  if (!CGM.getTarget().hasFeature("ptx"))

    return CudaArch::UNKNOWN;

  for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {

    if (Feature.getValue()) {

      CudaArch Arch = StringToCudaArch(Feature.getKey());

      if (Arch != CudaArch::UNKNOWN)

        return Arch;

    }

  }

  return CudaArch::UNKNOWN;

}


/// Check to see if target architecture supports unified addressing which is

/// a restriction for OpenMP requires clause "unified_shared_memory".

void CGOpenMPRuntimeGPU::processRequiresDirective(

    const OMPRequiresDecl *D) {

  for (const OMPClause *Clause : D->clauselists()) {

    if (Clause->getClauseKind() == OMPC_unified_shared_memory) {

      CudaArch Arch = getCudaArch(CGM);

      switch (Arch) {

      case CudaArch::SM_20:

      case CudaArch::SM_21:

      case CudaArch::SM_30:

      case CudaArch::SM_32_:

      case CudaArch::SM_35:

      case CudaArch::SM_37:

      case CudaArch::SM_50:

      case CudaArch::SM_52:

      case CudaArch::SM_53: {

        SmallString<256> Buffer;

        llvm::raw_svector_ostream Out(Buffer);

        Out << "Target architecture " << CudaArchToString(Arch)

            << " does not support unified addressing";

        CGM.Error(Clause->getBeginLoc(), Out.str());

        return;

      }

      case CudaArch::SM_60:

      case CudaArch::SM_61:

      case CudaArch::SM_62:

      case CudaArch::SM_70:

      case CudaArch::SM_72:

      case CudaArch::SM_75:

      case CudaArch::SM_80:

      case CudaArch::SM_86:

      case CudaArch::SM_87:

      case CudaArch::SM_89:

      case CudaArch::SM_90:

      case CudaArch::SM_90a:

      case CudaArch::GFX600:

      case CudaArch::GFX601:

      case CudaArch::GFX602:

      case CudaArch::GFX700:

      case CudaArch::GFX701:

      case CudaArch::GFX702:

      case CudaArch::GFX703:

      case CudaArch::GFX704:

      case CudaArch::GFX705:

      case CudaArch::GFX801:

      case CudaArch::GFX802:

      case CudaArch::GFX803:

      case CudaArch::GFX805:

      case CudaArch::GFX810:

      case CudaArch::GFX900:

      case CudaArch::GFX902:

      case CudaArch::GFX904:

      case CudaArch::GFX906:

      case CudaArch::GFX908:

      case CudaArch::GFX909:

      case CudaArch::GFX90a:

      case CudaArch::GFX90c:

      case CudaArch::GFX940:

      case CudaArch::GFX941:

      case CudaArch::GFX942:

      case CudaArch::GFX1010:

      case CudaArch::GFX1011:

      case CudaArch::GFX1012:

      case CudaArch::GFX1013:

      case CudaArch::GFX1030:

      case CudaArch::GFX1031:

      case CudaArch::GFX1032:

      case CudaArch::GFX1033:

      case CudaArch::GFX1034:

      case CudaArch::GFX1035:

      case CudaArch::GFX1036:

      case CudaArch::GFX1100:

      case CudaArch::GFX1101:

      case CudaArch::GFX1102:

      case CudaArch::GFX1103:

      case CudaArch::GFX1150:

      case CudaArch::GFX1151:

      case CudaArch::GFX1200:

      case CudaArch::GFX1201:

      case CudaArch::Generic:

      case CudaArch::UNUSED:

      case CudaArch::UNKNOWN:

        break;

      case CudaArch::LAST:

        llvm_unreachable("Unexpected Cuda arch.");

      }

    }

  }

  CGOpenMPRuntime::processRequiresDirective(D);

}


llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {

  CGBuilderTy &Bld = CGF.Builder;

  llvm::Module *M = &CGF.CGM.getModule();

  const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";

  llvm::Function *F = M->getFunction(LocSize);

  if (!F) {

    F = llvm::Function::Create(

        llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),

        llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());

  }

  return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");

}


llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {

  ArrayRef<llvm::Value *> Args{};

  return CGF.EmitRuntimeCall(

      OMPBuilder.getOrCreateRuntimeFunction(

          CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),

      Args);

}


llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {

  ArrayRef<llvm::Value *> Args{};

  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                                 CGM.getModule(), OMPRTL___kmpc_get_warp_size),

                             Args);

}

V
#define V(N, I)
Definition: ASTContext.h:3284

Attr.h

getNVPTXLaneID
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
Definition: CGOpenMPRuntimeGPU.cpp:518

getCudaArch
static CudaArch getCudaArch(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:3445

emitListToGlobalCopyFunction
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2158

emitGlobalToListReduceFunction
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2476

emitInterWarpCopyFunction
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
Definition: CGOpenMPRuntimeGPU.cpp:1708

getTeamsReductionVars
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
Definition: CGOpenMPRuntimeGPU.cpp:986

castValueToType
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
Definition: CGOpenMPRuntimeGPU.cpp:1412

emitReductionListCopy
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values,...
Definition: CGOpenMPRuntimeGPU.cpp:1577

getDistributeLastprivateVars
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ... or teams {distribute ....
Definition: CGOpenMPRuntimeGPU.cpp:961

shuffleAndStore
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
Definition: CGOpenMPRuntimeGPU.cpp:1473

hasNestedSPMDDirective
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Definition: CGOpenMPRuntimeGPU.cpp:540

emitShuffleAndReduceFunction
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
Definition: CGOpenMPRuntimeGPU.cpp:2000

supportsSPMDExecutionMode
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
Definition: CGOpenMPRuntimeGPU.cpp:642

emitListToGlobalReduceFunction
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2269

emitGlobalToListCopyFunction
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2364

createRuntimeShuffleFunction
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
Definition: CGOpenMPRuntimeGPU.cpp:1440

getNVPTXWarpID
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Definition: CGOpenMPRuntimeGPU.cpp:507

CGOpenMPRuntimeGPU.h

CodeGenFunction.h

DeclOpenMP.h
This file defines OpenMP nodes for declarative directives.

OpenMPClause.h
This file defines OpenMP AST classes for clauses.

getPrivateItem
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
Definition: SemaOpenMP.cpp:5404

Data
const char * Data
Definition: StandardLibrary.cpp:36

StmtOpenMP.h
This file defines OpenMP AST classes for executable directives and clauses.

StmtVisitor.h

Base

clang::ASTContext
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:182

clang::ASTContext::getPointerType
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
Definition: ASTContext.cpp:3310

clang::ASTContext::VoidPtrTy
CanQualType VoidPtrTy
Definition: ASTContext.h:1118

clang::ASTContext::getUIntPtrType
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Definition: ASTContext.cpp:6053

clang::ASTContext::getIntTypeForBitwidth
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
Definition: ASTContext.cpp:12218

clang::ASTContext::getSizeType
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
Definition: ASTContext.cpp:6015

clang::ASTContext::getDeclAlign
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
Definition: ASTContext.cpp:1639

clang::ASTContext::toBits
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
Definition: ASTContext.cpp:2427

clang::ASTContext::getTypeSizeInChars
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
Definition: ASTContext.cpp:2433

clang::ASTContext::VoidTy
CanQualType VoidTy
Definition: ASTContext.h:1091

clang::ASTContext::getAsVariableArrayType
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2770

clang::ASTContext::getTargetInfo
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:757

clang::ASTContext::getAddrSpaceQualType
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Definition: ASTContext.cpp:3037

clang::ASTContext::getTargetAddressSpace
unsigned getTargetAddressSpace(LangAS AS) const
Definition: ASTContext.cpp:12500

clang::Attr
Attr - This represents one attribute.
Definition: Attr.h:42

clang::BlockDecl::Capture
A class which contains all the information about a particular captured value.
Definition: Decl.h:4501

clang::BlockDecl::captures
ArrayRef< Capture > captures() const
Definition: Decl.h:4622

clang::BlockExpr
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:6173

clang::BlockExpr::getBlockDecl
const BlockDecl * getBlockDecl() const
Definition: Expr.h:6185

clang::CallExpr
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2820

clang::CallExpr::getCallee
Expr * getCallee()
Definition: Expr.h:2970

clang::CallExpr::arguments
arg_range arguments()
Definition: Expr.h:3059

clang::CapturedStmt::Capture
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3770

clang::CapturedStmt
This captures a statement into a function.
Definition: Stmt.h:3757

clang::CapturedStmt::getCapturedDecl
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1407

clang::CapturedStmt::getCapturedStmt
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3861

clang::CapturedStmt::capturesVariable
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1431

clang::CapturedStmt::captures
capture_range captures()
Definition: Stmt.h:3895

clang::CastExpr::getCastKind
CastKind getCastKind() const
Definition: Expr.h:3527

clang::CastExpr::getSubExpr
Expr * getSubExpr()
Definition: Expr.h:3533

clang::CharUnits
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38

clang::CharUnits::isZero
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:122

clang::CharUnits::getAsAlign
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:189

clang::CharUnits::getQuantity
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185

clang::CharUnits::fromQuantity
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63

clang::CodeGen::Address
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111

clang::CodeGen::Address::invalid
static Address invalid()
Definition: Address.h:153

clang::CodeGen::Address::emitRawPointer
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:220

clang::CodeGen::Address::getAlignment
CharUnits getAlignment() const
Definition: Address.h:166

clang::CodeGen::Address::getElementType
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:184

clang::CodeGen::Address::withElementType
Address withElementType(llvm::Type *ElemTy) const
Return address with different element type, but same pointer and alignment.
Definition: Address.h:241

clang::CodeGen::Address::getType
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:176

clang::CodeGen::AggValueSlot::DoesNotOverlap
@ DoesNotOverlap
Definition: CGValue.h:574

clang::CodeGen::ApplyDebugLocation::CreateEmpty
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:881

clang::CodeGen::CGBuilderTy
Definition: CGBuilder.h:50

clang::CodeGen::CGBuilderTy::CreateStore
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:136

clang::CodeGen::CGBuilderTy::CreateIsNull
llvm::Value * CreateIsNull(Address Addr, const Twine &Name="")
Definition: CGBuilder.h:355

clang::CodeGen::CGBuilderTy::CreateGEP
Address CreateGEP(CodeGenFunction &CGF, Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition: CGBuilder.h:292

clang::CodeGen::CGBuilderTy::CreatePointerBitCastOrAddrSpaceCast
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:203

clang::CodeGen::CGBuilderTy::CreateConstArrayGEP
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ... produce name = getelementptr inbounds addr, i64 0, i64 index where i64 is a...
Definition: CGBuilder.h:241

clang::CodeGen::CGBuilderTy::CreateConstGEP
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:278

clang::CodeGen::CGBuilderTy::CreateConstInBoundsGEP
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:261

clang::CodeGen::CGBuilderTy::CreateInBoundsGEP
Address CreateInBoundsGEP(Address Addr, ArrayRef< llvm::Value * > IdxList, llvm::Type *ElementType, CharUnits Align, const Twine &Name="")
Definition: CGBuilder.h:345

clang::CodeGen::CGFunctionInfo
CGFunctionInfo - Class to encapsulate the information about a function definition.
Definition: CGFunctionInfo.h:554

clang::CodeGen::CGOpenMPRuntimeGPU
Definition: CGOpenMPRuntimeGPU.h:24

clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelCall
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
Definition: CGOpenMPRuntimeGPU.cpp:1230

clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction
llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
Definition: CGOpenMPRuntimeGPU.cpp:996

clang::CodeGen::CGOpenMPRuntimeGPU::emitProcBindClause
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Definition: CGOpenMPRuntimeGPU.cpp:921

clang::CodeGen::CGOpenMPRuntimeGPU::emitReduction
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
Definition: CGOpenMPRuntimeGPU.cpp:2807

clang::CodeGen::CGOpenMPRuntimeGPU::DataSharingMode
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
Definition: CGOpenMPRuntimeGPU.h:40

clang::CodeGen::CGOpenMPRuntimeGPU::DS_CUDA
@ DS_CUDA
CUDA data sharing mode.
Definition: CGOpenMPRuntimeGPU.h:42

clang::CodeGen::CGOpenMPRuntimeGPU::DS_Generic
@ DS_Generic
Generic data-sharing mode.
Definition: CGOpenMPRuntimeGPU.h:44

clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3332

clang::CodeGen::CGOpenMPRuntimeGPU::getAddressOfLocalVariable
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Definition: CGOpenMPRuntimeGPU.cpp:3258

clang::CodeGen::CGOpenMPRuntimeGPU::emitFunctionProlog
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
Definition: CGOpenMPRuntimeGPU.cpp:3191

clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3349

clang::CodeGen::CGOpenMPRuntimeGPU::emitNumTeamsClause
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
Definition: CGOpenMPRuntimeGPU.cpp:933

clang::CodeGen::CGOpenMPRuntimeGPU::emitCriticalRegion
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
Definition: CGOpenMPRuntimeGPU.cpp:1342

clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsCall
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
Definition: CGOpenMPRuntimeGPU.cpp:1205

clang::CodeGen::CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
Definition: CGOpenMPRuntimeGPU.cpp:3415

clang::CodeGen::CGOpenMPRuntimeGPU::getKmpcFreeShared
void getKmpcFreeShared(CodeGenFunction &CGF, const std::pair< llvm::Value *, llvm::Value * > &AddrSizePair) override
Get call to __kmpc_free_shared.
Definition: CGOpenMPRuntimeGPU.cpp:1168

clang::CodeGen::CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU
CGOpenMPRuntimeGPU(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:890

clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelOutlinedFunction
llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
Definition: CGOpenMPRuntimeGPU.cpp:938

clang::CodeGen::CGOpenMPRuntimeGPU::functionFinished
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntimeGPU.cpp:3327

clang::CodeGen::CGOpenMPRuntimeGPU::getGPUThreadID
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
Definition: CGOpenMPRuntimeGPU.cpp:3563

clang::CodeGen::CGOpenMPRuntimeGPU::getGPUWarpSize
llvm::Value * getGPUWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
Definition: CGOpenMPRuntimeGPU.cpp:3571

clang::CodeGen::CGOpenMPRuntimeGPU::processRequiresDirective
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntimeGPU.cpp:3460

clang::CodeGen::CGOpenMPRuntimeGPU::emitOutlinedFunctionCall
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntimeGPU.cpp:3037

clang::CodeGen::CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl
bool isDelayedVariableLengthDecl(CodeGenFunction &CGF, const VarDecl *VD) const override
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
Definition: CGOpenMPRuntimeGPU.cpp:1131

clang::CodeGen::CGOpenMPRuntimeGPU::getParameterAddress
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
Definition: CGOpenMPRuntimeGPU.cpp:3011

clang::CodeGen::CGOpenMPRuntimeGPU::ExecutionMode
ExecutionMode
Defines the execution mode.
Definition: CGOpenMPRuntimeGPU.h:27

clang::CodeGen::CGOpenMPRuntimeGPU::EM_NonSPMD
@ EM_NonSPMD
Non-SPMD execution mode (1 master thread, others are workers).
Definition: CGOpenMPRuntimeGPU.h:31

clang::CodeGen::CGOpenMPRuntimeGPU::EM_Unknown
@ EM_Unknown
Unknown execution mode (orphaned directive).
Definition: CGOpenMPRuntimeGPU.h:33

clang::CodeGen::CGOpenMPRuntimeGPU::EM_SPMD
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
Definition: CGOpenMPRuntimeGPU.h:29

clang::CodeGen::CGOpenMPRuntimeGPU::emitBarrierCall
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntimeGPU.cpp:1325

clang::CodeGen::CGOpenMPRuntimeGPU::getGPUNumThreads
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
Definition: CGOpenMPRuntimeGPU.cpp:3550

clang::CodeGen::CGOpenMPRuntimeGPU::translateParameter
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
Definition: CGOpenMPRuntimeGPU.cpp:2979

clang::CodeGen::CGOpenMPRuntimeGPU::getKmpcAllocShared
std::pair< llvm::Value *, llvm::Value * > getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) override
Get call to __kmpc_alloc_shared.
Definition: CGOpenMPRuntimeGPU.cpp:1142

clang::CodeGen::CGOpenMPRuntimeGPU::isGPU
bool isGPU() const override
Returns true if the current target is a GPU.
Definition: CGOpenMPRuntimeGPU.h:134

clang::CodeGen::CGOpenMPRuntimeGPU::emitNumThreadsClause
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Definition: CGOpenMPRuntimeGPU.cpp:927

clang::CodeGen::CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
Definition: CGOpenMPRuntimeGPU.cpp:3361

clang::CodeGen::CGOpenMPRuntime
Definition: CGOpenMPRuntime.h:229

clang::CodeGen::CGOpenMPRuntime::emitThreadIDAddress
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
Definition: CGOpenMPRuntime.cpp:1944

clang::CodeGen::CGOpenMPRuntime::CGM
CodeGenModule & CGM
Definition: CGOpenMPRuntime.h:309

clang::CodeGen::CGOpenMPRuntime::getSingleCompoundChild
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
Definition: CGOpenMPRuntime.cpp:5969

clang::CodeGen::CGOpenMPRuntime::emitUpdateLocation
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
Definition: CGOpenMPRuntime.cpp:1366

clang::CodeGen::CGOpenMPRuntime::functionFinished
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntime.cpp:1461

clang::CodeGen::CGOpenMPRuntime::emitTeamsOutlinedFunction
virtual llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
Definition: CGOpenMPRuntime.cpp:1271

clang::CodeGen::CGOpenMPRuntime::OMPBuilder
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
Definition: CGOpenMPRuntime.h:312

clang::CodeGen::CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams
void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D, CodeGenFunction &CGF, int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal, int32_t &MaxTeamsVal)
Helper to determine the min/max number of threads/teams for D.
Definition: CGOpenMPRuntime.cpp:5889

clang::CodeGen::CGOpenMPRuntime::emitTargetOutlinedFunctionHelper
virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Helper to emit outlined function for 'target' directive.
Definition: CGOpenMPRuntime.cpp:5925

clang::CodeGen::CGOpenMPRuntime::hasRequiresUnifiedSharedMemory
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
Definition: CGOpenMPRuntime.cpp:10077

clang::CodeGen::CGOpenMPRuntime::processRequiresDirective
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntime.cpp:10024

clang::CodeGen::CGOpenMPRuntime::getThreadID
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
Definition: CGOpenMPRuntime.cpp:1391

clang::CodeGen::CGOpenMPRuntime::clearLocThreadIdInsertPt
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
Definition: CGOpenMPRuntime.cpp:1344

clang::CodeGen::CGOpenMPRuntime::emitBarrierCall
virtual void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false)
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntime.cpp:2343

clang::CodeGen::CGOpenMPRuntime::getDefaultFlagsForBarriers
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
Definition: CGOpenMPRuntime.cpp:2310

clang::CodeGen::CGOpenMPRuntime::emitParallelOutlinedFunction
virtual llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
Definition: CGOpenMPRuntime.cpp:1261

clang::CodeGen::CGOpenMPRuntime::getDefaultDistScheduleAndChunk
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
Definition: CGOpenMPRuntime.h:1523

clang::CodeGen::CGOpenMPRuntime::getIdentTyPointerTy
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
Definition: CGOpenMPRuntime.cpp:1482

clang::CodeGen::CGOpenMPRuntime::emitSingleReductionCombiner
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Definition: CGOpenMPRuntime.cpp:4899

clang::CodeGen::CGOpenMPRuntime::getOMPBuilder
llvm::OpenMPIRBuilder & getOMPBuilder()
Definition: CGOpenMPRuntime.h:306

clang::CodeGen::CGOpenMPRuntime::emitCriticalRegion
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
Definition: CGOpenMPRuntime.cpp:2006

clang::CodeGen::CGOpenMPRuntime::emitReduction
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
Definition: CGOpenMPRuntime.cpp:4919

clang::CodeGen::CGOpenMPRuntime::emitOutlinedFunctionCall
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntime.cpp:11220

clang::CodeGen::CGOpenMPRuntime::emitReductionFunction
llvm::Function * emitReductionFunction(StringRef ReducerName, SourceLocation Loc, llvm::Type *ArgsElemType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
Definition: CGOpenMPRuntime.cpp:4809

clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::getKind
CapturedRegionKind getKind() const
Definition: CodeGenFunction.h:482

clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::isCXXThisExprCaptured
bool isCXXThisExprCaptured() const
Definition: CodeGenFunction.h:493

clang::CodeGen::CodeGenFunction::OMPPrivateScope
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
Definition: CodeGenFunction.h:1120

clang::CodeGen::CodeGenFunction
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
Definition: CodeGenFunction.h:256

clang::CodeGen::CodeGenFunction::FinishFunction
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.

clang::CodeGen::CodeGenFunction::getEvaluationKind
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.

clang::CodeGen::CodeGenFunction::CapturedStmtInfo
CGCapturedStmtInfo * CapturedStmtInfo
Definition: CodeGenFunction.h:528

clang::CodeGen::CodeGenFunction::MakeNaturalAlignPointeeRawAddrLValue
LValue MakeNaturalAlignPointeeRawAddrLValue(llvm::Value *V, QualType T)
Same as MakeNaturalAlignPointeeAddrLValue except that the pointer is known to be unsigned.

clang::CodeGen::CodeGenFunction::EmitLoadOfPointer
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.

clang::CodeGen::CodeGenFunction::CreateDefaultAlignTempAlloca
RawAddress CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...

clang::CodeGen::CodeGenFunction::getVLASize
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements,...

clang::CodeGen::CodeGenFunction::EmitLValue
LValue EmitLValue(const Expr *E, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitLValue - Emit code to compute a designator that specifies the location of the expression.

clang::CodeGen::CodeGenFunction::createBasicBlock
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
Definition: CodeGenFunction.h:2544

clang::CodeGen::CodeGenFunction::EmitLValueForFieldInitialization
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...

clang::CodeGen::CodeGenFunction::EmitBlock
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.

clang::CodeGen::CodeGenFunction::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T)

clang::CodeGen::CodeGenFunction::EmitLValueForField
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)

clang::CodeGen::CodeGenFunction::CreateMemTemp
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...

clang::CodeGen::CodeGenFunction::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenFunction.h:2134

clang::CodeGen::CodeGenFunction::getTypeSize
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.

clang::CodeGen::CodeGenFunction::StartFunction
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.

clang::CodeGen::CodeGenFunction::EmitLoadOfComplex
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.

clang::CodeGen::CodeGenFunction::HaveInsertPoint
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
Definition: CodeGenFunction.h:2585

clang::CodeGen::CodeGenFunction::getDebugInfo
CGDebugInfo * getDebugInfo()
Definition: CodeGenFunction.h:2093

clang::CodeGen::CodeGenFunction::EmitBranch
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...

clang::CodeGen::CodeGenFunction::EmitAggregateCopy
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.

clang::CodeGen::CodeGenFunction::MakeNaturalAlignRawAddrLValue
LValue MakeNaturalAlignRawAddrLValue(llvm::Value *V, QualType T)

clang::CodeGen::CodeGenFunction::Builder
CGBuilderTy Builder
Definition: CodeGenFunction.h:294

clang::CodeGen::CodeGenFunction::EHStack
EHScopeStack EHStack
Definition: CodeGenFunction.h:671

clang::CodeGen::CodeGenFunction::getContext
ASTContext & getContext() const
Definition: CodeGenFunction.h:2092

clang::CodeGen::CodeGenFunction::EmitLoadOfScalar
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:4085

clang::CodeGen::CodeGenFunction::EmitRuntimeCall
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")

clang::CodeGen::CodeGenFunction::ConvertType
llvm::Type * ConvertType(QualType T)

clang::CodeGen::CodeGenFunction::getTypes
CodeGenTypes & getTypes() const
Definition: CodeGenFunction.h:2091

clang::CodeGen::CodeGenFunction::CGM
CodeGenModule & CGM
Definition: CodeGenFunction.h:286

clang::CodeGen::CodeGenFunction::EvaluateExprAsBool
llvm::Value * EvaluateExprAsBool(const Expr *E)
EvaluateExprAsBool - Perform the usual unary conversions on the specified expression and compare the ...

clang::CodeGen::CodeGenFunction::CurFn
llvm::Function * CurFn
Definition: CodeGenFunction.h:353

clang::CodeGen::CodeGenFunction::MakeAddrLValue
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
Definition: CodeGenFunction.h:2636

clang::CodeGen::CodeGenFunction::EmitStoreOfComplex
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.

clang::CodeGen::CodeGenFunction::LoadCXXThis
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
Definition: CodeGenFunction.h:2976

clang::CodeGen::CodeGenFunction::EmitLoadOfReferenceLValue
LValue EmitLoadOfReferenceLValue(LValue RefLVal)

clang::CodeGen::CodeGenFunction::GetAddrOfLocalVar
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
Definition: CodeGenFunction.h:2892

clang::CodeGen::CodeGenFunction::EmitScalarConversion
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...

clang::CodeGen::CodeGenFunction::ComplexPairTy
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
Definition: CodeGenFunction.h:292

clang::CodeGen::CodeGenFunction::getLLVMContext
llvm::LLVMContext & getLLVMContext()
Definition: CodeGenFunction.h:2135

clang::CodeGen::CodeGenFunction::EmitStoreOfScalar
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:4107

clang::CodeGen::CodeGenModule
This class organizes the cross-function state that is used while generating LLVM code.
Definition: CodeGenModule.h:280

clang::CodeGen::CodeGenModule::SetInternalFunctionAttributes
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
Definition: CodeGenModule.cpp:2663

clang::CodeGen::CodeGenModule::getModule
llvm::Module & getModule() const
Definition: CodeGenModule.h:751

clang::CodeGen::CodeGenModule::addCompilerUsedGlobal
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
Definition: CodeGenModule.cpp:2870

clang::CodeGen::CodeGenModule::getLangOpts
const LangOptions & getLangOpts() const
Definition: CodeGenModule.h:742

clang::CodeGen::CodeGenModule::getTypes
CodeGenTypes & getTypes()
Definition: CodeGenModule.h:768

clang::CodeGen::CodeGenModule::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenModule.h:756

clang::CodeGen::CodeGenModule::Error
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
Definition: CodeGenModule.cpp:1480

clang::CodeGen::CodeGenModule::getOpenMPRuntime
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
Definition: CodeGenModule.h:648

clang::CodeGen::CodeGenModule::getContext
ASTContext & getContext() const
Definition: CodeGenModule.h:741

clang::CodeGen::CodeGenModule::getLLVMContext
llvm::LLVMContext & getLLVMContext()
Definition: CodeGenModule.h:762

clang::CodeGen::CodeGenTypes::GetFunctionType
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1632

clang::CodeGen::CodeGenTypes::arrangeBuiltinFunctionDeclaration
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:680

clang::CodeGen::CodeGenTypes::getTargetAddressSpace
unsigned getTargetAddressSpace(QualType T) const
Definition: CodeGenTypes.cpp:829

clang::CodeGen::CodeGenTypes::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T, bool ForBitField=false)
ConvertTypeForMem - Convert type T into a llvm::Type.
Definition: CodeGenTypes.cpp:92

clang::CodeGen::EHScopeStack::Cleanup
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141

clang::CodeGen::FunctionArgList
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:352

clang::CodeGen::LValueBaseInfo
Definition: CGValue.h:164

clang::CodeGen::LValue
LValue - This represents an lvalue references.
Definition: CGValue.h:181

clang::CodeGen::LValue::getAddress
Address getAddress(CodeGenFunction &CGF) const
Definition: CGValue.h:370

clang::CodeGen::LValue::getPointer
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:361

clang::CodeGen::LValue::setAddress
void setAddress(Address address)
Definition: CGValue.h:375

clang::CodeGen::PrePostActionTy
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
Definition: CGOpenMPRuntime.h:58

clang::CodeGen::RawAddress
An abstract representation of an aligned address.
Definition: Address.h:41

clang::CodeGen::RawAddress::getPointer
llvm::Value * getPointer() const
Definition: Address.h:65

clang::CodeGen::RegionCodeGenTy
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
Definition: CGOpenMPRuntime.h:68

clang::CodeGen::RegionCodeGenTy::setAction
void setAction(PrePostActionTy &Action) const
Definition: CGOpenMPRuntime.h:89

clang::ConstStmtVisitor
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:195

clang::DeclContext
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1436

clang::DeclContext::addDecl
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1698

clang::DeclRefExpr
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1260

clang::DeclRefExpr::getDecl
ValueDecl * getDecl()
Definition: Expr.h:1328

clang::DeclStmt
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1497

clang::Decl
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86

clang::Decl::getAttr
T * getAttr() const
Definition: DeclBase.h:579

clang::Decl::hasAttrs
bool hasAttrs() const
Definition: DeclBase.h:524

clang::Decl::attr_end
attr_iterator attr_end() const
Definition: DeclBase.h:548

clang::Decl::isCanonicalDecl
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:974

clang::Decl::attr_begin
attr_iterator attr_begin() const
Definition: DeclBase.h:545

clang::Decl::getLocation
SourceLocation getLocation() const
Definition: DeclBase.h:445

clang::Decl::getDeclContext
DeclContext * getDeclContext()
Definition: DeclBase.h:454

clang::Decl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:437

clang::Decl::getAttrs
AttrVec & getAttrs()
Definition: DeclBase.h:530

clang::Decl::hasAttr
bool hasAttr() const
Definition: DeclBase.h:583

clang::Decl::getCanonicalDecl
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:968

clang::DeclaratorDecl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:822

clang::Expr
This represents one expression.
Definition: Expr.h:110

clang::Expr::IgnoreParenImpCasts
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3059

clang::Expr::IgnoreParens
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3055

clang::Expr::isLValue
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:277

clang::Expr::getType
QualType getType() const
Definition: Expr.h:142

clang::FieldDecl
Represents a member of a struct/union/class.
Definition: Decl.h:3058

clang::FieldDecl::Create
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4547

clang::GlobalDecl
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56

clang::ImplicitCastExpr
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3655

clang::ImplicitParamDecl
Definition: Decl.h:1720

clang::ImplicitParamDecl::Create
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:5381

clang::IntegerLiteral::Create
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:977

clang::LambdaCapture
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25

clang::LambdaExpr
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1948

clang::LambdaExpr::isInitCapture
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition: ExprCXX.cpp:1290

clang::LambdaExpr::captures
capture_range captures() const
Retrieve this lambda's captures.
Definition: ExprCXX.cpp:1303

clang::LangOptions::OMPHostIRFile
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:539

clang::NamedDecl::getIdentifier
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270

clang::NamedDecl::getName
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:276

clang::OMPClause
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55

clang::OMPExecutableDirective
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266

clang::OMPExecutableDirective::getInnermostCapturedStmt
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:556

clang::OMPExecutableDirective::hasAssociatedStmt
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:531

clang::OMPExecutableDirective::getCapturedStmt
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive.
Definition: StmtOpenMP.h:547

clang::OMPExecutableDirective::getDirectiveKind
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:569

clang::OMPExecutableDirective::getAssociatedStmt
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:534

clang::OMPExecutableDirective::getBeginLoc
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:502

clang::OMPExecutableDirective::clauses
ArrayRef< OMPClause * > clauses() const
Definition: StmtOpenMP.h:586

clang::OMPExecutableDirective::getSingleClause
static const SpecificClause * getSingleClause(ArrayRef< OMPClause * > Clauses)
Gets a single clause of the specified kind associated with the current directive iff there is only on...
Definition: StmtOpenMP.h:477

clang::OMPExecutableDirective::getClausesOfKind
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:459

clang::OMPLastprivateClause
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:2896

clang::OMPLoopDirective
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1018

clang::OMPReductionClause
This represents clause 'reduction' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:3207

clang::OMPRequiresDecl
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:417

clang::OMPRequiresDecl::clauselists
clauselist_range clauselists()
Definition: DeclOpenMP.h:442

clang::OMPXBareClause
This represents 'ompx_bare' clause in the '#pragma omp target teams ...' directive.
Definition: OpenMPClause.h:9367

clang::ParmVarDecl::Create
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2915

clang::PointerType
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:3135

clang::QualType
A (possibly-)qualified type.
Definition: Type.h:940

clang::QualType::getAddressSpace
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:7481

clang::QualType::getNonReferenceType
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:7556

clang::QualType::getCanonicalType
QualType getCanonicalType() const
Definition: Type.h:7407

clang::QualifierCollector
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:7295

clang::QualifierCollector::strip
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:7302

clang::QualifierCollector::apply
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:4272

clang::Qualifiers::addAddressSpace
void addAddressSpace(LangAS space)
Definition: Type.h:583

clang::Qualifiers::addRestrict
void addRestrict()
Definition: Type.h:466

clang::RecordDecl
Represents a struct/union/class.
Definition: Decl.h:4169

clang::RecordDecl::completeDefinition
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:5083

clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:88

clang::StmtVisitorBase::Visit
RetTy Visit(PTR(Stmt) S, ParamTys... P)
Definition: StmtVisitor.h:44

clang::Stmt
Stmt - This represents one statement.
Definition: Stmt.h:84

clang::Stmt::children
child_range children()
Definition: Stmt.cpp:287

clang::Stmt::IgnoreContainers
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top,...
Definition: Stmt.cpp:197

clang::TagDecl::startDefinition
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4739

clang::TargetInfo::getNewAlign
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:722

clang::TargetInfo::getTargetOpts
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:308

clang::TargetInfo::getGridValue
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1618

clang::TargetInfo::hasFeature
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1452

clang::TargetOptions::FeatureMap
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62

clang::Type
The base class of the type hierarchy.
Definition: Type.h:1813

clang::Type::getAsCXXRecordDecl
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1870

clang::Type::isIntegerType
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:7941

clang::Type::castAs
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:8186

clang::Type::isReferenceType
bool isReferenceType() const
Definition: Type.h:7620

clang::Type::getPointeeType
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:694

clang::Type::isLValueReferenceType
bool isLValueReferenceType() const
Definition: Type.h:7624

clang::Type::hasSignedIntegerRepresentation
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2174

clang::Type::isVariablyModifiedType
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2667

clang::UnaryOperator
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2183

clang::UnaryOperator::getSubExpr
Expr * getSubExpr() const
Definition: Expr.h:2228

clang::UnaryOperator::getOpcode
Opcode getOpcode() const
Definition: Expr.h:2223

clang::ValueDecl
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:706

clang::ValueDecl::getType
QualType getType() const
Definition: Decl.h:717

clang::ValueDecl::isInitCapture
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:5373

clang::VarDecl
Represents a variable declaration or definition.
Definition: Decl.h:918

clang::VarDecl::getCanonicalDecl
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2254

clang::VarDecl::isInitCapture
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1558

clang::specific_attr_iterator
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:33

llvm::ArrayRef
Definition: LLVM.h:31

llvm::SmallPtrSet
Definition: ASTContext.h:49

llvm::SmallString
Definition: LLVM.h:34

llvm::SmallVectorImpl
Definition: Randstruct.h:18

llvm::SmallVector
Definition: LLVM.h:35

Cuda.h

clang::CodeGen::AlignmentSource::Type
@ Type
The l-value was considered opaque, so the alignment was determined from a type.

clang::CodeGen::AlignmentSource::Decl
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...

clang::CodeGen::TEK_Aggregate
@ TEK_Aggregate
Definition: CodeGenFunction.h:111

clang::CodeGen::TEK_Scalar
@ TEK_Scalar
Definition: CodeGenFunction.h:109

clang::CodeGen::TEK_Complex
@ TEK_Complex
Definition: CodeGenFunction.h:110

clang::CodeGen::NormalAndEHCleanup
@ NormalAndEHCleanup
Definition: EHScopeStack.h:86

clang::index::SymbolKind::Field
@ Field

clang::prec::Conditional
@ Conditional
Definition: OperatorPrecedence.h:30

clang
The JSON file list parser is used to communicate input to InstallAPI.
Definition: CalledOnceCheck.h:17

clang::OpenACCClauseKind::Private
@ Private
'private' clause, allowed on 'parallel', 'serial', 'loop', 'parallel loop', and 'serial loop' constru...

clang::CudaArch
CudaArch
Definition: Cuda.h:53

clang::CudaArch::GFX940
@ GFX940

clang::CudaArch::GFX810
@ GFX810

clang::CudaArch::GFX600
@ GFX600

clang::CudaArch::GFX1031
@ GFX1031

clang::CudaArch::GFX701
@ GFX701

clang::CudaArch::GFX1035
@ GFX1035

clang::CudaArch::GFX904
@ GFX904

clang::CudaArch::GFX1012
@ GFX1012

clang::CudaArch::GFX702
@ GFX702

clang::CudaArch::GFX805
@ GFX805

clang::CudaArch::GFX602
@ GFX602

clang::CudaArch::SM_21
@ SM_21

clang::CudaArch::GFX1010
@ GFX1010

clang::CudaArch::SM_72
@ SM_72

clang::CudaArch::GFX1034
@ GFX1034

clang::CudaArch::GFX90a
@ GFX90a

clang::CudaArch::UNUSED
@ UNUSED

clang::CudaArch::SM_61
@ SM_61

clang::CudaArch::GFX1036
@ GFX1036

clang::CudaArch::GFX1201
@ GFX1201

clang::CudaArch::GFX1011
@ GFX1011

clang::CudaArch::GFX703
@ GFX703

clang::CudaArch::GFX941
@ GFX941

clang::CudaArch::UNKNOWN
@ UNKNOWN

clang::CudaArch::SM_52
@ SM_52

clang::CudaArch::SM_50
@ SM_50

clang::CudaArch::SM_75
@ SM_75

clang::CudaArch::SM_89
@ SM_89

clang::CudaArch::GFX1100
@ GFX1100

clang::CudaArch::SM_53
@ SM_53

clang::CudaArch::GFX1200
@ GFX1200

clang::CudaArch::GFX1032
@ GFX1032

clang::CudaArch::SM_35
@ SM_35

clang::CudaArch::SM_60
@ SM_60

clang::CudaArch::Generic
@ Generic

clang::CudaArch::GFX942
@ GFX942

clang::CudaArch::GFX1151
@ GFX1151

clang::CudaArch::SM_20
@ SM_20

clang::CudaArch::SM_86
@ SM_86

clang::CudaArch::GFX1103
@ GFX1103

clang::CudaArch::SM_87
@ SM_87

clang::CudaArch::GFX909
@ GFX909

clang::CudaArch::SM_62
@ SM_62

clang::CudaArch::GFX704
@ GFX704

clang::CudaArch::SM_80
@ SM_80

clang::CudaArch::GFX803
@ GFX803

clang::CudaArch::GFX902
@ GFX902

clang::CudaArch::GFX700
@ GFX700

clang::CudaArch::GFX1033
@ GFX1033

clang::CudaArch::GFX601
@ GFX601

clang::CudaArch::SM_90
@ SM_90

clang::CudaArch::SM_30
@ SM_30

clang::CudaArch::SM_70
@ SM_70

clang::CudaArch::GFX1150
@ GFX1150

clang::CudaArch::SM_90a
@ SM_90a

clang::CudaArch::GFX1013
@ GFX1013

clang::CudaArch::GFX900
@ GFX900

clang::CudaArch::GFX801
@ GFX801

clang::CudaArch::SM_37
@ SM_37

clang::CudaArch::GFX705
@ GFX705

clang::CudaArch::GFX1101
@ GFX1101

clang::CudaArch::GFX90c
@ GFX90c

clang::CudaArch::GFX906
@ GFX906

clang::CudaArch::LAST
@ LAST

clang::CudaArch::SM_32_
@ SM_32_

clang::CudaArch::GFX1030
@ GFX1030

clang::CudaArch::GFX802
@ GFX802

clang::CudaArch::GFX1102
@ GFX1102

clang::CudaArch::GFX908
@ GFX908

clang::OpenMPDirectiveKind
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24

clang::LinkageSpecLanguageIDs::C
@ C

clang::ICIS_NoInit
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:269

clang::isOpenMPDistributeDirective
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
Definition: OpenMPKinds.cpp:649

clang::LCK_ByRef
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37

clang::StringToCudaArch
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:168

clang::CR_OpenMP
@ CR_OpenMP
Definition: CapturedStmt.h:19

clang::isOpenMPParallelDirective
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
Definition: OpenMPKinds.cpp:602

clang::isOpenMPPrivate
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
Definition: OpenMPKinds.cpp:661

clang::SC_None
@ SC_None
Definition: Specifiers.h:247

clang::OpenMPDistScheduleClauseKind
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103

clang::isOpenMPTargetExecutionDirective
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
Definition: OpenMPKinds.cpp:609

clang::isOpenMPTeamsDirective
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
Definition: OpenMPKinds.cpp:626

clang::ArraySizeModifier::Normal
@ Normal

clang::OpenACCDirectiveKind::Data
@ Data

clang::isOpenMPLoopBoundSharingDirective
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
Definition: OpenMPKinds.cpp:676

clang::LangAS
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25

clang::LangAS::opencl_global
@ opencl_global

clang::LangAS::cuda_shared
@ cuda_shared

clang::LangAS::cuda_constant
@ cuda_constant

clang::LangAS::Default
@ Default

clang::getOpenMPCaptureRegions
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
Definition: OpenMPKinds.cpp:705

clang::getLangASFromTargetAS
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86

clang::ImplicitParamKind::CXXThis
@ CXXThis
Parameter for C++ 'this' argument.

clang::ImplicitParamKind::Other
@ Other
Other implicit parameter.

clang::CudaArchToString
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:150

clang::OpenMPScheduleClauseKind
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30

clang::AS_public
@ AS_public
Definition: Specifiers.h:121

hlsl::uint64_t
unsigned long uint64_t
Definition: hlsl_basic_types.h:32

CopyOptionsTy
Definition: CGOpenMPRuntimeGPU.cpp:1569

CopyOptionsTy::ScratchpadIndex
llvm::Value * ScratchpadIndex
Definition: CGOpenMPRuntimeGPU.cpp:1571

CopyOptionsTy::ScratchpadWidth
llvm::Value * ScratchpadWidth
Definition: CGOpenMPRuntimeGPU.cpp:1572

CopyOptionsTy::RemoteLaneOffset
llvm::Value * RemoteLaneOffset
Definition: CGOpenMPRuntimeGPU.cpp:1570

clang::CodeGen::CGOpenMPRuntime::ReductionOptionsTy
Definition: CGOpenMPRuntime.h:1211

clang::CodeGen::CodeGenFunction::VlaSizePair::NumElts
llvm::Value * NumElts
Definition: CodeGenFunction.h:2956

clang::CodeGen::CodeGenTypeCache::VoidPtrTy
llvm::PointerType * VoidPtrTy
Definition: CodeGenTypeCache.h:57

clang::CodeGen::CodeGenTypeCache::Int64Ty
llvm::IntegerType * Int64Ty
Definition: CodeGenTypeCache.h:37

clang::CodeGen::CodeGenTypeCache::Int8Ty
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Definition: CodeGenTypeCache.h:37

clang::CodeGen::CodeGenTypeCache::SizeTy
llvm::IntegerType * SizeTy
Definition: CodeGenTypeCache.h:50

clang::CodeGen::CodeGenTypeCache::VoidPtrPtrTy
llvm::PointerType * VoidPtrPtrTy
Definition: CodeGenTypeCache.h:59

clang::CodeGen::CodeGenTypeCache::Int32Ty
llvm::IntegerType * Int32Ty
Definition: CodeGenTypeCache.h:37

clang::CodeGen::CodeGenTypeCache::IntTy
llvm::IntegerType * IntTy
int
Definition: CodeGenTypeCache.h:42

clang::CodeGen::CodeGenTypeCache::Int16Ty
llvm::IntegerType * Int16Ty
Definition: CodeGenTypeCache.h:37

clang::CodeGen::CodeGenTypeCache::Int8PtrTy
llvm::PointerType * Int8PtrTy
Definition: CodeGenTypeCache.h:58

clang::CodeGen::CodeGenTypeCache::getPointerAlign
CharUnits getPointerAlign() const
Definition: CodeGenTypeCache.h:117

clang::CodeGen::TBAAAccessInfo
Definition: CodeGenTBAA.h:42