/*===================== begin_copyright_notice ==================================

Copyright (c) 2017 Intel Corporation

Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


======================= end_copyright_notice ==================================*/
#include "Compiler/CISACodeGen/LinkTessControlShaderMCFPass.h"
#include "Compiler/IGCPassSupport.h"
#include "common/LLVMWarningsPush.hpp"
#include <llvm/IR/Constants.h>
#include <llvm/IR/Intrinsics.h>
#include <llvm/IR/Function.h>
#include <llvm/IR/InstVisitor.h>
#include <llvm/IR/IRBuilder.h>
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#include <llvmWrapper/IR/Function.h>
#include <llvmWrapper/IR/InstrTypes.h>

#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/LoopInfo.h"
#include "common/LLVMWarningsPop.hpp"
#include "common/IGCIRBuilder.h"

#include "Compiler/MetaDataUtilsWrapper.h"
#include "Compiler/CodeGenPublic.h"

using namespace llvm;
using namespace IGC;
using namespace IGC::IGCMD;

// Summary: The role of this pass is to link the TCS generated by OGL FE
// by adding a loop to loop through the number of output control points and
// replace all occurrences of HSControlPointID with the loopCounter
// When barriers are present in TCS this pass elimates them by splitting
// a shader into multiple continuation functions (code between barriers,
// see http ://compilers.cs.uni-saarland.de/papers/karrenberg_opencl.pdf )
// running every continuation function in a control point loop before
// moving to the next phase.
// Values that need to be passed between phases are converted into
// global allocas and sized (arrays) by the number of control points.
// Continuation functions receive pointers to proper global alloca entries
// (i.e. indexed by control point ID) for data passing.
namespace IGC
{
    class LinkTessControlShaderMCF : public llvm::ModulePass
    {
    public:
        // Pass identification, replacement for typeid
        static char ID;
        static const uint32_t LARGE_INSTRUCTIONS_COUNT = 1000;

        /// @brief  Constructor
        LinkTessControlShaderMCF();
        ~LinkTessControlShaderMCF()
        {
            delete mpBuilder;
        };

        /// @brief  Provides name of pass
        virtual llvm::StringRef getPassName() const override {
            return "LinkTessControlShaderMCF";
        }

        virtual void getAnalysisUsage(llvm::AnalysisUsage& AU) const override
        {
            AU.addRequired<MetaDataUtilsWrapper>();
            AU.addRequired<CodeGenContextWrapper>();
            AU.addRequired<DominatorTreeWrapperPass>();
            AU.addRequired<PostDominatorTreeWrapperPass>();
        }

        /// @brief  Main entry point.
        /// @param  F The current function.
        virtual bool runOnModule(llvm::Module& M) override;

    private:
        llvm::IGCIRBuilder<>* mpBuilder{ nullptr };
        Module* mpModule{ nullptr };
        DominatorTree* mpDT{ nullptr };
        Function* mpMainFunction{ nullptr };
        uint32_t            mNumBarriers;
        uint32_t            mOutputControlPointCount;
        uint32_t            mNumInstructions;
        bool                m_useMultipleHardwareThread;

        // SIMD size for tessellation workloads is SIMD8
        static const uint32_t SIMDSize = 8;

        // Special state that indicates that program has exited.
        static const uint32_t ExitState = 0xFFFFFFFF;

        //////////////////////////////////////////////////////////////////////////
        /// BarrierInfo
        //////////////////////////////////////////////////////////////////////////
        struct BarrierInfo
        {
            uint32_t id{ 0 };  // Unique identifier
        };

        std::vector<CallInst*> mBarriers;
        std::map<CallInst*, BarrierInfo> mBarrierInfo;
        std::vector<CallInst*> mBarriersPHIInLoop;
        std::vector<CallInst*> mMemoryFence;

        //////////////////////////////////////////////////////////////////////////
        /// GlobalAllocas
        /// @brief Used to keep track of all alloca values that are live across
        ///        barrier.
        //////////////////////////////////////////////////////////////////////////

        typedef llvm::Value* global_alloca_key_t;

        class GlobalAllocas
        {
            std::map<global_alloca_key_t, Value*>  mGlobalAllocas;
            bool mGlobalAllocasUpdated = { false };

        public:
            /// @brief  Returns LLVM type for the alloca specified by key.
            Type* GetType(global_alloca_key_t key)
            {
                assert(!mGlobalAllocasUpdated && "map updated - type no longer available");
                Value* pVal = mGlobalAllocas[key];
                assert(pVal && "global alloca not found!");
                return pVal->getType();
            }

            /// @brief  Inserts alloca into a global map.
            void AddAllocaRef(Value* pAlloca)
            {
                assert(!mGlobalAllocasUpdated && "map updated - cannot add new allocas");
                mGlobalAllocas[pAlloca] = pAlloca;
            }

            /// @brief Returns pointer to local alloca specified by key.
            Value* GetAllocaRef(global_alloca_key_t allocaKey)
            {
                assert(!mGlobalAllocasUpdated && "map updated - cannot reference local allocas");
                return mGlobalAllocas[allocaKey];
            }

            /// @brief Returns pointer to global alloca specified by key.
            Value* GetGlobalAlloca(global_alloca_key_t allocaKey)
            {
                assert(mGlobalAllocasUpdated && "map not yet updated - alloca not available");
                return mGlobalAllocas[allocaKey];
            }

            void MoveAllocas(Instruction* pInsert);
            void CreateGlobalAllocas(llvm::IGCIRBuilder<>& builder, uint32_t count);
        };

        GlobalAllocas mGlobalAllocas;

        //////////////////////////////////////////////////////////////////////////
        /// ContinuationFunction
        /// @brief Represents a sequence of basic blocks for a single shader phase
        ///        (up to a barrier) and pointers to allocas that are used for
        ///        accross barrier value passing.
        /// @todo Can change lists to vector.
        //////////////////////////////////////////////////////////////////////////
        struct ContinuationFunction
        {
            uint32_t id{ 0 };        ///< This id is used for switch case.

            Function* pFunc{ nullptr };
            BasicBlock* pEntry{ nullptr };

            std::vector<BasicBlock*> exits;   ///< terminator blocks
            std::vector<BasicBlock*> blocks;  ///< blocks in a continuation function
            std::set<global_alloca_key_t> inputs; ///< global allocas for ins/outs
        };
        std::set<global_alloca_key_t> fullInputsList;

        //////////////////////////////////////////////////////////////////////////
        /// EdgeLives - Tracks all lives between edges in CFG.
        //////////////////////////////////////////////////////////////////////////
        struct EdgeLives
        {
            BasicBlock* pFrom{ nullptr };
            BasicBlock* pTo{ nullptr };

            std::set<Value*> lives;
        };
        std::map<BasicBlock*, EdgeLives> mEdgeLives;

        // Maps an entry block to each continuation function.
        std::map<BasicBlock*, ContinuationFunction> mEntryMap;
        std::vector<ContinuationFunction*> mContinuationFunctions;
        std::map<BasicBlock*, bool> mVisited;

        //////////////////////////////////////////////////////////////////////////
        /// BasicBlockMeta
        //////////////////////////////////////////////////////////////////////////
        struct BasicBlockMeta
        {
            bool isPreBarrierBlock{ false };
            BarrierInfo* pBarrierInfo{ nullptr };
            BarrierInfo* pPreBarrierInfo{ nullptr };
            BasicBlock* pPostBarrierBlock{ nullptr };
        };

        // Maps metadata to basic blocks.
        std::map<BasicBlock*, BasicBlockMeta> mBlockMeta;

        // Each post barrier basic block will become an entry block for a continuation function.
        // The FunctionEntryBlock list contains all post-barrier blocks and the main entry block.
        std::vector<BasicBlock*> mFunctionEntryBlocks;

        // For 'switch' instructions each CF needs to receive full function arguments list.
        bool mIsPhiWith3OrMoreIncomingValues;

        //////////////////////////////////////////////////////////////////////////
        /// Internal methods.
        //////////////////////////////////////////////////////////////////////////

        /// Helper getters.
        llvm::IGCIRBuilder<>& Builder(void)
        {
            assert(mpBuilder);
            return *mpBuilder;
        }

        Module* GetModule(void)
        {
            assert(mpModule);
            return mpModule;
        }

        Function* GetMainFunction(void)
        {
            assert(mpMainFunction);
            return mpMainFunction;
        }

        DominatorTree* GetDT(void)
        {
            assert(mpDT);
            return mpDT;
        }

        //////////////////////////////////////////////////////////////////////////
        /// @brief Returns true if basic block is a pre-barrier block.
        bool IsPreBarrierBlock(BasicBlock* pBlock)
        {
            BasicBlockMeta& meta = mBlockMeta[pBlock];
            return (meta.isPreBarrierBlock);
        }

        //////////////////////////////////////////////////////////////////////////
        /// @brief Returns true if basic block is a post barrier block.
        bool IsPostBarrierBlock(BasicBlock* pBlock)
        {
            BasicBlockMeta& meta = mBlockMeta[pBlock];
            return (meta.pBarrierInfo != nullptr);
        }

        //////////////////////////////////////////////////////////////////////////
        /// @brief Returns the immediate dominator for the current basic block.
        ///        This is used by live-ness analysis to help walk up the CFG.
        BasicBlock* GetImmediateDominator(BasicBlock* pBlock)
        {
            assert(GetDT()->getNode(pBlock) != nullptr);
            DomTreeNode* pNode = GetDT()->getNode(pBlock)->getIDom();
            return pNode ? pNode->getBlock() : nullptr;
        }

        // Forward declarations.
        HullShaderDispatchModes DetermineDispatchMode(void);
        void FindBarriers(Function& f);
        void ReplaceReturn(Function& f, uint32_t retValue);
        void SplitBasicBlocksAtBarriers(Function& f);
        void UnlinkPreBarrierBlocks(Function& f);
        void GetContinuationBlocks(BasicBlock* pCurBlock, ContinuationFunction& cf);
        bool IsBlockOutside(ContinuationFunction& cf, BasicBlock* pBlock);
        void AddLive(BasicBlock* pCurBlock, BasicBlock* pEndBlock, Instruction* pLive);
        void FindBarrierLives(Function& f);
        void ReplaceValueWithinBB(BasicBlock* pBlock, Value* pOld, Value* pNew);
        AllocaInst* ReplaceWithIntermittentAlloca(Instruction* pDef);
        void ReplaceLive(Value* pOld, Value* pNew);
        void ConvertBarrierLivesToAllocas(Function& f);
        void CollectArguments(ContinuationFunction& cf);
        void FixUpControlPointID(ContinuationFunction& cf);
        void DetectBarrierPHIInLoop(Function& cf);
        void FixUpHSControlPointIDVsGlobalVar(Function& f);
        void RemovePhiInstructions(Function& cf);
        void PurgeFunction(Function& f);
        void BuildContinuationFunction(ContinuationFunction& cf);
        void BuildContinuationFunctions(Function& f);
        void RebuildMainFunction(void);
        void TCSwHWBarriersSupport(MetaDataUtils* pMdUtils);
        bool SelectTCSwHWBarrierSupport(void);
        llvm::Function* CreateNewTCSFunction(llvm::Function* pCurrentFunc);
    };

    char LinkTessControlShaderMCF::ID = 0;
    // Register pass to igc-opt
#define PASS_FLAG "igc-LinkTessControlShaderMCF"
#define PASS_DESCRIPTION "Perform looping of tessellation function based on control point count"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
    IGC_INITIALIZE_PASS_BEGIN(LinkTessControlShaderMCF, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
        IGC_INITIALIZE_PASS_END(LinkTessControlShaderMCF, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)


        LinkTessControlShaderMCF::LinkTessControlShaderMCF() :
        llvm::ModulePass(ID),
        mNumBarriers(0),
        mOutputControlPointCount(0),
        mNumInstructions(0),
        m_useMultipleHardwareThread(false),
        mIsPhiWith3OrMoreIncomingValues(false)
    {
        initializeLinkTessControlShaderMCFPass(*llvm::PassRegistry::getPassRegistry());
    }


    HullShaderDispatchModes LinkTessControlShaderMCF::DetermineDispatchMode(void)
    {
        llvm::NamedMDNode* metaData = GetModule()->getOrInsertNamedMetadata("HullShaderDispatchMode");
        IGC::CodeGenContext* pCodeGenContext = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();

        /* Instance Count
        **         This field determines the number of threads(minus one) spawned per input patch.

        **         If the HS kernel uses a barrier function, software must restrict the Instance Count
        **         to the number of threads that can be simultaneously active within a subslice.
        **         Factors which must be considered includes scratch memory availability.
        **         Value             Description
        **         [0, 15]             representing[1, 16] instances */

        // Use HS single patch if WA exists and input control points >= 29 as there are not enough registers for push constants
        bool useSinglePatch = false;
        if (pCodeGenContext->platform.WaDispatchGRFHWIssueInGSAndHSUnit())
        {
            llvm::GlobalVariable* pGlobal = GetModule()->getGlobalVariable("TessInputControlPointCount");
            if (pGlobal && pGlobal->hasInitializer())
            {
                unsigned int inputControlPointCount = int_cast<unsigned int>(llvm::cast<llvm::ConstantInt>(pGlobal->getInitializer())->getZExtValue());
                if (inputControlPointCount >= 29)
                {
                    useSinglePatch = true;
                }
            }
        }

        if (pCodeGenContext->platform.useOnlyEightPatchDispatchHS() ||
            (pCodeGenContext->platform.supportHSEightPatchDispatch() &&
                !(m_useMultipleHardwareThread && mOutputControlPointCount >= 16) &&
                !useSinglePatch &&
                IGC_IS_FLAG_DISABLED(EnableHSSinglePatchDispatch)))
        {
            Constant* cval = llvm::ConstantInt::get(
                Builder().getInt32Ty(),
                HullShaderDispatchModes::EIGHT_PATCH_DISPATCH_MODE);
            llvm::MDNode* mdNode = llvm::MDNode::get(
                Builder().getContext(),
                llvm::ConstantAsMetadata::get(cval));
            metaData->addOperand(mdNode);
            return HullShaderDispatchModes::EIGHT_PATCH_DISPATCH_MODE;
        }
        else
        {
            Constant* cval = llvm::ConstantInt::get(
                Builder().getInt32Ty(),
                HullShaderDispatchModes::SINGLE_PATCH_DISPATCH_MODE);
            llvm::MDNode* mdNode = llvm::MDNode::get(
                Builder().getContext(),
                llvm::ConstantAsMetadata::get(cval));
            metaData->addOperand(mdNode);
            return HullShaderDispatchModes::SINGLE_PATCH_DISPATCH_MODE;
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Finds all barriers in program and populates barrier info.
    /// @param f- The function we're working on with this pass.
    void LinkTessControlShaderMCF::FindBarriers(Function& f)
    {
        mNumBarriers = 0;
        for (auto i = inst_begin(f), e = inst_end(f); i != e; ++i)
        {
            Instruction& inst = cast<Instruction>(*i);
            mNumInstructions++;
            if (GenIntrinsicInst * pInst = dyn_cast<GenIntrinsicInst>(&inst))
            {
                GenISAIntrinsic::ID IID = pInst->getIntrinsicID();
                if (IID == GenISAIntrinsic::GenISA_threadgroupbarrier)
                {
                    BarrierInfo& info = mBarrierInfo[pInst];

                    if (info.id > 0) continue;

                    mNumBarriers++;
                    info.id = mNumBarriers;  // id starts at 1.

                    mBarriers.push_back(pInst);
                }
                if (IID == GenISAIntrinsic::GenISA_memoryfence)
                {
                    // MemoryFence inst is going to be removed for MCF solution.
                    mMemoryFence.push_back(pInst);
                }
            }
            else if (PHINode * pPhi = dyn_cast<PHINode>(&inst))
            {
                // Set this member only when 'barrier' instr is before PHI.
                mIsPhiWith3OrMoreIncomingValues = ((pPhi->getNumIncomingValues() >= 3) && (mNumBarriers > 0));
            }
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Replace original returns (void) with our exit marker ret(0xFFFFFFFF).
    /// @param f- The function we're working on with this pass.
    void LinkTessControlShaderMCF::ReplaceReturn(Function& f, uint32_t retValue)
    {
        SmallVector<Instruction*, 8> retInstructions;

        for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e; ++i)
        {
            Instruction* pInst = &(*i);
            if (isa<ReturnInst>(pInst))
            {
                retInstructions.push_back(pInst);
            }
        }

        for (Instruction* pRet : retInstructions)
        {
            Builder().SetInsertPoint(pRet);
            Builder().CreateRet(Builder().getInt32(retValue));
            pRet->eraseFromParent();
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Splits all basic blocks with barriers and setup meta info.
    /// @param f- The function we're working on with this pass.
    void LinkTessControlShaderMCF::SplitBasicBlocksAtBarriers(Function& f)
    {
        mFunctionEntryBlocks.push_back(&f.getEntryBlock());

        for (auto pBarrier : mBarriers)
        {
            BasicBlock* pBarrierBlock = pBarrier->getParent();
            assert(pBarrierBlock != nullptr);

            Builder().SetInsertPoint(pBarrier);

            BarrierInfo& info = mBarrierInfo[pBarrier];

            llvm::StringRef name = pBarrierBlock->getName();

            std::string postfix = std::string(".postbarrier.") + std::to_string(info.id);

            BasicBlock* pPostBarrierBlock =
                pBarrierBlock->splitBasicBlock(pBarrier->getNextNode(), name + postfix.c_str());

            // Check whether the current barrier instruction is on the mBarriersPHIInLoop list.
            // This solution handles only barriers within loop in one BB i.e. without any control flow.
            for (auto pLocBarrier : mBarriersPHIInLoop)
            {
                if (pLocBarrier->getParent() == pBarrierBlock)
                {
                    for (BasicBlock::iterator i = pPostBarrierBlock->begin(), e = pPostBarrierBlock->end(); i != e; ++i)
                    {
                        Instruction* pInst = &(*i);
                        if (llvm::BranchInst * pBrInst = dyn_cast<BranchInst>(pInst))
                        {
                            // Replace  :   br COND, BB_IF_TRUE, BB_IF_FALSE
                            // with     :   br COND, BB_IF_TRUE.postbarrier, BB_IF_FALSE
                            Builder().SetInsertPoint(pInst);
                            BranchInst::Create(pPostBarrierBlock, pBrInst->getSuccessor(1), pBrInst->getCondition(), pPostBarrierBlock);
                            pInst->eraseFromParent();
                            break;
                        }
                    }
                    break;
                }
            }

            // Each post barrier basic block is an entry block for a continuation function.
            mFunctionEntryBlocks.push_back(pPostBarrierBlock);

            // The pBarrierBlock is now the pre-barrier block after the split.
            BasicBlockMeta& preBarrierMeta = mBlockMeta[pBarrierBlock];
            preBarrierMeta.isPreBarrierBlock = true;
            preBarrierMeta.pPreBarrierInfo = &info;
            preBarrierMeta.pPostBarrierBlock = pPostBarrierBlock;

            BasicBlockMeta& postBarrierMeta = mBlockMeta[pPostBarrierBlock];
            postBarrierMeta.pBarrierInfo = &info;

            EdgeLives& edge = mEdgeLives[pPostBarrierBlock];
            edge.pFrom = pBarrierBlock;
            edge.pTo = pPostBarrierBlock;
        }

        // Remove barrier instructions.
        for (auto pBarrier : mBarriers)
        {
            pBarrier->eraseFromParent();
        }
        mBarriers.clear();

        // Remove barrier instructions from 'in loop' sections of code.
        mBarriersPHIInLoop.clear();

        // Remove 'memory fence' instructions.
        for (auto pMemoryFence : mMemoryFence)
        {
            pMemoryFence->eraseFromParent();
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief For each pre-barrier block insert new return instruction
    ///        and remove original terminator. These new returns will be used
    ///        by continuation functions.
    /// @param f- The function we're working on with this pass.
    void LinkTessControlShaderMCF::UnlinkPreBarrierBlocks(Function& f)
    {
        for (BasicBlock& basicBlock : f.getBasicBlockList())
        {
            BasicBlock* pBasicBlock = &basicBlock;
            if (IsPreBarrierBlock(pBasicBlock))
            {
                BasicBlockMeta& meta = mBlockMeta[pBasicBlock];
                // Insert return(n) that will be used by continuation function.
                Builder().SetInsertPoint(pBasicBlock->getTerminator());
                Builder().CreateRet(Builder().getInt32(meta.pPreBarrierInfo->id));

                // Unlink the pre-barrier block from the post-barrier block by removing
                // original terminator.
                pBasicBlock->getTerminator()->eraseFromParent();
            }
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Finds all blocks that belong to a continuation function.
    /// @param pCurBlock - Current block in depth-first traversal.
    /// @param cf - Continuation function we're adding to.
    void LinkTessControlShaderMCF::GetContinuationBlocks(BasicBlock* pCurBlock, ContinuationFunction& cf)
    {
        // If we have already visited this node then can end traversal.
        if (mVisited[pCurBlock] == true)
        {
            return;
        }

        mVisited[pCurBlock] = true;

        BasicBlockMeta& meta = mBlockMeta[pCurBlock];

        cf.blocks.push_back(pCurBlock);

        // If the current block is pre-barrier block then end traversal.
        if (meta.isPreBarrierBlock)
        {
            cf.exits.push_back(pCurBlock);
            return;
        }

        uint32_t numSuccessors = 0;
        for (auto i = succ_begin(pCurBlock), e = succ_end(pCurBlock); i != e; ++i)
        {
            BasicBlock* pSuccessor = *i;

            GetContinuationBlocks(pSuccessor, cf);

            numSuccessors++;
        }

        if (numSuccessors == 0)
        {
            cf.exits.push_back(pCurBlock);
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Return true if basic block does not belong to CF.
    /// @param cf - current continuation function
    /// @param pBlock - Block we're testing
    bool LinkTessControlShaderMCF::IsBlockOutside(ContinuationFunction& cf, BasicBlock* pBlock)
    {
        for (auto pCfBlock : cf.blocks)
        {
            if (pCfBlock == pBlock)
            {
                return false;
            }
        }
        return true;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief This walks the CFG from the current block to the end block
    ///        and if it finds a post-barrier block along the way then it
    ///        adds the "live" to the edge between pre and post barrier blocks.
    ///        These lives are later used to generate spill/fills.
    void LinkTessControlShaderMCF::AddLive(BasicBlock* pCurBlock, BasicBlock* pEndBlock, Instruction* pLive)
    {
        if (mVisited[pCurBlock] == true)
        {
            return;
        }

        mVisited[pCurBlock] = true;

        if (pCurBlock != pEndBlock)
        {
            for (auto i = pred_begin(pCurBlock), end = pred_end(pCurBlock); i != end; ++i)
            {
                BasicBlock* pPredBlock = *i;

                if (pPredBlock != pEndBlock)
                {
                    AddLive(pPredBlock, pEndBlock, pLive);
                }
            }
        }

        if (IsPostBarrierBlock(pCurBlock))
        {
            EdgeLives& edge = mEdgeLives[pCurBlock];
            edge.lives.insert(pLive);
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Analyzes all instructions in the function and for each use
    ///        determines the live range. If any barrier falls within this live
    ///        range then we need to generate a spill/fill.
    /// @note Post-barrier blocks should never contain a phi. Algorithm assumes that.
    void LinkTessControlShaderMCF::FindBarrierLives(Function& f)
    {
        // Update dominator tree.
        mpDT = &getAnalysis<DominatorTreeWrapperPass>(*GetMainFunction()).getDomTree();

        for (auto i = inst_begin(f), e = inst_end(f); i != e; ++i)
        {
            Instruction* pDef = &*i;
            BasicBlock* pDefBlock = pDef->getParent();

            for (User* pUser : pDef->users())
            {
                Instruction* pUserInst = cast<Instruction>(pUser);
                BasicBlock* pUseBlock = pUserInst->getParent();

                if (PHINode * pPhi = dyn_cast<PHINode>(pUser))
                {
                    for (uint32_t incoming = 0; incoming < pPhi->getNumIncomingValues(); ++incoming)
                    {
                        if (pPhi->getIncomingValue(incoming) == pDef)
                        {
                            pUseBlock = pPhi->getIncomingBlock(incoming);
                            break;
                        }
                    }
                }

                while (pDefBlock != pUseBlock)
                {
                    BasicBlock* pIDom = GetImmediateDominator(pUseBlock);

                    if (pIDom)
                    {
                        mVisited.clear();  // Reset visited for next traversal.
                        AddLive(pUseBlock, pIDom, pDef);
                    }

                    pUseBlock = pIDom;
                }
            }
        }
        mVisited.clear();  // Reset visited for future use.
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Replace all uses of old value with a new value within BB only.
    void LinkTessControlShaderMCF::ReplaceValueWithinBB(BasicBlock* pBlock, Value* pOld, Value* pNew)
    {
        std::set<llvm::Use*> useList;

        for (auto ui = pOld->use_begin(), ue = pOld->use_end(); ui != ue; ++ui)
        {
            Use& use = *ui;
            Instruction* pUser = dyn_cast<Instruction>(use.getUser());
            if (pUser && pUser->getParent() == pBlock)
            {
                useList.insert(&use);
            }
        }

        for (auto pUse : useList)
        {
            pUse->set(pNew);
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Create intermittent alloca for a value and add loads for uses.
    /// @param def - value to be processed through alloca.
    AllocaInst* LinkTessControlShaderMCF::ReplaceWithIntermittentAlloca(Instruction* pDef)
    {
        assert(pDef);
        Type* pType = pDef->getType();
        BasicBlock* pDefBlock = pDef->getParent();
        std::string defName = pDef->getName();
        std::string newName = "l_" + defName;

        // Start with inserting new alloca after defining instruction.
        BasicBlock::iterator ii(pDef);
        Builder().SetInsertPoint(&(*(++ii)));
        AllocaInst* pAlloca = Builder().CreateAlloca(pType, nullptr, VALUE_NAME(defName));

        // Add load new value.
        // We replace def value even within defining BB just to be sure
        // alghorithm is consistent. Compiler will optimize this later.
        Value* pNewValue = Builder().CreateLoad(pAlloca, VALUE_NAME(newName));

        // Step 1.
        // Insert load instruction in all post-barrier basic blocks,
        // where this value is used.
        std::map<BasicBlock*, Value*> loadedValueMap;
        mVisited.clear();

        loadedValueMap[pDefBlock] = pNewValue;
        mVisited[pDefBlock] = true;

        for (auto ui = pDef->user_begin(), ue = pDef->user_end(); ui != ue; ++ui)
        {
            Instruction* pUser = cast<Instruction>(*ui);
            BasicBlock* pUseBlock = pUser->getParent();

            PHINode* pPhi = dyn_cast<PHINode>(pDef);
            // For phi instructions actual use block is incoming one.
            if (pPhi != nullptr)
            {
                uint32_t numIncoming = pPhi->getNumIncomingValues();
                for (uint32_t incoming = 0; incoming < numIncoming; ++incoming)
                {
                    if (pPhi->getIncomingValue(incoming) == pDef)
                    {
                        pUseBlock = pPhi->getIncomingBlock(incoming);
                        break;
                    }
                }
            }

            if (!mVisited[pUseBlock])
            {
                mVisited[pUseBlock] = true;
                Builder().SetInsertPoint(pUseBlock->getFirstNonPHI());
                Value* pNewValue = Builder().CreateLoad(pAlloca, VALUE_NAME(newName));
                loadedValueMap[pUseBlock] = pNewValue;
            }
        }
        mVisited.clear();

        // Step 2.
        // Replace all uses.
        //
        while (pDef->use_begin() != pDef->use_end())
        {
            Instruction* pUser = cast<Instruction>(pDef->use_begin()->getUser());
            BasicBlock* pUseBlock = pUser->getParent();

            PHINode* pPhi = dyn_cast<PHINode>(pDef);

            // For phi instructions actual use block is incoming one.
            if (pPhi != nullptr)
            {
                uint32_t numIncoming = pPhi->getNumIncomingValues();
                for (uint32_t incoming = 0; incoming < numIncoming; ++incoming)
                {
                    if (pPhi->getIncomingValue(incoming) == pDef)
                    {
                        pUseBlock = pPhi->getIncomingBlock(incoming);
                        break;
                    }
                }
            }

            // If we have new value fetched for the block already, then use it as replacement,
            // if not, then walk up dominance tree to find the correct one.
            auto mi = loadedValueMap.find(pUseBlock);
            pNewValue = (*mi).second;
            assert(pNewValue && "loaded value not found for added alloca!!!");
            if (pNewValue != pDef)
            {
                ReplaceValueWithinBB(pUseBlock, pDef, pNewValue);
            }
        }

        // Add 'store' inst for storing an old value.
        BasicBlock::iterator bbi(pAlloca);
        Builder().SetInsertPoint(&(*++bbi));
        Builder().CreateStore(pDef, pAlloca);

        return pAlloca;
    }


    //////////////////////////////////////////////////////////////////////////
    /// @brief Replace live value marked in our tables with a new one.
    void LinkTessControlShaderMCF::ReplaceLive(Value* pOld, Value* pNew)
    {
        for (auto& edgeMapEntry : mEdgeLives)
        {
            EdgeLives& edge = edgeMapEntry.second;
            auto pos = edge.lives.find(pOld);
            if (pos != edge.lives.end())
            {
                edge.lives.erase(pos);
                edge.lives.insert(pNew);
            }
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Make any values crossing barriers to use intermittent allocas.
    ///        Alghoritm depends on only allocas being live at barriers;
    //         these are then moved to main function and converted into
    //         continuation function arguments.
    void LinkTessControlShaderMCF::ConvertBarrierLivesToAllocas(Function& f)
    {
        for (auto edgeMapEntry : mEdgeLives)
        {
            EdgeLives& edge = edgeMapEntry.second;
            for (Value* pLive : edge.lives)
            {
                Instruction* pInst = cast<Instruction>(pLive);

                // If it's not alloca, create an intermittent alloca
                // and make all uses go through it.
                if (!isa<AllocaInst>(pInst))
                {
                    Instruction* pNewDef = ReplaceWithIntermittentAlloca(pInst);
                    ReplaceLive(pLive, pNewDef);
                }
            }
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Collects information about lives across barrier boundaries.
    void LinkTessControlShaderMCF::CollectArguments(ContinuationFunction& cf)
    {
        // First continuation function should not have live-ins.
        assert(cf.id != 0 || mEdgeLives[cf.pEntry].lives.size() == 0);

        // Generate inputs first.
        EdgeLives& edge = mEdgeLives[cf.pEntry];
        for (auto& pLive : edge.lives)
        {
            Instruction* pInst = dyn_cast<Instruction>(pLive);
            assert(isa<AllocaInst>(pInst));
            cf.inputs.insert(pLive);
            mGlobalAllocas.AddAllocaRef(pLive);
        }

        if (mIsPhiWith3OrMoreIncomingValues)
        {
            // For 'switch' inst scenarios, each CF needs to get a complete list of arguments.
            for (auto inputArg : fullInputsList)
            {
                cf.inputs.insert(inputArg);
            }
        }

        // Generate outputs.
        for (auto& pExit : cf.exits)
        {
            BasicBlockMeta& meta = mBlockMeta[pExit];

            if (meta.isPreBarrierBlock)
            {
                BasicBlock* pPostBarrier = meta.pPostBarrierBlock;
                assert(pPostBarrier != nullptr);

                EdgeLives& edge = mEdgeLives[pPostBarrier];

                assert((edge.pFrom != nullptr) && (pExit == edge.pFrom));

                for (auto pLive : edge.lives)
                {
                    Instruction* pInst = dyn_cast<Instruction>(pLive);
                    assert(isa<AllocaInst>(pInst));
                    cf.inputs.insert(pLive);
                    mGlobalAllocas.AddAllocaRef(pLive);

                    if (mIsPhiWith3OrMoreIncomingValues)
                    {
                        // Collect input arguments from each CF.
                        fullInputsList.insert(pLive);
                    }
                }
            }
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Replace all uses of @genISA.DCL.HSControlPointID() with
    ///        first argument of continuation function.
    /// @param cf - current continuation function
    void LinkTessControlShaderMCF::FixUpControlPointID(ContinuationFunction& cf)
    {
        SmallVector<Instruction*, 10> instructionToRemove;

        llvm::Value* pHSControlPointID = llvm::GenISAIntrinsic::getDeclaration(
            cf.pFunc->getParent(),
            GenISAIntrinsic::GenISA_DCL_HSControlPointID);

        Argument* arg0 = &(*cf.pFunc->arg_begin());

        for (Value::user_iterator i = pHSControlPointID->user_begin(), e = pHSControlPointID->user_end();
            i != e;
            ++i)
        {
            Instruction* useInst = cast<Instruction>(*i);
            if (useInst->getParent()->getParent() == cf.pFunc)
            {
                instructionToRemove.push_back(useInst);
                useInst->replaceAllUsesWith(arg0);
            }
        }

        for (auto& inst : instructionToRemove)
        {
            inst->eraseFromParent();
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Detect whether barrier inst is within 'inloop' section.
    ///        For such scenarios each relevant PHI instruction must be
    ///        replaced with set of alloca/store/load/store.
    ///        See RemovePhiInstructions description.
    /// @param f - main function
    void LinkTessControlShaderMCF::DetectBarrierPHIInLoop(Function& f)
    {
        // Examine only BB which contains Barrier instruction.
        for (auto pBarrier : mBarriers)
        {
            BasicBlock* pBarrierBlock = pBarrier->getParent();
            assert(pBarrierBlock != nullptr);

            bool isPHIInBB = false;
            bool isBarrierInLoop = false;

            for (BasicBlock::iterator i = pBarrierBlock->begin(), e = pBarrierBlock->end(); i != e; ++i)
            {
                Instruction* pInst = &(*i);
                if (dyn_cast<PHINode>(pInst))
                {
                    isPHIInBB = true;
                }
                else if (llvm::BranchInst * pBrInst = dyn_cast<BranchInst>(pInst))
                {
                    // Check whether any of successors(i.e. BB) matches pBarrierBlock.
                    for (unsigned int successorId = 0; successorId < pBrInst->getNumSuccessors(); ++successorId)
                    {
                        isBarrierInLoop = (pBrInst->getSuccessor(successorId) == pBarrierBlock) ? true : isBarrierInLoop;
                    }
                }
            }
            if (isPHIInBB && isBarrierInLoop)
            {
                // Barrier and PHI instructions are in the loop.
                // Store Barrier instruction , this data is needed in the RemovePhiInstructions.
                // This solution handles only barriers within loop in one BB i.e. without any control flow.
                mBarriersPHIInLoop.push_back(pBarrier);
            }
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Replace each relevant PHI instruction with
    ///        set of alloca/store/load/store.
    ///        E.g.
    /// %Temp-91.i.i = phi float [ %Temp-87.i.i, %MainCode ], [ %Temp-92.i.i, %Label-88.i.i ]
    /// -->
    ///   %Temp-91.i.i28 = alloca float
    ///   store float %Temp-87.i.i, float* %Temp-91.i.i28
    /// ( Label.postbarrier: ) <-- added by SplitBasicBlocksAtBarriers()
    ///   %Temp-91.i.i29 = load float* %Temp-91.i.i28
    ///   store float %Temp-92.i.i, float* %Temp-91.i.i28
    /// @param f - main function
    void LinkTessControlShaderMCF::RemovePhiInstructions(Function& f)
    {
        SmallVector<Instruction*, 4> phiToRemove;

        // Do removal of PHI instructions only in BB which contains Barrier and PHI in the loop.
        // This solution handles only barriers within loop in one BB i.e. without any control flow.
        for (auto pBarrier : mBarriersPHIInLoop)
        {
            BasicBlock* pBarrierBlock = pBarrier->getParent();
            Instruction* pBarrierInst = cast<Instruction>(pBarrier);

            for (BasicBlock::iterator i = pBarrierBlock->begin(), e = pBarrierBlock->end(); i != e; ++i)
            {
                Instruction* pDef = &*i;
                if (PHINode * pPhi = dyn_cast<PHINode>(pDef))
                {
                    phiToRemove.push_back(pPhi);

                    std::string defName = pPhi->getName();
                    std::string newName = pDef->getName();
                    Type* pType = pPhi->getType();
                    BasicBlock::iterator ii(pDef);
                    Builder().SetInsertPoint(&(*ii));
                    // 1. Create 'alloca' inst to store incoming values to PHI instruction.
                    AllocaInst* pAllocaPHI = Builder().CreateAlloca(pType, nullptr, VALUE_NAME(defName));
                    // 2. Create 'store' inst to store the 1st out of the PHI incoming values.
                    Instruction* pIncoming1stDef = cast<Instruction>(pPhi->getIncomingValue(0));
                    Builder().CreateStore(pIncoming1stDef, pAllocaPHI);

                    // 3. Create 'load' inst after barrier inst to load one of the PHI incoming values.
                    // This instruction loads firstly the value from the 1st PHI incoming values
                    // and secondly,later the 2nd PHI incoming value.
                    BasicBlock::iterator iAfterThreadBarrier(pBarrierInst);
                    Builder().SetInsertPoint(&(*++iAfterThreadBarrier));
                    Value* pNewValue = Builder().CreateLoad(pAllocaPHI, VALUE_NAME(newName));

                    // 4. Create 'store' inst to store the 2nd out of the PHI incoming values.
                    Instruction* pIncoming2ndDef = cast<Instruction>(pPhi->getIncomingValue(1));
                    BasicBlock::iterator iAfterIncoming2ndDef(pIncoming2ndDef);
                    Builder().SetInsertPoint(&(*++iAfterIncoming2ndDef));
                    Builder().CreateStore(pIncoming2ndDef, pAllocaPHI);
                    pPhi->replaceAllUsesWith(pNewValue);
                }
            }
        }

        for (auto& phiInst : phiToRemove)
        {
            phiInst->eraseFromParent();
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Makes a "shell" function removing all basic blocks.
    /// @param f - The function we're working on.
    void LinkTessControlShaderMCF::PurgeFunction(Function& f)
    {
        std::vector<BasicBlock*> blocksToRemove;

        // Find blocks to be removed.
        for (Function::iterator bb = f.begin(), be = f.end();
            bb != be;
            ++bb)
        {
            BasicBlock* pBlock = &*bb;
            if (pBlock->getParent() == &f)
            {
                blocksToRemove.push_back(pBlock);
                pBlock->dropAllReferences();
            }
        }

        for (BasicBlock* pBlock : blocksToRemove)
        {
            pBlock->eraseFromParent();
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Builds LLVM function for continuation.
    /// @param cf - The continuation function we're working on.
    void LinkTessControlShaderMCF::BuildContinuationFunction(ContinuationFunction& cf)
    {
        Function* pMainFunc = GetMainFunction();

        // We don't expect main function to have any arguments.
        assert(pMainFunc->arg_begin() == pMainFunc->arg_end());

        // Prepare argument list for a function.
        std::vector<Type*> argTypes;
        argTypes.push_back(Builder().getInt32Ty()); // CPId
        for (auto alloca_key : cf.inputs)
        {
            argTypes.push_back(mGlobalAllocas.GetType(alloca_key));
        }

        std::string funcName = std::string("tcs.phase.") + std::to_string(cf.id);
        cf.pFunc = Function::Create(
            FunctionType::get(Builder().getInt32Ty(), argTypes, false),
            llvm::GlobalValue::PrivateLinkage,
            funcName,
            mpModule);

        cf.pFunc->addFnAttr(llvm::Attribute::AlwaysInline);

        // Map
        ValueToValueMapTy vmap;
        SmallVector<ReturnInst*, 8> returns;

        // Loop over the arguments, copying the names of the mapped arguments over...
        Function::arg_iterator ai = cf.pFunc->arg_begin();
        ai->setName("CPId");
        ++ai;
        for (auto ii : cf.inputs)
        {
            const Value* pVal = mGlobalAllocas.GetAllocaRef(ii);
            if (vmap.count(pVal) == 0)
            {
                // Copy the name over...
                ai->setName("arg" + pVal->getName());
                // Add mapping to ValueMap
                vmap[pVal] = static_cast<Argument*>(&*ai);
                ++ai;
            }
        }

        // Fix up starting block, by moving it to the function beginning.
        cf.pEntry->moveBefore(&*(pMainFunc->begin()));

        CloneAndPruneFunctionInto(cf.pFunc, pMainFunc, vmap, false, returns);
        FixUpControlPointID(cf);
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Replace each usage of global loaded from HSControlPointID by
    ///        a call directly to it. As a result, instead of having one load
    ///        of CPID and multiple usages of the global variable initialized
    ///        by that value, there are calls to HSControlPointID before each
    ///        usage.
    ///        E.g.
    ///        See method replacing the 'load' instructions that reference to
    ///        global variable which stores the CPId value with
    ///        HSControlPointID() calls.
    ///
    ///        FROM -->
    ///            % 3 = call i32 @llvm.genx.GenISA.DCL.HSControlPointID()
    ///            store i32 % 3, i32* @0
    ///            . . .
    ///            % 4 = load i32, i32* @0
    ///            . . .
    ///            % 10 = insertelement <4 x i32> <i32 0, i32 0, i32 undef, i32 1>, i32 % 4, i32 2
    ///
    ///        TO -->
    ///            % 3 = call i32 @llvm.genx.GenISA.DCL.HSControlPointID()
    ///            store i32 % 3, i32* @0
    ///            . . .
    ///            % 4 = call i32 @llvm.genx.GenISA.DCL.HSControlPointID()
    ///            . . .
    ///            % 10 = insertelement <4 x i32> <i32 0, i32 0, i32 undef, i32 1>, i32 % 4, i32 2
    ///
    ///
    /// @param f - The function we're working on.
    void LinkTessControlShaderMCF::FixUpHSControlPointIDVsGlobalVar(Function& f)
    {
        SmallVector<Instruction*, 10> instructionToUpdate;
        llvm::Value* pHSControlPointID = llvm::GenISAIntrinsic::getDeclaration(
            f.getParent(),
            GenISAIntrinsic::GenISA_DCL_HSControlPointID);

        // Pass through usages of the call to HSControlPointID.
        // e.g.: %3 = call i32 @llvm.genx.GenISA.DCL.HSControlPointID()
        //      %16 = call i32 @llvm.genx.GenISA.DCL.HSControlPointID()
        for (Value::user_iterator i = pHSControlPointID->user_begin(), e = pHSControlPointID->user_end();
            i != e;
            ++i)
        {
            Instruction* useInst = cast<Instruction>(*i);

            // Pass through the usages of the each HSControlPointID's call result.
            // e.g.: call void @llvm.genx.GenISA.OutputTessControlPoint(float %19, float %20, float %21, float %22, i32 %11, i32 %16, i32 %17)
            //       store i32 %3, i32* @0
            for (Value::user_iterator _i = useInst->user_begin(), _e = useInst->user_end();
                _i != _e;
                ++_i)
            {
                // Find the 'store' instr and check if it is storing local val(i.e.the result of HSControlPointID's call) into a global variable.
                // e.g.: store i32 %3, i32* @0
                if (llvm::StoreInst * storeInst = llvm::dyn_cast<llvm::StoreInst>(*_i))
                {
                    llvm::Value* op1 = storeInst->getOperand(1);
                    if (isa<GlobalVariable>(op1))
                    {
                        // Pass through usages of that global to find load instructions to replace.
                        // e.g.: %5 = load i32, i32* @0
                        //      % 4 = load i32, i32* @0
                        for (auto ui = op1->user_begin(), ue = op1->user_end(); ui != ue; ++ui)
                        {
                            if (llvm::LoadInst * ldInst = llvm::dyn_cast<llvm::LoadInst>(*ui))
                            {
                                instructionToUpdate.push_back(ldInst);
                            }
                        }
                    }
                }
            }
        }

        // Do the actual replacement of global variable to variable resulting
        // from call to HSControlPointID.
        for (auto& ldInst : instructionToUpdate)
        {
            Builder().SetInsertPoint(ldInst);
            llvm::Value* pCPId = Builder().CreateCall(pHSControlPointID);
            ldInst->replaceAllUsesWith(pCPId);
            ldInst->eraseFromParent();

            // e.g.:  %4 = load i32, i32* @0 <--replaced with --> %4 = call i32 @llvm.genx.GenISA.DCL.HSControlPointID()
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Constructs a set of barrier-free functions from a main function
    ///        that contains barriers. Each new function is called a continuation
    ///        function that contains all of the basic blocks up to a barrier.
    /// @param f - Main function we're breaking up into multiple functions.
    void LinkTessControlShaderMCF::BuildContinuationFunctions(Function& f)
    {
        FixUpHSControlPointIDVsGlobalVar(f);
        DetectBarrierPHIInLoop(f);
        RemovePhiInstructions(f);
        SplitBasicBlocksAtBarriers(f);
        FindBarrierLives(f);
        ConvertBarrierLivesToAllocas(f);

        // Perform a depth-first traversal from each continuation entry
        // basic block to find all blocks that belong to each continuation
        // function. Traversal for each continuation function ends when a
        // pre-barrier block is hit or the final return block.
        for (auto pEntry : mFunctionEntryBlocks)
        {
            // Effectively we're allocating the Continuation Function with the entry map.
            ContinuationFunction& cf = mEntryMap[pEntry];
            cf.pEntry = pEntry;

            BasicBlockMeta& meta = mBlockMeta[pEntry];

            if (meta.pBarrierInfo != nullptr)
            {
                cf.id = meta.pBarrierInfo->id;
            }

            GetContinuationBlocks(pEntry, cf);

            // We'll traverse this list and build continuation functions from this.
            mContinuationFunctions.push_back(&cf);

            // Reset the visited list for generating next continuation function.
            mVisited.clear();
        }

        // Collect arguments for continuation functions.
        for (auto pCF : mContinuationFunctions)
        {
            CollectArguments(*pCF);
        }

        // Here we are done with analysis.
        // Now we create actual continuation functions through transforming
        // and cloning the main function.

        // Replace existing returns with ret(-1) which marks end of program.
        ReplaceReturn(f, ExitState);
        // Remove BR instructions linking pre- and post-barrier blocks and replace
        // them with ret(N) instructions (where N is barrier id).
        UnlinkPreBarrierBlocks(f);

        // Create new basic block to temporary hold all global allocas,
        // and move allocas into it. This way alloca instructions won't be cloned
        // into continuation functions.
        BasicBlock* pEntry = &(f.getEntryBlock());
        BasicBlock* pAllocaBlock = BasicBlock::Create(
            GetModule()->getContext(),
            VALUE_NAME("alloca_block"),
            &f,
            pEntry);
        Instruction* pTerminator = BranchInst::Create(pEntry, pAllocaBlock);
        mGlobalAllocas.MoveAllocas(pTerminator);

        for (auto pCF : mContinuationFunctions)
        {
            BuildContinuationFunction(*pCF);
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Moves global allocas to the BB specified by insertion point.
    /// @param pIsert - Insertion point.
    void LinkTessControlShaderMCF::GlobalAllocas::MoveAllocas(Instruction* pInsert)
    {
        assert(pInsert);
        for (auto& ai : mGlobalAllocas)
        {
            Instruction* pAllocaInst = cast<Instruction>(ai.second);
            pAllocaInst->moveBefore(pInsert);
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Creates global allocas for values shared between phases.
    /// @param arraySize - number of elements in allocas.
    /// @note Caller should set IRBuilder insert point.
    void LinkTessControlShaderMCF::GlobalAllocas::CreateGlobalAllocas(
        llvm::IGCIRBuilder<>& builder,
        uint32_t arraySize)
    {
        std::string suffix = "[" + std::to_string(arraySize) + "]";
        for (auto& ai : mGlobalAllocas)
        {
            Value* pOldAlloca = ai.second;
            std::string name = pOldAlloca->getName().str() + suffix;
            Type* oldType = pOldAlloca->getType()->getPointerElementType();
            Type* allocaType = ArrayType::get(oldType, arraySize);
            Value* pAlloca = builder.CreateAlloca(allocaType, nullptr, name);
            ai.second = pAlloca;
        }
        mGlobalAllocasUpdated = true;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Builds the wrapper function that calls the continuation
    ///        functions. Each continuation function returns a state. The
    ///        wrapper implements a state machine using while loop and a switch.
    /// @param f- The original function.
    void LinkTessControlShaderMCF::RebuildMainFunction()
    {
        LLVMContext& ctx = GetModule()->getContext();
        std::string oldEntryFuncName = GetMainFunction()->getName().str();

        // Need to create unconnected entry block for the new function first,
        // Because we need info from old main function to create global allocas.
        BasicBlock* pEntry = BasicBlock::Create(ctx, oldEntryFuncName);

        Builder().SetInsertPoint(pEntry);
        mGlobalAllocas.CreateGlobalAllocas(Builder(), mOutputControlPointCount);

        // Now we can delete old function.
        PurgeFunction(*GetMainFunction());
        Function* pNewFunc = GetMainFunction();

        // Determine the dispatch mode
        HullShaderDispatchModes dispatchMode = DetermineDispatchMode();

        // Create new basic blocks for main function.
        pEntry->insertInto(pNewFunc);
        BasicBlock* pStateLoopHeader = BasicBlock::Create(ctx, "state_loop_header", pNewFunc);
        BasicBlock* pStateDone = BasicBlock::Create(ctx, "state_done", pNewFunc);
        BasicBlock* pExit = BasicBlock::Create(ctx, "exit", pNewFunc);

        Value* pConstant0 = llvm::ConstantInt::get(Builder().getInt32Ty(), 0);
        Value* pLoopLimit = llvm::ConstantInt::get(Builder().getInt32Ty(), mOutputControlPointCount);
        Value* pBaseCPID = nullptr;
        Value* pCpidIncrement = nullptr;
        // Map to keep pointers to global alloca variables.
        std::map<size_t, Value*> allocas;

        // Create rest of the entry block, setting starting CPID value and adding block terminator branch.
        // Starting CPID is zero for EIGHT_PATCH dispatch and lane ID for SINGLE_PATCH.
        Builder().SetInsertPoint(pEntry);
        switch (dispatchMode)
        {
        case SINGLE_PATCH_DISPATCH_MODE:
            // In single patch mode we will need SIMD lane ID to calculate CPID.
        {
            Value* pSimdLane = Builder().CreateCall(
                GenISAIntrinsic::getDeclaration(pNewFunc->getParent(), GenISAIntrinsic::GenISA_simdLaneId),
                None,
                VALUE_NAME("simd_lane_id"));
            pBaseCPID = Builder().CreateZExt(pSimdLane, Builder().getInt32Ty());
            pCpidIncrement = llvm::ConstantInt::get(Builder().getInt32Ty(), SIMDSize);
        }
        break;

        case EIGHT_PATCH_DISPATCH_MODE:
            pBaseCPID = pConstant0;
            pCpidIncrement = llvm::ConstantInt::get(Builder().getInt32Ty(), 1);
            break;

        default:
            assert(0 && "should not reach here");
            break;
        }
        Builder().CreateBr(pStateLoopHeader);

        // The barrier state machine part. It consists of the loop enclosing switch
        // over the next state == barrier ID. Loop is finished when the next state is (-1).

        Builder().SetInsertPoint(pStateDone);
        PHINode* pNextStatePhi = Builder().CreatePHI(Builder().getInt32Ty(),
            mContinuationFunctions.size(),
            VALUE_NAME("next_state_phi"));

        Builder().SetInsertPoint(pStateLoopHeader);
        PHINode* pStatePhi = Builder().CreatePHI(Builder().getInt32Ty(), 2, VALUE_NAME("state_phi"));
        pStatePhi->addIncoming(pConstant0, pEntry);
        pStatePhi->addIncoming(pNextStatePhi, pNextStatePhi->getParent());

        BasicBlock* pPrevStateCheckBlock = nullptr;

        std::vector<Instruction*> nextState;

        for (auto pCF : mContinuationFunctions)
        {
            //switch(state)
            std::string strId = std::to_string(pCF->id);
            BasicBlock* pStateCheckBlock = BasicBlock::Create(ctx,
                VALUE_NAME(std::string("state_check_cf") + strId),
                pNewFunc,
                pStateDone);
            BasicBlock* pStateExecBlock = BasicBlock::Create(ctx,
                VALUE_NAME(std::string("state_exec_cf") + strId),
                pNewFunc,
                pStateDone);

            BasicBlock* pCpidExecBlock = BasicBlock::Create(ctx,
                VALUE_NAME(std::string("cpid_exec_cf") + strId),
                pNewFunc,
                pStateDone);

            if (pPrevStateCheckBlock != nullptr)
            {
                IGCLLVM::TerminatorInst* pTerminator = pPrevStateCheckBlock->getTerminator();
                BranchInst* pCondBranch = cast<BranchInst>(pTerminator);
                assert(pCondBranch->isConditional());

                pCondBranch->setOperand(1, pStateCheckBlock);
            }
            else
            {
                Builder().SetInsertPoint(pStateLoopHeader);
                Builder().CreateBr(pStateCheckBlock);
            }

            Builder().SetInsertPoint(pStateCheckBlock);
            Value* pCond = Builder().CreateICmpEQ(pStatePhi, llvm::ConstantInt::get(Builder().getInt32Ty(), pCF->id));
            Builder().CreateCondBr(pCond, pStateExecBlock, pExit);

            // Create a loop over CPID.
            Builder().SetInsertPoint(pStateExecBlock);
            PHINode* pCurrentCPID = Builder().CreatePHI(Builder().getInt32Ty(), 2, VALUE_NAME("CPID"));

            // Create counter incrementation.
            Builder().SetInsertPoint(pCpidExecBlock);
            Value* pIncrementedCPID = Builder().CreateAdd(pCurrentCPID, pCpidIncrement, VALUE_NAME("cpid_inc"));

            // Continue with loop creation.
            Builder().SetInsertPoint(pStateExecBlock);
            pCurrentCPID->addIncoming(pBaseCPID, pStateCheckBlock);
            pCurrentCPID->addIncoming(pIncrementedCPID, pCpidExecBlock);

            PHINode* pStateVal = Builder().CreatePHI(Builder().getInt32Ty(), 2, VALUE_NAME("next_state"));
            pStateVal->addIncoming(pStatePhi, pStateCheckBlock);
            nextState.push_back(pStateVal);

            // Create CPID loop header with checking of the loop condition: "CPID < NumOutputControlPoints"
            Value* pCounterConditionalRes = Builder().CreateICmpULT(
                pCurrentCPID,
                pLoopLimit,
                VALUE_NAME("tcs_if_ult_cond2"));
            Builder().CreateCondBr(pCounterConditionalRes, pCpidExecBlock, pStateDone);

            // Build cpid exec block by inserting GEPs and call to continuation function.
            Builder().SetInsertPoint(pCpidExecBlock);
            // Just forward arguments to continuation functions.
            std::vector<Value*> args;
            // First argument is control point ID.
            args.push_back(pCurrentCPID);
            for (auto input : pCF->inputs)
            {
                Value* pAlloca = mGlobalAllocas.GetGlobalAlloca(input);
                Value* indexList[2] = { pConstant0, pCurrentCPID };
                // Create GEP for curent instance.
                Value* pGEP = Builder().CreateGEP(pAlloca, ArrayRef<Value*>(indexList, 2),
                    VALUE_NAME(std::string("p_") + pAlloca->getName()));
                args.push_back(pGEP);
            }

            // CFn(uint arg0, type1* arg1, type2* arg2, ...);
            Instruction* pCfResult = Builder().CreateCall(
                pCF->pFunc,
                args,
                VALUE_NAME(std::string("cf_result") + strId));
            pStateVal->addIncoming(pCfResult, pCpidExecBlock);

            // Add CPID loop termination.
            Builder().CreateBr(pStateExecBlock);

            pPrevStateCheckBlock = pStateCheckBlock;
        }

        // Now complete the phi instruction for the "state done" block.
        Builder().SetInsertPoint(pStateDone);
        for (auto pNextStateCall : nextState)
        {
            // pNextStateCall should always be in immediate predecessor.
            pNextStatePhi->addIncoming(pNextStateCall, pNextStateCall->getParent());
        }
        // Add state loop condition check.
        Value* pCond = Builder().CreateICmpEQ(pNextStatePhi, llvm::ConstantInt::get(Builder().getInt32Ty(), ExitState));
        Builder().CreateCondBr(pCond, pExit, pStateLoopHeader);

        // Add function terminator.
        Builder().SetInsertPoint(pExit);
        Builder().CreateRetVoid();
    }

    llvm::Function* LinkTessControlShaderMCF::CreateNewTCSFunction(llvm::Function* pCurrentFunc)
    {
        llvm::IRBuilder<> irBuilder(pCurrentFunc->getContext());
        CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();

        std::vector<llvm::Type*> callArgTypes;
        for (auto& argIter : range(pCurrentFunc->arg_begin(), pCurrentFunc->arg_end()))
        {
            callArgTypes.push_back(argIter.getType());
        }
        callArgTypes.push_back(irBuilder.getInt32Ty());

        std::string funcName = "tessControlShaderEntry";

        llvm::Function* pNewFunction = llvm::Function::Create(
            llvm::FunctionType::get(
                irBuilder.getVoidTy(), callArgTypes, false),
            llvm::GlobalValue::PrivateLinkage,
            funcName,
            ctx->getModule());

        pNewFunction->addFnAttr(llvm::Attribute::AlwaysInline);

        // Move over the contents of the original function
        pNewFunction->getBasicBlockList().splice(pNewFunction->begin(), pCurrentFunc->getBasicBlockList());

        llvm::Function* pToBeDeletedFunc = pCurrentFunc;

        for (auto oldArg = pToBeDeletedFunc->arg_begin(),
            oldArgEnd = pToBeDeletedFunc->arg_end(),
            newArg = pNewFunction->arg_begin();
            oldArg != oldArgEnd;
            ++oldArg, ++newArg)
        {
            oldArg->replaceAllUsesWith(&(*newArg));
            newArg->takeName(&(*oldArg));
        }

        // delete the old function signature
        pToBeDeletedFunc->eraseFromParent();

        // now replace all occurrences of HSControlPointID with the current
        // loop iteration CPID - pCurrentCPID
        SmallVector<Instruction*, 10> instructionToRemove;

        llvm::Value* pHSControlPointID = llvm::GenISAIntrinsic::getDeclaration(pNewFunction->getParent(),
            GenISAIntrinsic::GenISA_DCL_HSControlPointID);

        unsigned int argIndexInFunc = IGCLLVM::GetFuncArgSize(pNewFunction) - 1;
        Function::arg_iterator arg = pNewFunction->arg_begin();
        for (unsigned int i = 0; i < argIndexInFunc; ++i, ++arg);

        for (Value::user_iterator i = pHSControlPointID->user_begin(), e = pHSControlPointID->user_end(); i != e; ++i)
        {
            Instruction* useInst = cast<Instruction>(*i);
            if (useInst->getParent()->getParent() == pNewFunction)
            {
                instructionToRemove.push_back(useInst);
                useInst->replaceAllUsesWith(&(*arg));
            }
        }

        for (auto& inst : instructionToRemove)
        {
            inst->eraseFromParent();
        }
        return pNewFunction;
    }

    void LinkTessControlShaderMCF::TCSwHWBarriersSupport(MetaDataUtils* pMdUtils)
    {
        CodeGenContext* ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
        std::string oldEntryFuncName = GetMainFunction()->getName().str();
        llvm::Function* pNewTCSFunction = CreateNewTCSFunction(mpMainFunction);

        m_useMultipleHardwareThread = (mNumBarriers != 0) ? true : false;

        // Determine the dispatch mode
        HullShaderDispatchModes dispatchMode = DetermineDispatchMode();

        // This function is the new entry function
        llvm::Function* pNewLoopFunc = llvm::Function::Create(llvm::FunctionType::get(Builder().getVoidTy(), false),
            llvm::GlobalValue::ExternalLinkage,
            oldEntryFuncName,
            ctx->getModule());

        llvm::BasicBlock* pEntryBlock = llvm::BasicBlock::Create(
            pNewLoopFunc->getContext(),
            oldEntryFuncName,
            pNewLoopFunc);

        Builder().SetInsertPoint(pEntryBlock);

        // first create a call to simdLaneId() intrinsic
        llvm::Value* pCPId = nullptr;
        llvm::Function* pFuncPatchInstanceIdOrSIMDLaneId = nullptr;
        switch (dispatchMode)
        {
        case SINGLE_PATCH_DISPATCH_MODE:
            pFuncPatchInstanceIdOrSIMDLaneId = llvm::GenISAIntrinsic::getDeclaration(
                pNewLoopFunc->getParent(), llvm::GenISAIntrinsic::GenISA_simdLaneId);
            pCPId = Builder().CreateCall(pFuncPatchInstanceIdOrSIMDLaneId);

            if (m_useMultipleHardwareThread)
            {
                // CPID = patchInstanceID * 8 + SimdLaneId;
                pFuncPatchInstanceIdOrSIMDLaneId = llvm::GenISAIntrinsic::getDeclaration(
                    pNewLoopFunc->getParent(), llvm::GenISAIntrinsic::GenISA_patchInstanceId);
                pCPId = Builder().CreateAdd(
                    Builder().CreateZExt(
                        pCPId,
                        Builder().getInt32Ty()),
                    Builder().CreateMul(
                        Builder().CreateCall(pFuncPatchInstanceIdOrSIMDLaneId),
                        llvm::ConstantInt::get(Builder().getInt32Ty(), SIMDSize)));
            }

            break;

        case EIGHT_PATCH_DISPATCH_MODE:
            pCPId = Builder().getInt32(0);
            if (m_useMultipleHardwareThread)
            {
                pFuncPatchInstanceIdOrSIMDLaneId = llvm::GenISAIntrinsic::getDeclaration(
                    pNewLoopFunc->getParent(), llvm::GenISAIntrinsic::GenISA_patchInstanceId);
                pCPId = Builder().CreateCall(pFuncPatchInstanceIdOrSIMDLaneId);
            }
            break;

        default:
            assert(0 && "should not reach here");
            break;
        }

        // We don't need to deal with any loops when we are using multiple hardware threads
        if (!m_useMultipleHardwareThread)
        {
            // initialize instanceCount to output control point count
            llvm::Value* pInstanceCount = Builder().getInt32(mOutputControlPointCount);

            // initialize loopCounter
            llvm::Value* pLoopCounter = Builder().CreateAlloca(Builder().getInt32Ty(), 0, "loopCounter");
            llvm::Value* pConstInt = Builder().getInt32(0);
            Builder().CreateStore(pConstInt, pLoopCounter, false);

            // create loop-entry basic block and setInsertPoint to loop-entry
            llvm::BasicBlock* pLoopEntryBB = llvm::BasicBlock::Create(pNewLoopFunc->getContext(),
                VALUE_NAME("tcs_loop_entry"),
                pNewLoopFunc);

            llvm::BasicBlock* pLoopConditionTrue = llvm::BasicBlock::Create(pNewLoopFunc->getContext(),
                VALUE_NAME("tcs_loop_condition_true"),
                pNewLoopFunc);

            // Create loop-continue basic block
            llvm::BasicBlock* pLoopContinueBB = llvm::BasicBlock::Create(pNewLoopFunc->getContext(),
                VALUE_NAME("tcs_loop_continue"),
                pNewLoopFunc);

            // create loop exit basic block
            llvm::BasicBlock* pAfterLoopBB = llvm::BasicBlock::Create(pNewLoopFunc->getContext(),
                VALUE_NAME("tcs_after_loop"),
                pNewLoopFunc);

            // setInsertPoint to loopEntryBB
            Builder().CreateBr(pLoopEntryBB);
            Builder().SetInsertPoint(pLoopEntryBB);

            // Load the loop counter
            llvm::LoadInst* pLoadLoopCounter = Builder().CreateLoad(pLoopCounter);
            llvm::Value* pMulLoopCounterRes = nullptr;
            llvm::Value* pCurrentCPID = nullptr;
            llvm::Value* pConditionalRes1 = nullptr;
            uint32_t loopIterationCount = 0;

            switch (dispatchMode)
            {
            case SINGLE_PATCH_DISPATCH_MODE:
                // currentCPID = pCPId + loopCounter x simdsize ( in this case its always simd 8 )
                pMulLoopCounterRes = Builder().CreateMul(
                    pLoadLoopCounter,
                    llvm::ConstantInt::get(Builder().getInt32Ty(), SIMDSize));
                pCurrentCPID = Builder().CreateAdd(
                    Builder().CreateZExt(
                        pCPId,
                        Builder().getInt32Ty()),
                    pMulLoopCounterRes);

                // cmp currentCPID to instanceCount so we enable only the required lanes
                pConditionalRes1 = Builder().CreateICmpULT(
                    pCurrentCPID,
                    pInstanceCount,
                    VALUE_NAME("tcs_if_ult_cond1"));

                // if true go to startBB else jump to pAfterLoopBB
                Builder().CreateCondBr(pConditionalRes1,
                    pLoopConditionTrue,
                    pAfterLoopBB);

                loopIterationCount = ((mOutputControlPointCount - 1) / 8) + 1;
                break;

            case EIGHT_PATCH_DISPATCH_MODE:
                pCurrentCPID = pLoadLoopCounter;
                loopIterationCount = mOutputControlPointCount;

                // jump to startBB
                Builder().CreateBr(pLoopConditionTrue);
                break;

            default:
                assert(false && "should not reach here");
                break;
            }

            // branch to pLoopContinueBB from endBB
            Builder().SetInsertPoint(pLoopConditionTrue);

            // Create a call to the TCS function when condition is true to loop the function as many times as the number of control points
            Builder().CreateCall(pNewTCSFunction, pCurrentCPID);
            Builder().CreateBr(pLoopContinueBB);

            // setInsertPoint to pLoopContinueBB
            Builder().SetInsertPoint(pLoopContinueBB);
            // increment loop counter loopCounter = loopCounter + 1
            llvm::Value* pIncrementedLoopCounter = Builder().CreateAdd(
                pLoadLoopCounter,
                llvm::ConstantInt::get(Builder().getInt32Ty(), 1));
            Builder().CreateStore(pIncrementedLoopCounter, pLoopCounter, false);

            // now evaluate loop, if( ( incrementedLoopCounter ) < ( ( maxControlPointCount - 1 )/8) + 1 )
            // then continue loop else go to after loop
            llvm::Value* pNumberOfLoopIterationsRequired = llvm::ConstantInt::get(Builder().getInt32Ty(), loopIterationCount);

            llvm::Value* pConditionalRes2 = Builder().CreateICmpULT(
                pIncrementedLoopCounter,
                pNumberOfLoopIterationsRequired,
                VALUE_NAME("tcs_if_ult_cond2"));

            // create branch to LoopEntryBB or AfterLoopBB based on result of conditional branch
            Builder().CreateCondBr(pConditionalRes2,
                pLoopEntryBB,
                pAfterLoopBB);

            // set insert point to afterloop basic block
            Builder().SetInsertPoint(pAfterLoopBB);
        }
        else if (dispatchMode == SINGLE_PATCH_DISPATCH_MODE)
        {
            // In single patch dispatch mode the execution mask is 0xFF. Make
            // that only valid CPIDs execute.

            // Create the main basic block for the shader
            llvm::BasicBlock* pTcsBody = llvm::BasicBlock::Create(pNewLoopFunc->getContext(),
                VALUE_NAME("tcs_body"),
                pNewLoopFunc);
            // and the end block.
            llvm::BasicBlock* pAfterTcsBody = llvm::BasicBlock::Create(pNewLoopFunc->getContext(),
                VALUE_NAME("tcs_end"),
                pNewLoopFunc);

            // Compare current CPID to the number of CPIDs to enable only the required lanes.
            llvm::Value* pIsLaneEnabled = Builder().CreateICmpULT(
                pCPId,
                Builder().getInt32(mOutputControlPointCount),
                VALUE_NAME("tcs_if_ult_cond1"));

            Builder().CreateCondBr(pIsLaneEnabled,
                pTcsBody,
                pAfterTcsBody);

            Builder().SetInsertPoint(pTcsBody);

            // Call TCS function.
            Builder().CreateCall(pNewTCSFunction, pCPId);

            Builder().CreateBr(pAfterTcsBody);
            Builder().SetInsertPoint(pAfterTcsBody);
        }
        else
        {
            // when using multiple hardware threads just call the Control Point function once with the appropriate CPID
            Builder().CreateCall(pNewTCSFunction, pCPId);
        }
        // add terminator to the afterloop basic block
        Builder().CreateRetVoid();

        pMdUtils->clearFunctionsInfo();
        IGCMetaDataHelper::addFunction(*pMdUtils, pNewLoopFunc);
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief - Checks whether the TCS with HW barrier support is a better
    /// option in comparison to TCS with SW barriers.
    bool LinkTessControlShaderMCF::SelectTCSwHWBarrierSupport(void)
    {
        if ((mNumBarriers == 0) ||
            (mNumInstructions > LARGE_INSTRUCTIONS_COUNT))
        {
            // Comment for mNumInstructions > LARGE_INSTRUCTIONS_COUNT:
            // TCSs with SW barrier enabled and having large instructions count consume much more scratch space per
            // thread than TCSs with HW barriers enabled. For a very large instructions count TCSs with SW barriers enabled
            // can run in pull vertex mode what negatively impacts performance.
            return true;
        }
        else
        {
            return false;
        }
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Perform pass operations.
    /// @param M- The original module.
    bool LinkTessControlShaderMCF::runOnModule(llvm::Module& M)
    {
        mpModule = &M;

        MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
        if (pMdUtils->size_FunctionsInfo() != 1)
        {
            return false;
        }

        mpMainFunction = pMdUtils->begin_FunctionsInfo()->first;
        assert(mpMainFunction);

        mpBuilder = new llvm::IGCIRBuilder<>(M.getContext());

        // Get the output control point count.
        llvm::GlobalVariable* pOCPCount = GetMainFunction()->getParent()->getGlobalVariable("HSOutputControlPointCount");
        mOutputControlPointCount = int_cast<uint32_t>(llvm::cast<llvm::ConstantInt>(pOCPCount->getInitializer())->getZExtValue());

        // Get the barriers count.
        FindBarriers(*mpMainFunction);

        if (SelectTCSwHWBarrierSupport())
        {
            TCSwHWBarriersSupport(pMdUtils);
        }
        else
        {
            // SW barriers support.
            BuildContinuationFunctions(*mpMainFunction);
            RebuildMainFunction();
        }
        return true;
    }

    //////////////////////////////////////////////////////////////////////////
    /// @brief Create this pass.
    llvm::Pass* createLinkTessControlShaderMCF()
    {
        return new LinkTessControlShaderMCF();
    }
}
