ignatenkobrain pushed to beignet (f22). "update to latest head and fix licensing issues (..more)"
notifications at fedoraproject.org
notifications at fedoraproject.org
Tue May 19 10:03:43 UTC 2015
From bbfccf3794ded8b19d0e05a8aefa85c7cf20b61c Mon Sep 17 00:00:00 2001
From: Igor Gnatenko <i.gnatenko.brain at gmail.com>
Date: Tue, 19 May 2015 12:42:33 +0300
Subject: update to latest head and fix licensing issues
Signed-off-by: Igor Gnatenko <i.gnatenko.brain at gmail.com>
diff --git a/0001-Remove-some-LGPL-incompatible-code.patch b/0001-Remove-some-LGPL-incompatible-code.patch
new file mode 100644
index 0000000..b0fb06d
--- /dev/null
+++ b/0001-Remove-some-LGPL-incompatible-code.patch
@@ -0,0 +1,1506 @@
+From 347116ef2855e73c25dc09c0a8ee1c7f58c46099 Mon Sep 17 00:00:00 2001
+From: Zhigang Gong <zhigang.gong at intel.com>
+Date: Tue, 19 May 2015 10:36:03 +0800
+Subject: [PATCH 1/9] Remove some LGPL incompatible code.
+
+Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
+---
+ backend/src/CMakeLists.txt | 2 -
+ backend/src/ir/structural_analysis.cpp | 1092 --------------------------------
+ backend/src/ir/structural_analysis.hpp | 342 ----------
+ backend/src/llvm/llvm_to_gen.cpp | 11 -
+ 4 files changed, 1447 deletions(-)
+ delete mode 100644 backend/src/ir/structural_analysis.cpp
+ delete mode 100644 backend/src/ir/structural_analysis.hpp
+
+diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
+index a6736ec..45f18d8 100644
+--- a/backend/src/CMakeLists.txt
++++ b/backend/src/CMakeLists.txt
+@@ -66,8 +66,6 @@ set (GBE_SRC
+ ir/lowering.hpp
+ ir/printf.cpp
+ ir/printf.hpp
+- ir/structural_analysis.cpp
+- ir/structural_analysis.hpp
+ ir/immediate.hpp
+ ir/immediate.cpp
+ backend/context.cpp
+diff --git a/backend/src/ir/structural_analysis.cpp b/backend/src/ir/structural_analysis.cpp
+deleted file mode 100644
+index 101570a..0000000
+--- a/backend/src/ir/structural_analysis.cpp
++++ /dev/null
+@@ -1,1092 +0,0 @@
+-/*
+- * structural_analysis.hpp
+- * This code is derived from the ControlTree.h and ControlTree.cpp of
+- * project gpuocelot by Yongjia Zhang.
+- * The original copyright of gpuocelot appears below in its entirety.
+- */
+-
+-/*
+- * Copyright 2011
+- * GEORGIA TECH RESEARCH CORPORATION
+- * ALL RIGHTS RESERVED
+- *
+- * Redistribution and use in source and binary forms, with or without
+- * modification, are permitted provided that the following conditions are
+- * met:
+- * * Redistributions of source code must retain the above copyright
+- * notice, this list of conditions and the following disclaimers.
+- * * Redistributions in binary form must reproduce the above copyright
+- * notice, this list of conditions and the following disclaimers in the
+- * documentation and/or other materials provided with the
+- * distribution.
+- * * Neither the name of GEORGIA TECH RESEARCH CORPORATION nor the
+- * names of its contributors may be used to endorse or promote
+- * products derived from this software without specific prior
+- * written permission.
+- *
+- * THIS SOFTWARE IS PROVIDED BY GEORGIA TECH RESEARCH CORPORATION ''AS IS''
+- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEORGIA TECH RESEARCH
+- * CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+- *
+- * You agree that the Software will not be shipped, transferred, exported,
+- * or re-exported directly into any country prohibited by the United States
+- * Export Administration Act and the regulations thereunder nor will be
+- * used for any purpose prohibited by the Act.
+- */
+-
+-
+-#include "structural_analysis.hpp"
+-
+-namespace analysis
+-{
+- ControlTree::~ControlTree()
+- {
+- NodeVector::iterator iter = nodes.begin();
+- NodeVector::iterator iter_end = nodes.end();
+- while(iter != iter_end)
+- {
+- delete *iter;
+- iter++;
+- }
+- }
+- void ControlTree::handleSelfLoopNode(Node *loopnode, ir::LabelIndex& whileLabel)
+- {
+- //NodeList::iterator child_iter = (*it)->children.begin();
+- ir::BasicBlock *pbb = loopnode->getExit();
+- GBE_ASSERT(pbb->isLoopExit);
+- ir::BasicBlock::iterator it = pbb->end();
+- it--;
+- if (pbb->hasExtraBra)
+- it--;
+- ir::BranchInstruction* pinsn = static_cast<ir::BranchInstruction *>(&*it);
+-
+- if(!pinsn->isPredicated()){
+- std::cout << "WARNING:" << "endless loop detected!" << std::endl;
+- return;
+- }
+- ir::Register reg = pinsn->getPredicateIndex();
+- /* since this node is an while node, so we remove the BRA instruction at the bottom of the exit BB of 'node',
+- * and insert WHILE instead
+- */
+- whileLabel = pinsn->getLabelIndex();
+- ir::Instruction insn = ir::WHILE(whileLabel, reg);
+- ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+- pbb->insertAt(it, *p_new_insn);
+- pbb->whileLabel = whileLabel;
+- pbb->erase(it);
+- }
+-
+- /* recursive mark the bbs' variable needEndif, the bbs all belong to node.*/
+- void ControlTree::markNeedIf(Node *node, bool status)
+- {
+- if(node->type() == BasicBlock)
+- {
+- ir::BasicBlock* bb = ((BasicBlockNode*)node)->getBasicBlock();
+- bb->needIf = status;
+- return;
+- }
+- NodeList::iterator it = node->children.begin();
+- while(it != node->children.end())
+- {
+- markNeedIf(*it,status);
+- it++;
+- }
+- }
+-
+- /* recursive mark the bbs' variable needIf, the bbs all belong to node.*/
+- void ControlTree::markNeedEndif(Node *node, bool status)
+- {
+- if(node->type() == BasicBlock)
+- {
+- ir::BasicBlock* bb = ((BasicBlockNode*)node)->getBasicBlock();
+- bb->needEndif = status;
+- return;
+- }
+-
+- NodeList::iterator it = node->children.begin();
+- while(it != node->children.end())
+- {
+- markNeedEndif(*it, status);
+- it++;
+- }
+- }
+-
+- /* recursive mark the bbs' variable mark, the bbs all belong to node. */
+- void ControlTree::markStructuredNodes(Node *node, bool status)
+- {
+- if(node->type() == BasicBlock)
+- {
+- BasicBlockNode* pbb = static_cast<BasicBlockNode *>(node);
+- pbb->getBasicBlock()->belongToStructure = true;
+- }
+- node->mark = status;
+- NodeList::iterator it = node->children.begin();
+- while(it != node->children.end())
+- {
+- markStructuredNodes(*it, status);
+- it++;
+- }
+- }
+-
+- void ControlTree::handleIfNode(Node *node, ir::LabelIndex& matchingEndifLabel, ir::LabelIndex& matchingElseLabel)
+- {
+- ir::BasicBlock *pbb = node->getExit();
+- ir::BranchInstruction* pinsn = static_cast<ir::BranchInstruction *>(pbb->getLastInstruction());
+- ir::Register reg = pinsn->getPredicateIndex();
+- ir::BasicBlock::iterator it = pbb->end();
+- it--;
+- /* since this node is an if node, so we remove the BRA instruction at the bottom of the exit BB of 'node',
+- * and insert IF instead
+- */
+- pbb->erase(it);
+- ir::Instruction insn = ir::IF(matchingElseLabel, reg, node->inversePredicate);
+- ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+- pbb->append(*p_new_insn);
+- pbb->matchingEndifLabel = matchingEndifLabel;
+- pbb->matchingElseLabel = matchingElseLabel;
+- }
+-
+- void ControlTree::handleThenNode(Node *node, ir::LabelIndex& endiflabel)
+- {
+- ir::BasicBlock *pbb = node->getExit();
+- ir::BasicBlock::iterator it = pbb->end();
+- it--;
+- ir::Instruction *p_last_insn = pbb->getLastInstruction();
+-
+- endiflabel = fn->newLabel();
+- //pbb->thisEndifLabel = endiflabel;
+-
+- ir::Instruction insn = ir::ENDIF(endiflabel);
+- ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+- // we need to insert ENDIF before the BRA(if exists).
+- bool append_bra = false;
+- if((*it).getOpcode() == ir::OP_BRA)
+- {
+- pbb->erase(it);
+- append_bra = true;
+- }
+- pbb->append(*p_new_insn);
+- if(append_bra)
+- pbb->append(*p_last_insn);
+- }
+-
+-
+- void ControlTree::handleThenNode2(Node *node, Node *elsenode, ir::LabelIndex elseBBLabel)
+- {
+- ir::BasicBlock *pbb = node->getExit();
+- ir::BasicBlock::iterator it = pbb->end();
+- it--;
+- if((*it).getOpcode() == ir::OP_BRA)
+- pbb->erase(it);
+-
+- if(node->getExit()->getNextBlock() == elsenode->getEntry())
+- return;
+-
+- // Add an unconditional jump to 'else' block
+- ir::Instruction insn = ir::BRA(elseBBLabel);
+- ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+- pbb->append(*p_new_insn);
+- }
+-
+-
+- void ControlTree::handleElseNode(Node* node, ir::LabelIndex& elselabel, ir::LabelIndex& endiflabel)
+- {
+- // to insert ENDIF properly
+- handleThenNode(node, endiflabel);
+-
+- ir::BasicBlock *pbb = node->getEntry();
+- ir::BasicBlock::iterator it = pbb->begin();
+- it++;
+-
+- elselabel = fn->newLabel();
+- pbb->thisElseLabel = elselabel;
+-
+- // insert ELSE properly
+- ir::Instruction insn = ir::ELSE(endiflabel);
+- ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+-
+- pbb->insertAt(it, *p_new_insn);
+- }
+-
+-
+- void ControlTree::handleStructuredNodes()
+- {
+- NodeVector::iterator it;
+- NodeVector::iterator end = nodes.end();
+- NodeVector::iterator begin = nodes.begin();
+- it = end;
+- it--;
+- NodeVector::reverse_iterator rit = nodes.rbegin();
+- /* structured bbs only need if and endif insn to handle the execution
+- * in structure entry and exit BasicBlock, so we process the nodes backward, since
+- * the node at the back of nodes is always a 'not smaller' structure then
+- * the ones before it. we mark the nodes which are sub-nodes of the node
+- * we are dealing with, in order to ensure we are always handling the 'biggest'
+- * structures */
+- while(rit != nodes.rend())
+- {
+- if((*rit)->type() == IfThen || (*rit)->type() == IfElse|| (*rit)->type() == SelfLoop)
+- {
+- if(false == (*rit)->mark && (*rit)->canBeHandled)
+- {
+- markStructuredNodes(*rit, true);
+- /* only the entry bb of this structure needs 'if' at backend and
+- * only the exit bb of this structure needs 'endif' at backend
+- * see comment about needEndif and needIf at function.hpp for detail. */
+- markNeedEndif(*rit, false);
+- markNeedIf(*rit, false);
+- ir::BasicBlock* entry = (*rit)->getEntry();
+- ir::BasicBlock* eexit = (*rit)->getExit();
+- entry->needIf = true;
+- eexit->needEndif = true;
+- entry->endifLabel = fn->newLabel();
+- eexit->endifLabel = entry->endifLabel;
+- eexit->isStructureExit = true;
+- eexit->matchingStructureEntry = entry;
+- }
+- }
+- rit++;
+- }
+-
+- rit = nodes.rbegin();
+- gbe::vector<ir::BasicBlock *> &blocks = fn->getBlocks();
+- std::vector<ir::BasicBlock *> bbs;
+- bbs.resize(blocks.size());
+-
+- /* here insert the bras to the BBs, which would
+- * simplify the reorder of basic blocks */
+- for(size_t i = 0; i < blocks.size(); ++i)
+- {
+- bbs[i] = blocks[i];
+- if(i != blocks.size() -1 &&
+- (bbs[i]->getLastInstruction()->getOpcode() != ir::OP_BRA ||
+- (bbs[i]->isStructureExit && bbs[i]->isLoopExit)))
+- {
+- ir::Instruction insn = ir::BRA(bbs[i]->getNextBlock()->getLabelIndex());
+- ir::Instruction* pNewInsn = bbs[i]->getParent().newInstruction(insn);
+- bbs[i]->append(*pNewInsn);
+- if (bbs[i]->isStructureExit && bbs[i]->isLoopExit)
+- bbs[i]->hasExtraBra = true;
+- }
+- }
+-
+- /* now, reorder the basic blocks to reduce the unconditional jump we inserted whose
+- * targets are the 'else' nodes. the algorithm is quite simple, just put the unstructured
+- * BBs(maybe belong to another structure, but not this one) in front of the entry BB of
+- * this structure in front of all the others and put the other unstructured BBs at the
+- * back of the others. the sequence of structured is get through function getStructureSequence.
+- */
+- while(rit != nodes.rend())
+- {
+- if(((*rit)->type() == IfThen || (*rit)->type() == IfElse || (*rit)->type() == Block ||(*rit)->type() == SelfLoop) &&
+- (*rit)->canBeHandled && (*rit)->mark == true)
+- {
+- markStructuredNodes(*rit, false);
+- std::set<int> ns = getStructureBasicBlocksIndex(*rit, bbs);
+- ir::BasicBlock *entry = (*rit)->getEntry();
+-
+- int entryIndex = *(ns.begin());
+- for(size_t i=0; i<bbs.size(); ++i)
+- {
+- if(bbs[i] == entry)
+- entryIndex = i;
+- }
+-
+- std::set<int>::iterator iter = ns.begin();
+- int index = *iter;
+-
+- std::vector<ir::BasicBlock *> unstruSeqHead;
+- std::vector<ir::BasicBlock *> unstruSeqTail;
+-
+- iter = ns.begin();
+- while(iter != ns.end())
+- {
+- if(index != *iter)
+- {
+- if(index < entryIndex)
+- unstruSeqHead.push_back(bbs[index]);
+- else
+- unstruSeqTail.push_back(bbs[index]);
+- index++;
+- }
+- else
+- {
+- index++;
+- iter++;
+- }
+- }
+-
+- std::vector<ir::BasicBlock *> struSeq;
+- getStructureSequence(*rit, struSeq);
+-
+- int firstindex = *(ns.begin());
+- for(size_t i = 0; i < unstruSeqHead.size(); ++i)
+- bbs[firstindex++] = unstruSeqHead[i];
+- for(size_t i = 0; i < struSeq.size(); ++i)
+- bbs[firstindex++] = struSeq[i];
+- for(size_t i = 0; i < unstruSeqTail.size(); ++i)
+- bbs[firstindex++] = unstruSeqTail[i];
+- }
+- rit++;
+- }
+-
+- /* now, erase the BRAs inserted before whose targets are their fallthrough blocks */
+- for(size_t i=0; i<bbs.size(); ++i)
+- {
+- if(bbs[i]->getLastInstruction()->getOpcode() == ir::OP_BRA &&
+- !((ir::BranchInstruction*)(bbs[i]->getLastInstruction()))->isPredicated())
+- {
+- if(((ir::BranchInstruction *)bbs[i]->getLastInstruction())->getLabelIndex() == bbs[i+1]->getLabelIndex())
+- {
+- ir::BasicBlock::iterator it= bbs[i]->end();
+- it--;
+-
+- bbs[i]->erase(it);
+-
+- if (bbs[i]->hasExtraBra)
+- bbs[i]->hasExtraBra = false;
+- }
+- }
+- }
+- for(size_t i=0; i<bbs.size(); ++i)
+- blocks[i] = bbs[i];
+-
+- fn->sortLabels();
+- fn->computeCFG();
+-
+- it = begin;
+- while(it != end)
+- {
+- if((*it)->canBeHandled)
+- {
+- switch((*it)->type())
+- {
+- case IfThen:
+- {
+- NodeList::iterator child_iter = (*it)->children.end();
+- ir::LabelIndex endiflabel;
+- child_iter--;
+- handleThenNode(*child_iter, endiflabel); // this call would pass out the proper endiflabel for handleIfNode's use.
+- child_iter--;
+- handleIfNode(*child_iter, endiflabel, endiflabel);
+- }
+- break;
+-
+- case IfElse:
+- {
+- NodeList::iterator child_iter = (*it)->children.end();
+- ir::LabelIndex endiflabel;
+- ir::LabelIndex elselabel;
+- NodeList::iterator else_node;
+- child_iter--;
+- else_node = child_iter;
+- handleElseNode(*child_iter, elselabel, endiflabel);
+- ir::LabelIndex elseBBLabel = (*child_iter)->getEntry()->getLabelIndex();
+- child_iter--;
+- handleThenNode2(*child_iter, *else_node, elseBBLabel);
+- child_iter--;
+- handleIfNode(*child_iter, endiflabel, elselabel);
+- }
+- break;
+-
+- case SelfLoop:
+- {
+- ir::LabelIndex whilelabel;
+- handleSelfLoopNode(*it, whilelabel);
+- }
+- break;
+-
+- default:
+- break;
+- }
+- }
+-
+- it++;
+- }
+-
+- }
+-
+- void ControlTree::getStructureSequence(Node *node, std::vector<ir::BasicBlock*> &seq)
+- {
+- /* in the control tree, for if-then, if node is before then node; for if-else, the
+- * stored sequence is if-then-else, for block structure, the stored sequence is just
+- * their executed sequence. so we could just get the structure sequence by recrusive
+- * calls getStructureSequence to all the elements in children one by one.
+- */
+- if(node->type() == BasicBlock)
+- {
+- seq.push_back(((BasicBlockNode *)node)->getBasicBlock());
+- return;
+- }
+-
+- NodeList::iterator iter = node->children.begin();
+- while(iter != node->children.end())
+- {
+- getStructureSequence(*iter, seq);
+- iter++;
+- }
+-
+- }
+-
+-
+- std::set<int> ControlTree::getStructureBasicBlocksIndex(Node* node, std::vector<ir::BasicBlock *> &bbs)
+- {
+- std::set<int> result;
+- if(node->type() == BasicBlock)
+- {
+- for(size_t i=0; i<bbs.size(); i++)
+- {
+- if(bbs[i] == ((BasicBlockNode *)node)->getBasicBlock())
+- {
+- result.insert(i);
+- break;
+- }
+- }
+- return result;
+- }
+- NodeList::iterator iter = (node->children).begin();
+- NodeList::iterator end = (node->children).end();
+- while(iter != end)
+- {
+- std::set<int> ret = getStructureBasicBlocksIndex(*iter, bbs);
+- result.insert(ret.begin(), ret.end());
+- iter++;
+- }
+- return result;
+- }
+-
+-
+- std::set<ir::BasicBlock *> ControlTree::getStructureBasicBlocks(Node *node)
+- {
+- std::set<ir::BasicBlock *> result;
+- if(node->type() == BasicBlock)
+- {
+- result.insert(((BasicBlockNode *)node)->getBasicBlock());
+- return result;
+- }
+- NodeList::iterator iter = (node->children).begin();
+- NodeList::iterator end = (node->children).end();
+- while(iter != end)
+- {
+- std::set<ir::BasicBlock *> ret = getStructureBasicBlocks(*iter);
+- result.insert(ret.begin(), ret.end());
+- iter++;
+- }
+- return result;
+- }
+-
+-
+- Node* ControlTree::insertNode(Node *p_node)
+- {
+- nodes.push_back(p_node);
+- return p_node;
+- }
+-
+-
+- bool ControlTree::checkForBarrier(const ir::BasicBlock* bb)
+- {
+- ir::BasicBlock::const_iterator iter = bb->begin();
+- ir::BasicBlock::const_iterator iter_end = bb->end();
+- while(iter != iter_end)
+- {
+- if((*iter).getOpcode() == ir::OP_SYNC)
+- return true;
+- iter++;
+- }
+-
+- return false;
+- }
+-
+-
+- void ControlTree::getLiveIn(ir::BasicBlock& bb, std::set<ir::Register>& livein)
+- {
+- ir::BasicBlock::iterator iter = bb.begin();
+- std::set<ir::Register> varKill;
+- while(iter != bb.end())
+- {
+- ir::Instruction& insn = *iter;
+- const uint32_t srcNum = insn.getSrcNum();
+- const uint32_t dstNum = insn.getDstNum();
+- for(uint32_t srcID = 0; srcID < srcNum; ++srcID)
+- {
+- const ir::Register reg = insn.getSrc(srcID);
+- if(varKill.find(reg) == varKill.end())
+- livein.insert(reg);
+- }
+- for(uint32_t dstID = 0; dstID < dstNum; ++dstID)
+- {
+- const ir::Register reg = insn.getDst(dstID);
+- varKill.insert(reg);
+- }
+-
+- iter++;
+- }
+- }
+-
+- void ControlTree::calculateNecessaryLiveout()
+- {
+- NodeVector::iterator iter = nodes.begin();
+-
+- while(iter != nodes.end())
+- {
+- switch((*iter)->type())
+- {
+- case IfElse:
+- {
+- std::set<ir::BasicBlock *> bbs;
+- NodeList::iterator thenIter = (*iter)->children.begin();
+- thenIter++;
+- bbs = getStructureBasicBlocks(*thenIter);
+-
+- Node *elseNode = *((*iter)->children.rbegin());
+- std::set<ir::Register> livein;
+- getLiveIn(*(elseNode->getEntry()), livein);
+-
+- std::set<ir::BasicBlock *>::iterator bbiter = bbs.begin();
+- while(bbiter != bbs.end())
+- {
+- (*bbiter)->liveout.insert(livein.begin(), livein.end());
+- bbiter++;
+- }
+- }
+-
+- default:
+- break;
+- }
+- iter++;
+- }
+- }
+-
+-
+- void ControlTree::initializeNodes()
+- {
+- ir::BasicBlock& tmp_bb = fn->getTopBlock();
+- ir::BasicBlock* p_tmp_bb = &tmp_bb;
+- Node* p = NULL;
+-
+- if(NULL != p_tmp_bb)
+- {
+- Node *p_tmp_node = new BasicBlockNode(p_tmp_bb);
+- p_tmp_node->label = p_tmp_bb->getLabelIndex();
+-
+- if(checkForBarrier(p_tmp_bb))
+- p_tmp_node->hasBarrier() = true;
+-
+- nodes.push_back(p_tmp_node);
+- bbmap[p_tmp_bb] = p_tmp_node;
+- p_tmp_bb = p_tmp_bb->getNextBlock();
+- p = p_tmp_node;
+- }
+-
+- while(p_tmp_bb != NULL)
+- {
+- Node *p_tmp_node = new BasicBlockNode(p_tmp_bb);
+- p_tmp_node->label = p_tmp_bb->getLabelIndex();
+-
+- if(checkForBarrier(p_tmp_bb))
+- p_tmp_node->hasBarrier() = true;
+-
+- p->fallthrough() = p_tmp_node;
+- p = p_tmp_node;
+- nodes.push_back(p_tmp_node);
+- bbmap[p_tmp_bb] = p_tmp_node;
+- p_tmp_bb = p_tmp_bb->getNextBlock();
+- }
+-
+- if(NULL != p)
+- p->fallthrough() = NULL;
+-
+- p_tmp_bb = &tmp_bb;
+-
+- this->nodes_entry = bbmap[p_tmp_bb];
+-
+- while(p_tmp_bb != NULL)
+- {
+- ir::BlockSet::const_iterator iter_begin = p_tmp_bb->getPredecessorSet().begin();
+- ir::BlockSet::const_iterator iter_end = p_tmp_bb->getPredecessorSet().end();
+- while(iter_begin != iter_end)
+- {
+- bbmap[p_tmp_bb]->preds().insert(bbmap[*iter_begin]);
+- iter_begin++;
+- }
+-
+- iter_begin = p_tmp_bb->getSuccessorSet().begin();
+- iter_end = p_tmp_bb->getSuccessorSet().end();
+- while(iter_begin != iter_end)
+- {
+- bbmap[p_tmp_bb]->succs().insert(bbmap[*iter_begin]);
+- iter_begin++;
+- }
+-
+- p_tmp_bb = p_tmp_bb->getNextBlock();
+- }
+- }
+-
+-
+- void ControlTree::DFSPostOrder(Node *start)
+- {
+- visited.insert(start);
+- NodeSet::iterator y;
+- NodeSet::iterator iter_begin = start->succs().begin();
+- NodeSet::iterator iter_end = start->succs().end();
+- for(y = iter_begin; y != iter_end; ++y )
+- {
+- if(visited.find(*y) != visited.end())
+- continue;
+- DFSPostOrder(*y);
+- }
+- post_order.push_back(start);
+- }
+-
+-
+- bool ControlTree::isCyclic(Node* node)
+- {
+- if(node->type() == NaturalLoop ||
+- node->type() == WhileLoop ||
+- node->type() == SelfLoop)
+- return true;
+-
+- return false;
+- }
+-
+-
+- bool ControlTree::isBackedge(const Node* head, const Node* tail)
+- {
+- const Node* match[] = {head, tail};
+- NodeList::iterator n = find_first_of(post_order.begin(), post_order.end(), match, match + 2);
+-
+- if(*n == head)
+- return true;
+- if(*n == tail)
+- return false;
+-
+- return false;
+- }
+-
+-
+- /* this algorithm is from Muchnick's textbook(sec 7.7) (Advanced Compiler Design and Implementation) */
+- Node* ControlTree::acyclicRegionType(Node* node, NodeSet& nset)
+- {
+- nset.clear();
+- Node *n;
+- bool p, s, barrier;
+- NodeList nodes;
+-
+- n = node;
+- p = true;
+- s = (n->succs().size()==1);
+- barrier = n->hasBarrier();
+- while(p && s && !barrier)
+- {
+- if(nset.insert(n).second)
+- nodes.push_back(n);
+- n = *(n->succs().begin());
+- barrier = n->hasBarrier();
+- p = (n->preds().size() == 1);
+- s = (n->succs().size() == 1);
+- }
+-
+- if(p && !barrier)
+- {
+- if(nset.insert(n).second)
+- nodes.push_back(n);
+- }
+-
+- n = node;
+- p = (n->preds().size() == 1);
+- s = true;
+- barrier = n->hasBarrier();
+-
+- while(p && s && !barrier)
+- {
+- if(nset.insert(n).second)
+- nodes.push_front(n);
+- n = *(n->preds().begin());
+- barrier = n->hasBarrier();
+- p = (n->preds().size() == 1);
+- s = (n->succs().size() == 1);
+- }
+-
+- if(s && !barrier)
+- {
+- if(nset.insert(n).second)
+- nodes.push_front(n);
+- }
+-
+- node = n;
+-
+- if(nodes.size() >=2 )
+- {
+- Node* p = new BlockNode(nodes);
+- NodeList::iterator iter = nodes.begin();
+- while(iter != nodes.end())
+- {
+- if((*iter)->canBeHandled == false)
+- {
+- p->canBeHandled = false;
+- break;
+- }
+- iter++;
+- }
+-
+- return insertNode(p);
+- }
+-
+- else if(node->succs().size() == 2)
+- {
+- Node *m;
+- m = *(node->succs().begin());
+- n = *(++(node->succs().begin()));
+-
+- /* check for if node then n */
+- if( n->succs().size() == 1 &&
+- n->preds().size() == 1 &&
+- *(n->succs().begin()) == m &&
+- !n->hasBarrier() && !node->hasBarrier())
+- {
+- nset.clear();
+- nset.insert(node);
+- nset.insert(n);
+-
+- Node* p = new IfThenNode(node, n);
+- if(node->fallthrough() == m)
+- node->inversePredicate = false;
+-
+- if(node->canBeHandled == false || n->canBeHandled == false)
+- p->canBeHandled = false;
+-
+- return insertNode(p);
+- }
+-
+- /* check for if node then m */
+- if(m->succs().size() == 1 &&
+- m->preds().size() == 1 &&
+- *(m->succs().begin()) == n &&
+- !m->hasBarrier() && !node->hasBarrier())
+- {
+- nset.clear();
+- nset.insert(node);
+- nset.insert(m);
+-
+- Node* p = new IfThenNode(node, m);
+- if(node->fallthrough() == n)
+- node->inversePredicate = false;
+-
+- if(node->canBeHandled == false || m->canBeHandled == false)
+- p->canBeHandled = false;
+-
+- return insertNode(p);
+- }
+-
+- /* check for if node then n else m */
+- if(m->succs().size() == 1 && n->succs().size() == 1 &&
+- m->preds().size() == 1 && n->preds().size() == 1 &&
+- *(m->succs().begin()) == *(n->succs().begin()) &&
+- node->fallthrough() == n && !m->hasBarrier() && !n->hasBarrier() && !node->hasBarrier())
+- {
+- nset.clear();
+- nset.insert(node);
+- nset.insert(n);
+- nset.insert(m);
+-
+- Node* p = new IfElseNode(node, n, m);
+-
+- if(node->canBeHandled == false ||
+- m->canBeHandled == false ||
+- n->canBeHandled == false)
+- p->canBeHandled = false;
+-
+- return insertNode(p);
+- }
+-
+- /* check for if node then m else n */
+- if(m->succs().size() == 1 && n->succs().size() == 1 &&
+- m->preds().size() == 1 && n->preds().size() == 1 &&
+- *(m->succs().begin()) == *(n->succs().begin()) &&
+- node->fallthrough() == m && !m->hasBarrier() && !n->hasBarrier() &&!node->hasBarrier())
+- {
+- nset.clear();
+- nset.insert(node);
+- nset.insert(m);
+- nset.insert(n);
+-
+- Node* p = new IfElseNode(node, m, n);
+-
+- if(node->canBeHandled == false ||
+- m->canBeHandled == false ||
+- n->canBeHandled == false)
+- p->canBeHandled = false;
+- return insertNode(p);
+- }
+- }
+-
+- return NULL;
+- }
+-
+- bool ControlTree::path(Node *from, Node *to, Node *notthrough)
+- {
+-
+- if(from == notthrough || visited.find(from) != visited.end())
+- return false;
+-
+- if(from == to)
+- return true;
+-
+- visited.insert(from);
+-
+- for(NodeSet::const_iterator s = from->succs().begin(); s != from->succs().end(); s++)
+- {
+- if(path(*s, to, notthrough))
+- return true;
+- }
+-
+- return false;
+- }
+-
+-
+- Node * ControlTree::cyclicRegionType(Node *node, NodeList &nset)
+- {
+- /* check for self-loop */
+- if(nset.size() == 1)
+- {
+- if(node->succs().find(node) != node->succs().end())
+- {
+- Node* p = new SelfLoopNode(node);
+-
+- p->canBeHandled = true;
+- node->getExit()->isLoopExit = true;
+- return insertNode(p);
+- }
+- else
+- return NULL;
+- }
+-
+- //FIXME: as our IR could only handle self loop, the while loop node
+- //is disabled to avoid performace regression by the path function.
+-#if 0
+- /* check for improper region */
+- for(NodeList::const_iterator m = nset.begin(); m != nset.end(); m++)
+- {
+- visited.clear();
+- if(!path(node, *m))
+- return NULL;
+- }
+-
+- /* check for while loop */
+- NodeList::iterator m;
+- for(m = nset.begin(); m != nset.end(); ++m)
+- {
+- if(*m == node)
+- continue;
+- if(node->succs().size() == 2 && (*m)->succs().size() == 1 &&
+- node->preds().size() == 2 && (*m)->preds().size() == 1)
+- {
+- Node* p = new WhileLoopNode(node, *m);
+-
+- p->canBeHandled = false;
+-
+- return insertNode(p);
+- }
+- }
+-#endif
+-
+- return NULL;
+- }
+-
+-
+- /* this algorithm is from Muchnick's textbook(sec 7.7) (Advanced Compiler Design and Implementation) */
+- void ControlTree::reduce(Node* node, NodeSet nodeSet)
+- {
+- NodeSet::iterator n;
+- for(n = nodeSet.begin(); n != nodeSet.end(); n++)
+- {
+- NodeSet::iterator p;
+- for(p = (*n)->preds().begin(); p != (*n)->preds().end(); p++)
+- {
+- if(nodeSet.find(*p) != nodeSet.end())
+- continue;
+-
+- (*p)->succs().erase(*n);
+-
+- (*p)->succs().insert(node);
+- node->preds().insert(*p);
+-
+- if((*p)->fallthrough() == *n)
+- (*p)->fallthrough() = node;
+- }
+-
+-
+- NodeSet::iterator s;
+- for(s = (*n)->succs().begin(); s != (*n)->succs().end(); s++)
+- {
+- if(nodeSet.find(*s) != nodeSet.end())
+- continue;
+-
+- (*s)->preds().erase(*n);
+-
+- (*s)->preds().insert(node);
+- node->succs().insert(*s);
+-
+- if((*n)->fallthrough() == *s)
+- node->fallthrough() = *s;
+- }
+- }
+-
+- if(!isCyclic(node))
+- {
+- for(n = nodeSet.begin(); n != nodeSet.end(); n++)
+- {
+- bool shouldbreak = false;
+- NodeSet::iterator p;
+- for(p = (*n)->preds().begin(); p != (*n)->preds().end(); p++)
+- {
+- if(nodeSet.find(*p) == nodeSet.end())
+- continue;
+-
+- if(isBackedge(*p, *n))
+- {
+- node->preds().insert(node);
+- node->succs().insert(node);
+-
+- shouldbreak = true;
+- break;
+- }
+- }
+-
+- if(shouldbreak)
+- break;
+- }
+- }
+-
+- compact(node, nodeSet);
+- }
+-
+-
+- /* this algorithm is from Muchnick's textbook(sec 7.7) (Advanced Compiler Design and Implementation) */
+- void ControlTree::compact(Node* node, NodeSet nodeSet)
+- {
+- NodeList::iterator n, pos;
+- for(n = post_order.begin(); n!= post_order.end() && !nodeSet.empty();)
+- {
+- if(!nodeSet.erase(*n))
+- {
+- n++;
+- continue;
+- }
+-
+- n = post_order.erase(n);
+- pos = n;
+- }
+-
+- post_ctr = post_order.insert(pos, node);
+- }
+-
+-
+- /* this algorithm is from Muchnick's textbook(sec 7.7) (Advanced Compiler Design and Implementation) */
+- void ControlTree::structuralAnalysis(Node *entry)
+- {
+- Node* n;
+- NodeSet nset;
+- NodeList reachUnder;
+- bool changed;
+- do
+- {
+- changed = false;
+- post_order.clear();
+- visited.clear();
+-
+- DFSPostOrder(entry);
+- post_ctr = post_order.begin();
+-
+- while(post_order.size() > 1 && post_ctr != post_order.end())
+- {
+- n = *post_ctr;
+- Node* region = acyclicRegionType(n, nset);
+-
+- if( NULL != region)
+- {
+- changed = true;
+-
+- reduce(region, nset);
+-
+- if(nset.find(entry) != nset.end())
+- entry = region;
+- }
+- else
+- {
+- reachUnder.clear();
+- nset.clear();
+-
+- //reuse the loop info from llvm gaterLoopInfo.
+- const gbe::vector<ir::Loop *> &loops = fn->getLoops();
+- if(loops.size() == 0){
+- post_ctr++;
+- continue;
+- }
+-
+- Node* loop_header = NULL;
+- //if n is basic block node, query the llvm loop info to find the loop whoose loop header is n;
+- if(n->type() == BasicBlock){
+- for (auto l : loops) {
+- ir::BasicBlock &a = fn->getBlock(l->bbs[0]);
+- loop_header = bbmap.find(&a)->second;
+-
+- if(loop_header == n){
+- for (auto bb : l->bbs) {
+- ir::BasicBlock &tmp = fn->getBlock(bb);
+- Node* node_ = bbmap.find(&tmp)->second;
+- reachUnder.push_front(node_);
+- nset.insert(node_);
+- }
+- break;
+- }
+- }
+- }else{
+- //n is compacted node, it would have a successor pointed to itself for self loop.
+- if(n->succs().find(n) != n->succs().end())
+- {
+- reachUnder.push_front(n);
+- nset.insert(n);
+- }
+- }
+-
+- region = cyclicRegionType(n, reachUnder);
+-
+- if(NULL != region)
+- {
+- reduce(region, nset);
+- changed = true;
+-
+- if(nset.find(entry) != nset.end())
+- entry = region;
+- }
+- else
+- {
+- post_ctr++;
+- }
+- }
+- }
+-
+- if(!changed)
+- break;
+-
+- } while(post_order.size()>1);
+-
+- }
+-
+- void ControlTree::analyze()
+- {
+- initializeNodes();
+- structuralAnalysis(nodes_entry);
+- handleStructuredNodes();
+- calculateNecessaryLiveout();
+- }
+-}
+diff --git a/backend/src/ir/structural_analysis.hpp b/backend/src/ir/structural_analysis.hpp
+deleted file mode 100644
+index 7aaa533..0000000
+--- a/backend/src/ir/structural_analysis.hpp
++++ /dev/null
+@@ -1,342 +0,0 @@
+-/*
+- * structural_analysis.hpp
+- * This code is derived from the ControlTree.h and ControlTree.cpp of
+- * project gpuocelot by Yongjia Zhang.
+- * The original copyright of gpuocelot appears below in its entirety.
+- */
+-
+-/*
+- * Copyright 2011
+- * GEORGIA TECH RESEARCH CORPORATION
+- * ALL RIGHTS RESERVED
+- *
+- * Redistribution and use in source and binary forms, with or without
+- * modification, are permitted provided that the following conditions are
+- * met:
+- * * Redistributions of source code must retain the above copyright
+- * notice, this list of conditions and the following disclaimers.
+- * * Redistributions in binary form must reproduce the above copyright
+- * notice, this list of conditions and the following disclaimers in the
+- * documentation and/or other materials provided with the
+- * distribution.
+- * * Neither the name of GEORGIA TECH RESEARCH CORPORATION nor the
+- * names of its contributors may be used to endorse or promote
+- * products derived from this software without specific prior
+- * written permission.
+- *
+- * THIS SOFTWARE IS PROVIDED BY GEORGIA TECH RESEARCH CORPORATION ''AS IS''
+- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEORGIA TECH RESEARCH
+- * CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+- *
+- * You agree that the Software will not be shipped, transferred, exported,
+- * or re-exported directly into any country prohibited by the United States
+- * Export Administration Act and the regulations thereunder nor will be
+- * used for any purpose prohibited by the Act.
+- */
+-
+-
+-#ifndef __STRUCTURAL_ANALYSIS_HPP__
+-#define __STRUCTURAL_ANALYSIS_HPP__
+-
+-#include "ir/unit.hpp"
+-#include "ir/function.hpp"
+-#include "ir/instruction.hpp"
+-
+-#include <iostream>
+-#include <unordered_set>
+-#include <unordered_map>
+-#include <vector>
+-#include <map>
+-#include <list>
+-#include <algorithm>
+-#include <set>
+-#define TRANSFORM_UNSTRUCTURE
+-
+-namespace analysis
+-{
+- using namespace std;
+- using namespace gbe;
+-
+- enum RegionType
+- {
+- BasicBlock = 0,
+- Block,
+- IfThen,
+- IfElse,
+- SelfLoop,
+- WhileLoop,
+- NaturalLoop
+- } ;
+-
+- /* control tree virtual node */
+- class Node;
+-
+- typedef unordered_set<Node *> NodeSet;
+- typedef list<Node *> NodeList;
+- typedef std::vector<Node *> NodeVector;
+-
+- /* control tree virtual node */
+- class Node
+- {
+- public:
+- Node(RegionType rtype, const NodeList& children): has_barrier(false), mark(false), canBeHandled(true), inversePredicate(true)
+- {
+- this->rtype = rtype;
+- this->children = children;
+- }
+- virtual ~Node() {}
+- NodeSet& preds() { return pred; }
+- NodeSet& succs() { return succ; }
+- Node*& fallthrough() { return fall_through; }
+- bool& hasBarrier() { return has_barrier; }
+- RegionType type() { return rtype; }
+- virtual ir::BasicBlock* getEntry()
+- {
+- return (*(children.begin()))->getEntry();
+- }
+- virtual ir::BasicBlock* getExit()
+- {
+- return (*(children.rbegin()))->getExit();
+- }
+-
+- public:
+- RegionType rtype;
+- NodeSet pred;
+- NodeSet succ;
+- NodeList children;
+- Node* fall_through;
+- bool has_barrier;
+- bool mark;
+- bool canBeHandled;
+- //label is for debug
+- int label;
+- /* inversePredicate should be false under two circumstance,
+- * fallthrough is the same with succs:
+- * (1) n->succs == m && node->fallthrough == m
+- * node
+- * | \
+- * | \
+- * m<--n
+- * (2) m->succs == n && node->fallthrough == n
+- * node
+- * | \
+- * | \
+- * m-->n
+- * */
+- bool inversePredicate;
+- };
+-
+- /* represents basic block */
+- class BasicBlockNode : public Node
+- {
+- public:
+- BasicBlockNode(ir::BasicBlock *p_bb) : Node(BasicBlock, NodeList()) { this->p_bb = p_bb; }
+- virtual ~BasicBlockNode() {}
+- ir::BasicBlock* getBasicBlock() { return p_bb; }
+- virtual ir::BasicBlock* getEntry() { return p_bb; }
+- virtual ir::BasicBlock* getExit() { return p_bb; }
+- virtual ir::BasicBlock* getFirstBB() { return p_bb; }
+- private:
+- ir::BasicBlock *p_bb;
+- };
+-
+- /* a sequence of nodes */
+- class BlockNode : public Node
+- {
+- public:
+- BlockNode(NodeList& children) : Node(Block, children) {}
+- virtual ~BlockNode(){}
+- };
+-
+- /* If-Then structure node */
+- class IfThenNode : public Node
+- {
+- public:
+- IfThenNode(Node* cond, Node* ifTrue) : Node(IfThen, BuildChildren(cond, ifTrue)) {}
+- virtual ~IfThenNode() {}
+-
+- private:
+- const NodeList BuildChildren(Node* cond, Node* ifTrue)
+- {
+- NodeList children;
+- children.push_back(cond);
+- children.push_back(ifTrue);
+- return children;
+- }
+- };
+-
+- /* If-Else structure node */
+- class IfElseNode : public Node
+- {
+- public:
+- IfElseNode(Node* cond, Node* ifTrue, Node* ifFalse) : Node(IfElse, BuildChildren(cond, ifTrue, ifFalse)) {}
+- virtual ~IfElseNode() {}
+-
+- private:
+- const NodeList BuildChildren(Node* cond, Node* ifTrue, Node* ifFalse)
+- {
+- NodeList children;
+- children.push_back(cond);
+- children.push_back(ifTrue);
+- children.push_back(ifFalse);
+- return children;
+- }
+- };
+-
+- /* Self loop structure node */
+- class SelfLoopNode : public Node
+- {
+- public:
+- SelfLoopNode(Node* node) : Node(SelfLoop, BuildChildren(node)) {}
+- virtual ~SelfLoopNode() {}
+- virtual ir::BasicBlock* getEntry()
+- {
+- return (*(children.begin()))->getEntry();
+- }
+- virtual ir::BasicBlock* getExit()
+- {
+- return (*(children.begin()))->getExit();
+- }
+-
+- private:
+- const NodeList BuildChildren(Node *node)
+- {
+- NodeList children;
+- children.push_back(node);
+- return children;
+- }
+- };
+-
+- /* While loop structure node */
+- class WhileLoopNode : public Node
+- {
+- public:
+- WhileLoopNode(Node* cond, Node* execute) : Node(WhileLoop, BuildChildren(cond, execute)) {}
+- virtual ~WhileLoopNode() {}
+- virtual ir::BasicBlock* getEntry()
+- {
+- return (*(children.begin()))->getEntry();
+- }
+- virtual ir::BasicBlock* getExit()
+- {
+- return (*(children.begin()))->getExit();
+- }
+-
+- private:
+- const NodeList BuildChildren(Node* cond, Node* execute)
+- {
+- NodeList children;
+- children.push_back(cond);
+- children.push_back(execute);
+- return children;
+- }
+-
+- };
+-
+- /* Natural loop structure node */
+- class NaturalLoopNode : public Node
+- {
+- public:
+- NaturalLoopNode(const NodeList& children): Node(NaturalLoop, children){}
+- virtual ~NaturalLoopNode() {}
+- virtual ir::BasicBlock* getEntry()
+- {
+- //TODO implement it
+- return NULL;
+- }
+- virtual ir::BasicBlock* getExit()
+- {
+- //TODO implement it
+- return NULL;
+- }
+- };
+-
+- /* computes the control tree, and do the structure identification during the computation */
+- class ControlTree
+- {
+- public:
+- void analyze();
+-
+- ControlTree(ir::Function* fn) { this->fn = fn; }
+- ~ControlTree();
+-
+- private:
+- /* create a sequence of BasicBlockNodes, which are refered to the basic blocks in the function */
+- void initializeNodes();
+- /* insert a new Node, and returns the pointer of the node */
+- Node* insertNode(Node *);
+- /* do the structural analysis */
+- void structuralAnalysis(Node * entry);
+- /* do the dfs post order traverse of the current CFG */
+- void DFSPostOrder(Node *start);
+- /* returns true if there is a (possibly empty) path from m to k that does not pass through n */
+- bool path(Node *m, Node *k, Node *n = NULL);
+- /* link region node into abstract flowgraph, adjust the predecessor and successor functions, and augment the control tree */
+- void reduce(Node* node, NodeSet nodeSet);
+- /* adds node to the control tree, inserts node into _post
+- * at the highest-numbered position of a node in nodeSet, removes
+- * the nodes in nodeSet from _post, compacts the remaining nodes at
+- * the beginning of _post, and sets _postCtr to the index of node
+- * in the resulting postorder */
+- void compact(Node* node, NodeSet nodeSet);
+- Node* getNodesEntry() const { return nodes_entry;}
+- /* determines whether node is the entry node of an acyclic
+- * control structure and returns its region. Stores in nset the set
+- * of nodes in the identified control structure */
+- Node* acyclicRegionType(Node* node, NodeSet& nset);
+- /* determines whether node is the entry node of a cyclic
+- * control structure and returns its region. Stores in nset the set
+- * of nodes in the identified control structure */
+- Node* cyclicRegionType(Node*, NodeList&);
+- /* is this a cyclic region? */
+- bool isCyclic(Node*);
+- /* is this a back edge? */
+- bool isBackedge(const Node*, const Node*);
+- /* check if there is a barrier in a basic block */
+- bool checkForBarrier(const ir::BasicBlock*);
+- /* insert while instruction at the proper position of Node */
+- void handleSelfLoopNode(Node *, ir::LabelIndex&);
+- /* mark all the BasicBlockNodes of the control tree node n as status */
+- void markStructuredNodes(Node *n, bool status);
+- /* mark all the ir::BasicBlocks' needEndIf of n as status */
+- void markNeedEndif(Node * n, bool status);
+- /* mark all the ir::BasicBlocks' needIf of n as status */
+- void markNeedIf(Node *, bool);
+- /* insert IF instruction at the proper position of Node */
+- void handleIfNode(Node *, ir::LabelIndex&, ir::LabelIndex&);
+- /* insert ENDIF instruction at the proper position of Node, this Node is
+- the 'then' node of identified if-then structure */
+- void handleThenNode(Node *, ir::LabelIndex&);
+- /* handle the then node of identified if-else structure */
+- void handleThenNode2(Node *, Node *, ir::LabelIndex);
+- /* insert ELSE instruction at the proper position of Node */
+- void handleElseNode(Node *, ir::LabelIndex&, ir::LabelIndex&);
+- /* this calls other functions to finish the handling of identified structure blocks */
+- void handleStructuredNodes();
+- std::set<int> getStructureBasicBlocksIndex(Node *, std::vector<ir::BasicBlock *> &);
+- std::set<ir::BasicBlock *> getStructureBasicBlocks(Node*);
+- /* get livein of bb */
+- void getLiveIn(ir::BasicBlock& bb, std::set<ir::Register>& livein);
+- /* see comment of BasicBlock::liveout in function.hpp for detail. */
+- void calculateNecessaryLiveout();
+- /* get the exectutive sequence of structure n */
+- void getStructureSequence(Node* n, std::vector<ir::BasicBlock*> &v);
+- private:
+- ir::Function *fn;
+- NodeVector nodes;
+- Node* nodes_entry;
+- unordered_map<ir::BasicBlock *, Node *> bbmap;
+- NodeList post_order;
+- NodeSet visited;
+- NodeList::iterator post_ctr;
+- };
+-}
+-#endif
+diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
+index a4ce4a2..3e11fdb 100644
+--- a/backend/src/llvm/llvm_to_gen.cpp
++++ b/backend/src/llvm/llvm_to_gen.cpp
+@@ -61,7 +61,6 @@
+ #include "sys/cvar.hpp"
+ #include "sys/platform.hpp"
+ #include "ir/unit.hpp"
+-#include "ir/structural_analysis.hpp"
+
+ #include <clang/CodeGen/CodeGenAction.h>
+
+@@ -308,16 +307,6 @@ namespace gbe
+ // Print the code extra optimization passes
+ OUTPUT_BITCODE(AFTER_GEN, mod);
+
+- const ir::Unit::FunctionSet& fs = unit.getFunctionSet();
+- ir::Unit::FunctionSet::const_iterator iter = fs.begin();
+- while(iter != fs.end())
+- {
+- analysis::ControlTree *ct = new analysis::ControlTree(iter->second);
+- ct->analyze();
+- delete ct;
+- iter++;
+- }
+-
+ delete libraryInfo;
+ return true;
+ }
+--
+2.4.1
+
diff --git a/0002-GBE-Fix-the-immediate-data-type.patch b/0002-GBE-Fix-the-immediate-data-type.patch
new file mode 100644
index 0000000..27d7582
--- /dev/null
+++ b/0002-GBE-Fix-the-immediate-data-type.patch
@@ -0,0 +1,62 @@
+From ae59e9fd4d2777eb5a2990060d70870597f7ce05 Mon Sep 17 00:00:00 2001
+From: Ruiling Song <ruiling.song at intel.com>
+Date: Thu, 30 Apr 2015 11:49:47 +0800
+Subject: [PATCH 2/9] GBE: Fix the immediate data type
+
+Signed-off-by: Ruiling Song <ruiling.song at intel.com>
+Reviewed-by: Zhigang Gong <zhigang.gong at intel.com>
+---
+ backend/src/ir/immediate.hpp | 12 ++++++------
+ backend/src/llvm/llvm_gen_backend.cpp | 2 +-
+ 2 files changed, 7 insertions(+), 7 deletions(-)
+
+diff --git a/backend/src/ir/immediate.hpp b/backend/src/ir/immediate.hpp
+index 6b27e8b..ff37a29 100644
+--- a/backend/src/ir/immediate.hpp
++++ b/backend/src/ir/immediate.hpp
+@@ -130,11 +130,11 @@ namespace ir {
+ DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8)
+ DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8)
+ DECL_CONSTRUCTOR(int16_t, s16, TYPE_S16)
+- DECL_CONSTRUCTOR(uint16_t, u16, TYPE_S16)
++ DECL_CONSTRUCTOR(uint16_t, u16, TYPE_U16)
+ DECL_CONSTRUCTOR(int32_t, s32, TYPE_S32)
+- DECL_CONSTRUCTOR(uint32_t, u32, TYPE_S32)
++ DECL_CONSTRUCTOR(uint32_t, u32, TYPE_U32)
+ DECL_CONSTRUCTOR(int64_t, s64, TYPE_S64)
+- DECL_CONSTRUCTOR(uint64_t, u64, TYPE_S64)
++ DECL_CONSTRUCTOR(uint64_t, u64, TYPE_U64)
+ DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT)
+ DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE)
+ #undef DECL_CONSTRUCTOR
+@@ -155,11 +155,11 @@ namespace ir {
+ DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8, elemNum)
+ DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8, elemNum)
+ DECL_CONSTRUCTOR(int16_t, s16, TYPE_S16, elemNum)
+- DECL_CONSTRUCTOR(uint16_t, u16, TYPE_S16, elemNum)
++ DECL_CONSTRUCTOR(uint16_t, u16, TYPE_U16, elemNum)
+ DECL_CONSTRUCTOR(int32_t, s32, TYPE_S32, elemNum)
+- DECL_CONSTRUCTOR(uint32_t, u32, TYPE_S32, elemNum)
++ DECL_CONSTRUCTOR(uint32_t, u32, TYPE_U32, elemNum)
+ DECL_CONSTRUCTOR(int64_t, s64, TYPE_S64, elemNum)
+- DECL_CONSTRUCTOR(uint64_t, u64, TYPE_S64, elemNum)
++ DECL_CONSTRUCTOR(uint64_t, u64, TYPE_U64, elemNum)
+ DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT, elemNum)
+ DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE, elemNum)
+ #undef DECL_CONSTRUCTOR
+diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
+index 0487bcb..c2c016c 100644
+--- a/backend/src/llvm/llvm_gen_backend.cpp
++++ b/backend/src/llvm/llvm_gen_backend.cpp
+@@ -3546,7 +3546,7 @@ namespace gbe
+ if (step != 0) {
+ ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
+ ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
+- ctx.LOADI(ir::TYPE_S32, stepReg, stepImm);
++ ctx.LOADI(ir::TYPE_U32, stepReg, stepImm);
+ ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
+ ctx.getFunction().pushStackSize(step);
+ }
+--
+2.4.1
+
diff --git a/0003-correct-the-src-output-of-alu3-when-OCL_OUTPUT_ASM-1.patch b/0003-correct-the-src-output-of-alu3-when-OCL_OUTPUT_ASM-1.patch
new file mode 100644
index 0000000..11c94fd
--- /dev/null
+++ b/0003-correct-the-src-output-of-alu3-when-OCL_OUTPUT_ASM-1.patch
@@ -0,0 +1,54 @@
+From 0b10bc0dbd201b892299b897ef5038dc3c52c724 Mon Sep 17 00:00:00 2001
+From: Guo Yejun <yejun.guo at intel.com>
+Date: Mon, 4 May 2015 16:47:21 +0800
+Subject: [PATCH 3/9] correct the src output of alu3 when OCL_OUTPUT_ASM=1
+
+Signed-off-by: Guo Yejun <yejun.guo at intel.com>
+Reviewed-by: "Song, Ruiling" <ruiling.song at intel.com>
+---
+ backend/src/backend/gen/gen_mesa_disasm.c | 15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
+index 4822de3..2615276 100644
+--- a/backend/src/backend/gen/gen_mesa_disasm.c
++++ b/backend/src/backend/gen/gen_mesa_disasm.c
+@@ -844,7 +844,10 @@ static int src0_3src(FILE *file, const void* inst)
+ return 0;
+ if (GEN_BITS_FIELD(inst, bits2.da3src.src0_subreg_nr))
+ format(file, ".%d", GEN_BITS_FIELD(inst, bits2.da3src.src0_subreg_nr));
+- string(file, "<4,1,1>");
++ if (GEN_BITS_FIELD(inst, bits2.da3src.src0_rep_ctrl))
++ string(file, "<0,1,0>");
++ else
++ string(file, "<8,8,1>");
+ err |= control(file, "src da16 reg type", reg_encoding,
+ GEN_TYPE_F, NULL);
+ /*
+@@ -889,7 +892,10 @@ static int src1_3src(FILE *file, const void* inst)
+ return 0;
+ if (src1_subreg_nr)
+ format(file, ".%d", src1_subreg_nr);
+- string(file, "<4,1,1>");
++ if (GEN_BITS_FIELD(inst, bits2.da3src.src1_rep_ctrl))
++ string(file, "<0,1,0>");
++ else
++ string(file, "<8,8,1>");
+ err |= control(file, "src da16 reg type", reg_encoding,
+ GEN_TYPE_F, NULL);
+ /*
+@@ -931,7 +937,10 @@ static int src2_3src(FILE *file, const void* inst)
+ return 0;
+ if (GEN_BITS_FIELD(inst, bits3.da3src.src2_subreg_nr))
+ format(file, ".%d", GEN_BITS_FIELD(inst, bits3.da3src.src2_subreg_nr));
+- string(file, "<4,1,1>");
++ if (GEN_BITS_FIELD(inst, bits3.da3src.src2_rep_ctrl))
++ string(file, "<0,1,0>");
++ else
++ string(file, "<8,8,1>");
+ err |= control(file, "src da16 reg type", reg_encoding,
+ GEN_TYPE_F, NULL);
+ /*
+--
+2.4.1
+
diff --git a/0004-Add-a-sanity-test-in-clGetDeviceIDs.patch b/0004-Add-a-sanity-test-in-clGetDeviceIDs.patch
new file mode 100644
index 0000000..a0d77a4
--- /dev/null
+++ b/0004-Add-a-sanity-test-in-clGetDeviceIDs.patch
@@ -0,0 +1,134 @@
+From a7f3944cf5437e7648b43d4aaba8f48616d5dcfc Mon Sep 17 00:00:00 2001
+From: "Rebecca N. Palmer" <rebecca_palmer at zoho.com>
+Date: Sat, 16 May 2015 18:48:37 +0100
+Subject: [PATCH 4/9] Add a sanity test in clGetDeviceIDs
+
+Run a small __local-using kernel in clGetDeviceIDs; if this returns
+the wrong result, return CL_DEVICE_NOT_FOUND.
+
+As far as I can see, there's no way to tell in advance (except
+unreliably with a global version check) whether __local-using batches
+will be accepted...so the easiest solution is probably to just try
+running one and see what result we get.
+
+Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>
+Reviewed-by: "Luo, Xionghu" <xionghu.luo at intel.com>
+Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>
+---
+ src/cl_device_id.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ utests/setenv.sh.in | 2 ++
+ 2 files changed, 84 insertions(+)
+
+diff --git a/src/cl_device_id.c b/src/cl_device_id.c
+index 0fd4a69..0c1c52f 100644
+--- a/src/cl_device_id.c
++++ b/src/cl_device_id.c
+@@ -436,6 +436,74 @@ brw_gt3_break:
+ return ret;
+ }
+
++/* Runs a small kernel to check that the device works; returns
++ * 0 for success, 1 for silently wrong result, 2 for error */
++LOCAL cl_int
++cl_self_test(cl_device_id device)
++{
++ cl_int status, ret;
++ cl_context ctx;
++ cl_command_queue queue;
++ cl_program program;
++ cl_kernel kernel;
++ cl_mem buffer;
++ cl_event kernel_finished;
++ size_t n = 3;
++ cl_int test_data[3] = {3, 7, 5};
++ const char* kernel_source = "__kernel void self_test(__global int *buf) {"
++ " __local int tmp[3];"
++ " tmp[get_local_id(0)] = buf[get_local_id(0)];"
++ " barrier(CLK_LOCAL_MEM_FENCE);"
++ " buf[get_global_id(0)] = tmp[2 - get_local_id(0)] + buf[get_global_id(0)];"
++ "}"; // using __local to catch the "no SLM on Haswell" problem
++ ret = 2;
++ ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
++ if (status == CL_SUCCESS) {
++ queue = clCreateCommandQueue(ctx, device, 0, &status);
++ if (status == CL_SUCCESS) {
++ program = clCreateProgramWithSource(ctx, 1, &kernel_source, NULL, &status);
++ if (status == CL_SUCCESS) {
++ status = clBuildProgram(program, 1, &device, "", NULL, NULL);
++ if (status == CL_SUCCESS) {
++ kernel = clCreateKernel(program, "self_test", &status);
++ if (status == CL_SUCCESS) {
++ buffer = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n*4, test_data, &status);
++ if (status == CL_SUCCESS) {
++ status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
++ if (status == CL_SUCCESS) {
++ status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &n, &n, 0, NULL, &kernel_finished);
++ if (status == CL_SUCCESS) {
++ status = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, n*4, test_data, 1, &kernel_finished, NULL);
++ if (status == CL_SUCCESS) {
++ if (test_data[0] == 8 && test_data[1] == 14 && test_data[2] == 8){
++ ret = 0;
++ } else {
++ ret = 1;
++ printf("Beignet: self-test failed: (3, 7, 5) + (5, 7, 3) returned (%i, %i, %i)\n"
++ "See README.md or http://www.freedesktop.org/wiki/Software/Beignet/\n",
++ test_data[0], test_data[1], test_data[2]);
++ }
++ }
++ }
++ }
++ }
++ clReleaseMemObject(buffer);
++ }
++ clReleaseKernel(kernel);
++ }
++ }
++ clReleaseProgram(program);
++ }
++ clReleaseCommandQueue(queue);
++ }
++ clReleaseContext(ctx);
++ if (ret == 2) {
++ printf("Beignet: self-test failed: error %i\n"
++ "See README.md or http://www.freedesktop.org/wiki/Software/Beignet/\n", status);
++ }
++ return ret;
++}
++
+ LOCAL cl_int
+ cl_get_device_ids(cl_platform_id platform,
+ cl_device_type device_type,
+@@ -447,6 +515,20 @@ cl_get_device_ids(cl_platform_id platform,
+
+ /* Do we have a usable device? */
+ device = cl_get_gt_device();
++ if (device && cl_self_test(device)) {
++ int disable_self_test = 0;
++ // can't use BVAR (backend/src/sys/cvar.hpp) here as it's C++
++ const char *env = getenv("OCL_IGNORE_SELF_TEST");
++ if (env != NULL) {
++ sscanf(env, "%i", &disable_self_test);
++ }
++ if (disable_self_test) {
++ printf("Beignet: Warning - overriding self-test failure\n");
++ } else {
++ printf("Beignet: disabling non-working device\n");
++ device = 0;
++ }
++ }
+ if (!device) {
+ if (num_devices)
+ *num_devices = 0;
+diff --git a/utests/setenv.sh.in b/utests/setenv.sh.in
+index ac06b10..67e3bf1 100644
+--- a/utests/setenv.sh.in
++++ b/utests/setenv.sh.in
+@@ -6,3 +6,5 @@ export OCL_PCH_PATH=@LOCAL_OCL_PCH_OBJECT@
+ export OCL_KERNEL_PATH=@CMAKE_CURRENT_SOURCE_DIR@/../kernels
+ export OCL_GBE_PATH=@LOCAL_GBE_OBJECT_DIR@
+ export OCL_INTERP_PATH=@LOCAL_INTERP_OBJECT_DIR@
++#disable self-test so we can get something more precise than "doesn't work"
++export OCL_IGNORE_SELF_TEST=1
+--
+2.4.1
+
diff --git a/0005-Docs-update-clarify-Haswell-issues.patch b/0005-Docs-update-clarify-Haswell-issues.patch
new file mode 100644
index 0000000..9f8f8af
--- /dev/null
+++ b/0005-Docs-update-clarify-Haswell-issues.patch
@@ -0,0 +1,124 @@
+From b19ed97e5c4127f04f1aff05f111162cb96629e6 Mon Sep 17 00:00:00 2001
+From: "Rebecca N. Palmer" <rebecca_palmer at zoho.com>
+Date: Sat, 16 May 2015 18:52:59 +0100
+Subject: [PATCH 5/9] Docs: update/clarify Haswell issues
+
+Reflect recent beignet and Linux changes.
+Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>
+Reviewed-by: "Luo, Xionghu" <xionghu.luo at intel.com>
+Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>
+---
+ docs/Beignet.mdwn | 40 ++++++++++++++++++++++++++--------------
+ src/cl_device_id.c | 14 +++++++++-----
+ 2 files changed, 35 insertions(+), 19 deletions(-)
+
+diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
+index 57041ba..bd7fd98 100644
+--- a/docs/Beignet.mdwn
++++ b/docs/Beignet.mdwn
+@@ -142,7 +142,7 @@ Supported Targets
+
+ * 3rd Generation Intel Core Processors
+ * Intel “Bay Trail” platforms with Intel HD Graphics
+- * 4th Generation Intel Core Processors, need kernel patch currently, see the "Known Issues" section.
++ * 4th Generation Intel Core Processors "Haswell", need kernel patch currently, see the "Known Issues" section.
+ * 5th Generation Intel Core Processors "Broadwell".
+
+ Known Issues
+@@ -163,22 +163,34 @@ Known Issues
+ But this command is a little bit dangerous, as if your kernel really hang, then the gpu will lock up
+ forever until a reboot.
+
+-* Almost all unit tests fail.
+- There is a known issue in some versions of linux kernel which enable register whitelist feature
+- but miss some necessary registers which are required for beignet. For non-HSW platforms, the
+- problematic version are around 3.15 and 3.16 which have commit f0a346b... but haven't commit
+- c9224f... If it is the case, you can apply c9224f... manually and rebuild the kernel or just
+- disable the parse command by invoke the following command (use Ubuntu as an example):
++* "Beignet: self-test failed" and almost all unit tests fail.
++ Linux 3.15 and 3.16 (commits [f0a346b](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=f0a346bdafaf6fc4a51df9ddf1548fd888f860d8)
++ to [c9224fa](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c9224faa59c3071ecfa2d4b24592f4eb61e57069))
++ enable the register whitelist by default but miss some registers needed
++ for Beignet.
++
++ This can be fixed by upgrading Linux, or by disabling the whitelist:
+
+ `# echo 0 > /sys/module/i915/parameters/enable_cmd_parser`
+
+- For HSW platforms, this issue exists in all linux kernel version after 3.15. We always need
+- to execute the above command.
+-
+-* Some unit test cases, maybe 20 to 30, fail on 4th Generation (HSW) platform.
+- _The 4th Generation Intel Core Processors's support requires some Linux kernel
+- modification_. You need to apply the patch at:
+- [https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support](https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support)
++ On Haswell hardware, Beignet 1.0.1 to 1.0.3 also required the
++ above workaround on later Linux versions, but this _should not_ be
++ required in current (after [83f8739](http://cgit.freedesktop.org/beignet/commit/?id=83f8739b6fc4893fac60145326052ccb5cf653dc))
++ git master.
++
++* "Beignet: self-test failed" and 15-30 unit tests fail on 4th Generation (Haswell) hardware.
++ On Haswell, shared local memory (\_\_local) does not work at all on
++ Linux <= 4.0, and requires the i915.enable_ppgtt=2 [boot parameter](https://wiki.ubuntu.com/Kernel/KernelBootParameters)
++ on Linux 4.1.
++
++ This will be fixed in Linux 4.2; older versions can be fixed with
++ [this patch](https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support).
++
++ If you do not need \_\_local, you can override the self-test with
++
++ `export OCL_IGNORE_SELF_TEST=1`
++
++ but using \_\_local after this may silently give wrong results.
+
+ * Precision issue.
+ Currently Gen does not provide native support of high precision math functions
+diff --git a/src/cl_device_id.c b/src/cl_device_id.c
+index 0c1c52f..9cd7c00 100644
+--- a/src/cl_device_id.c
++++ b/src/cl_device_id.c
+@@ -441,7 +441,7 @@ brw_gt3_break:
+ LOCAL cl_int
+ cl_self_test(cl_device_id device)
+ {
+- cl_int status, ret;
++ cl_int status;
+ cl_context ctx;
+ cl_command_queue queue;
+ cl_program program;
+@@ -456,7 +456,11 @@ cl_self_test(cl_device_id device)
+ " barrier(CLK_LOCAL_MEM_FENCE);"
+ " buf[get_global_id(0)] = tmp[2 - get_local_id(0)] + buf[get_global_id(0)];"
+ "}"; // using __local to catch the "no SLM on Haswell" problem
+- ret = 2;
++ static int tested = 0;
++ static cl_int ret = 2;
++ if (tested != 0)
++ return ret;
++ tested = 1;
+ ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
+ if (status == CL_SUCCESS) {
+ queue = clCreateCommandQueue(ctx, device, 0, &status);
+@@ -480,8 +484,8 @@ cl_self_test(cl_device_id device)
+ } else {
+ ret = 1;
+ printf("Beignet: self-test failed: (3, 7, 5) + (5, 7, 3) returned (%i, %i, %i)\n"
+- "See README.md or http://www.freedesktop.org/wiki/Software/Beignet/\n",
+- test_data[0], test_data[1], test_data[2]);
++ "See README.md or http://www.freedesktop.org/wiki/Software/Beignet/\n",
++ test_data[0], test_data[1], test_data[2]);
+ }
+ }
+ }
+@@ -499,7 +503,7 @@ cl_self_test(cl_device_id device)
+ clReleaseContext(ctx);
+ if (ret == 2) {
+ printf("Beignet: self-test failed: error %i\n"
+- "See README.md or http://www.freedesktop.org/wiki/Software/Beignet/\n", status);
++ "See README.md or http://www.freedesktop.org/wiki/Software/Beignet/\n", status);
+ }
+ return ret;
+ }
+--
+2.4.1
+
diff --git a/0006-utest_pow-don-t-fail-on-declared-lack-of-denormals.patch b/0006-utest_pow-don-t-fail-on-declared-lack-of-denormals.patch
new file mode 100644
index 0000000..6ef1e34
--- /dev/null
+++ b/0006-utest_pow-don-t-fail-on-declared-lack-of-denormals.patch
@@ -0,0 +1,53 @@
+From 028f5ab22dfa70c1e2d6a71799f41badbd19eafd Mon Sep 17 00:00:00 2001
+From: "Rebecca N. Palmer" <rebecca_palmer at zoho.com>
+Date: Wed, 29 Apr 2015 13:58:13 +0800
+Subject: [PATCH 6/9] utest_pow: don't fail on declared lack of denormals.
+
+0.01**20.5 is denormal; at least Ivy Bridge does not support
+denormals and hence returns 0. As this is allowed by the
+OpenCL standard, it shouldn't fail the test.
+
+Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>
+Reviewed-by: "Song, Ruiling" <ruiling.song at intel.com>
+---
+ utests/builtin_pow.cpp | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
+index a18f31e..a8523d3 100644
+--- a/utests/builtin_pow.cpp
++++ b/utests/builtin_pow.cpp
+@@ -37,6 +37,9 @@ static void builtin_pow(void)
+ input_data2[i*count_input_ori+k] = ori_data[k];
+ }
+
++ cl_device_fp_config fp_config;
++ clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, 0);
++ bool denormals_supported = fp_config & CL_FP_DENORM;
+ const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
+ float ULPSIZE_FACTOR = 16.0;
+ if (env_strict == NULL || strcmp(env_strict, "0") == 0)
+@@ -75,7 +78,9 @@ static void builtin_pow(void)
+ #if udebug
+ if ( (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])) ||
+ (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])) ||
+- (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR) )
++ (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR
++ && (denormals_supported || gpu_data[index_cur]!=0 || std::fpclassify(cpu_data[index_cur])!=FP_SUBNORMAL) ) )
++
+ {
+ printf_c("%d/%d: x:%f, y:%f -> gpu:%f cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
+ }
+@@ -88,7 +93,8 @@ static void builtin_pow(void)
+ OCL_ASSERT(isnan(gpu_data[index_cur]));
+ else
+ {
+- OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR);
++ OCL_ASSERT((fabs(gpu_data[index_cur] - cpu_data[index_cur]) < cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR) ||
++ (!denormals_supported && gpu_data[index_cur]==0 && std::fpclassify(cpu_data[index_cur])==FP_SUBNORMAL) );
+ }
+ #endif
+ }
+--
+2.4.1
+
diff --git a/0007-Make-tgamma-meet-the-accuracy-standard.patch b/0007-Make-tgamma-meet-the-accuracy-standard.patch
new file mode 100644
index 0000000..b51d40c
--- /dev/null
+++ b/0007-Make-tgamma-meet-the-accuracy-standard.patch
@@ -0,0 +1,135 @@
+From e4c38e56576e58cd384c156e0bcfe6290ff9f33a Mon Sep 17 00:00:00 2001
+From: "Rebecca N. Palmer" <rebecca_palmer at zoho.com>
+Date: Wed, 29 Apr 2015 14:15:09 +0800
+Subject: [PATCH 7/9] Make tgamma meet the accuracy standard.
+
+The old tgamma=exp(lgamma) implementation had high rounding error on
+large outputs, exceeding the 16ulp specification for approx. x>8
+(hence the test failure in strict conformance mode).
+
+Replace this with an implementation based on glibc's
+http://sources.debian.net/src/glibc/2.19-17/sysdeps/ieee754/flt-32/e_gammaf_r.c/
+
+Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>
+Reviewed-by: "Song, Ruiling" <ruiling.song at intel.com>
+---
+ backend/src/libocl/tmpl/ocl_math.tmpl.cl | 96 +++++++++++++++++++++++++++++---
+ 1 file changed, 89 insertions(+), 7 deletions(-)
+
+diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+index fcc60fd..f6e53c3 100644
+--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
++++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+@@ -1746,13 +1746,6 @@ OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+ }
+ }
+
+-INLINE_OVERLOADABLE float tgamma(float x) {
+- float y;
+- int s;
+- y=lgamma_r(x,&s);
+- return __gen_ocl_internal_exp(y)*s;
+-}
+-
+ /* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com.
+ */
+@@ -2963,6 +2956,95 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
+ return sn*z;
+ }
+
++OVERLOADABLE float tgamma (float x)
++{
++ /* based on glibc __ieee754_gammaf_r by Ulrich Drepper <drepper at cygnus.com> */
++
++ unsigned int hx;
++ GEN_OCL_GET_FLOAT_WORD(hx,x);
++ if (hx == 0xff800000)
++ {
++ /* x == -Inf. According to ISO this is NaN. */
++ return NAN;
++ }
++ if ((hx & 0x7f800000) == 0x7f800000)
++ {
++ /* Positive infinity (return positive infinity) or NaN (return
++ NaN). */
++ return x;
++ }
++ if (x < 0.0f && __gen_ocl_internal_floor (x) == x)
++ {
++ /* integer x < 0 */
++ return NAN;
++ }
++
++ if (x >= 36.0f)
++ {
++ /* Overflow. */
++ return INFINITY;
++ }
++ else if (x <= 0.0f && x >= -FLT_EPSILON / 4.0f)
++ {
++ return 1.0f / x;
++ }
++ else
++ {
++ float sinpix = __gen_ocl_internal_sinpi(x);
++ if (x <= -42.0f)
++ /* Underflow. */
++ {return 0.0f * sinpix /*for sign*/;}
++ int exp2_adj = 0;
++ float x_abs = __gen_ocl_fabs(x);
++ float gam0;
++
++ if (x_abs < 4.0f) {
++ /* gamma = exp(lgamma) is only accurate for small lgamma */
++ float prod,x_adj;
++ if (x_abs < 0.5f) {
++ prod = 1.0f / x_abs;
++ x_adj = x_abs + 1.0f;
++ } else if (x_abs <= 1.5f) {
++ prod = 1.0f;
++ x_adj = x_abs;
++ } else if (x_abs < 2.5f) {
++ x_adj = x_abs - 1.0f;
++ prod = x_adj;
++ } else {
++ x_adj = x_abs - 2.0f;
++ prod = x_adj * (x_abs - 1.0f);
++ }
++ gam0 = __gen_ocl_internal_exp (lgamma (x_adj)) * prod;
++ }
++ else {
++ /* Compute gamma (X) using Stirling's approximation,
++ starting by computing pow (X, X) with a power of 2
++ factored out to avoid intermediate overflow. */
++ float x_int = __gen_ocl_internal_round (x_abs);
++ float x_frac = x_abs - x_int;
++ int x_log2;
++ float x_mant = frexp (x_abs, &x_log2);
++ if (x_mant < M_SQRT1_2_F)
++ {
++ x_log2--;
++ x_mant *= 2.0f;
++ }
++ exp2_adj = x_log2 * (int) x_int;
++ float ret = (__gen_ocl_internal_pow(x_mant, x_abs)
++ * exp2 (x_log2 * x_frac)
++ * __gen_ocl_internal_exp (-x_abs)
++ * sqrt (2.0f * M_PI_F / x_abs) );
++
++ float x2 = x_abs * x_abs;
++ float bsum = (0x3.403404p-12f / x2 -0xb.60b61p-12f) / x2 + 0x1.555556p-4f;
++ gam0 = ret + ret * __gen_ocl_internal_expm1 (bsum / x_abs);
++ }
++ if (x > 0.0f) {return __gen_ocl_internal_ldexp (gam0, exp2_adj);}
++ float gam1 = M_PI_F / (-x * sinpix * gam0);
++ return __gen_ocl_internal_ldexp (gam1, -exp2_adj);
++ }
++}
++
+ float __gen_ocl_internal_pown(float x, int y) {
+ const float
+ bp[] = {1.0, 1.5,},
+--
+2.4.1
+
diff --git a/0008-Allow-building-with-Python-3.patch b/0008-Allow-building-with-Python-3.patch
new file mode 100644
index 0000000..9fa5108
--- /dev/null
+++ b/0008-Allow-building-with-Python-3.patch
@@ -0,0 +1,119 @@
+From 48b2cc4757cef526961f62cd5c41023690b7d2f1 Mon Sep 17 00:00:00 2001
+From: "Rebecca N. Palmer" <rebecca_palmer at zoho.com>
+Date: Wed, 29 Apr 2015 11:26:41 +0100
+Subject: [PATCH 8/9] Allow building with Python 3
+
+Make the build scripts work in both Python 2 and Python 3.
+(CMake prefers Python 2 if both are available, but will use
+Python 3 if only it is installed.)
+
+Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>
+Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>
+---
+ backend/src/libocl/script/gen_vector.py | 23 ++++++++++++-----------
+ utests/utest_generator.py | 3 ++-
+ 2 files changed, 14 insertions(+), 12 deletions(-)
+
+diff --git a/backend/src/libocl/script/gen_vector.py b/backend/src/libocl/script/gen_vector.py
+index ffc573a..92582f5 100755
+--- a/backend/src/libocl/script/gen_vector.py
++++ b/backend/src/libocl/script/gen_vector.py
+@@ -20,13 +20,14 @@
+
+ # This file is to generate inline code to lower down those builtin
+ # vector functions to scalar functions.
++from __future__ import print_function
+ import re
+ import sys
+ import os
+
+ if len(sys.argv) != 4:
+- print "Invalid argument {0}".format(sys.argv)
+- print "use {0} spec_file_name output_file_name just_proto".format(sys.argv[0])
++ print("Invalid argument {0}".format(sys.argv))
++ print("use {0} spec_file_name output_file_name just_proto".format(sys.argv[0]))
+ raise
+
+ all_vector = 1,2,3,4,8,16
+@@ -61,8 +62,8 @@ all_type = all_int_type + all_float_type
+
+ # all vector/scalar types
+ for t in all_type:
+- exec "{0}n = [\"{0}n\", gen_vector_type([\"{0}\"])]".format(t)
+- exec "s{0} = [\"{0}\", gen_vector_type([\"{0}\"], [1])]".format(t)
++ exec("{0}n = [\"{0}n\", gen_vector_type([\"{0}\"])]".format(t))
++ exec("s{0} = [\"{0}\", gen_vector_type([\"{0}\"], [1])]".format(t))
+
+ # Predefined type sets according to the Open CL spec.
+ math_gentype = ["math_gentype", gen_vector_type(all_float_type)]
+@@ -124,8 +125,8 @@ def check_type(types):
+ for t in types:
+ memspace, t = stripMemSpace(t)
+ if not t in type_dict:
+- print t
+- raise "found invalid type."
++ print(t)
++ raise TypeError("found invalid type.")
+
+ def match_unsigned(dtype):
+ if dtype[0] == 'float':
+@@ -187,8 +188,8 @@ def fixup_type(dstType, srcType, n):
+ if (len(dstType) == len(srcType)):
+ return dstType[n]
+
+- print dstType, srcType
+- raise "type mispatch"
++ print(dstType, srcType)
++ raise TypeError("type mispatch")
+
+ class builtinProto():
+ valueTypeStr = ""
+@@ -226,7 +227,7 @@ class builtinProto():
+
+ def init_from_line(self, t):
+ self.append('//{0}'.format(t))
+- line = filter(None, re.split(',| |\(', t.rstrip(')\n')))
++ line = [_f for _f in re.split(',| |\(', t.rstrip(')\n')) if _f]
+ self.paramCount = 0
+ stripped = 0
+ memSpace = ''
+@@ -310,7 +311,7 @@ class builtinProto():
+ vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i)
+ if vtype[1] != ptype[1]:
+ if ptype[1] != 1:
+- raise "parameter is not a scalar but has different width with result value."
++ raise TypeError("parameter is not a scalar but has different width with result value.")
+ if isPointer(ptype):
+ formatStr += '&'
+ formatStr += 'param{0}'.format(n)
+@@ -333,7 +334,7 @@ class builtinProto():
+
+ def output(self):
+ for line in self.outputStr:
+- print line
++ print(line)
+
+ def output(self, outFile):
+ for line in self.outputStr:
+diff --git a/utests/utest_generator.py b/utests/utest_generator.py
+index 510c41a..7d2d3a0 100644
+--- a/utests/utest_generator.py
++++ b/utests/utest_generator.py
+@@ -1,4 +1,5 @@
+ #!/usr/bin/python
++from __future__ import print_function
+ import os,sys,re
+
+ FLT_MAX_POSI='0x1.fffffep127f'
+@@ -326,7 +327,7 @@ which can print more values and information to assist debuging the issue.
+ file_object.close()
+
+ def nameForCmake(self,content,namesuffix):
+- print("generated/%s_%s.cpp"%(self.fileName,namesuffix)),
++ print("generated/%s_%s.cpp"%(self.fileName,namesuffix),end=" ")
+
+ def utestFunc(self,index):
+ funcLines=[]
+--
+2.4.1
+
diff --git a/0009-utests-fix-test-case-builtin_tgamma.patch b/0009-utests-fix-test-case-builtin_tgamma.patch
new file mode 100644
index 0000000..a3191b7
--- /dev/null
+++ b/0009-utests-fix-test-case-builtin_tgamma.patch
@@ -0,0 +1,65 @@
+From a6810c54b5bfa4c88d9fec6fb1f86fdea4af75af Mon Sep 17 00:00:00 2001
+From: "Rebecca N. Palmer" <rebecca_palmer at zoho.com>
+Date: Wed, 29 Apr 2015 16:25:47 +0100
+Subject: [PATCH 9/9] utests: fix test case builtin_tgamma.
+
+Compare with tgamma instead of tgammaf for better accuracy.
+Include negative inputs, and handle the resulting denormals.
+Print maximum error found.
+
+Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>
+Reviewed-by: "Song, Ruiling" <ruiling.song at intel.com>
+Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>
+---
+ utests/builtin_tgamma.cpp | 19 ++++++++++++++++---
+ 1 file changed, 16 insertions(+), 3 deletions(-)
+
+diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
+index 47cc5f4..b7db69b 100644
+--- a/utests/builtin_tgamma.cpp
++++ b/utests/builtin_tgamma.cpp
+@@ -20,10 +20,15 @@ void builtin_tgamma(void)
+ if (env_strict == NULL || strcmp(env_strict, "0") == 0)
+ ULPSIZE_FACTOR = 10000.;
+
+- for (int j = 0; j < 1024; j ++) {
++ cl_device_fp_config fp_config;
++ clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, 0);
++ bool denormals_supported = fp_config & CL_FP_DENORM;
++ float max_ulp = 0, max_ulp_at = 0;
++
++ for (int j = 0; j < 128; j ++) {
+ OCL_MAP_BUFFER(0);
+ for (int i = 0; i < n; ++i) {
+- src[i] = ((float*)buf_data[0])[i] = (j*n+i+1) * 0.001f;
++ src[i] = ((float*)buf_data[0])[i] = j - 64 + i*0.001f;
+ }
+ OCL_UNMAP_BUFFER(0);
+
+@@ -32,7 +37,14 @@ void builtin_tgamma(void)
+ OCL_MAP_BUFFER(1);
+ float *dst = (float*)buf_data[1];
+ for (int i = 0; i < n; ++i) {
+- float cpu = tgammaf(src[i]);
++ float cpu = tgamma(src[i]);
++ if (!denormals_supported && std::fpclassify(cpu)==FP_SUBNORMAL && dst[i]==0) {
++ cpu = 0;
++ }
++ if (fabsf(cpu - dst[i]) > cl_FLT_ULP(cpu) * max_ulp) {
++ max_ulp = fabsf(cpu - dst[i]) / cl_FLT_ULP(cpu);
++ max_ulp_at = src[i];
++ }
+ if (isinf(cpu)) {
+ OCL_ASSERT(isinf(dst[i]));
+ } else if (fabsf(cpu - dst[i]) >= cl_FLT_ULP(cpu) * ULPSIZE_FACTOR) {
+@@ -42,6 +54,7 @@ void builtin_tgamma(void)
+ }
+ OCL_UNMAP_BUFFER(1);
+ }
++ printf("max error=%f ulp at x=%f ", max_ulp, max_ulp_at);
+ }
+
+ MAKE_UTEST_FROM_FUNCTION(builtin_tgamma);
+--
+2.4.1
+
diff --git a/beignet.spec b/beignet.spec
index cfd6700..8dd4d78 100644
--- a/beignet.spec
+++ b/beignet.spec
@@ -1,6 +1,6 @@
Name: beignet
Version: 1.0.3
-Release: 1%{?dist}
+Release: 2%{?dist}
Summary: Open source implementation of the OpenCL for Intel GPUs
License: LGPLv2+
@@ -9,12 +9,28 @@ Source0: https://01.org/sites/default/files/%{name}-%{version}-source.tar
BuildRequires: cmake
BuildRequires: llvm-devel >= 3.3 clang-devel >= 3.3 llvm-static >= 3.3
-BuildRequires: libdrm-devel mesa-libGL-devel mesa-libEGL-devel mesa-libgbm-devel ocl-icd-devel python-devel
+BuildRequires: libdrm-devel mesa-libGL-devel mesa-libEGL-devel mesa-libgbm-devel ocl-icd-devel
BuildRequires: zlib-devel libedit-devel
BuildRequires: opencl-headers
+%if 0%{?fedora} > 22
+BuildRequires: python3-devel
+%else
+BuildRequires: python2-devel
+%endif
+
Requires: opencl-filesystem
+Patch0: 0001-Remove-some-LGPL-incompatible-code.patch
+Patch1: 0002-GBE-Fix-the-immediate-data-type.patch
+Patch2: 0003-correct-the-src-output-of-alu3-when-OCL_OUTPUT_ASM-1.patch
+Patch3: 0004-Add-a-sanity-test-in-clGetDeviceIDs.patch
+Patch4: 0005-Docs-update-clarify-Haswell-issues.patch
+Patch5: 0006-utest_pow-don-t-fail-on-declared-lack-of-denormals.patch
+Patch6: 0007-Make-tgamma-meet-the-accuracy-standard.patch
+Patch7: 0008-Allow-building-with-Python-3.patch
+Patch8: 0009-utests-fix-test-case-builtin_tgamma.patch
+
ExclusiveArch: x86_64 %{ix86}
%description
@@ -34,23 +50,24 @@ Devel package for Beignet is an open source implementation of the OpenCL
specification - a generic compute oriented API.
%prep
-%autosetup -n Beignet-%{version}-Source
+%autosetup -n Beignet-%{version}-Source -S git
mkdir build
%build
pushd build
%cmake ../ -DLLVM_INSTALL_DIR=%{_bindir}/
- make %{?_smp_mflags}
+ %make_build
popd
%install
pushd build
%make_install
popd
-find %{buildroot}%{_includedir}/CL/ -regextype posix-egrep -not -regex ".*(cl_intel.h)" -type f -delete
+find %{buildroot}%{_includedir}/CL/ -not -name "cl_intel.h" -type f -delete
%files
-%doc COPYING README.md
+%license COPYING
+%doc README.md
%{_libdir}/beignet/
%{_sysconfdir}/OpenCL/vendors/intel-beignet.icd
@@ -59,6 +76,12 @@ find %{buildroot}%{_includedir}/CL/ -regextype posix-egrep -not -regex ".*(cl_in
%{_includedir}/CL/cl_intel.h
%changelog
+* Tue May 19 2015 Igor Gnatenko <i.gnatenko.brain at gmail.com> - 1.0.3-2
+- Fix licensing issues with not compatipble LGPL code
+- use python3-devel for fedora23+
+- use license macro
+- use make_build macro
+
* Fri May 08 2015 Igor Gnatenko <i.gnatenko.brain at gmail.com> - 1.0.3-1
- Update to 1.0.3 (RHBZ #1202329)
--
cgit v0.10.2
http://pkgs.fedoraproject.org/cgit/beignet.git/commit/?h=f22&id=bbfccf3794ded8b19d0e05a8aefa85c7cf20b61c
More information about the scm-commits
mailing list