Interpretation of gbdt source code

Recently, I read some content about gbdt and found that some articles on the Internet were vague. I found a source code on git

from GBDTReg import GBDT
class Config(object):
    learningRate=0.1
    maxTreeLength=4
    maxLeafCount=30
    maxTreeNum=4
    def buildGbdt(self,x_train,y_train):
        size = len(x_train)
        dim = len(x_train[0])
        x_train=np.array(x_train)
        y_train=np.array(y_train)
        x_train_feature=[]

        #Initialize first tree
        treePreviousValue=0*y_train
        treeValues=[]
        treeValues.append(treePreviousValue)

        curValue = self.sigmoid(0*y_train)
        dataFeatures=[]
        for i in range(self.maxTreeNum):
            print("the tree %i-th"%i)
            residualGradient = -1*self.learningRate*(curValue-y_train)
            curTree = self.splitTree(x_train,residualGradient,1)
            self.tree.append(curTree)
            print (curTree)
            #Update gradient residual value
            curTreeLeafNodeNum = self.getTreeLeafNodeNum(curTree)
            curTreeValue=[]
            for singleX in x_train:
                xValue,xFeature = self.scanTree(curTree,singleX,curTreeLeafNodeNum)
                curTreeValue.append(xValue)

            treePreviousValue=np.array(curTreeValue)+treePreviousValue
            curValue=self.sigmoid(treePreviousValue)
            print (y_train)
            print("curValue")
            print( curValue)


First, it traverses the number of range(self.maxTreeNum) trees, residualGradient = (y-y0), and then it begins to split the trees,

    def splitTree(self,x_train,residualGradient,treeHeight):
        """

        :param x_train:Training data
        :param residualGradient:Current gradient residual value to be fitted
        :param treeHeight:Height of tree
        :return:Built up GBDT tree
        """
        size = len(x_train)
        dim = len(x_train[0])
        #Convention: left subtree is less than or equal to, right subtree is greater than
        bestSplitPointDim=-1
        bestSplitPointValue=-1
        curLoss = self.calculateSquareLoss(residualGradient)
        minLossValue=curLoss
        if treeHeight==self.maxTreeLength:
            return curLoss
        tree=dict([])
        for i in range(dim):
            for j in range(size):
                splitNum = x_train[j,i]
                leftSubTree=[]
                rightSubTree=[]
                for k in range(size):
                    tmpNum=x_train[k,i]
                    if tmpNum<=splitNum:
                        leftSubTree.append(residualGradient[k])
                    else:
                        rightSubTree.append(residualGradient[k])
                sumLoss=0.0

                sumLoss+=self.calculateSquareLoss(np.array(leftSubTree))
                sumLoss+=self.calculateSquareLoss(np.array(rightSubTree))
                if sumLoss<minLossValue:
                    bestSplitPointDim=i
                    bestSplitPointValue=splitNum
                    minLossValue=sumLoss
        #If the loss value does not decrease, no change will be made, that is, the following Node will be returned
        if minLossValue==curLoss:
            return np.mean(residualGradient)
        else:

            leftSplit=[(x_train[i],residualGradient[i]) for i in range(size) if x_train[i,bestSplitPointDim]<=bestSplitPointValue ]
            rightSplit=[(x_train[i],residualGradient[i]) for i in range(size) if x_train[i,bestSplitPointDim]>bestSplitPointValue ]
             
#            print(leftSplit)
            newLeftSubTree = list(zip(*leftSplit))[0]
            newLeftResidual = list(zip(*leftSplit))[1]
            leftTree = self.splitTree(np.array(newLeftSubTree),newLeftResidual,treeHeight+1)

            newRightSubTree = list(zip(*rightSplit))[0]
            newRightResidual =list(zip(*rightSplit))[1]
            rightTree = self.splitTree(np.array(newRightSubTree),newRightResidual,treeHeight+1)

            tree[(bestSplitPointDim,bestSplitPointValue)]=[leftTree,rightTree]
            return tree

Through the interpretation of split method, we can see that the so-called gradient lifting branch is to update the gradient recursively and divide the left subtree and the right subtree at the same time. Until the loss is not decreasing. If the loss of this tree is no longer reduced, self.tree.append(curTree), the final model is the superposition of multiple trees.

Source link

Tags: git less

Posted on Sun, 10 Nov 2019 12:14:24 -0800 by Bret