Machine learning series gradient descent method June 6, 2020

preface

Learning gradient descent method in this section

  • Search based optimization method
  • To minimize the loss function

1. Principle and simple realization of gradient descent method


Picture reading comprehension

  • Calculation gradient
  • Each time according to the learning rate gradient decline
  • Finally, the optimal solution is obtained

The value of learning rate affects the speed of optimal solution

  • Too small convergence too slow
  • Too large may not converge
  • Need to adjust learning rate and starting point

The implementation is as follows

import numpy as np
import matplotlib.pyplot as plt
"""Simulated gradient descent method"""
# Taking a quadratic function as loss function
plot_x = np.linspace(-1., 6., 141)
plot_y = (plot_x-2.5)**2 - 1
# loss function 
def J(theta):
    try:
        return (theta-2.5)**2 - 1.
    except:
        return float('inf')
# derivatives
def dJ(theta):
    return 2 * (theta - 2.5)
"""
# Gradient descent method
eta = 0.1 #Learning rate
theta = 0.0 #starting point
epsilon = 1e-8 #judge
theta_history = [theta]
while True:
    gradient = dJ(theta) #gradient
    last_theta = theta
    theta = theta - eta * gradient #Step down
    theta_history.append(theta)
    if (abs(J(theta) - J(last_theta)) < epsilon):
        break
plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
plt.show()
print(theta)
print(J(theta))"""
# Function encapsulation of gradient descent method
theta_history = []
def gradient_descent(initial_theta, eta, n_iters = 1e4,epsilon=1e-8):
    theta = initial_theta
    theta_history.append(initial_theta)
    i_iter = 0
    while i_iter < n_iters:
        gradient = dJ(theta)
        last_theta = theta
        theta = theta - eta * gradient
        theta_history.append(theta)
        if (abs(J(theta) - J(last_theta)) < epsilon):
            break
        i_iter += 1
    return
def plot_theta_history():
    plt.plot(plot_x, J(plot_x))
    plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
    plt.show()
eta = 0.01
theta_history = []
gradient_descent(0, eta)
plot_theta_history()

2. Gradient descent method in linear regression


The formula is as follows


The implementation is as follows

import numpy as np
import matplotlib.pyplot as plt
"""Gradient descent in linear regression"""
# For visualization, make a one-dimensional array
np.random.seed(666) #Random seed
x = 2 * np.random.random(size=100)
y = x * 3. + 4. + np.random.normal(size=100)
X = x.reshape(-1, 1)
# loss function 
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta))**2) / len(X_b)
    except:
        return float('inf')
# gradient
def dJ(theta, X_b, y):
    res = np.empty(len(theta))
    res[0] = np.sum(X_b.dot(theta) - y)
    for i in range(1, len(theta)):
        res[i] = (X_b.dot(theta) - y).dot(X_b[:,i])
    return res * 2 / len(X_b)
# gradient descent 
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
    theta = initial_theta
    cur_iter = 0
    while cur_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
        cur_iter += 1
    return theta
# parameter
X_b = np.hstack([np.ones((len(x), 1)), x.reshape(-1,1)])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)
print(theta)

3. Random gradient descent method

Random gradient descent

  • Direction is random, can jump out of the local optimal solution
  • Precision change time
  • Learning rate is very important, it should be gradually decreasing
  • The idea of simulated annealing
  • The algorithm in scikit is more complex and optimized

The implementation is as follows

import numpy as np
import matplotlib.pyplot as plt
"""Random gradient descent method"""
# data
m = 100000
x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)
# loss function 
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
    except:
        return float('inf')
# gradient
def dJ_sgd(theta, X_b_i, y_i): #It's not the whole data coming in, it's a row
    return 2 * X_b_i.T.dot(X_b_i.dot(theta) - y_i)
# Random gradient descent
def sgd(X_b, y, initial_theta, n_iters):
    # Two super parameters of ab in the formula
    t0, t1 = 5, 50
    # Learning rate
    def learning_rate(t):
        return t0 / (t + t1)
    # starting point
    theta = initial_theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b)) #Random index
        gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i]) #Corresponding gradient
        theta = theta - learning_rate(cur_iter) * gradient
    return theta
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=m//3) #It's a very important super parameter
print(theta)

4. Encapsulating gradient descent method in the linear regression function of the previous article

import numpy as np
from sklearn.metrics import r2_score

"""Linear regression function with gradient descent method"""
class LinearRegression:
    def __init__(self):
        """initialization Linear Regression Model"""
        self.coef_ = None
        self.intercept_ = None
        self._theta = None
    
    def fit_normal(self, X_train, y_train):
        """According to training data set X_train, y_train train Linear Regression Model"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
        self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
        self.intercept_ = self._theta[0]
        self.coef_ = self._theta[1:]
        return self
    
    def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
        """According to training data set X_train, y_train, Training with gradient descent method Linear Regression Model"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        # loss function 
        def J(theta, X_b, y):
            try:
                return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
            except:
                return float('inf')
        # gradient
        def dJ(theta, X_b, y):
            return X_b.T.dot(X_b.dot(theta) - y) * 2 / len(y) #Vectorization
        # gradient descent 
        def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
            # parameter
            theta = initial_theta
            cur_iter = 0
            while cur_iter < n_iters:
                gradient = dJ(theta, X_b, y)
                last_theta = theta
                theta = theta - eta * gradient
                if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
                    break
                cur_iter += 1
            return theta
        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
        initial_theta = np.zeros(X_b.shape[1])
        self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
        self.intercept_ = self._theta[0]
        self.coef_ = self._theta[1:]
        return self
    
    def fit_sgd(self, X_train, y_train, n_iters=50, t0=5, t1=50):
        """According to training data set X_train, y_train, Training with random gradient descent method Linear Regression Model"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert n_iters >= 1
        # gradient
        def dJ_sgd(theta, X_b_i, y_i):
            return X_b_i * (X_b_i.dot(theta) - y_i) * 2
        # Random gradient descent
        def sgd(X_b, y, initial_theta, n_iters=5, t0=5, t1=50): #t0,t1 are super parameters of learning rate
            def learning_rate(t):
                return t0 / (t + t1)
            theta = initial_theta
            m = len(X_b)
            for i_iter in range(n_iters):
                # Make sure that all samples are read once
                indexes = np.random.permutation(m)
                X_b_new = X_b[indexes,:]
                y_new = y[indexes]
                for i in range(m):
                    gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
                    theta = theta - learning_rate(i_iter * m + i) * gradient
            return theta
        X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
        initial_theta = np.random.randn(X_b.shape[1])
        self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
        self.intercept_ = self._theta[0]
        self.coef_ = self._theta[1:]
        return self
    
    def predict(self, X_predict):
        """Given data set to be predicted X_predict´╝îReturn representation X_predict Result vector of"""
        assert self.intercept_ is not None and self.coef_ is not None, \
            "must fit before predict!"
        assert X_predict.shape[1] == len(self.coef_), \
            "the feature number of X_predict must be equal to X_train"
        X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
        return X_b.dot(self._theta)
    
    def score(self, X_test, y_test):
        """According to the test data set X_test and y_test Determine the accuracy of the current model"""
        y_predict = self.predict(X_test)
        return r2_score(y_test, y_predict)
   
    def __repr__(self):
        return "LinearRegression()"

5. Debugging of gradient descent method

  • Good effect
  • Slow speed

The implementation is as follows

import numpy as np
import matplotlib.pyplot as plt
import datetime

"""Adjustment of gradient descent method"""
# data
np.random.seed(666)
X = np.random.random(size=(1000, 10))
true_theta = np.arange(1, 12, dtype=float) #What we should get in the end
X_b = np.hstack([np.ones((len(X), 1)), X])
y = X_b.dot(true_theta) + np.random.normal(size=1000) #Add a noise
# loss function 
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta))**2) / len(X_b)
    except:
        return float('inf')
# Gradient of previous mathematical methods
def dJ_math(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
# Gradient from commissioning
def dJ_debug(theta, X_b, y, epsilon=0.01):
    res = np.empty(len(theta))
    for i in range(len(theta)):
        # One dimension at a time
        theta_1 = theta.copy()
        theta_1[i] += epsilon
        theta_2 = theta.copy()
        theta_2[i] -= epsilon
        res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2 * epsilon)
    return res
# gradient descent 
def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
    theta = initial_theta
    cur_iter = 0
    while cur_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
        cur_iter += 1
    return theta
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
startTime = datetime.datetime.now()
theta1 = gradient_descent(dJ_debug, X_b, y, initial_theta, eta)
print(theta1)
endTime = datetime.datetime.now()
print("The running time is:%ss" % (endTime - startTime).seconds)
startTime = datetime.datetime.now()
theta2 = gradient_descent(dJ_math, X_b, y, initial_theta, eta)
print(theta2)
endTime = datetime.datetime.now()
print("The running time is:%ss" % (endTime - startTime).seconds)

epilogue

In this section, the gradient descent method is studied
Including batch gradient descent method and random gradient descent method
And used the linear regression of the previous article
Gradient descent method is a good way for many machine learning to obtain the optimal solution

There is also a small batch gradient descent method
Obviously, a batch size super parameter has been added

Posted on Fri, 05 Jun 2020 23:25:45 -0700 by Kower