# preface

Learning gradient descent method in this section

• Search based optimization method
• To minimize the loss function

## 1. Principle and simple realization of gradient descent method

• Each time according to the learning rate gradient decline
• Finally, the optimal solution is obtained

The value of learning rate affects the speed of optimal solution

• Too small convergence too slow
• Too large may not converge
• Need to adjust learning rate and starting point

The implementation is as follows

```import numpy as np
import matplotlib.pyplot as plt
"""Simulated gradient descent method"""
# Taking a quadratic function as loss function
plot_x = np.linspace(-1., 6., 141)
plot_y = (plot_x-2.5)**2 - 1
# loss function
def J(theta):
try:
return (theta-2.5)**2 - 1.
except:
return float('inf')
# derivatives
def dJ(theta):
return 2 * (theta - 2.5)
"""
# Gradient descent method
eta = 0.1 #Learning rate
theta = 0.0 #starting point
epsilon = 1e-8 #judge
theta_history = [theta]
while True:
last_theta = theta
theta = theta - eta * gradient #Step down
theta_history.append(theta)
if (abs(J(theta) - J(last_theta)) < epsilon):
break
plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
plt.show()
print(theta)
print(J(theta))"""
# Function encapsulation of gradient descent method
theta_history = []
def gradient_descent(initial_theta, eta, n_iters = 1e4,epsilon=1e-8):
theta = initial_theta
theta_history.append(initial_theta)
i_iter = 0
while i_iter < n_iters:
last_theta = theta
theta = theta - eta * gradient
theta_history.append(theta)
if (abs(J(theta) - J(last_theta)) < epsilon):
break
i_iter += 1
return
def plot_theta_history():
plt.plot(plot_x, J(plot_x))
plt.plot(np.array(theta_history), J(np.array(theta_history)), color="r", marker='+')
plt.show()
eta = 0.01
theta_history = []
plot_theta_history()```

## 2. Gradient descent method in linear regression

The formula is as follows

The implementation is as follows

```import numpy as np
import matplotlib.pyplot as plt
"""Gradient descent in linear regression"""
# For visualization, make a one-dimensional array
np.random.seed(666) #Random seed
x = 2 * np.random.random(size=100)
y = x * 3. + 4. + np.random.normal(size=100)
X = x.reshape(-1, 1)
# loss function
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta))**2) / len(X_b)
except:
return float('inf')
def dJ(theta, X_b, y):
res = np.empty(len(theta))
res[0] = np.sum(X_b.dot(theta) - y)
for i in range(1, len(theta)):
res[i] = (X_b.dot(theta) - y).dot(X_b[:,i])
return res * 2 / len(X_b)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
# parameter
X_b = np.hstack([np.ones((len(x), 1)), x.reshape(-1,1)])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)
print(theta)```

## 3. Random gradient descent method

• Direction is random, can jump out of the local optimal solution
• Precision change time
• Learning rate is very important, it should be gradually decreasing
• The idea of simulated annealing
• The algorithm in scikit is more complex and optimized

The implementation is as follows

```import numpy as np
import matplotlib.pyplot as plt
"""Random gradient descent method"""
# data
m = 100000
x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0, 3, size=m)
# loss function
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ_sgd(theta, X_b_i, y_i): #It's not the whole data coming in, it's a row
return 2 * X_b_i.T.dot(X_b_i.dot(theta) - y_i)
# Random gradient descent
def sgd(X_b, y, initial_theta, n_iters):
# Two super parameters of ab in the formula
t0, t1 = 5, 50
# Learning rate
def learning_rate(t):
return t0 / (t + t1)
# starting point
theta = initial_theta
for cur_iter in range(n_iters):
rand_i = np.random.randint(len(X_b)) #Random index
gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i]) #Corresponding gradient
theta = theta - learning_rate(cur_iter) * gradient
return theta
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=m//3) #It's a very important super parameter
print(theta)```

## 4. Encapsulating gradient descent method in the linear regression function of the previous article

```import numpy as np
from sklearn.metrics import r2_score

"""Linear regression function with gradient descent method"""
class LinearRegression:
def __init__(self):
"""initialization Linear Regression Model"""
self.coef_ = None
self.intercept_ = None
self._theta = None

def fit_normal(self, X_train, y_train):
"""According to training data set X_train, y_train train Linear Regression Model"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self

def fit_gd(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""According to training data set X_train, y_train, Training with gradient descent method Linear Regression Model"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
# loss function
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2 / len(y) #Vectorization
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
# parameter
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self

def fit_sgd(self, X_train, y_train, n_iters=50, t0=5, t1=50):
"""According to training data set X_train, y_train, Training with random gradient descent method Linear Regression Model"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert n_iters >= 1
def dJ_sgd(theta, X_b_i, y_i):
return X_b_i * (X_b_i.dot(theta) - y_i) * 2
# Random gradient descent
def sgd(X_b, y, initial_theta, n_iters=5, t0=5, t1=50): #t0,t1 are super parameters of learning rate
def learning_rate(t):
return t0 / (t + t1)
theta = initial_theta
m = len(X_b)
for i_iter in range(n_iters):
# Make sure that all samples are read once
indexes = np.random.permutation(m)
X_b_new = X_b[indexes,:]
y_new = y[indexes]
for i in range(m):
gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
theta = theta - learning_rate(i_iter * m + i) * gradient
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.random.randn(X_b.shape[1])
self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self

def predict(self, X_predict):
"""Given data set to be predicted X_predict，Return representation X_predict Result vector of"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return X_b.dot(self._theta)

def score(self, X_test, y_test):
"""According to the test data set X_test and y_test Determine the accuracy of the current model"""
y_predict = self.predict(X_test)
return r2_score(y_test, y_predict)

def __repr__(self):
return "LinearRegression()"```

# 5. Debugging of gradient descent method

• Good effect
• Slow speed

The implementation is as follows

```import numpy as np
import matplotlib.pyplot as plt
import datetime

# data
np.random.seed(666)
X = np.random.random(size=(1000, 10))
true_theta = np.arange(1, 12, dtype=float) #What we should get in the end
X_b = np.hstack([np.ones((len(X), 1)), X])
y = X_b.dot(true_theta) + np.random.normal(size=1000) #Add a noise
# loss function
def J(theta, X_b, y):
try:
return np.sum((y - X_b.dot(theta))**2) / len(X_b)
except:
return float('inf')
# Gradient of previous mathematical methods
def dJ_math(theta, X_b, y):
return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
# Gradient from commissioning
def dJ_debug(theta, X_b, y, epsilon=0.01):
res = np.empty(len(theta))
for i in range(len(theta)):
# One dimension at a time
theta_1 = theta.copy()
theta_1[i] += epsilon
theta_2 = theta.copy()
theta_2[i] -= epsilon
res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2 * epsilon)
return res
def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
startTime = datetime.datetime.now()
theta1 = gradient_descent(dJ_debug, X_b, y, initial_theta, eta)
print(theta1)
endTime = datetime.datetime.now()
print("The running time is:%ss" % (endTime - startTime).seconds)
startTime = datetime.datetime.now()
theta2 = gradient_descent(dJ_math, X_b, y, initial_theta, eta)
print(theta2)
endTime = datetime.datetime.now()
print("The running time is:%ss" % (endTime - startTime).seconds)```

# epilogue

In this section, the gradient descent method is studied
Including batch gradient descent method and random gradient descent method
And used the linear regression of the previous article
Gradient descent method is a good way for many machine learning to obtain the optimal solution

There is also a small batch gradient descent method
Obviously, a batch size super parameter has been added

Posted on Fri, 05 Jun 2020 23:25:45 -0700 by Kower