From 45b30b2d914a77a7c816b7f6a49446aa1c79ad31 Mon Sep 17 00:00:00 2001 From: mysjkin Date: Fri, 12 Oct 2018 13:39:36 +0200 Subject: [PATCH 1/3] Gradiant descent implementation. --- .../C++/SimpleGradientDescent.cpp | 59 +++++++++++++++++++ .../Gradient Descent/C++/Utility.h | 31 ++++++++++ Machine Learning/Gradient Descent/README.md | 10 ++++ 3 files changed, 100 insertions(+) create mode 100644 Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp create mode 100644 Machine Learning/Gradient Descent/C++/Utility.h create mode 100644 Machine Learning/Gradient Descent/README.md diff --git a/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp b/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp new file mode 100644 index 00000000..8c27c2b4 --- /dev/null +++ b/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp @@ -0,0 +1,59 @@ + +#include "Utility.h" + +using namespace std; + +vector Predict(vector data, pair model){ + int numDataPoints = data.size(); + vector predictions(numDataPoints); + for(int i = 0; i < numDataPoints; ++i){ + predictions[i] = i * model.first + model.second; + } + return predictions; +} + +pair GradientDescent(vector data, double learningRate, pair model){ + return pair(0.0, 0.0); +} + +pair LinearRegression(vector data, unsigned int epochs, double learningRate){ + // Initialize our linear regression model as: 1x + 1. + pair model(0, 0); + for(int i = 0; i < epochs; ++i){ + auto predictions = Predict(data, model); + + // Calculate gradiants for x and y intercept. + int numSamples = data.size(); + double gradientX = 0; + double gradientY = 0; + for(int k = 0; k < numSamples; ++k){ + double loss = data[k] - predictions[k]; + gradientY -= loss; + gradientX -= loss * data[k]; + } + gradientX *= (double) 2/numSamples; + gradientY *= (double) 2/numSamples; + + // Perform gradient descent step. + model.first = model.first - (learningRate * gradientX); + model.second = model.second - (learningRate * gradientY); + } + return model; +} + +int main(){ + // Define the x range for data generation. + pair range = pair(0,100); + + // Get data from the following linear function: 2x + 5. + vector data = GetLinearFunctionData(range, 2, 5); + + pair model = LinearRegression(data, 100000, 0.0001); + auto predictions = Predict(data, model); + + cout << "Data generating function: 2x + 5" << endl; + cout << "Mean squared error: " << MSE(data, predictions) << endl; + cout << "Learned model: " << model.first << "x + " << model.second << endl; + + return 0; +} \ No newline at end of file diff --git a/Machine Learning/Gradient Descent/C++/Utility.h b/Machine Learning/Gradient Descent/C++/Utility.h new file mode 100644 index 00000000..6928284a --- /dev/null +++ b/Machine Learning/Gradient Descent/C++/Utility.h @@ -0,0 +1,31 @@ + +/* Header-only utility functions used for gradient descent */ + +#include +#include +#include +#include +#include + +using namespace std; +using namespace boost; + +// Generating data from a linear function. +vector GetLinearFunctionData(pair range, double x, double yIntercept){ + vector data(range.second); + for(int i = 0; i < range.second; ++i){ + data[i] = (i + range.first) * x + yIntercept; + } + return data; +} + +// Sum of squared erros. +double MSE(vector actual, vector predicted){ + auto actualItt = actual.begin(); + auto predictedItt = predicted.begin(); + double sum = 0; + for( ; actualItt != actual.end(), predictedItt != predicted.end(); ++actualItt, ++predictedItt){ + sum += pow(*actualItt - *predictedItt, 2); + } + return sum/actual.size(); +} \ No newline at end of file diff --git a/Machine Learning/Gradient Descent/README.md b/Machine Learning/Gradient Descent/README.md new file mode 100644 index 00000000..24b23cc7 --- /dev/null +++ b/Machine Learning/Gradient Descent/README.md @@ -0,0 +1,10 @@ + +# Gradient Descent Optimisation Algorithm + +$ \lambda = 2 $. + +## Derive Gradient Descent Rule + +## Pseudocode + +## Example From 5d3bc9c093fe4eafebfd8938410728a2c7c3b32c Mon Sep 17 00:00:00 2001 From: mysjkin Date: Wed, 17 Oct 2018 09:54:47 +0200 Subject: [PATCH 2/3] Small fixes in implementation. Split data into x and y data vectors, instead of containing it all inside one data vector. --- .../C++/SimpleGradientDescent.cpp | 43 +++++++++---------- .../Gradient Descent/C++/Utility.h | 14 ++++-- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp b/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp index 8c27c2b4..3e773293 100644 --- a/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp +++ b/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp @@ -3,36 +3,31 @@ using namespace std; -vector Predict(vector data, pair model){ - int numDataPoints = data.size(); +vector Predict(vector x, pair model){ + int numDataPoints = x.size(); vector predictions(numDataPoints); for(int i = 0; i < numDataPoints; ++i){ - predictions[i] = i * model.first + model.second; + predictions[i] = x[i] * model.first + model.second; } return predictions; } -pair GradientDescent(vector data, double learningRate, pair model){ - return pair(0.0, 0.0); -} - -pair LinearRegression(vector data, unsigned int epochs, double learningRate){ - // Initialize our linear regression model as: 1x + 1. +pair LinearRegression(vector x, vector y, unsigned int epochs, double learningRate){ + // Initialize our linear regression model as: 0x + 0. pair model(0, 0); for(int i = 0; i < epochs; ++i){ - auto predictions = Predict(data, model); + auto predictions = Predict(x, model); // Calculate gradiants for x and y intercept. - int numSamples = data.size(); - double gradientX = 0; - double gradientY = 0; + int numSamples = x.size(); + double gradientX = 0.0; + double gradientY = 0.0; + for(int k = 0; k < numSamples; ++k){ - double loss = data[k] - predictions[k]; - gradientY -= loss; - gradientX -= loss * data[k]; + double error = y[k] - predictions[k]; + gradientX += ((-2.0) / (double) numSamples) * error *y[k]; + gradientY += ((-2.0) / (double) numSamples) * error; } - gradientX *= (double) 2/numSamples; - gradientY *= (double) 2/numSamples; // Perform gradient descent step. model.first = model.first - (learningRate * gradientX); @@ -46,13 +41,17 @@ int main(){ pair range = pair(0,100); // Get data from the following linear function: 2x + 5. - vector data = GetLinearFunctionData(range, 2, 5); + pair, vector> data = GetLinearFunctionData(range, 2, 5); + vector xData = data.first; + vector yData = data.second; - pair model = LinearRegression(data, 100000, 0.0001); - auto predictions = Predict(data, model); + pair model = LinearRegression(xData, yData, 10000, 0.0001); + auto predictions = Predict(xData, model); cout << "Data generating function: 2x + 5" << endl; - cout << "Mean squared error: " << MSE(data, predictions) << endl; + // Mean squared error: 2.37223. + cout << "Mean squared error: " << MSE(yData, predictions) << endl; + // Learned model: 2.04665x + 1.94324. cout << "Learned model: " << model.first << "x + " << model.second << endl; return 0; diff --git a/Machine Learning/Gradient Descent/C++/Utility.h b/Machine Learning/Gradient Descent/C++/Utility.h index 6928284a..66bbea1f 100644 --- a/Machine Learning/Gradient Descent/C++/Utility.h +++ b/Machine Learning/Gradient Descent/C++/Utility.h @@ -11,11 +11,17 @@ using namespace std; using namespace boost; // Generating data from a linear function. -vector GetLinearFunctionData(pair range, double x, double yIntercept){ - vector data(range.second); - for(int i = 0; i < range.second; ++i){ - data[i] = (i + range.first) * x + yIntercept; +pair, vector> GetLinearFunctionData(pair range, double x, double yIntercept){ + vector xData(range.second); + vector yData(range.second); + int numSamples = range.second - range.first; + + for(int i = range.first, k = 0; i < range.second, k < numSamples; ++i, ++k){ + xData[k] = i; + yData[k] = i * x + yIntercept; } + + pair, vector> data(xData, yData); return data; } From 9c3e85b5697d941565f45629bd49ec52d0dd93d2 Mon Sep 17 00:00:00 2001 From: mysjkin Date: Thu, 18 Oct 2018 19:56:13 +0200 Subject: [PATCH 3/3] Fixes and readme documentation. --- .../C++/SimpleGradientDescent.cpp | 47 ++++++++++++------- Machine Learning/Gradient Descent/README.md | 13 +++-- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp b/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp index 3e773293..90f0238c 100644 --- a/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp +++ b/Machine Learning/Gradient Descent/C++/SimpleGradientDescent.cpp @@ -3,41 +3,53 @@ using namespace std; -vector Predict(vector x, pair model){ +// Performs predictions with the given model. +vector Predict(vector x, pair model){ int numDataPoints = x.size(); vector predictions(numDataPoints); + for(int i = 0; i < numDataPoints; ++i){ predictions[i] = x[i] * model.first + model.second; } + return predictions; } +// Performs the gradient step. +pair BatchGradientDecentStep(vector predictions, vector y, double learningRate, pair model){ + int numSamples = y.size(); + double gradientX = 0.0; + double gradientY = 0.0; + + for(int k = 0; k < numSamples; ++k){ + double error = y[k] - predictions[k]; + gradientX += ((-2.0) / (double) numSamples) * error * y[k]; + gradientY += ((-2.0) / (double) numSamples) * error; + } + + model.first = model.first - (learningRate * gradientX); + model.second = model.second - (learningRate * gradientY); + + return model; +} + +// Runs through all the epchs updating the model based on the calculated gradient. pair LinearRegression(vector x, vector y, unsigned int epochs, double learningRate){ // Initialize our linear regression model as: 0x + 0. pair model(0, 0); + for(int i = 0; i < epochs; ++i){ auto predictions = Predict(x, model); - - // Calculate gradiants for x and y intercept. - int numSamples = x.size(); - double gradientX = 0.0; - double gradientY = 0.0; - - for(int k = 0; k < numSamples; ++k){ - double error = y[k] - predictions[k]; - gradientX += ((-2.0) / (double) numSamples) * error *y[k]; - gradientY += ((-2.0) / (double) numSamples) * error; - } - - // Perform gradient descent step. - model.first = model.first - (learningRate * gradientX); - model.second = model.second - (learningRate * gradientY); + model = BatchGradientDecentStep(predictions, y, learningRate, model); } + return model; } int main(){ - // Define the x range for data generation. + // Define the x range for data generation. + // Note, larger data values might cause exploding gradients. + // One possible solution is to reduce the learning rate. pair range = pair(0,100); // Get data from the following linear function: 2x + 5. @@ -45,6 +57,7 @@ int main(){ vector xData = data.first; vector yData = data.second; + // Run for 10000 epochs with a learning rate of 0.0001. pair model = LinearRegression(xData, yData, 10000, 0.0001); auto predictions = Predict(xData, model); diff --git a/Machine Learning/Gradient Descent/README.md b/Machine Learning/Gradient Descent/README.md index 24b23cc7..51a273c4 100644 --- a/Machine Learning/Gradient Descent/README.md +++ b/Machine Learning/Gradient Descent/README.md @@ -1,10 +1,17 @@ # Gradient Descent Optimisation Algorithm -$ \lambda = 2 $. +This explanation of the algorithm will not go into details with the mathematics, however, it is an important part but it is better explained online and in books. Instead, the focus is more on a high-level explanation of the algorithm. -## Derive Gradient Descent Rule +Gradient descent is a mathematical optimization algorithm. It is essentially a hill-climbing algorithm that follows the gradient of the function being optimized in order to search for optimal values. It is called gradient descent because we minimize a function by incrementally following the gradient towards a local minimum. And it is often used when training machine learning models. + +A gradient is basically the derivative for multi-variable functions, but it is a vector rather than a scaler. The gradient vector encapsulates the partial derivatives of a multi-variable function with respect to its parameters. The gradient of a function with respect to its input tells us something about how it behaves when we change the input, and for gradient descent we exploit a property, that is, the gradient vector points towards the steepest ascent. Hence, minimizing a function in iterations is simply calculating the gradient and moving in the opposite direction. + +Practically, we derive the partial derivative of our error function with respect to our model parameters, calculate the partial derivative for each weight and incrementally update each parameter in the opposite sign of the corresponding partial derivative. ## Pseudocode +Where lr is the learning rate, epochs is the number of iterations, and w are the model parameters. -## Example + for i to num_epochs: + for w_i in w: + w_i = w_i - lr * partial_derivative(loss, w_i)