diff --git a/README.md b/README.md new file mode 100644 index 0000000..4afddc8 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +# machinelearningregression +boilerplate code, scripts, modules, data for Regression Analysis workshop + +Before starting check that your working environment is ready running the `runme.py`: + +``` +$ python runme.py +``` + +If the following message appears: + +``` +Good to go, all packages installed ok, ready to code. +``` + +You're ready fot the workshop! diff --git a/functions.py b/functions.py index 151a533..47777e1 100644 --- a/functions.py +++ b/functions.py @@ -1,4 +1,4 @@ -#cca regression functions +# Advanced functions from sklearn.pipeline import make_pipeline from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler, PolynomialFeatures @@ -23,23 +23,20 @@ def model_plot_3d(ax, model, x1, x2): positions = np.vstack([X1.ravel(), X2.ravel()]).T y = model.predict(positions) Y = y.reshape(X1.shape) - ax.plot_surface(X1, X2, Y, alpha = 0.2 , cmap = 'jet') - -def save_figure(name, i): - plt.savefig('../figures/'+name+str(i), bbox_inches='tight') - plt.gcf().clear() + ax.plot_surface(X1, X2, Y, alpha = 0.2 , cmap = 'jet',\ + linewidth=0.5, rstride=1, cstride=1, shade=True) def PolynomialRegression(degree = 1): - return make_pipeline(PolynomialFeatures(degree = degree), LinearRegression()) + return make_pipeline(PolynomialFeatures(degree = degree,\ + include_bias = False), LinearRegression()) def PolynomialRidge(degree = 1, alpha = 1): - return make_pipeline(PolynomialFeatures(degree = degree), StandardScaler(), Ridge(alpha = alpha)) + return make_pipeline(PolynomialFeatures(degree = degree,\ + include_bias = False), StandardScaler(), Ridge(alpha = alpha)) def PolynomialLasso(degree = 1, alpha = 1): - return make_pipeline(PolynomialFeatures(degree = degree), StandardScaler(), Lasso(alpha = alpha)) - - - + return make_pipeline(PolynomialFeatures(degree = degree,\ + include_bias = False), StandardScaler(), Lasso(alpha = alpha)) def polynomial_residual(degree, X, y): polynomial_regression= PolynomialRegression(degree = degree) @@ -48,17 +45,7 @@ def polynomial_residual(degree, X, y): mae = mean_absolute_error(y, y_pred) return mae - def organize_data(to_forecast, window, horizon): - """ - Input: - to_forecast, univariate time series organized as numpy array - window, number of items to use in the forecast window - horizon, horizon of the forecast - Output: - X, a matrix where each row contains a forecast window - y, the target values for each row of X - """ shape = to_forecast.shape[:-1] + (to_forecast.shape[-1] - window + 1, window) strides = to_forecast.strides + (to_forecast.strides[-1],) X = np.lib.stride_tricks.as_strided(to_forecast, @@ -66,36 +53,3 @@ def organize_data(to_forecast, window, horizon): strides=strides) y = np.array([X[i+horizon][-1] for i in range(len(X)-horizon)]) return X[:-horizon], y - - -class NonLinearRegression(object): - - def __init__(self, fun): - self.fun = fun - self._find_n_params() - - def fit(self, X, y): - self.params = basinhopping(lambda p: self._get_residual(X,y,p), np.random.randn(self._n_params), niter = 10000, niter_success = 50)['x'] - - def predict(self, X): - return self.fun(X.flatten(), self.params) - - def get_params(self, deep=True): - return {"fun": self.fun} - - def set_params(self, **parameters): - for parameter, value in parameters.items(): - self.setattr(parameter, value) - return self - - def _get_residual(self, X, y, p): - return np.mean((self.fun(X.flatten(), p)-y)**2) - - def _find_n_params(self): - for n_params in range(1,100)[::-1]: - try: - self.fun(1,np.random.rand(n_params)) - except: - n_params += 1 - break - self._n_params = n_params diff --git a/functions.pyc b/functions.pyc deleted file mode 100644 index 4fd1ff3..0000000 Binary files a/functions.pyc and /dev/null differ diff --git a/module3.py b/module3.py index 20f3db6..58eefde 100644 --- a/module3.py +++ b/module3.py @@ -1,16 +1,10 @@ -from functions import save_figure -from os.path import splitext -figname = splitext(__file__)[0]+'_' -ifig = 0 +# Module 3: Linear regression +# New imports import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression +from sklearn import metrics -bikes_df = pd.read_csv('./data/bikes_subsampled.csv') - -# We select the variables temperature and bikes_count - -temperature = bikes_df['temperature'].values -bikes_count = bikes_df['count'].values +# Code after this diff --git a/module4.py b/module4.py index 9dd7177..e92f9a4 100644 --- a/module4.py +++ b/module4.py @@ -1,27 +1,18 @@ -from functions import save_figure -from os.path import splitext -figname = splitext(__file__)[0]+'_' -ifig = 0 - -################################################################################ -################################### MODULE 4 ################################### -##################### Multiple variables linear regression ##################### -################################################################################ +# Module 4: Multiple and polynomial regression +# Previous imports import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression -from mpl_toolkits.mplot3d.axes3d import Axes3D -from sklearn.preprocessing import PolynomialFeatures - -from functions import PolynomialRegression, model_plot_3d +# New imports +from functions import PolynomialRegression +from mpl_toolkits.mplot3d.axes3d import Axes3D +from functions import model_plot_3d +from functions import polynomial_residual +# Load dataset bikes_df = pd.read_csv('./data/bikes_subsampled.csv') -# Learning activity 1: Fit a model of 2 variables and plot the model - -features = ['temperature','humidity'] -X = bikes_df[features].values -y = bikes_df['count'].values +# Code after this diff --git a/module5.py b/module5.py index ea09f9c..17aea76 100644 --- a/module5.py +++ b/module5.py @@ -1,23 +1,19 @@ -from functions import save_figure -from os.path import splitext -figname = splitext(__file__)[0]+'_' -ifig = 0 - -################################################################################ -################################### MODULE 5 ################################### -############################# Model evaluation ################################# -################################################################################ -# To be separated in a unique file # +# Module 5: Evaluating model performance +# Previous imports import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from functions import PolynomialRegression -from sklearn.preprocessing import PolynomialFeatures + +# New imports from sklearn.cross_validation import train_test_split, cross_val_score from sklearn.metrics import mean_absolute_error +# Load dataset bikes_df = pd.read_csv('./data/bikes_subsampled.csv') temperature = bikes_df[['temperature']].values bikes_count = bikes_df['count'].values + +# Code after this diff --git a/module6.py b/module6.py index 382ed66..8ab6477 100644 --- a/module6.py +++ b/module6.py @@ -1,26 +1,18 @@ -from functions import save_figure -from os.path import splitext -figname = splitext(__file__)[0]+'_' -ifig = 0 +# Module 6: Avoid overfitting with regularisation - -################################################################################ -################################### MODULE 6 ################################### -############################# Regularisation ################################### -################################################################################ - - -# Learning activity 2: Ridge and Lasso regularisations +# Previous imports import pandas as pd import numpy as np import matplotlib.pyplot as plt -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler, PolynomialFeatures -from sklearn.linear_model import Ridge, Lasso +from sklearn.linear_model import LinearRegression +from functions import PolynomialRegression +from sklearn.cross_validation import cross_val_score -from functions import PolynomialRidge, PolynomialLasso, PolynomialRegression +# New imports +from functions import PolynomialRidge, PolynomialLasso +from sklearn.grid_search import GridSearchCV +# Load dataset bikes_df = pd.read_csv('./data/bikes.csv') -features = ['temperature','humidity','windspeed'] -X = bikes_df[features].values -y = bikes_df['count'].values + +# Code after this diff --git a/module7.py b/module7.py index a71cb45..9411cc9 100644 --- a/module7.py +++ b/module7.py @@ -1,30 +1,17 @@ -from functions import save_figure -from os.path import splitext -figname = splitext(__file__)[0]+'_' -ifig = 0 +# Module 7: Predict the future with autoregression -################################################################################ -################################### MODULE 7 ################################### -############################# Advanced fitting methods ######################### -################################################################################ - -#Learning activity 1: use any sklearn model - -from sklearn.tree import DecisionTreeRegressor -from sklearn.cross_validation import cross_val_score -from sklearn.linear_model import LinearRegression -from sklearn import metrics -from sklearn.ensemble import RandomForestRegressor -from sklearn.neighbors import KNeighborsRegressor -from sklearn.svm import SVR -import numpy as np +# Previous imports import pandas as pd +import numpy as np import matplotlib.pyplot as plt +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_absolute_error -from functions import NonLinearRegression, organize_data - +# New imports +from functions import organize_data -#Learning activity 2: Custom nonlinear regression +# Load dataset bikes_df = pd.read_csv('./data/bikes.csv') -temperature = bikes_df[['temperature']].values bikes = bikes_df['count'].values + +# Code after this diff --git a/runme.py b/runme.py index 4c01d04..fba7918 100644 --- a/runme.py +++ b/runme.py @@ -1,11 +1,7 @@ -import pandas -import matplotlib.pyplot as plt -from sklearn import preprocessing -from sklearn.cross_validation import train_test_split -from sklearn import metrics -import functions -from sklearn import linear_model -from sklearn import cross_validation -from sklearn.grid_search import GridSearchCV +import module3 +import module4 +import module5 +import module6 +import module7 print "Good to go, all packages installed ok, ready to code."