diff --git a/Houston_J/Houston_J-sub3.ipynb b/Houston_J/Houston_J-sub3.ipynb new file mode 100644 index 0000000..da1236e --- /dev/null +++ b/Houston_J/Houston_J-sub3.ipynb @@ -0,0 +1,601 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compared to sub2, a few changes was made,\n", + "For feature engineering, apply a log scale to \"GR\" and \"PHIND\", which has non-sysmetric hist. \n", + "For xgboost model, fine tuning the parameters for better performance. \n", + "The code was developed in spyder, and pasted directly here. For comments, please refer to submit 2. \n", + "\n", + "The Facies classification project for SEG A test notes is appended at bottom\n", + "The project has three parts :\n", + "1. Raw Data analysis (Small data, quick statistics)\n", + "\n", + "2. Feature Engineering\n", + " a. Missing \"PE\" data : Regressional fillin is better than median and mean (https://github.com/seg/2016-ml-contest/blob/master/LA_Team/Facies_classification_LA_TEAM_05.ipynb)\n", + " b. How many features to include : \n", + " Current tests from other groups use only pre-defined features. \n", + " I found Formation has predicting power too, including Formation info give extra uplift to my model See test 8,9\n", + " c. Feature augmentation : https://github.com/seg/2016-ml-contest/blob/master/ispl/facies_classification_try02.ipynb\n", + " Great works, included the depth information in a nature way\n", + " d. Robust model scaling \n", + "\n", + "3. Model Selection\n", + " XGBOOST is superior to SVC (My benchmark)\n", + " A brutal gridsearch was done on XGBOOST on top of the best feature engineering " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.colors as colors\n", + "from mpl_toolkits.axes_grid1 import make_axes_locatable\n", + "\n", + "train_raw = pd.read_csv('../01_raw_data/facies_vectors.csv')\n", + "train = train_raw.copy()\n", + "cols = train.columns.values\n", + "well = train[\"Well Name\"].values\n", + "depth = train[\"Depth\"].values\n", + "\n", + "\n", + "## 01 Raw data analysis\n", + "print(\"No. of Wells is \" + str(len(train[\"Well Name\"].unique())))\n", + "print(\"No. of Formation is \" + str(len(train[\"Formation\"].unique())))\n", + "well_PE_Miss = train.loc[train[\"PE\"].isnull(),\"Well Name\"].unique()\n", + "#print(\"Wells with Missing PE \" + well_PE_Miss)\n", + "#print(train.loc[train[\"Well Name\"] == well_PE_Miss[0],[\"PE\",\"Depth\"]].count())\n", + "#print(train.loc[train[\"Well Name\"] == well_PE_Miss[1],[\"PE\",\"Depth\"]].count())\n", + "#print(train.loc[train[\"Well Name\"] == well_PE_Miss[2],[\"PE\",\"Depth\"]].count())\n", + "(train.groupby(\"Well Name\"))[\"PE\"].mean()\n", + "(train.groupby(\"Well Name\"))[\"PE\"].median()\n", + "#\n", + "### 02 Feature definition and QC functions\n", + "features = ['GR', 'ILD_log10', 'DeltaPHI', \n", + " 'PHIND','PE','NM_M', 'RELPOS']\n", + "feature_vectors = train[features]\n", + "facies_labels = train['Facies']\n", + "## 1=sandstone 2=c_siltstone 3=f_siltstone \n", + "## 4=marine_silt_shale 5=mudstone 6=wackestone 7=dolomite\n", + "## 8=packstone 9=bafflestone\n", + "facies_colors = ['#F4D03F', '#F5B041','#DC7633','#6E2C00',\n", + " '#1B4F72','#2E86C1', '#AED6F1', '#A569BD', '#196F3D']\n", + "\n", + "facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',\n", + " 'WS', 'D','PS', 'BS']\n", + "#facies_color_map is a dictionary that maps facies labels\n", + "#to their respective colors\n", + "\n", + "facies_color_map = {}\n", + "for ind, label in enumerate(facies_labels):\n", + " facies_color_map[label] = facies_colors[ind]\n", + "\n", + "def label_facies(row, labels):\n", + " return labels[ row['Facies'] -1]\n", + " \n", + "train.loc[:,'FaciesLabels'] = train.apply(lambda row: label_facies(row, facies_labels), axis=1)\n", + "##\n", + "#\n", + "def make_facies_log_plot(logs, facies_colors):\n", + " #make sure logs are sorted by depth\n", + " logs = logs.sort_values(by='Depth')\n", + " cmap_facies = colors.ListedColormap(\n", + " facies_colors[0:len(facies_colors)], 'indexed')\n", + " \n", + " ztop=logs.Depth.min(); zbot=logs.Depth.max()\n", + " \n", + " cluster=np.repeat(np.expand_dims(logs['Facies'].values,1), 100, 1)\n", + " \n", + " f, ax = plt.subplots(nrows=1, ncols=7, figsize=(10, 12))\n", + " ax[0].plot(logs.GR, logs.Depth, '-g')\n", + " ax[1].plot(logs.ILD_log10, logs.Depth, '-')\n", + " ax[2].plot(logs.DeltaPHI, logs.Depth, '-', color='0.5')\n", + " ax[3].plot(logs.PHIND, logs.Depth, '-', color='r')\n", + " ax[4].plot(logs.PE, logs.Depth, '-', color='black')\n", + " ax[5].plot(logs.NM_M, logs.Depth, '-', color='black')\n", + " im=ax[6].imshow(cluster, interpolation='none', aspect='auto',\n", + " cmap=cmap_facies,vmin=1,vmax=9)\n", + " \n", + " divider = make_axes_locatable(ax[5])\n", + " cax = divider.append_axes(\"right\", size=\"20%\", pad=0.05)\n", + " cbar=plt.colorbar(im, cax=cax)\n", + " cbar.set_label((17*' ').join([' SS ', 'CSiS', 'FSiS', \n", + " 'SiSh', ' MS ', ' WS ', ' D ', \n", + " ' PS ', ' BS ']))\n", + " cbar.set_ticks(range(0,1)); cbar.set_ticklabels('')\n", + " \n", + " for i in range(len(ax)-1):\n", + " ax[i].set_ylim(ztop,zbot)\n", + " ax[i].invert_yaxis()\n", + " ax[i].grid()\n", + " ax[i].locator_params(axis='x', nbins=3)\n", + " \n", + " ax[0].set_xlabel(\"GR\")\n", + " ax[0].set_xlim(logs.GR.min(),logs.GR.max())\n", + " ax[1].set_xlabel(\"ILD_log10\")\n", + " ax[1].set_xlim(logs.ILD_log10.min(),logs.ILD_log10.max())\n", + " ax[2].set_xlabel(\"DeltaPHI\")\n", + " ax[2].set_xlim(logs.DeltaPHI.min(),logs.DeltaPHI.max())\n", + " ax[3].set_xlabel(\"PHIND\")\n", + " ax[3].set_xlim(logs.PHIND.min(),logs.PHIND.max())\n", + " ax[4].set_xlabel(\"PE\")\n", + " ax[4].set_xlim(logs.PE.min(),logs.PE.max())\n", + " ax[5].set_xlabel('NoMarine/Marine')\n", + " ax[6].set_xlabel('Facies')\n", + " \n", + " ax[1].set_yticklabels([]); ax[2].set_yticklabels([]); ax[3].set_yticklabels([])\n", + " ax[4].set_yticklabels([]); ax[5].set_yticklabels([]); ax[6].set_yticklabels([])\n", + " ax[5].set_xticklabels([])\n", + " f.suptitle('Well: %s'%logs.iloc[0]['Well Name'], fontsize=14,y=0.94)\n", + "\n", + "#\n", + "##\n", + "##\n", + "#\n", + "#\n", + "#\n", + "### 03 Feature Engineering tests (SVC and XGB were used to test this)\n", + "## a. Fill in missing PE values : Median, mean, NN regressor\n", + "## b. Feature augmentaions\n", + "## c. Additional dummy features : Formation \n", + "## d. Featuere scaling\n", + "# Feature windows concatenation function\n", + "def augment_features_window(X, N_neig):\n", + " \n", + " # Parameters\n", + " N_row = X.shape[0]\n", + " N_feat = X.shape[1]\n", + "\n", + " # Zero padding\n", + " X = np.vstack((np.zeros((N_neig, N_feat)), X, (np.zeros((N_neig, N_feat)))))\n", + "\n", + " # Loop over windows\n", + " X_aug = np.zeros((N_row, N_feat*(2*N_neig+1)))\n", + " for r in np.arange(N_row)+N_neig:\n", + " this_row = []\n", + " for c in np.arange(-N_neig,N_neig+1):\n", + " this_row = np.hstack((this_row, X[r+c]))\n", + " X_aug[r-N_neig] = this_row\n", + "\n", + " return X_aug\n", + "\n", + "\n", + "# Feature gradient computation function\n", + "def augment_features_gradient(X, depth):\n", + " \n", + " # Compute features gradient\n", + " d_diff = np.diff(depth).reshape((-1, 1))\n", + " d_diff[d_diff==0] = 0.001\n", + " X_diff = np.diff(X, axis=0)\n", + " X_grad = X_diff / d_diff\n", + " \n", + " # Compensate for last missing value\n", + " X_grad = np.concatenate((X_grad, np.zeros((1, X_grad.shape[1]))))\n", + " \n", + " return X_grad\n", + "\n", + "\n", + "# Feature augmentation function\n", + "def augment_features(X, well, depth, N_neig=1):\n", + " \n", + " # Augment features\n", + " X_aug = np.zeros((X.shape[0], X.shape[1]*(N_neig*2+2)))\n", + " for w in np.unique(well):\n", + " w_idx = np.where(well == w)[0]\n", + " X_aug_win = augment_features_window(X[w_idx, :], N_neig)\n", + " X_aug_grad = augment_features_gradient(X[w_idx, :], depth[w_idx])\n", + " X_aug[w_idx, :] = np.concatenate((X_aug_win, X_aug_grad), axis=1)\n", + " \n", + " # Find padded rows\n", + " padded_rows = np.unique(np.where(X_aug[:, 0:7] == np.zeros((1, 7)))[0])\n", + " return X_aug, padded_rows\n", + "##\n", + "#### LA_Team Feature engineering\n", + "##train[\"PE\"] = train_raw[\"PE\"].fillna(train_raw[\"PE\"].median())\n", + "##X1 = train[features].values \n", + "##X_aug, padded_rows = augment_features(X1, well, depth,N_neig = 1)\n", + "##X_feat.update({\"X_aug\" : X_aug})\n", + "###\n", + "#\n", + "#\n", + "X_feat = {}\n", + "## Feature Engeering 1 : With dummy variable from Formation\n", + "## Create dummy variables for Well Name, Formation, which may have geologic or geospatial information\n", + "train_dummy = pd.get_dummies(train[[\"Formation\"]])\n", + "train_dummy.describe()\n", + "cols_dummy = train_dummy.columns.values\n", + "train[cols_dummy] = train_dummy[cols_dummy]\n", + "train_inp = train.drop([\"Formation\",\"Well Name\",'FaciesLabels',\"Depth\"],axis =1)\n", + "X_fe1 = train_inp.drop([\"Facies\"],axis = 1).values\n", + "X_feat.update({\"X_fe1\" : X_fe1})\n", + "#\n", + "## Feature Engeering 2 : With dummy variable from Formation and feature augmentation\n", + "train[\"PE\"] = train_raw[\"PE\"].fillna(train_raw[\"PE\"].median())\n", + "train_inp = train.drop([\"Formation\",\"Well Name\",'FaciesLabels',\"Depth\"],axis =1)\n", + "X_fe1 = train_inp.drop([\"Facies\"],axis = 1).values\n", + "X_fe1_aug, padded_rows = augment_features(X_fe1, well, depth,N_neig = 1)\n", + "X_feat.update({\"X_fe2\" : X_fe1_aug})\n", + "\n", + "\n", + "## Feature Engeering 3 : With dummy variable from Formation and feature augmentation\n", + "## Fill Nan PE with mean\n", + "train[\"PE\"] = train_raw[\"PE\"].fillna(train_raw[\"PE\"].mean())\n", + "train_inp = train.drop([\"Formation\",\"Well Name\",'FaciesLabels',\"Depth\"],axis =1)\n", + "X_fe1 = train_inp.drop([\"Facies\"],axis = 1).values\n", + "X_fe1_aug, padded_rows = augment_features(X_fe1, well, depth,N_neig = 1)\n", + "X_feat.update({\"X_fe3\" : X_fe1_aug})\n", + "\n", + "\n", + "### Feature Engeering 4c : Modified GR and PHIND With dummy variable from Formation and feature augmentation\n", + "### Fill Nan PE with MPRRegressor\n", + "train[\"GR\"] = train[\"GR\"].apply(lambda x : np.log(x))\n", + "train[\"PHIND\"] = train[\"PHIND\"].apply(lambda x : np.log(x))\n", + "from sklearn.neural_network import MLPRegressor\n", + "reg = MLPRegressor()\n", + "DataImpAll = train_raw.drop(['Formation', 'Well Name', 'Depth'], axis=1).copy()\n", + "DataImp = DataImpAll.dropna(axis = 0, inplace=False)\n", + "Ximp=DataImp.loc[:, DataImp.columns != 'PE']\n", + "Yimp=DataImp.loc[:, 'PE']\n", + "reg.fit(Ximp, Yimp)\n", + "train.loc[np.array(DataImpAll.PE.isnull()),\"PE\"] = reg.predict(DataImpAll.loc[DataImpAll.PE.isnull(),:].drop('PE',axis=1,inplace=False))\n", + "train_inp = train.drop([\"Formation\",\"Well Name\",'FaciesLabels',\"Depth\"],axis =1)\n", + "X_fe1 = train_inp.drop([\"Facies\"],axis = 1).values\n", + "X_fe1_aug, padded_rows = augment_features(X_fe1, well, depth,N_neig = 1)\n", + "X_feat.update({\"X_fe4\" : X_fe1_aug})\n", + "\n", + "\n", + "#### Feature Engeering 6 : Drop low-correlating feature RELPOS\n", + "#train_inp = train.drop([\"Formation\",\"Well Name\",'FaciesLabels',\"Depth\",\"RELPOS\"],axis =1)\n", + "#X_fe6 = train_inp.drop([\"Facies\"],axis = 1).values\n", + "#X_fe6_aug, padded_rows = augment_features(X_fe6, well, depth,N_neig = 1)\n", + "#X_feat.update({\"X_fe6\" : X_fe6_aug})\n", + "\n", + "### Feature Engeering 7 : Drop low-correlating feature RELPOS\n", + "#train[\"GR\"] = train[\"GR\"].apply(lambda x : np.log(x))\n", + "#train[\"PHIND\"] = train[\"PHIND\"].apply(lambda x : np.log(x))\n", + "#train_inp = train.drop([\"Formation\",\"Well Name\",'FaciesLabels',\"Depth\"],axis =1)\n", + "#X_fe7 = train_inp.drop([\"Facies\"],axis = 1).values\n", + "#X_fe7_aug, padded_rows = augment_features(X_fe6, well, depth,N_neig = 1)\n", + "#X_feat.update({\"X_fe7\" : X_fe7_aug})\n", + "\n", + "\n", + "## Select which feature engineering for next model test\n", + "# Feature enginering Selection \n", + "X_tr = X_feat[\"X_fe4\"]\n", + "y = train[\"Facies\"].values\n", + "## Feature Scaling\n", + "from sklearn import preprocessing\n", + "scaler = preprocessing.RobustScaler(quantile_range=(25.0, 75.0)).fit(X_tr)\n", + "X = scaler.transform(X_tr)\n", + "\n", + "## Removing Padded Rows\n", + "#X = np.delete(X,padded_rows,axis = 0)\n", + "#y = np.delete(y,padded_rows,axis = 0)\n", + "Well_no_pad = well #np.delete(well,padded_rows,axis=0)\n", + "\n", + "#\n", + "#\n", + "#\n", + "## Reading Test dataset and process the same way as trainning\n", + "test = pd.read_csv('../01_raw_data/validation_data_nofacies.csv')\n", + "## Test data Check\n", + "print(test.count()) # Make sure no missing data in test\n", + "print(\"No. of Formation in test is \" + str(len(test[\"Formation\"].unique())))\n", + "## Dummy formation\n", + "test_dummy = pd.get_dummies(test[[\"Formation\"]])\n", + "test_cols_dummy = test_dummy.columns.values\n", + "test[test_cols_dummy] = test_dummy[cols_dummy]\n", + "## Feature augmentaion\n", + "Well_test = test[\"Well Name\"].values\n", + "Depth_test = test[\"Depth\"].values\n", + "test[\"GR\"] = test[\"GR\"].apply(lambda x : np.log(x))\n", + "test[\"PHIND\"] = test[\"PHIND\"].apply(lambda x : np.log(x))\n", + "test_inp = test.drop([\"Formation\",\"Well Name\",\"Depth\"],axis =1)\n", + "test_fe = test_inp.values\n", + "test_aug,t_pad_row = augment_features(test_fe,Well_test,Depth_test)\n", + "## Scaling\n", + "X_test = scaler.transform(test_aug)\n", + "\n", + "\n", + "\n", + "\n", + "# Split Group\n", + "from sklearn.model_selection import LeavePGroupsOut\n", + "lpgo = LeavePGroupsOut(n_groups=2)\n", + "#split_no = lpgo.get_n_splits(X,y,wellgroups)\n", + "train_index=[]\n", + "val_index = []\n", + "for tr_i,val_i in lpgo.split(X, y, groups=Well_no_pad):\n", + " hist_tr = np.histogram(y[tr_i], bins=np.arange(len(facies_labels)+1)+0.5)\n", + " hist_val = np.histogram(y[val_i], bins=np.arange(len(facies_labels)+1)+0.5)\n", + " if np.all(hist_tr[0] != 0) & np.all(hist_val[0] != 0): \n", + " train_index.append(tr_i)\n", + " val_index.append(val_i)\n", + "split_no = len(train_index)\n", + "##\n", + "from sklearn.multiclass import OneVsOneClassifier\n", + "import xgboost as xgb\n", + "from xgboost.sklearn import XGBClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import f1_score\n", + "from sklearn.svm import SVC\n", + "from scipy.signal import medfilt\n", + "\n", + "\n", + "#\n", + "#tuned_param = {'learning_rate': [0.1],\n", + "# 'n_estimators': [200],\n", + "# 'reg_alpha': [0.6,0.8,1.2],\n", + "# 'reg_lambda':[1],\n", + "# 'colsample_bytree': [0.8], \n", + "# 'gamma': [0.6],\n", + "# 'max_depth':[3],\n", + "# 'min_child_weight': [1],\n", + "# 'subsample': [0.7]}\n", + "\n", + "#param = {'alpha': 0.2,\n", + "# 'colsamplebytree': 0.8,\n", + "# 'gamma': 0.3,\n", + "# 'learningrate': 0.2,\n", + "# 'maxdepth': 5,\n", + "# 'minchildweight': 1,\n", + "# 'n_estimators': 200,\n", + "# 'subsample': 0.9}\n", + "##\n", + "#param = {'alpha': 0.2,\n", + "# 'colsamplebytree': 0.8,\n", + "# 'gamma': 0.3,\n", + "# 'learningrate': 0.05,\n", + "# 'maxdepth': 3,\n", + "# 'minchildweight': 1,\n", + "# 'n_estimators': 200,\n", + "# 'subsample': 0.7}\n", + "seed = np.random.randint(100)\n", + "clf = XGBClassifier(\n", + " learning_rate = 0.05,\n", + " n_estimators=400,\n", + " max_depth=3,\n", + " min_child_weight=1,\n", + " gamma = 0.6,\n", + " subsample= 0.7,\n", + " colsample_bytree=0.8,\n", + " reg_alpha = 0.8,\n", + " reg_lambda= 1,\n", + " nthread =-1,\n", + " seed = seed,\n", + " ) \n", + "#\n", + "#import datetime\n", + "\n", + "\n", + "#scores = ['precision', 'recall']\n", + "#clf = GridSearchCV(XGBClassifier(nthread=-1,seed = seed),\n", + "# tuned_param, cv=lpgo.split(X, y, groups=train['Well Name'].values),\n", + "# scoring='%s_micro' % scores[0],n_jobs = -1,verbose = 10)\n", + "#clf.fit(X,y)\n", + "#print(clf.best_score_)\n", + "#print(clf.best_params_)\n", + "#\n", + "#\n", + "svc_best = SVC(C = 10, gamma = 0.01, kernel = 'rbf')\n", + "#\n", + "#\n", + "f1 = np.zeros((split_no,1))\n", + "f1_svc = np.zeros((split_no,1))\n", + "for i in range(split_no):\n", + " split_train_no_pad = np.setdiff1d(train_index[i],padded_rows)\n", + "# print(len(train_index[i]),len(split_train_no_pad))\n", + " X_train = X[split_train_no_pad,:]\n", + " Y_train = y[split_train_no_pad]\n", + " X_val = X[val_index[i],:]\n", + " Y_val = y[val_index[i]]\n", + " print(i)\n", + " ### XGBOOST\n", + " clf.fit(X_train,Y_train)\n", + " y_pred = clf.predict(X_val)\n", + "# y_pred = medfilt(y_pred,kernel_size=5)\n", + " f1[i] = f1_score(Y_val, y_pred, average='micro') \n", + " \n", + " \n", + " svc_best.fit(X_train,Y_train)\n", + " Y_pred = svc_best.predict(X_val)\n", + "# Y_pred = medfilt(Y_pred,kernel_size=5)\n", + " f1_svc[i] = f1_score(Y_val, Y_pred, average='micro') \n", + " \n", + "print(\"XGBOOST score \" + str(np.mean(f1)))\n", + "print(\"SVC score\" + str(np.mean(f1_svc)))\n", + "\n", + "\n", + "#\n", + "# Predict for testing data\n", + "##Plot predicted labels\n", + "test_facies = clf.fit(X,y).predict(X_test)\n", + "test[\"Facies\"] = test_facies\n", + "test.to_csv(\"HoustonJ_sub3.csv\")\n", + "\n", + "make_facies_log_plot(\n", + " test[test['Well Name'] == 'STUART'],\n", + " facies_colors=facies_colors)\n", + "\n", + "make_facies_log_plot(\n", + " test[test['Well Name'] == 'CRAWFORD'],\n", + " facies_colors=facies_colors)\n", + "\n", + "\n", + "\n", + "#plt.figure(figsize=(6,4))\n", + "#sns.heatmap(train.corr(method='pearson', min_periods=1),cmap='seismic')\n", + "#plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Facies Classification :\n", + " \n", + "facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',\n", + " 'WS', 'D','PS', 'BS']\n", + "\n", + "Facies Description Label Adjacen facies\n", + "1 Nonmarine sandstone SS 2\n", + "2 Nonmarine coarse siltstone CSiS 1,3\n", + "3 Nonmarine fine siltstone FSiS 2\n", + "4 Marine siltstone and shale SiSh 5\n", + "5 Mudstone MS 4,6\n", + "6 Wackestone WS 5,7,8\n", + "7 Dolomite D 6,8\n", + "8 Packstone-grainstone PS 6,7,9\n", + "9 Phylloid-algal bafflestone BS 7,8\n", + "\n", + "Features : 'Facies', 'Formation', 'Well Name', 'Depth', 'GR', 'ILD_log10','DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS'\n", + "\n", + "Pre-processing 1: \n", + "1. Well 'ALEXANDER D', 'KIMZEY A' has missing PE, filled in with median values \n", + "2. Map \"Formation\", \"Well Name\" into dummy features ; Effective features increase from 8 to 21\n", + "3. Robust normalization, dropping depth\n", + "\n", + "\n", + "Pre-processing 2: \n", + "1. Well 'ALEXANDER D', 'KIMZEY A' has missing PE, filled in with median values \n", + "2. Map \"Formation\", \"Well Name\" into dummy features ; Effective features increase from 8 to 21\n", + "3. Feature augmentation\n", + "4. Robust normalization, dropping depth\n", + "\n", + "Pre-processing 3: \n", + "1. Well 'ALEXANDER D', 'KIMZEY A' has missing PE, filled in with mean values \n", + "2. Map \"Formation\", \"Well Name\" into dummy features ; Effective features increase from 8 to 21\n", + "3. Feature augmentation\n", + "4. Robust normalization, dropping depth\n", + "\n", + "Pre-processing 4: \n", + "1. Well 'ALEXANDER D', 'KIMZEY A' has missing PE, filled in NN Regressor\n", + "2. Map \"Formation\", \"Well Name\" into dummy features ; Effective features increase from 8 to 21\n", + "3. Feature augmentation\n", + "4. Robust normalization, dropping depth\n", + "\n", + "Pre-processing 5: \n", + "1. Well 'ALEXANDER D', 'KIMZEY A' has missing PE, filled in with mean values \n", + "3. Feature augmentation\n", + "4. Robust normalization, dropping depth\n", + "\n", + "Pre-processing 6: Derive from 4, and drop the low-correlation feature RELPOS\n", + "1. Well 'ALEXANDER D', 'KIMZEY A' has missing PE, filled in NN Regressor\n", + "2. Map \"Formation\", \"Well Name\" into dummy features ; Effective features increase from 8 to 21\n", + "3. Feature augmentation\n", + "4. Robust normalization, dropping depth\n", + "\n", + "Pre-processing 6: Derive from 4, and Applied a log function to GR and PHIND\n", + "1. Well 'ALEXANDER D', 'KIMZEY A' has missing PE, filled in NN Regressor\n", + "2. Map \"Formation\", \"Well Name\" into dummy features ; Effective features increase from 8 to 21\n", + "3. Feature augmentation\n", + "4. Robust normalization, dropping depth\n", + "\n", + "\n", + "Model_selection_pre:\n", + "Split group by Well Name : 2 test 8 train\n", + "\n", + "Test 1 : The pre-processing 1\n", + "Radomly choose one slipt and run SVC vs XGBOOST\n", + "f1 score : SVC 0.486 < XGBOOST 0.535\n", + "Conclusion_pre : XGBOOST > SVC \n", + "\n", + "Feature Engineering Selection : \n", + "Test 2 : The pre-processing 1\n", + "XGBOOST for all splits : 0.535\n", + "Houmath Best : 0.563\n", + "\n", + "Test3 : The feature augmentation \n", + "XGBOOST score 0.552620109649\n", + "SVC score0.502307800369\n", + "\n", + "Test 4 : The feature augmentation N_neig = 2\n", + "XGBOOST score 0.544176923417\n", + "SVC score0.489872101252\n", + "\n", + "Test 5 : The pre-processing 2 \n", + "XGBOOST score 0.557558000862\n", + "SVC score0.499220019065\n", + "\n", + "Test 6 : The pre-processing 3 \n", + "XGBOOST score 0.557884804169\n", + "SVC score0.500650895029\n", + "\n", + "Test 7 : The pre-processing 3 y_pre = medfil size 5\n", + "XGBOOST score 0.559944170153\n", + "SVC score0.509190227257\n", + "\n", + "Test 8 : The pre-processing 4 y_pre = medfil size 5\n", + "XGBOOST score 0.566146182295\n", + "SVC score0.507362308656\n", + "\n", + "Test 9 : The pre-processing 5 Drop Formation dummy y_pre = medfil size 5\n", + "XGBOOST score 0.555870232144\n", + "SVC score0.509423764916\n", + "\n", + "Test 10 : The pre-processing 6 Drop RELPOS\n", + "XGBOOST score 0.559226819725\n", + "SVC score0.509379374599\n", + "\n", + "Test 11 : The pre-processing 7 Apply Log function to GR and PHIND\n", + "XGBOOST score 0.559652979267\n", + "SVC score0.512544201578\n", + "\n", + "Test 12 : The pre-processing 8 Drop RELPOS and Apply log function to GR and PHIND (Best)\n", + "XGBOOST score 0.562391525842\n", + "SVC score0.51057543881\n", + "\n", + "\n", + "Removing padded rows in cross-validation\n", + "XGBOOST score 0.559949591075\n", + "SVC score0.512978370131\n", + "\n", + "\n", + "Model Optimization : \n", + "Test 10 : The pre-processing 4 y_pre = medfil size 5\n", + "Grid Search for XGBOOST parameters\n", + "\n", + "Learning_rate 0.05, n_estimators : 200 ; Hope not overfitting\n", + "XGBOOST score 0.563599674886\n", + "SVC score0.510516447302\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}