import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
train_data = pd.read_csv('/kaggle/input/playground-series-s3e17/train.csv', index_col=0)
test_data = pd.read_csv('/kaggle/input/playground-series-s3e17/test.csv', index_col=0)
train_data.head()
Product ID | Type | Air temperature [K] | Process temperature [K] | Rotational speed [rpm] | Torque [Nm] | Tool wear [min] | Machine failure | TWF | HDF | PWF | OSF | RNF | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | |||||||||||||
0 | L50096 | L | 300.6 | 309.6 | 1596 | 36.1 | 140 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | M20343 | M | 302.6 | 312.1 | 1759 | 29.1 | 200 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | L49454 | L | 299.3 | 308.5 | 1805 | 26.5 | 25 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | L53355 | L | 301.0 | 310.9 | 1524 | 44.3 | 197 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | M24050 | M | 298.0 | 309.0 | 1641 | 35.4 | 34 | 0 | 0 | 0 | 0 | 0 | 0 |
def preprocess_data(data):
# Dropping some columns
drop_cols = ['TWF','HDF','PWF','OSF','RNF', 'Product ID']
data.drop(drop_cols, axis=1, inplace=True)
# One-Hot-Encoding the Type column
OH_encoder = OneHotEncoder(sparse_output=False)
OH_type = OH_encoder.fit_transform(data.Type.array.reshape(-1, 1))
cats = OH_encoder.categories_[0]
OH_type = pd.DataFrame(OH_type, index=data.index, columns=cats)
# Adding the new one-hot-encoded Type to the data
new_data = pd.concat([data, OH_type], axis=1)
new_data.drop('Type', inplace=True, axis=1)
# Renaming the columns
col_names = {
'Air temperature [K]':'AirTemp',
'Process temperature [K]':'ProcessTemp',
'Rotational speed [rpm]':'RotationalSpeed',
'Torque [Nm]':'Torque',
'Tool wear [min]':'ToolWear'}
new_data.rename(columns=col_names, inplace=True)
return new_data
prepped_train_data = preprocess_data(train_data)
prepped_test_data = preprocess_data(test_data)
prepped_train_data
AirTemp | ProcessTemp | RotationalSpeed | Torque | ToolWear | Machine failure | H | L | M | |
---|---|---|---|---|---|---|---|---|---|
id | |||||||||
0 | 300.6 | 309.6 | 1596 | 36.1 | 140 | 0 | 0.0 | 1.0 | 0.0 |
1 | 302.6 | 312.1 | 1759 | 29.1 | 200 | 0 | 0.0 | 0.0 | 1.0 |
2 | 299.3 | 308.5 | 1805 | 26.5 | 25 | 0 | 0.0 | 1.0 | 0.0 |
3 | 301.0 | 310.9 | 1524 | 44.3 | 197 | 0 | 0.0 | 1.0 | 0.0 |
4 | 298.0 | 309.0 | 1641 | 35.4 | 34 | 0 | 0.0 | 0.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
136424 | 300.1 | 311.4 | 1530 | 37.5 | 210 | 0 | 0.0 | 0.0 | 1.0 |
136425 | 297.5 | 308.5 | 1447 | 49.1 | 2 | 0 | 1.0 | 0.0 | 0.0 |
136426 | 300.5 | 311.8 | 1524 | 38.5 | 214 | 0 | 0.0 | 1.0 | 0.0 |
136427 | 301.7 | 310.9 | 1447 | 46.3 | 42 | 0 | 0.0 | 1.0 | 0.0 |
136428 | 296.9 | 308.1 | 1557 | 39.3 | 229 | 0 | 0.0 | 1.0 | 0.0 |
136429 rows × 9 columns
prepped_train_data.describe()
AirTemp | ProcessTemp | RotationalSpeed | Torque | ToolWear | Machine failure | H | L | M | |
---|---|---|---|---|---|---|---|---|---|
count | 136429.000000 | 136429.000000 | 136429.000000 | 136429.000000 | 136429.000000 | 136429.000000 | 136429.000000 | 136429.000000 | 136429.000000 |
mean | 299.862776 | 309.941070 | 1520.331110 | 40.348643 | 104.408901 | 0.015744 | 0.065404 | 0.698928 | 0.235668 |
std | 1.862247 | 1.385173 | 138.736632 | 8.502229 | 63.965040 | 0.124486 | 0.247238 | 0.458726 | 0.424417 |
min | 295.300000 | 305.800000 | 1181.000000 | 3.800000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 298.300000 | 308.700000 | 1432.000000 | 34.600000 | 48.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 300.000000 | 310.000000 | 1493.000000 | 40.400000 | 106.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
75% | 301.200000 | 310.900000 | 1580.000000 | 46.100000 | 159.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
max | 304.400000 | 313.800000 | 2886.000000 | 76.600000 | 253.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
prepped_train_data['TempDiffer'] = prepped_train_data['ProcessTemp'] - prepped_train_data['AirTemp']
prepped_test_data['TempDiffer'] = prepped_test_data['ProcessTemp'] - prepped_test_data['AirTemp']
prepped_train_data.head()
AirTemp | ProcessTemp | RotationalSpeed | Torque | ToolWear | Machine failure | H | L | M | TempDiffer | |
---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||
0 | 300.6 | 309.6 | 1596 | 36.1 | 140 | 0 | 0.0 | 1.0 | 0.0 | 9.0 |
1 | 302.6 | 312.1 | 1759 | 29.1 | 200 | 0 | 0.0 | 0.0 | 1.0 | 9.5 |
2 | 299.3 | 308.5 | 1805 | 26.5 | 25 | 0 | 0.0 | 1.0 | 0.0 | 9.2 |
3 | 301.0 | 310.9 | 1524 | 44.3 | 197 | 0 | 0.0 | 1.0 | 0.0 | 9.9 |
4 | 298.0 | 309.0 | 1641 | 35.4 | 34 | 0 | 0.0 | 0.0 | 1.0 | 11.0 |
X=prepped_train_data.drop(['Machine failure'], axis=1)
y=prepped_train_data['Machine failure']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=99)
mi_scores= mutual_info_regression(X, y)
mi_scores_series = pd.Series(mi_scores, name='MI scores', index=X.columns)
mi_scores_series.sort_values(ascending=False)
Torque 0.021782
RotationalSpeed 0.015154
TempDiffer 0.007294
ToolWear 0.004929
AirTemp 0.004925
ProcessTemp 0.002126
M 0.000677
H 0.000000
L 0.000000
Name: MI scores, dtype: float64
pca=PCA()
X_pca = pca.fit_transform(X_train)
X_pca_components = pd.DataFrame(pca.components_, index=X_train.columns)
X_pca_components
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
AirTemp | 0.000243 | 0.000132 | 9.988607e-01 | -4.765407e-02 | 2.520727e-03 | 0.000003 | -0.000004 | 0.000002 | -0.000111 |
ProcessTemp | 0.000494 | 0.000248 | -2.521373e-03 | 5.033723e-05 | 9.999966e-01 | 0.000015 | -0.000055 | 0.000040 | -0.000246 |
RotationalSpeed | 0.004191 | 0.001422 | 4.765190e-02 | 9.988504e-01 | 6.676347e-05 | -0.000072 | -0.000005 | 0.000077 | -0.002769 |
Torque | -0.801315 | -0.536329 | 5.243256e-04 | 4.835591e-03 | 5.951679e-04 | 0.000984 | 0.000778 | -0.001762 | 0.264985 |
ToolWear | 0.156622 | -0.615604 | -1.340616e-04 | -1.915458e-03 | -1.153781e-04 | -0.001484 | -0.008262 | 0.009746 | -0.772226 |
H | -0.003296 | 0.006797 | -4.139783e-06 | -1.716147e-05 | -6.644867e-05 | 0.079459 | -0.743423 | 0.663965 | 0.010093 |
L | 0.001579 | -0.001139 | 6.580431e-07 | 8.110767e-05 | -1.228493e-05 | 0.812619 | -0.337519 | -0.475100 | -0.002718 |
M | -0.577350 | 0.577350 | -0.000000e+00 | -3.122502e-17 | -1.110223e-16 | -0.000727 | -0.000727 | -0.000727 | -0.577350 |
TempDiffer | 0.000727 | -0.000727 | -3.144186e-18 | -5.825215e-17 | 2.168404e-19 | -0.577350 | -0.577350 | -0.577350 | 0.000727 |
sns.heatmap(X_pca_components)
<Axes: >
sns.heatmap(prepped_train_data.corr())
<Axes: >
prepped_train_data.columns
Index(['AirTemp', 'ProcessTemp', 'RotationalSpeed', 'Torque', 'ToolWear',
'Machine failure', 'H', 'L', 'M', 'TempDiffer'],
dtype='object')
y.value_counts()
0 134281
1 2148
Name: Machine failure, dtype: int64
figure, axes = plt.subplots(5, 1, figsize=(10,20))
sns.histplot(x='AirTemp', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[0])
sns.histplot(x='ProcessTemp', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[1])
sns.histplot(x='RotationalSpeed', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[2])
sns.histplot(x='Torque', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[3])
sns.histplot(x='ToolWear', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[4])
<Axes: xlabel='ToolWear', ylabel='Count'>
sns.pairplot(prepped_train_data, vars=['AirTemp', 'ProcessTemp', 'RotationalSpeed', 'Torque', 'ToolWear'], hue='Machine failure')
<seaborn.axisgrid.PairGrid at 0x7b506a3bca30>
model = XGBRegressor(n_estimators=3000, learning_rate=0.003)
model.fit(X_train, y_train,
early_stopping_rounds=5,
eval_set=[(X_valid, y_valid)], verbose=30)
[0] validation_0-rmse:0.49856
/opt/conda/lib/python3.10/site-packages/xgboost/sklearn.py:835: UserWarning: `early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.
warnings.warn(
[30] validation_0-rmse:0.45755
[60] validation_0-rmse:0.42024
[90] validation_0-rmse:0.38633
[120] validation_0-rmse:0.35554
[150] validation_0-rmse:0.32760
[180] validation_0-rmse:0.30230
[210] validation_0-rmse:0.27942
[240] validation_0-rmse:0.25876
[270] validation_0-rmse:0.24014
[300] validation_0-rmse:0.22334
[330] validation_0-rmse:0.20827
[360] validation_0-rmse:0.19479
[390] validation_0-rmse:0.18276
[420] validation_0-rmse:0.17205
[450] validation_0-rmse:0.16257
[480] validation_0-rmse:0.15420
[510] validation_0-rmse:0.14684
[540] validation_0-rmse:0.14041
[570] validation_0-rmse:0.13480
[600] validation_0-rmse:0.12991
[630] validation_0-rmse:0.12567
[660] validation_0-rmse:0.12204
[690] validation_0-rmse:0.11891
[720] validation_0-rmse:0.11623
[750] validation_0-rmse:0.11395
[780] validation_0-rmse:0.11200
[810] validation_0-rmse:0.11034
[840] validation_0-rmse:0.10892
[870] validation_0-rmse:0.10773
[900] validation_0-rmse:0.10672
[930] validation_0-rmse:0.10584
[960] validation_0-rmse:0.10509
[990] validation_0-rmse:0.10447
[1020] validation_0-rmse:0.10393
[1050] validation_0-rmse:0.10345
[1080] validation_0-rmse:0.10305
[1110] validation_0-rmse:0.10273
[1140] validation_0-rmse:0.10244
[1170] validation_0-rmse:0.10221
[1200] validation_0-rmse:0.10201
[1230] validation_0-rmse:0.10184
[1260] validation_0-rmse:0.10171
[1290] validation_0-rmse:0.10157
[1320] validation_0-rmse:0.10147
[1350] validation_0-rmse:0.10136
[1380] validation_0-rmse:0.10127
[1410] validation_0-rmse:0.10120
[1440] validation_0-rmse:0.10114
[1470] validation_0-rmse:0.10109
[1500] validation_0-rmse:0.10104
[1530] validation_0-rmse:0.10101
[1560] validation_0-rmse:0.10098
[1590] validation_0-rmse:0.10095
[1620] validation_0-rmse:0.10092
[1650] validation_0-rmse:0.10089
[1680] validation_0-rmse:0.10087
[1710] validation_0-rmse:0.10085
[1740] validation_0-rmse:0.10084
[1770] validation_0-rmse:0.10084
[1800] validation_0-rmse:0.10083
[1830] validation_0-rmse:0.10082
[1860] validation_0-rmse:0.10081
[1890] validation_0-rmse:0.10080
[1920] validation_0-rmse:0.10080
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.003, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=3000, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)
score = mean_absolute_error(y_valid, model.predict(X_valid))
score
0.022236713043428454
predictions = model.predict(prepped_test_data)
predictions = pd.DataFrame(predictions, index=prepped_test_data.index, columns=['Machine failure'])
from sklearn.metrics import roc_curve
roc = roc_curve(y_valid, model.predict(X_valid))
sns.lineplot(x=roc[0], y=roc[1]);
sns.lineplot(x=(0,1), y=(0,1));
plt.xlabel('False Positives');
plt.ylabel('True Positves');
plt.title('ROC curve');
# predictions['Machine failure'] = (predictions['Machine failure'] > 0.5).astype(int)
predictions.to_csv('submission.csv')