Skip to content

Prediction of Machine failure to improve the cost efficiency of a hypothetical factory.

Notifications You must be signed in to change notification settings

DuanBoomer/Machine-Failure-Prediction

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

3 Commits
 
 
 
 

Repository files navigation

1. Importing Modules

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

2. Loading Data

train_data = pd.read_csv('/kaggle/input/playground-series-s3e17/train.csv', index_col=0)
test_data = pd.read_csv('/kaggle/input/playground-series-s3e17/test.csv', index_col=0)
train_data.head()
Product ID Type Air temperature [K] Process temperature [K] Rotational speed [rpm] Torque [Nm] Tool wear [min] Machine failure TWF HDF PWF OSF RNF
id
0 L50096 L 300.6 309.6 1596 36.1 140 0 0 0 0 0 0
1 M20343 M 302.6 312.1 1759 29.1 200 0 0 0 0 0 0
2 L49454 L 299.3 308.5 1805 26.5 25 0 0 0 0 0 0
3 L53355 L 301.0 310.9 1524 44.3 197 0 0 0 0 0 0
4 M24050 M 298.0 309.0 1641 35.4 34 0 0 0 0 0 0

3. Prepping Data

def preprocess_data(data):
    # Dropping some columns
    drop_cols = ['TWF','HDF','PWF','OSF','RNF', 'Product ID']
    data.drop(drop_cols, axis=1, inplace=True)
    
    # One-Hot-Encoding the Type column
    OH_encoder = OneHotEncoder(sparse_output=False)
    OH_type = OH_encoder.fit_transform(data.Type.array.reshape(-1, 1))
    cats = OH_encoder.categories_[0]
    OH_type = pd.DataFrame(OH_type, index=data.index, columns=cats)
    
    # Adding the new one-hot-encoded Type to the data
    new_data = pd.concat([data, OH_type], axis=1)
    new_data.drop('Type', inplace=True, axis=1)
    
    # Renaming the columns
    col_names = {
        'Air temperature [K]':'AirTemp', 
        'Process temperature [K]':'ProcessTemp', 
        'Rotational speed [rpm]':'RotationalSpeed', 
        'Torque [Nm]':'Torque', 
        'Tool wear [min]':'ToolWear'}
    new_data.rename(columns=col_names, inplace=True)
    
    return new_data
prepped_train_data = preprocess_data(train_data)
prepped_test_data = preprocess_data(test_data)
prepped_train_data
AirTemp ProcessTemp RotationalSpeed Torque ToolWear Machine failure H L M
id
0 300.6 309.6 1596 36.1 140 0 0.0 1.0 0.0
1 302.6 312.1 1759 29.1 200 0 0.0 0.0 1.0
2 299.3 308.5 1805 26.5 25 0 0.0 1.0 0.0
3 301.0 310.9 1524 44.3 197 0 0.0 1.0 0.0
4 298.0 309.0 1641 35.4 34 0 0.0 0.0 1.0
... ... ... ... ... ... ... ... ... ...
136424 300.1 311.4 1530 37.5 210 0 0.0 0.0 1.0
136425 297.5 308.5 1447 49.1 2 0 1.0 0.0 0.0
136426 300.5 311.8 1524 38.5 214 0 0.0 1.0 0.0
136427 301.7 310.9 1447 46.3 42 0 0.0 1.0 0.0
136428 296.9 308.1 1557 39.3 229 0 0.0 1.0 0.0

136429 rows × 9 columns

prepped_train_data.describe()
AirTemp ProcessTemp RotationalSpeed Torque ToolWear Machine failure H L M
count 136429.000000 136429.000000 136429.000000 136429.000000 136429.000000 136429.000000 136429.000000 136429.000000 136429.000000
mean 299.862776 309.941070 1520.331110 40.348643 104.408901 0.015744 0.065404 0.698928 0.235668
std 1.862247 1.385173 138.736632 8.502229 63.965040 0.124486 0.247238 0.458726 0.424417
min 295.300000 305.800000 1181.000000 3.800000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 298.300000 308.700000 1432.000000 34.600000 48.000000 0.000000 0.000000 0.000000 0.000000
50% 300.000000 310.000000 1493.000000 40.400000 106.000000 0.000000 0.000000 1.000000 0.000000
75% 301.200000 310.900000 1580.000000 46.100000 159.000000 0.000000 0.000000 1.000000 0.000000
max 304.400000 313.800000 2886.000000 76.600000 253.000000 1.000000 1.000000 1.000000 1.000000

6. Adding new features

prepped_train_data['TempDiffer'] = prepped_train_data['ProcessTemp'] - prepped_train_data['AirTemp']
prepped_test_data['TempDiffer'] = prepped_test_data['ProcessTemp'] - prepped_test_data['AirTemp']
prepped_train_data.head()
AirTemp ProcessTemp RotationalSpeed Torque ToolWear Machine failure H L M TempDiffer
id
0 300.6 309.6 1596 36.1 140 0 0.0 1.0 0.0 9.0
1 302.6 312.1 1759 29.1 200 0 0.0 0.0 1.0 9.5
2 299.3 308.5 1805 26.5 25 0 0.0 1.0 0.0 9.2
3 301.0 310.9 1524 44.3 197 0 0.0 1.0 0.0 9.9
4 298.0 309.0 1641 35.4 34 0 0.0 0.0 1.0 11.0

4. Spliting Data

X=prepped_train_data.drop(['Machine failure'], axis=1)
y=prepped_train_data['Machine failure']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=99)

5. Analysing Features

mi_scores= mutual_info_regression(X, y)
mi_scores_series = pd.Series(mi_scores, name='MI scores', index=X.columns)
mi_scores_series.sort_values(ascending=False)
Torque             0.021782
RotationalSpeed    0.015154
TempDiffer         0.007294
ToolWear           0.004929
AirTemp            0.004925
ProcessTemp        0.002126
M                  0.000677
H                  0.000000
L                  0.000000
Name: MI scores, dtype: float64
pca=PCA()
X_pca = pca.fit_transform(X_train)
X_pca_components = pd.DataFrame(pca.components_, index=X_train.columns)
X_pca_components
0 1 2 3 4 5 6 7 8
AirTemp 0.000243 0.000132 9.988607e-01 -4.765407e-02 2.520727e-03 0.000003 -0.000004 0.000002 -0.000111
ProcessTemp 0.000494 0.000248 -2.521373e-03 5.033723e-05 9.999966e-01 0.000015 -0.000055 0.000040 -0.000246
RotationalSpeed 0.004191 0.001422 4.765190e-02 9.988504e-01 6.676347e-05 -0.000072 -0.000005 0.000077 -0.002769
Torque -0.801315 -0.536329 5.243256e-04 4.835591e-03 5.951679e-04 0.000984 0.000778 -0.001762 0.264985
ToolWear 0.156622 -0.615604 -1.340616e-04 -1.915458e-03 -1.153781e-04 -0.001484 -0.008262 0.009746 -0.772226
H -0.003296 0.006797 -4.139783e-06 -1.716147e-05 -6.644867e-05 0.079459 -0.743423 0.663965 0.010093
L 0.001579 -0.001139 6.580431e-07 8.110767e-05 -1.228493e-05 0.812619 -0.337519 -0.475100 -0.002718
M -0.577350 0.577350 -0.000000e+00 -3.122502e-17 -1.110223e-16 -0.000727 -0.000727 -0.000727 -0.577350
TempDiffer 0.000727 -0.000727 -3.144186e-18 -5.825215e-17 2.168404e-19 -0.577350 -0.577350 -0.577350 0.000727
sns.heatmap(X_pca_components)
<Axes: >

png

sns.heatmap(prepped_train_data.corr())
<Axes: >

png

prepped_train_data.columns
Index(['AirTemp', 'ProcessTemp', 'RotationalSpeed', 'Torque', 'ToolWear',
       'Machine failure', 'H', 'L', 'M', 'TempDiffer'],
      dtype='object')
y.value_counts()
0    134281
1      2148
Name: Machine failure, dtype: int64
figure, axes = plt.subplots(5, 1, figsize=(10,20))
sns.histplot(x='AirTemp', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[0])
sns.histplot(x='ProcessTemp', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[1])
sns.histplot(x='RotationalSpeed', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[2])
sns.histplot(x='Torque', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[3])
sns.histplot(x='ToolWear', data=prepped_train_data, bins=30, hue='Machine failure', ax=axes[4])
<Axes: xlabel='ToolWear', ylabel='Count'>

png

sns.pairplot(prepped_train_data, vars=['AirTemp', 'ProcessTemp', 'RotationalSpeed', 'Torque', 'ToolWear'], hue='Machine failure')
<seaborn.axisgrid.PairGrid at 0x7b506a3bca30>

png

7. Model Training

model = XGBRegressor(n_estimators=3000, learning_rate=0.003)
model.fit(X_train, y_train, 
          early_stopping_rounds=5, 
          eval_set=[(X_valid, y_valid)], verbose=30)
[0]	validation_0-rmse:0.49856


/opt/conda/lib/python3.10/site-packages/xgboost/sklearn.py:835: UserWarning: `early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.
  warnings.warn(


[30]	validation_0-rmse:0.45755
[60]	validation_0-rmse:0.42024
[90]	validation_0-rmse:0.38633
[120]	validation_0-rmse:0.35554
[150]	validation_0-rmse:0.32760
[180]	validation_0-rmse:0.30230
[210]	validation_0-rmse:0.27942
[240]	validation_0-rmse:0.25876
[270]	validation_0-rmse:0.24014
[300]	validation_0-rmse:0.22334
[330]	validation_0-rmse:0.20827
[360]	validation_0-rmse:0.19479
[390]	validation_0-rmse:0.18276
[420]	validation_0-rmse:0.17205
[450]	validation_0-rmse:0.16257
[480]	validation_0-rmse:0.15420
[510]	validation_0-rmse:0.14684
[540]	validation_0-rmse:0.14041
[570]	validation_0-rmse:0.13480
[600]	validation_0-rmse:0.12991
[630]	validation_0-rmse:0.12567
[660]	validation_0-rmse:0.12204
[690]	validation_0-rmse:0.11891
[720]	validation_0-rmse:0.11623
[750]	validation_0-rmse:0.11395
[780]	validation_0-rmse:0.11200
[810]	validation_0-rmse:0.11034
[840]	validation_0-rmse:0.10892
[870]	validation_0-rmse:0.10773
[900]	validation_0-rmse:0.10672
[930]	validation_0-rmse:0.10584
[960]	validation_0-rmse:0.10509
[990]	validation_0-rmse:0.10447
[1020]	validation_0-rmse:0.10393
[1050]	validation_0-rmse:0.10345
[1080]	validation_0-rmse:0.10305
[1110]	validation_0-rmse:0.10273
[1140]	validation_0-rmse:0.10244
[1170]	validation_0-rmse:0.10221
[1200]	validation_0-rmse:0.10201
[1230]	validation_0-rmse:0.10184
[1260]	validation_0-rmse:0.10171
[1290]	validation_0-rmse:0.10157
[1320]	validation_0-rmse:0.10147
[1350]	validation_0-rmse:0.10136
[1380]	validation_0-rmse:0.10127
[1410]	validation_0-rmse:0.10120
[1440]	validation_0-rmse:0.10114
[1470]	validation_0-rmse:0.10109
[1500]	validation_0-rmse:0.10104
[1530]	validation_0-rmse:0.10101
[1560]	validation_0-rmse:0.10098
[1590]	validation_0-rmse:0.10095
[1620]	validation_0-rmse:0.10092
[1650]	validation_0-rmse:0.10089
[1680]	validation_0-rmse:0.10087
[1710]	validation_0-rmse:0.10085
[1740]	validation_0-rmse:0.10084
[1770]	validation_0-rmse:0.10084
[1800]	validation_0-rmse:0.10083
[1830]	validation_0-rmse:0.10082
[1860]	validation_0-rmse:0.10081
[1890]	validation_0-rmse:0.10080
[1920]	validation_0-rmse:0.10080
XGBRegressor(base_score=None, booster=None, callbacks=None,
         colsample_bylevel=None, colsample_bynode=None,
         colsample_bytree=None, early_stopping_rounds=None,
         enable_categorical=False, eval_metric=None, feature_types=None,
         gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
         interaction_constraints=None, learning_rate=0.003, max_bin=None,
         max_cat_threshold=None, max_cat_to_onehot=None,
         max_delta_step=None, max_depth=None, max_leaves=None,
         min_child_weight=None, missing=nan, monotone_constraints=None,
         n_estimators=3000, n_jobs=None, num_parallel_tree=None,
         predictor=None, random_state=None, ...)

8. Scoring and Submission

score = mean_absolute_error(y_valid, model.predict(X_valid))
score
0.022236713043428454
predictions = model.predict(prepped_test_data)
predictions = pd.DataFrame(predictions, index=prepped_test_data.index, columns=['Machine failure'])
from sklearn.metrics import roc_curve
roc = roc_curve(y_valid, model.predict(X_valid))
sns.lineplot(x=roc[0], y=roc[1]);
sns.lineplot(x=(0,1), y=(0,1));
plt.xlabel('False Positives');
plt.ylabel('True Positves');
plt.title('ROC curve');

png

# predictions['Machine failure'] = (predictions['Machine failure'] > 0.5).astype(int)
predictions.to_csv('submission.csv')

THE END
PyPI Downloads

About

Prediction of Machine failure to improve the cost efficiency of a hypothetical factory.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published