diff --git a/Regression.py b/Regression.py new file mode 100644 index 0000000..8a24bf6 --- /dev/null +++ b/Regression.py @@ -0,0 +1,288 @@ +import pandas as pd +import streamlit as st +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LinearRegression, Ridge, Lasso +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor +from sklearn.model_selection import cross_val_score +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score + +#=================Project Overview=========================# +st.title("Real Estate Price Prediction") +st.write(""" +### Project Overview +This project involves predicting real estate prices based on various features such as house age, distance to the nearest MRT station, and the number of nearby convenience stores. +We will perform data cleaning, feature engineering, scaling, and visualization to prepare the data for modeling. +""") + +#=================1: Data Cleaning and Structuring=========================# +# Load the datasets +train_data_path = 'C:/Users/71591/Desktop/dataset/Train Real estate.csv' +test_data_path = 'C:/Users/71591/Desktop/dataset/Test Real estate.csv' + +# Read in the datasets +train_df = pd.read_csv(train_data_path) +test_df = pd.read_csv(test_data_path) + +# Display the first few rows of the train and test datasets in Streamlit +st.write("### Train Data Overview") +st.write(train_df.head()) + +st.write("### Test Data Overview") +st.write(test_df.head()) + +# Convert 'X1 transaction date' into datetime and extract year and month for both train and test data +def convert_transaction_date(date): + try: + return pd.to_datetime(date, format='%Y.%f') + except: + return pd.to_datetime(date, format='%Y') + +# Apply the conversion for train and test datasets +train_df['transaction_date'] = train_df['X1 transaction date'].apply(convert_transaction_date) +test_df['transaction_date'] = test_df['X1 transaction date'].apply(convert_transaction_date) + +# Extract year and month from 'transaction_date' +train_df['transaction_year'] = train_df['transaction_date'].dt.year +train_df['transaction_month'] = train_df['transaction_date'].dt.month + +test_df['transaction_year'] = test_df['transaction_date'].dt.year +test_df['transaction_month'] = test_df['transaction_date'].dt.month + +# Drop the original 'X1 transaction date' column +train_df = train_df.drop(columns=['X1 transaction date']) +test_df = test_df.drop(columns=['X1 transaction date']) + +# Display the processed train and test datasets with new date features +st.write("### Processed Train Data with Date Features") +st.write(train_df.head()) + +st.write("### Processed Test Data with Date Features") +st.write(test_df.head()) + +#=================Scaling Numerical Features=============================# +# Scale numerical features +scaler = StandardScaler() +features_to_scale = ['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores'] + +# Fit the scaler on train data and transform both train and test data +train_df[features_to_scale] = scaler.fit_transform(train_df[features_to_scale]) +test_df[features_to_scale] = scaler.transform(test_df[features_to_scale]) + +# Display the scaled data +st.write("### Scaled Train Data") +st.write(train_df.head()) + +st.write("### Scaled Test Data") +st.write(test_df.head()) + +#=================Data Visualization=========================# +st.write("### Data Visualizations") + +# 1. Correlation Heatmap +st.write("#### Correlation Heatmap") +corr_matrix = train_df.corr() +plt.figure(figsize=(10, 6)) +sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5) +st.pyplot(plt) + +# 2. Scatter Plots: Numerical features vs. Target +st.write("#### Scatter Plots") +fig, ax = plt.subplots(1, 3, figsize=(18, 6)) + +# Scatter plot for House Age vs House Price +sns.scatterplot(x='X2 house age', y='Y house price of unit area', data=train_df, ax=ax[0]) +ax[0].set_title('House Age vs House Price') + +# Scatter plot for Distance to MRT vs House Price +sns.scatterplot(x='X3 distance to the nearest MRT station', y='Y house price of unit area', data=train_df, ax=ax[1]) +ax[1].set_title('Distance to MRT vs House Price') + +# Scatter plot for Convenience Stores vs House Price +sns.scatterplot(x='X4 number of convenience stores', y='Y house price of unit area', data=train_df, ax=ax[2]) +ax[2].set_title('Number of Convenience Stores vs House Price') + +st.pyplot(fig) + +# 3. Histograms for Key Features +st.write("#### Histograms") +fig, ax = plt.subplots(1, 3, figsize=(18, 6)) + +# Histogram for House Price +sns.histplot(train_df['Y house price of unit area'], bins=20, kde=True, ax=ax[0]) +ax[0].set_title('House Price Distribution') + +# Histogram for House Age +sns.histplot(train_df['X2 house age'], bins=20, kde=True, ax=ax[1]) +ax[1].set_title('House Age Distribution') + +# Histogram for Distance to MRT +sns.histplot(train_df['X3 distance to the nearest MRT station'], bins=20, kde=True, ax=ax[2]) +ax[2].set_title('Distance to MRT Distribution') + +st.pyplot(fig) + +# 4. Box Plots: House Price by Transaction Year and Month +st.write("#### Box Plots") + +# Box plot for House Price vs Transaction Year +fig, ax = plt.subplots(1, 2, figsize=(18, 6)) +sns.boxplot(x='transaction_year', y='Y house price of unit area', data=train_df, ax=ax[0]) +ax[0].set_title('House Price by Transaction Year') + +# Box plot for House Price vs Transaction Month +sns.boxplot(x='transaction_month', y='Y house price of unit area', data=train_df, ax=ax[1]) +ax[1].set_title('House Price by Transaction Month') + +st.pyplot(fig) + +#=================Model Building: Linear Regression=========================# +st.write("### Model Building: Linear Regression") + +# Prepare the features (X) and target (y) for training +X_train = train_df[['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'transaction_year', 'transaction_month']] +y_train = train_df['Y house price of unit area'] + +X_test = test_df[['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'transaction_year', 'transaction_month']] +y_test = test_df['Y house price of unit area'] + +# Initialize and train the Linear Regression model +lr_model = LinearRegression() +lr_model.fit(X_train, y_train) + +# Make predictions on the test dataset +y_pred_lr = lr_model.predict(X_test) + +#=================Model Evaluation=========================# +# Calculate evaluation metrics for Linear Regression +mae_lr = mean_absolute_error(y_test, y_pred_lr) +mse_lr = mean_squared_error(y_test, y_pred_lr) +rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False) +r2_lr = r2_score(y_test, y_pred_lr) + +# Display the evaluation metrics +st.write(f"### Linear Regression Model Evaluation") +st.write(f"Mean Absolute Error (MAE): {mae_lr:.2f}") +st.write(f"Mean Squared Error (MSE): {mse_lr:.2f}") +st.write(f"Root Mean Squared Error (RMSE): {rmse_lr:.2f}") +st.write(f"R-squared (R2): {r2_lr:.2f}") + +# Plotting Actual vs. Predicted for Linear Regression +fig, ax = plt.subplots(figsize=(10, 6)) +ax.scatter(y_test, y_pred_lr, alpha=0.5) +ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2) +ax.set_title("Actual vs Predicted - Linear Regression") +ax.set_xlabel("Actual House Price") +ax.set_ylabel("Predicted House Price") +st.pyplot(fig) + +#=================Model Tuning: Ridge and Lasso=========================# +st.write("### Model Tuning: Ridge and Lasso Regression") + +# Initialize and train Ridge and Lasso models +ridge_model = Ridge(alpha=1.0) +lasso_model = Lasso(alpha=0.1) + +ridge_model.fit(X_train, y_train) +lasso_model.fit(X_train, y_train) + +# Make predictions with Ridge and Lasso +y_pred_ridge = ridge_model.predict(X_test) +y_pred_lasso = lasso_model.predict(X_test) + +# Evaluate Ridge +mae_ridge = mean_absolute_error(y_test, y_pred_ridge) +r2_ridge = r2_score(y_test, y_pred_ridge) +st.write(f"Ridge Model MAE: {mae_ridge:.2f}, R2: {r2_ridge:.2f}") + +# Plotting Actual vs. Predicted for Ridge Regression +fig, ax = plt.subplots(figsize=(10, 6)) +ax.scatter(y_test, y_pred_ridge, alpha=0.5) +ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2) +ax.set_title("Actual vs Predicted - Ridge Regression") +ax.set_xlabel("Actual House Price") +ax.set_ylabel("Predicted House Price") +st.pyplot(fig) + +# Evaluate Lasso +mae_lasso = mean_absolute_error(y_test, y_pred_lasso) +r2_lasso = r2_score(y_test, y_pred_lasso) +st.write(f"Lasso Model MAE: {mae_lasso:.2f}, R2: {r2_lasso:.2f}") + +# Plotting Actual vs. Predicted for Lasso Regression +fig, ax = plt.subplots(figsize=(10, 6)) +ax.scatter(y_test, y_pred_lasso, alpha=0.5) +ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2) +ax.set_title("Actual vs Predicted - Lasso Regression") +ax.set_xlabel("Actual House Price") +ax.set_ylabel("Predicted House Price") +st.pyplot(fig) + +#=================Additional Models: Decision Tree, Random Forest, Gradient Boosting=========================# +st.write("### Additional Models: Decision Tree, Random Forest, Gradient Boosting") + +# Initialize models +dt_model = DecisionTreeRegressor(random_state=42) +rf_model = RandomForestRegressor(random_state=42) +gb_model = GradientBoostingRegressor(random_state=42) + +# Train models +dt_model.fit(X_train, y_train) +rf_model.fit(X_train, y_train) +gb_model.fit(X_train, y_train) + +# Predict with the models +y_pred_dt = dt_model.predict(X_test) +y_pred_rf = rf_model.predict(X_test) +y_pred_gb = gb_model.predict(X_test) + +# Evaluate models (MAE and R2) +mae_dt = mean_absolute_error(y_test, y_pred_dt) +r2_dt = r2_score(y_test, y_pred_dt) + +mae_rf = mean_absolute_error(y_test, y_pred_rf) +r2_rf = r2_score(y_test, y_pred_rf) + +mae_gb = mean_absolute_error(y_test, y_pred_gb) +r2_gb = r2_score(y_test, y_pred_gb) + +# Display the evaluation metrics +st.write(f"Decision Tree MAE: {mae_dt:.2f}, R2: {r2_dt:.2f}") +st.write(f"Random Forest MAE: {mae_rf:.2f}, R2: {r2_rf:.2f}") +st.write(f"Gradient Boosting MAE: {mae_gb:.2f}, R2: {r2_gb:.2f}") + +# Plotting Actual vs. Predicted for Decision Tree Regression +fig, ax = plt.subplots(figsize=(10, 6)) +ax.scatter(y_test, y_pred_dt, alpha=0.5) +ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2) +ax.set_title("Actual vs Predicted - Decision Tree Regression") +ax.set_xlabel("Actual House Price") +ax.set_ylabel("Predicted House Price") +st.pyplot(fig) + +# Plotting Actual vs. Predicted for Random Forest Regression +fig, ax = plt.subplots(figsize=(10, 6)) +ax.scatter(y_test, y_pred_rf, alpha=0.5) +ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2) +ax.set_title("Actual vs Predicted - Random Forest Regression") +ax.set_xlabel("Actual House Price") +ax.set_ylabel("Predicted House Price") +st.pyplot(fig) + +# Plotting Actual vs. Predicted for Gradient Boosting Regression +fig, ax = plt.subplots(figsize=(10, 6)) +ax.scatter(y_test, y_pred_gb, alpha=0.5) +ax.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2) +ax.set_title("Actual vs Predicted - Gradient Boosting Regression") +ax.set_xlabel("Actual House Price") +ax.set_ylabel("Predicted House Price") +st.pyplot(fig) + +#=================Cross-Validation=========================# +st.write("### Cross-Validation with Random Forest") + +# Cross-validation with Random Forest +rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error') +st.write(f"Random Forest Cross-Validation MAE: {(-rf_cv_scores.mean()):.2f}") \ No newline at end of file diff --git a/Time_series.py b/Time_series.py new file mode 100644 index 0000000..fc9752e --- /dev/null +++ b/Time_series.py @@ -0,0 +1,234 @@ +import streamlit as st +import pandas as pd +import matplotlib.pyplot as plt +from statsmodels.tsa.statespace.sarimax import SARIMAX +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +import numpy as np +from statsmodels.tsa.seasonal import seasonal_decompose +from statsmodels.graphics.tsaplots import plot_acf, plot_pacf + +#============= Streamlit Title and Description ====================================== + +st.title("Time Series Analysis and Forecasting of Coffee Sales") +st.write(""" +### Project Overview: +This analysis aims to study coffee sales data collected from vending machines. +The purpose is to uncover underlying patterns in the data and forecast future sales to assist in inventory and financial planning. +**SARIMA model** was chosen for its ability to handle seasonality in the time series data. + +- **Data**: The dataset includes daily coffee sales with features such as payment type, coffee type, and transaction time. +- **Objective**: The primary goal is to predict future coffee sales to optimize inventory and stock management. +""") + +#============= 1- Data Preprocessing ====================================== + +# Load the dataset +file_path = r"C:\Users\71591\Desktop\dataset\Train Coffee Sales.csv" +df = pd.read_csv(file_path) + +# Convert 'date' and 'datetime' columns to datetime format +df['date'] = pd.to_datetime(df['date']) +df['datetime'] = pd.to_datetime(df['datetime']) + +# Handle missing values in the 'card' column using forward fill method +df['card'] = df['card'].ffill() + +# Aggregate total sales by coffee type and payment method before encoding +sales_by_coffee_type = df.groupby('coffee_name')['money'].sum().reset_index() +sales_by_payment_method = df.groupby('cash_type')['money'].sum().reset_index() + +# Merge these features back into the original dataframe before encoding +df = df.merge(sales_by_coffee_type, on='coffee_name', how='left', suffixes=('', '_total_by_coffee')) +df = df.merge(sales_by_payment_method, on='cash_type', how='left', suffixes=('', '_total_by_payment')) + +# One-hot encode the 'cash_type' and 'coffee_name' columns +df_encoded = pd.get_dummies(df, columns=['cash_type', 'coffee_name']) + +#============= Data Visualization ====================================== + +# Time Series Plot - Sales over time +st.subheader('Time Series Plot: Coffee Sales Over Time') +plt.figure(figsize=(10, 6)) +plt.plot(df_encoded['datetime'], df_encoded['money']) +plt.title('Coffee Sales Over Time') +plt.xlabel('Date') +plt.ylabel('Sales (Money)') +st.pyplot(plt) + +# Sales by Coffee Type +st.subheader('Bar Plot: Total Sales by Coffee Type') +plt.figure(figsize=(8, 5)) +df.groupby('coffee_name')['money'].sum().plot(kind='bar') +plt.title('Total Sales by Coffee Type') +plt.xlabel('Coffee Type') +plt.ylabel('Total Sales') +plt.xticks(rotation=45) +st.pyplot(plt) + +# Sales by Payment Method +st.subheader('Bar Plot: Total Sales by Payment Method') +plt.figure(figsize=(8, 5)) +df.groupby('cash_type')['money'].sum().plot(kind='bar') +plt.title('Total Sales by Payment Method') +plt.xlabel('Payment Method') +plt.ylabel('Total Sales') +plt.xticks(rotation=45) +st.pyplot(plt) + +# Distribution of Sales (money) +st.subheader('Distribution Plot: Coffee Sales') +plt.figure(figsize=(8, 5)) +df_encoded['money'].plot(kind='hist', bins=20, edgecolor='black') +plt.title('Distribution of Coffee Sales') +plt.xlabel('Sales Amount (Money)') +plt.ylabel('Frequency') +st.pyplot(plt) + +#============= 2- Feature Engineering ===================================== + +# Extract time-based features from 'datetime' +df_encoded['hour'] = df_encoded['datetime'].dt.hour +df_encoded['day_of_week'] = df_encoded['datetime'].dt.dayofweek +df_encoded['month'] = df_encoded['datetime'].dt.month +df_encoded['week_of_year'] = df_encoded['datetime'].dt.isocalendar().week + +# Create lag features and rolling statistics +df_encoded['lag_1'] = df_encoded['money'].shift(1) +df_encoded['rolling_mean_7'] = df_encoded['money'].rolling(window=7).mean() + +# Drop rows with NaN values created by shifting or rolling +df_encoded.dropna(inplace=True) + +#============= Decomposition Plot ====================================== + +st.subheader("Time Series Decomposition") +st.write(""" +To understand the components of our coffee sales data, we decompose the time series into **trend**, **seasonality**, and **residual** components. +This allows us to observe the underlying patterns that are influencing sales performance over time. +""") + +# Perform seasonal decomposition of the time series +decomposition = seasonal_decompose(df_encoded['money'], model='additive', period=7) +fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8)) + +# Plot the decomposition components +decomposition.observed.plot(ax=ax1, title='Observed', legend=False) +decomposition.trend.plot(ax=ax2, title='Trend', legend=False) +decomposition.seasonal.plot(ax=ax3, title='Seasonality', legend=False) +decomposition.resid.plot(ax=ax4, title='Residuals', legend=False) + +plt.tight_layout() +st.pyplot(fig) + +#============= ACF and PACF Plots ====================================== + +st.subheader("ACF and PACF Plots") +st.write(""" +The **ACF (Autocorrelation Function)** and **PACF (Partial Autocorrelation Function)** plots are used to identify the presence of any autoregressive or moving average components in the time series. +These plots help in selecting the appropriate lags for our SARIMA model. +""") + +# Plot ACF and PACF +fig_acf, ax_acf = plt.subplots(1, 1, figsize=(10, 4)) +plot_acf(df_encoded['money'], lags=40, ax=ax_acf) +st.pyplot(fig_acf) + +fig_pacf, ax_pacf = plt.subplots(1, 1, figsize=(10, 4)) +plot_pacf(df_encoded['money'], lags=40, ax=ax_pacf) +st.pyplot(fig_pacf) + +#============== 3- Model Building and Evaluation ============================ + +st.subheader("SARIMA Model Building and Evaluation") + +# Load the training data (already preprocessed and encoded) +df_train = df_encoded.copy() # Use the encoded train data from preprocessing + +# Load the test data +df_test = pd.read_csv(r"C:\Users\71591\Desktop\dataset\Test Coffee Sales.csv") + +# Convert 'datetime' column to datetime format and set it as index for the test data +df_test['datetime'] = pd.to_datetime(df_test['datetime']) +df_test.set_index('datetime', inplace=True) + +# Extract the 'money' column for training and testing +y_train = df_train['money'] +y_test = df_test['money'] + +# Step 1: Fit the SARIMA model on the training data +sarima = SARIMAX(y_train, order=(1,2,1), seasonal_order=(1,1,1,7)) +sarima_fit = sarima.fit() + +# Print model summary +st.write(sarima_fit.summary()) + +# Step 2: Make predictions on the test set +y_pred = sarima_fit.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1, dynamic=False) + +# Step 3: Calculate evaluation metrics +mae = mean_absolute_error(y_test, y_pred) +rmse = np.sqrt(mean_squared_error(y_test, y_pred)) +r2 = r2_score(y_test, y_pred) + +st.write(f"Mean Absolute Error: {mae:.2f}") +st.write(f"Root Mean Squared Error: {rmse:.2f}") +st.write(f"R Squared Error: {r2:.2f}") + +# Step 4: Plot the results for Training, Test, and Predicted data + +# Reset index for both train and test data to ensure proper alignment +df_train = df_train.reset_index() +df_test = df_test.reset_index() + +# Create a combined dataframe for proper x-axis alignment +fig, ax = plt.subplots(figsize=(10, 5)) + +# Plot the actual training data +ax.plot(df_train['datetime'], y_train, label='Training Data', color='blue') + +# Plot the actual test data +ax.plot(df_test['datetime'], y_test, label='Test Data', color='green') + +# Plot the predictions for the test data (predictions must align with test data) +ax.plot(df_test['datetime'], y_pred, label='Predictions', color='red') + +# Formatting the plot +plt.gcf().autofmt_xdate() +ax.set_title('SARIMA Model Predictions vs Actual Sales') +ax.set_xlabel('Date') +ax.set_ylabel('Sales (Money)') +ax.legend() +st.pyplot(fig) + +#=======================4-Forecasting========================# +st.subheader("Forecasting Future Coffee Sales") + +n_steps = 30 # number of future steps to forecast (e.g., 30 days) +forecast = sarima_fit.get_forecast(steps=n_steps) + +# Extract forecasted values +forecasted_values = forecast.predicted_mean +confidence_intervals = forecast.conf_int() + +# Generate future dates for forecasting +last_date = df_test['datetime'].max() +forecast_dates = pd.date_range(last_date, periods=n_steps + 1, freq='D')[1:] + +# Plot the forecasted values with confidence intervals +fig_forecast, ax_forecast = plt.subplots(figsize=(10, 5)) + +# Plot the forecasted values +ax_forecast.plot(forecast_dates, forecasted_values, label='Forecasted Sales', color='orange') + +# Plot confidence intervals +ax_forecast.fill_between(forecast_dates, confidence_intervals.iloc[:, 0], confidence_intervals.iloc[:, 1], color='orange', alpha=0.3) + +# Formatting the plot +plt.gcf().autofmt_xdate() +ax_forecast.set_title('Forecasted Coffee Sales for Next 30 Days') +ax_forecast.set_xlabel('Date') +ax_forecast.set_ylabel('Sales (Money)') +ax_forecast.legend() +st.pyplot(fig_forecast) + +st.write(forecasted_values)