diff --git a/LR3/Dockerfile b/LR3/Dockerfile new file mode 100644 index 0000000..cffcaa9 --- /dev/null +++ b/LR3/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.12.2-alpine + +WORKDIR /app + +COPY . . + +RUN pip install -r requirements.txt + +EXPOSE 8081 + +CMD ["streamlit", "run", "regression.py"] \ No newline at end of file diff --git a/LR3/README.md b/LR3/README.md new file mode 100644 index 0000000..3c0474b --- /dev/null +++ b/LR3/README.md @@ -0,0 +1,7 @@ +run the following commands to open app: + +docker pull ziyadabd/regression + +docker run --name regression-ziyad -p 80:8501 ziyadabd/regression + +open http://localhost:80 in browser diff --git a/LR3/regression.py b/LR3/regression.py new file mode 100644 index 0000000..976fe2d --- /dev/null +++ b/LR3/regression.py @@ -0,0 +1,175 @@ +import streamlit as st +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LinearRegression, Ridge, Lasso +from sklearn.metrics import mean_squared_error, r2_score + +st.title('Real Estate Price Prediction') + +st.markdown(""" + +1. **Data Cleaning and Structuring** +2. **Exploratory Data Analysis (EDA)** +3. **Model Building** +4. **Model Evaluation** +5. **Model Deployment** +""") + +train_data = pd.read_csv('Train Real estate.csv') +test_data = pd.read_csv('Test Real estate.csv') + +# Rename the target column in the test data to avoid issues +if 'Y house price of unit area' in test_data.columns: + test_data.rename(columns={'Y house price of unit area': 'Predicted price'}, inplace=True) + +st.subheader('1. Data Cleaning and Structuring') + +# Display the first few rows of the datasets +st.markdown("### Initial Dataset Overview") +st.write('**Training Data:**') +st.dataframe(train_data.head()) +st.write('**Test Data:**') +st.dataframe(test_data.head()) + +# Handling missing values +st.markdown("### Handling Missing Values") +st.write("Checking for missing values in the training data:") +st.write(train_data.isnull().sum()) +st.write("Checking for missing values in the test data:") +st.write(test_data.isnull().sum()) + +# Handling missing values - filling with mean as an example +train_data.fillna(train_data.mean(), inplace=True) +test_data.fillna(test_data.mean(), inplace=True) +st.write("Missing values have been filled with the mean of each column.") + +# Checking for duplicates +st.markdown("### Checking for Duplicates") +st.write(f"Number of duplicate rows in training data: {train_data.duplicated().sum()}") +st.write(f"Number of duplicate rows in test data: {test_data.duplicated().sum()}") + +# Removing duplicates +train_data.drop_duplicates(inplace=True) +test_data.drop_duplicates(inplace=True) +st.write("Duplicates have been removed.") + +# Feature engineering (if necessary) +st.markdown("### Feature Engineering") +st.write("No additional feature engineering was necessary for this dataset at this stage.") + +# Dropping unneeded columns +st.markdown("### Dropping Unneeded Columns") +st.write("All columns in the provided datasets are relevant, so none were dropped.") + +# Splitting the data into features and target +X_train = train_data.drop('Y house price of unit area', axis=1) +y_train = train_data['Y house price of unit area'] + +# Display the shape of the data +st.write(f"**Shape of X_train (features):** {X_train.shape}") +st.write(f"**Shape of y_train (target):** {y_train.shape}") + +# 2. Exploratory Data Analysis (EDA) +st.subheader('2. Exploratory Data Analysis (EDA)') + +# Basic statistics +st.markdown("### Basic Statistics of Features") +st.write(X_train.describe()) + +# Distribution of the target variable +st.markdown("### Distribution of Target Variable (House Price per Unit Area)") +plt.figure(figsize=(8, 4)) +sns.histplot(y_train, kde=True) +st.pyplot(plt.gcf()) # Ensures the plot is rendered before continuing + +# Pairplot to see relationships between features +st.markdown("### Pairplot to Explore Relationships Between Features") +sns.pairplot(train_data) +st.pyplot(plt.gcf()) # Ensures the plot is rendered before continuing + +# Correlation heatmap +st.markdown("### Correlation Heatmap") +plt.figure(figsize=(10, 6)) +sns.heatmap(train_data.corr(), annot=True, cmap='coolwarm', fmt='.2f') +st.pyplot(plt.gcf()) # Ensures the plot is rendered before continuing + +# Discussing insights from the heatmap +st.markdown(""" +- There is a strong negative correlation between house age (`X2 house age`) and house price per unit area (`Y house price of unit area`). This suggests that newer houses tend to have higher prices. +- The distance to the nearest MRT station (`X3 distance to the nearest MRT station`) is also negatively correlated with the house price, indicating that properties closer to transit are generally more expensive. +- The number of convenience stores (`X4 number of convenience stores`) shows a moderate positive correlation with house prices, suggesting that properties with more nearby amenities are valued higher. +""") + +# 3. Model Building and Comparison +st.subheader('3. Model Building and Comparison') + +X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) + +# Scaling the features +st.markdown("### Scaling the Features") +st.write("Feature scaling is applied to ensure that the different scales of the features don't negatively impact the regression models.") +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train_split) +X_val_scaled = scaler.transform(X_val) + +# Initialize models +models = { + "Linear Regression": LinearRegression(), + "Ridge Regression": Ridge(alpha=1.0), + "Lasso Regression": Lasso(alpha=0.1) +} + +# Train models and evaluate them +st.markdown("### Training and Evaluating Multiple Models") + +results = {} +for model_name, model in models.items(): + model.fit(X_train_scaled, y_train_split) + y_val_pred = model.predict(X_val_scaled) + + mse = mean_squared_error(y_val, y_val_pred) + r2 = r2_score(y_val, y_val_pred) + + results[model_name] = {"MSE": mse, "R2": r2} + + st.write(f"**{model_name}:**") + st.write(f"Mean Squared Error (MSE): {mse:.2f}") + st.write(f"R-squared (R2): {r2:.2f}") + st.write("---") + +# Comparing models +st.subheader('4. Model Comparison') + +# Results DataFrame +results_df = pd.DataFrame(results).T +st.write("### Comparison of Model Performance") +st.dataframe(results_df) + + +# 5. Conclusion and Model Deployment +st.subheader('5. Conclusion and Model Deployment') + +# Best model selection +best_model_name = results_df['R2'].idxmax() +best_model = models[best_model_name] + +st.write(f"**The best performing model is: {best_model_name}** with an R-squared value of {results_df.loc[best_model_name, 'R2']:.2f}") + +# Drop the target column from the test dataset if it exists +if 'Predicted price' in test_data.columns: + test_features = test_data.drop('Predicted price', axis=1) +else: + test_features = test_data + +# Predicting on test data with the best model +test_data_scaled = scaler.transform(test_features) +test_predictions = best_model.predict(test_data_scaled) + +st.markdown("### Predictions on Test Data with Best Model") +st.write(test_predictions) + +st.write("# PLEASE REFRESH THE PAGE IF SOME PLOTS DO NOT SHOW.") diff --git a/LR3/requirements.txt b/LR3/requirements.txt new file mode 100644 index 0000000..bb6a5c4 --- /dev/null +++ b/LR3/requirements.txt @@ -0,0 +1,7 @@ +streamlit==1.35.0 +pandas==2.2.2 +numpy==1.26.4 +seaborn==0.13.2 +matplotlib==3.9.0 +scikit-learn==1.5.1 +statsmodels==0.14.2 diff --git a/TS3/Dockerfile b/TS3/Dockerfile new file mode 100644 index 0000000..2affb03 --- /dev/null +++ b/TS3/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.12.2-alpine + +WORKDIR /app + +COPY . . + +RUN pip install -r requirements.txt + +EXPOSE 8080 + +CMD ["streamlit", "run", "time.py"] \ No newline at end of file diff --git a/TS3/README.md b/TS3/README.md new file mode 100644 index 0000000..1f99246 --- /dev/null +++ b/TS3/README.md @@ -0,0 +1,7 @@ +run the following commands to open app: + +docker pull ziyadabd/timeseries + +docker run --name timeseries-ziyad -p 80:8501 ziyadabd/timeseries + +open http://localhost:80 diff --git a/TS3/requirements.txt b/TS3/requirements.txt new file mode 100644 index 0000000..f84ad0f --- /dev/null +++ b/TS3/requirements.txt @@ -0,0 +1,9 @@ +streamlit==1.35.0 +pandas==2.2.2 +numpy==1.26.4 +seaborn==0.13.2 +matplotlib==3.9.0 +scikit-learn==1.5.1 +statsmodels==0.14.2 +prophet==1.1.5 +pmdarmia==2.0.4 diff --git a/TS3/time.py b/TS3/time.py new file mode 100644 index 0000000..aad878e --- /dev/null +++ b/TS3/time.py @@ -0,0 +1,308 @@ +import streamlit as st +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score +import statsmodels.api as sm +from statsmodels.graphics.tsaplots import plot_acf, plot_pacf +import pmdarima as pm +from prophet import Prophet +import warnings +warnings.filterwarnings('ignore') + +# Set Streamlit page configuration +st.set_page_config(page_title='Coffee Sales Time Series Analysis', layout='wide') + +# Title of the application +st.title('Coffee Sales Time Series Analysis') + +# Load data from CSV files in the same directory as the .py file +train_data = pd.read_csv('Train Coffee Sales.csv') +test_data = pd.read_csv('Test Coffee Sales.csv') + +# Display Data Overview +st.header('Data Overview') + +col1, col2 = st.columns(2) + +with col1: + st.subheader('Training Data') + st.write(train_data.head()) + st.write('Training Data Shape:', train_data.shape) + +with col2: + st.subheader('Test Data') + st.write(test_data.head()) + st.write('Test Data Shape:', test_data.shape) + +# Data Preprocessing +st.header('Data Preprocessing') +st.write('Cleaning and preprocessing the data...') + +# Drop unnecessary columns +columns_to_drop = ['card', 'datetime', 'cash_type', 'coffee_name'] +train_data.drop(columns=columns_to_drop, inplace=True) +test_data.drop(columns=columns_to_drop, inplace=True) + +st.write(f'Dropped columns: {columns_to_drop}') + +# Convert date columns to datetime format +train_data['date'] = pd.to_datetime(train_data['date'], format='%Y-%m-%d', errors='coerce') +test_data['date'] = pd.to_datetime(test_data['date'], format='%Y-%m-%d', errors='coerce') + +# Handle missing values +st.write('Handling missing values...') +train_data.dropna(inplace=True) +test_data.dropna(inplace=True) + +st.write('Missing values handled. Here are the data shapes after cleaning:') +col1, col2 = st.columns(2) +with col1: + st.write('Training Data Shape:', train_data.shape) +with col2: + st.write('Test Data Shape:', test_data.shape) + +# Feature Engineering +st.header('Feature Engineering') +st.write('Deriving new features from the data...') + +for df in [train_data, test_data]: + # Extracting time-based features + df['year'] = df['date'].dt.year + df['month'] = df['date'].dt.month + df['day'] = df['date'].dt.day + df['day_of_week'] = df['date'].dt.dayofweek + + # Sorting by date to ensure proper lagging + df.sort_values('date', inplace=True) + df.reset_index(drop=True, inplace=True) + +# Creating lag features on training data +train_data['lag1'] = train_data['money'].shift(1) +train_data['lag7'] = train_data['money'].shift(7) +train_data['rolling_mean_7'] = train_data['money'].rolling(window=7).mean() + +# Drop rows with NaN values resulted from lag features +train_data.dropna(inplace=True) + +st.write('Feature engineering complete. Here are the first few rows of the training data:') +st.write(train_data.head()) + +# Exploratory Data Analysis +st.header('Exploratory Data Analysis') + +# Sales Over Time +st.subheader('Sales Over Time') +fig, ax = plt.subplots(figsize=(10, 6)) +ax.plot(train_data['date'], train_data['money'], label='Sales') +ax.set_title('Sales Over Time') +ax.set_xlabel('Date') +ax.set_ylabel('Sales (Money)') +ax.legend() +st.pyplot(fig) + +# Sales Distribution by Coffee Type +st.subheader('Sales Distribution by Coffee Type') +fig, ax = plt.subplots(figsize=(10, 6)) +sns.countplot(data=train_data, x='month', ax=ax) # Adjusted to plot month since coffee_name was dropped +ax.set_title('Sales Distribution by Month') +ax.set_xlabel('Month') +ax.set_ylabel('Count') +plt.xticks(rotation=45) +st.pyplot(fig) + +# ACF and PACF Plots +st.subheader('Autocorrelation and Partial Autocorrelation Plots') +fig, ax = plt.subplots(2, 1, figsize=(12, 8)) + +# ACF Plot +plot_acf(train_data['money'], ax=ax[0], lags=40) +ax[0].set_title('Autocorrelation Function (ACF)') + +# PACF Plot +plot_pacf(train_data['money'], ax=ax[1], lags=40) +ax[1].set_title('Partial Autocorrelation Function (PACF)') + +plt.tight_layout() +st.pyplot(fig) + +# Decomposition of the time series +st.subheader('Time Series Decomposition') +decomposition = sm.tsa.seasonal_decompose(train_data['money'], model='additive', period=7) +fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8)) + +# Plot the observed data +decomposition.observed.plot(ax=ax1) +ax1.set_ylabel('Observed') + +# Plot the trend component +decomposition.trend.plot(ax=ax2) +ax2.set_ylabel('Trend') + +# Plot the seasonal component +decomposition.seasonal.plot(ax=ax3) +ax3.set_ylabel('Seasonal') + +# Plot the residual component +decomposition.resid.plot(ax=ax4) +ax4.set_ylabel('Residual') + +plt.tight_layout() +st.pyplot(fig) + + + +# Model Building and Forecasting +st.header('Model Building and Forecasting') + +# Model selection +model_option = st.selectbox('Select a model to build:', ('Prophet', 'ARIMA', 'SARIMA')) + +if model_option in ['ARIMA', 'SARIMA']: + st.write(f'Building {model_option} model...') + + # Determine if seasonal components are needed + if model_option == 'SARIMA': + seasonal = True + m = st.number_input('Enter the seasonal period (m):', min_value=1, max_value=365, value=12) + else: + seasonal = False + m = 1 # Non-seasonal + + # Automatically determine the best parameters using auto_arima + try: + with st.spinner('Fitting the model...'): + model = pm.auto_arima( + train_data['money'], + start_p=1, start_q=1, + max_p=5, max_q=5, + d=None, # Let auto_arima determine 'd' + seasonal=seasonal, + m=m if seasonal else 1, + start_P=0, start_Q=0, + max_P=2, max_Q=2, + D=1 if seasonal else 0, # Seasonal differencing + trace=True, + error_action='ignore', + suppress_warnings=True, + stepwise=True + ) + + st.write('Best Model Parameters:') + st.write(f'Order: {model.order}') + if seasonal: + st.write(f'Seasonal Order: {model.seasonal_order}') + st.write(f'AIC: {model.aic()}') + + # Forecasting + st.write('Generating forecasts...') + n_periods = len(test_data) + forecast, conf_int = model.predict(n_periods=n_periods, return_conf_int=True) + + # Assign forecasts to test_data + forecast_series = pd.Series(forecast, index=test_data.index) + test_data[f'forecast_{model_option.lower()}'] = forecast_series + + # Handle any potential NaN values in forecasts + test_data[f'forecast_{model_option.lower()}'].fillna(method='ffill', inplace=True) + + st.write(f'{model_option} Forecast') + st.write(test_data[['date', 'money', f'forecast_{model_option.lower()}']].head()) + + # Evaluation + st.header('Evaluation Metrics') + mse = mean_squared_error(test_data['money'], test_data[f'forecast_{model_option.lower()}']) + mae = mean_absolute_error(test_data['money'], test_data[f'forecast_{model_option.lower()}']) + r2 = r2_score(test_data['money'], test_data[f'forecast_{model_option.lower()}']) + + st.write(f'Mean Squared Error (MSE): {mse:.2f}') + st.write(f'Mean Absolute Error (MAE): {mae:.2f}') + st.write(f'R² Score: {r2:.2f}') + + # Forecast Visualization (Show all previous data along with predicted) + st.subheader('Forecast Visualization') + fig, ax = plt.subplots(figsize=(10, 6)) + + # Plot training data + ax.plot(train_data['date'], train_data['money'], label='Historical Data (Train)', color='blue') + + # Plot test data (actual) + ax.plot(test_data['date'], test_data['money'], label='Actual Data (Test)', color='green') + + # Plot forecasted data + ax.plot(test_data['date'], test_data[f'forecast_{model_option.lower()}'], label='Forecasted Data', color='red') + + ax.fill_between( + test_data['date'], + conf_int[:, 0], + conf_int[:, 1], + color='pink', + alpha=0.3, + label='Confidence Interval' + ) + + ax.set_title(f'Historical vs Forecasted Sales ({model_option})') + ax.set_xlabel('Date') + ax.set_ylabel('Sales (Money)') + ax.legend() + st.pyplot(fig) + + except Exception as e: + st.error(f'An error occurred while building the {model_option} model: {e}') + +elif model_option == 'Prophet': + st.write('Building Prophet model...') + + try: + with st.spinner('Fitting the Prophet model...'): + # Prepare data for Prophet + prophet_train = train_data[['date', 'money']].rename(columns={'date': 'ds', 'money': 'y'}) + + prophet_model = Prophet() + prophet_model.fit(prophet_train) + + # Create future dataframe + future = prophet_model.make_future_dataframe(periods=len(test_data), freq='D') + forecast = prophet_model.predict(future) + + # Extract forecast for the test period + forecast_test = forecast.tail(len(test_data)).set_index(test_data.index) + + # Assign forecasts to test_data + test_data['forecast_prophet'] = forecast_test['yhat'].values + + st.write('Prophet Forecast') + st.write(test_data[['date', 'money', 'forecast_prophet']].head()) + + # Evaluation + st.header('Evaluation Metrics') + mse = mean_squared_error(test_data['money'], test_data['forecast_prophet']) + mae = mean_absolute_error(test_data['money'], test_data['forecast_prophet']) + r2 = r2_score(test_data['money'], test_data['forecast_prophet']) + + st.write(f'Mean Squared Error (MSE): {mse:.2f}') + st.write(f'Mean Absolute Error (MAE): {mae:.2f}') + st.write(f'R² Score: {r2:.2f}') + + # Forecast Visualization (Show all previous data along with predicted) + st.subheader('Forecast Visualization') + fig, ax = plt.subplots(figsize=(10, 6)) + + # Plot training data + ax.plot(train_data['date'], train_data['money'], label='Historical Data (Train)', color='blue') + + # Plot test data (actual) + ax.plot(test_data['date'], test_data['money'], label='Actual Data (Test)', color='green') + + # Plot forecasted data + ax.plot(test_data['date'], test_data['forecast_prophet'], label='Forecasted Data', color='red') + + ax.set_title('Historical vs Forecasted Sales (Prophet)') + ax.set_xlabel('Date') + ax.set_ylabel('Sales (Money)') + ax.legend() + st.pyplot(fig) + + except Exception as e: + st.error(f'An error occurred while building the Prophet model: {e}')