diff --git a/LR3/Dockerfile b/LR3/Dockerfile new file mode 100644 index 0000000..24315dd --- /dev/null +++ b/LR3/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.11-slim +WORKDIR /app +COPY . . +RUN pip install -r requirements.txt +EXPOSE 8501 +CMD ["streamlit", "run", "regres.py"] diff --git a/LR3/regres.py b/LR3/regres.py new file mode 100644 index 0000000..203c3da --- /dev/null +++ b/LR3/regres.py @@ -0,0 +1,145 @@ +import streamlit as st +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error, r2_score +from sklearn.preprocessing import StandardScaler + +st.title("Real Estate Price Prediction") + +# Step 1: Data Loading +train_file = st.file_uploader("Upload the Training Dataset", type="csv") +test_file = st.file_uploader("Upload the Testing Dataset", type="csv") + +if train_file is not None and test_file is not None: + train_data = pd.read_csv(train_file) + test_data = pd.read_csv(test_file) + + st.write("Training Data Preview:") + st.write(train_data.head()) + + st.write("Testing Data Preview:") + st.write(test_data.head()) + + # Step 2: Data Cleaning and Structuring + st.subheader("Data Cleaning and Structuring") + + # Assuming there are no missing values, otherwise handle them + train_data.dropna(inplace=True) + test_data.dropna(inplace=True) + + # Step 3: Exploratory Data Analysis (EDA) + st.subheader("Exploratory Data Analysis") + + # Basic Information + st.write("Dataset Information:") + st.write(train_data.info()) + + st.write("Descriptive Statistics:") + st.write(train_data.describe()) + + # Distribution of the Target Variable + st.write("Distribution of House Prices per Unit Area:") + plt.figure(figsize=(10, 6)) + sns.histplot(train_data['Y house price of unit area'], kde=True) + plt.title('Distribution of House Prices per Unit Area') + st.pyplot(plt) + + # Boxplot of the Target Variable + st.write("Boxplot of House Prices per Unit Area:") + plt.figure(figsize=(10, 6)) + sns.boxplot(x=train_data['Y house price of unit area']) + plt.title('Boxplot of House Prices per Unit Area') + st.pyplot(plt) + + # Correlation Matrix + st.write("Correlation Matrix:") + corr = train_data.corr() + plt.figure(figsize=(12, 8)) + sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f') + plt.title('Correlation Matrix') + st.pyplot(plt) + + # Pairplot to explore pairwise relationships + st.write("Pairplot of Features:") + sns.pairplot(train_data) + st.pyplot(plt) + + # Distribution plots for each feature + st.write("Distribution of Features:") + features = ['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', + 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude'] + for feature in features: + plt.figure(figsize=(10, 6)) + sns.histplot(train_data[feature], kde=True) + plt.title(f'Distribution of {feature}') + st.pyplot(plt) + + # Scatter plots for feature relationships with the target variable + st.write("Scatter Plots of Features vs. Target Variable:") + for feature in features: + plt.figure(figsize=(10, 6)) + sns.scatterplot(x=train_data[feature], y=train_data['Y house price of unit area']) + plt.title(f'{feature} vs. House Price per Unit Area') + st.pyplot(plt) + + # Step 4: Feature Engineering + st.subheader("Feature Engineering") + + X_train = train_data[features] + y_train = train_data['Y house price of unit area'] + + X_test = test_data[features] + y_test = test_data['Y house price of unit area'] + + # Feature Scaling + scaler = StandardScaler() + X_train = scaler.fit_transform(X_train) + X_test = scaler.transform(X_test) + + # Step 5: Model Building + st.subheader("Model Building") + model = LinearRegression() + model.fit(X_train, y_train) + + # Step 6: Model Evaluation + st.subheader("Model Evaluation") + y_pred = model.predict(X_test) + + mse = mean_squared_error(y_test, y_pred) + r2 = r2_score(y_test, y_pred) + + st.write(f"Mean Squared Error: {mse}") + st.write(f"R-squared: {r2}") + + # Display a comparison of the first few predictions vs. actual prices + comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}) + st.write("Comparison of Actual and Predicted Prices:") + st.write(comparison.head()) + + # Step 7: Deployment - Input data and predict + st.subheader("Predict Property Price") + + transaction_date = st.number_input("Transaction Date", min_value=0) + house_age = st.number_input("House Age", min_value=0) + distance_to_mrt = st.number_input("Distance to MRT Station", min_value=0) + num_convenience_stores = st.number_input("Number of Convenience Stores", min_value=0) + latitude = st.number_input("Latitude") + longitude = st.number_input("Longitude") + + input_data = pd.DataFrame({ + 'X1 transaction date': [transaction_date], + 'X2 house age': [house_age], + 'X3 distance to the nearest MRT station': [distance_to_mrt], + 'X4 number of convenience stores': [num_convenience_stores], + 'X5 latitude': [latitude], + 'X6 longitude': [longitude] + }) + + input_data = scaler.transform(input_data) + prediction = model.predict(input_data) + + st.write(f"Predicted House Price per Unit Area: {prediction[0]}") diff --git a/LR3/requirements.txt b/LR3/requirements.txt new file mode 100644 index 0000000..7c6378e --- /dev/null +++ b/LR3/requirements.txt @@ -0,0 +1,6 @@ +streamlit==1.24.0 +pandas==2.0.3 +numpy==1.25.1 +matplotlib==3.8.0 +seaborn==0.12.2 +scikit-learn==1.3.0 diff --git a/TS2/Dockerfile b/TS2/Dockerfile new file mode 100644 index 0000000..b14c7e0 --- /dev/null +++ b/TS2/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.11-slim +WORKDIR /app +COPY . . +RUN pip install -r requirements.txt +EXPOSE 8501 +CMD ["streamlit", "run", "test2.py"] diff --git a/TS2/requirements.txt b/TS2/requirements.txt new file mode 100644 index 0000000..c42e2f8 --- /dev/null +++ b/TS2/requirements.txt @@ -0,0 +1,7 @@ +pandas==2.0.3 +streamlit==1.24.0 +seaborn==0.12.2 +matplotlib==3.8.0 +statsmodels==0.14.0 +scikit-learn==1.3.0 +prophet diff --git a/TS2/test2.py b/TS2/test2.py new file mode 100644 index 0000000..65ebe1e --- /dev/null +++ b/TS2/test2.py @@ -0,0 +1,205 @@ +import pandas as pd +import streamlit as st +import seaborn as sns +import matplotlib.pyplot as plt +from statsmodels.tsa.statespace.sarimax import SARIMAX +from sklearn.metrics import mean_squared_error, r2_score +from math import sqrt +from prophet import Prophet + +class TimeSeries: + def __init__(self, train_df, stores_df, features_df): + self.train_df = train_df + self.stores_df = stores_df + self.features_df = features_df + self.merged_df = None + + def explore_data(self): + """ + Perform exploratory data analysis (EDA) on the provided datasets. + """ + st.subheader("Training Data Summary") + st.write(self.train_df.describe()) + + st.subheader("Stores Data Summary") + st.write(self.stores_df.describe()) + + st.subheader("Features Data Summary") + st.write(self.features_df.describe()) + + # Check for missing values + st.subheader("Missing Values in Training Data") + st.write(self.train_df.isnull().sum()) + + st.subheader("Missing Values in Stores Data") + st.write(self.stores_df.isnull().sum()) + + st.subheader("Missing Values in Features Data") + st.write(self.features_df.isnull().sum()) + + # Correlation matrix + st.subheader("Correlation Matrix of Training Data") + numeric_df = self.train_df.select_dtypes(include=['float64', 'int64']) # Select only numeric columns + corr_matrix = numeric_df.corr() + sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5) + st.pyplot(plt.gcf()) + + # Distribution of sales + st.subheader("Sales Distribution") + sns.histplot(self.train_df['Weekly_Sales'], kde=True, color='blue', bins=30) + st.pyplot(plt.gcf()) + + # Sales distribution over time + st.subheader("Sales Distribution Over Time") + sns.lineplot(data=self.train_df, x='Date', y='Weekly_Sales') + st.pyplot(plt.gcf()) + + # Boxplot to identify outliers + st.subheader("Outliers in Weekly Sales") + sns.boxplot(x=self.train_df['Weekly_Sales']) + st.pyplot(plt.gcf()) + + # Pairplot for relationships between numerical features + st.subheader("Pairplot of Numerical Features") + sns.pairplot(numeric_df) + st.pyplot(plt.gcf()) + + + + def merge_data(self): + """ + Merges the train DataFrame with the stores DataFrame and then with the features DataFrame + based on the 'Store' and 'Date' columns. + """ + self.train_df.columns = self.train_df.columns.str.strip().str.title() + self.stores_df.columns = self.stores_df.columns.str.strip().str.title() + self.features_df.columns = self.features_df.columns.str.strip().str.title() + + self.train_df['Date'] = pd.to_datetime(self.train_df['Date'], errors='coerce') + self.features_df['Date'] = pd.to_datetime(self.features_df['Date'], errors='coerce') + + merged_store = pd.merge(self.train_df, self.stores_df, on='Store', how='left') + self.merged_df = pd.merge(merged_store, self.features_df, on=['Store', 'Date'], how='left') + + st.subheader("Merged DataFrame Head") + st.dataframe(self.merged_df.head()) + + def preprocess_for_prophet(self): + """ + Prepares the data for Prophet modeling by renaming columns to 'ds' and 'y'. + """ + if self.merged_df is not None: + df = self.merged_df.groupby('Date')['Weekly_Sales'].sum().reset_index() + df.rename(columns={'Date': 'ds', 'Weekly_Sales': 'y'}, inplace=True) + return df + else: + st.error("DataFrames are not merged yet. Please merge the DataFrames first.") + return None + + def fit_predict_prophet(self, train_df, test_df): + """ + Fits a Prophet model on the training data and makes predictions on the test data. + """ + prophet_model = Prophet() + prophet_model.fit(train_df) + + future = test_df[['ds']].copy() + forecast = prophet_model.predict(future) + + # Calculate evaluation metrics + y_true = test_df['y'].values + y_pred = forecast['yhat'].values + + mse = mean_squared_error(y_true, y_pred) + rmse = sqrt(mse) + r2 = r2_score(y_true, y_pred) + + st.subheader("Prophet Model Evaluation") + st.write(f"RMSE: {rmse}") + st.write(f"MSE: {mse}") + st.write(f"R²: {r2}") + + # Plot the predictions + fig, ax = plt.subplots(figsize=(10, 6)) + ax.plot(test_df['ds'], y_true, label='Actual Sales', color='blue') + ax.plot(test_df['ds'], y_pred, label='Forecasted Sales', color='orange') + ax.set_title('Prophet Model - Actual vs Forecasted Sales') + ax.set_xlabel('Date') + ax.set_ylabel('Weekly Sales') + ax.legend() + st.pyplot(fig) + + def fit_predict_sarima(self, train_df, test_df, order=(1, 1, 1), seasonal_order=(1, 1, 1, 52)): + """ + Fits a SARIMA model on the training data and makes predictions on the test data. + """ + train_series = train_df.set_index('ds')['y'] + test_series = test_df.set_index('ds')['y'] + + sarima_model = SARIMAX(train_series, order=order, seasonal_order=seasonal_order) + sarima_result = sarima_model.fit(disp=False) + + # Forecast + forecast = sarima_result.get_forecast(steps=len(test_series)) + y_pred = forecast.predicted_mean + y_true = test_series + + # Calculate evaluation metrics + mse = mean_squared_error(y_true, y_pred) + rmse = sqrt(mse) + r2 = r2_score(y_true, y_pred) + + st.subheader("SARIMA Model Evaluation") + st.write(f"RMSE: {rmse}") + st.write(f"MSE: {mse}") + st.write(f"R²: {r2}") + + # Plot the predictions + fig, ax = plt.subplots(figsize=(10, 6)) + ax.plot(test_series.index, y_true, label='Actual Sales', color='blue') + ax.plot(test_series.index, y_pred, label='Forecasted Sales', color='orange') + ax.set_title('SARIMA Model - Actual vs Forecasted Sales') + ax.set_xlabel('Date') + ax.set_ylabel('Weekly Sales') + ax.legend() + st.pyplot(fig) + +def main(): + st.title("Time Series Analysis and Forecasting") + + # Upload data files + train_file = st.file_uploader("Upload Training Data CSV", type="csv") + stores_file = st.file_uploader("Upload Stores Data CSV", type="csv") + features_file = st.file_uploader("Upload Features Data CSV", type="csv") + + if train_file and stores_file and features_file: + # Read the uploaded files into pandas DataFrames + train_df = pd.read_csv(train_file) + stores_df = pd.read_csv(stores_file) + features_df = pd.read_csv(features_file) + + # Initialize analysis object + ts_analysis = TimeSeries(train_df, stores_df, features_df) + + # Perform Data Exploration + ts_analysis.explore_data() + + # Merge and preprocess the training data + ts_analysis.merge_data() + train_prophet_df = ts_analysis.preprocess_for_prophet() + + # Prepare test data + ts_analysis2 = TimeSeries(train_df, stores_df, features_df) + ts_analysis2.merge_data() + test_prophet_df = ts_analysis2.preprocess_for_prophet() + + # Fit and predict with Prophet + if train_prophet_df is not None and test_prophet_df is not None: + ts_analysis.fit_predict_prophet(train_prophet_df, test_prophet_df) + + # Fit and predict with SARIMA + if train_prophet_df is not None and test_prophet_df is not None: + ts_analysis.fit_predict_sarima(train_prophet_df, test_prophet_df) + +if __name__ == "__main__": + main()