khoulaCode · aaziz9 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 28, 2024
diff --git a/LR3/Dockerfile b/LR3/Dockerfile
@@ -0,0 +1,6 @@
+FROM python:3.11-slim
+WORKDIR /app
+COPY . .
+RUN pip install -r requirements.txt
+EXPOSE 8501
+CMD ["streamlit", "run", "regres.py"]
diff --git a/LR3/regres.py b/LR3/regres.py
@@ -0,0 +1,145 @@
+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.preprocessing import StandardScaler
+
+st.title("Real Estate Price Prediction")
+
+# Step 1: Data Loading
+train_file = st.file_uploader("Upload the Training Dataset", type="csv")
+test_file = st.file_uploader("Upload the Testing Dataset", type="csv")
+
+if train_file is not None and test_file is not None:
+    train_data = pd.read_csv(train_file)
+    test_data = pd.read_csv(test_file)
+
+    st.write("Training Data Preview:")
+    st.write(train_data.head())
+
+    st.write("Testing Data Preview:")
+    st.write(test_data.head())
+
+    # Step 2: Data Cleaning and Structuring
+    st.subheader("Data Cleaning and Structuring")
+
+    # Assuming there are no missing values, otherwise handle them
+    train_data.dropna(inplace=True)
+    test_data.dropna(inplace=True)
+
+    # Step 3: Exploratory Data Analysis (EDA)
+    st.subheader("Exploratory Data Analysis")
+
+    # Basic Information
+    st.write("Dataset Information:")
+    st.write(train_data.info())
+
+    st.write("Descriptive Statistics:")
+    st.write(train_data.describe())
+
+    # Distribution of the Target Variable
+    st.write("Distribution of House Prices per Unit Area:")
+    plt.figure(figsize=(10, 6))
+    sns.histplot(train_data['Y house price of unit area'], kde=True)
+    plt.title('Distribution of House Prices per Unit Area')
+    st.pyplot(plt)
+
+    # Boxplot of the Target Variable
+    st.write("Boxplot of House Prices per Unit Area:")
+    plt.figure(figsize=(10, 6))
+    sns.boxplot(x=train_data['Y house price of unit area'])
+    plt.title('Boxplot of House Prices per Unit Area')
+    st.pyplot(plt)
+
+    # Correlation Matrix
+    st.write("Correlation Matrix:")
+    corr = train_data.corr()
+    plt.figure(figsize=(12, 8))
+    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
+    plt.title('Correlation Matrix')
+    st.pyplot(plt)
+
+    # Pairplot to explore pairwise relationships
+    st.write("Pairplot of Features:")
+    sns.pairplot(train_data)
+    st.pyplot(plt)
+
+    # Distribution plots for each feature
+    st.write("Distribution of Features:")
+    features = ['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', 
+                'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']
+    for feature in features:
+        plt.figure(figsize=(10, 6))
+        sns.histplot(train_data[feature], kde=True)
+        plt.title(f'Distribution of {feature}')
+        st.pyplot(plt)
+
+    # Scatter plots for feature relationships with the target variable
+    st.write("Scatter Plots of Features vs. Target Variable:")
+    for feature in features:
+        plt.figure(figsize=(10, 6))
+        sns.scatterplot(x=train_data[feature], y=train_data['Y house price of unit area'])
+        plt.title(f'{feature} vs. House Price per Unit Area')
+        st.pyplot(plt)
+
+    # Step 4: Feature Engineering
+    st.subheader("Feature Engineering")
+
+    X_train = train_data[features]
+    y_train = train_data['Y house price of unit area']
+
+    X_test = test_data[features]
+    y_test = test_data['Y house price of unit area']
+
+    # Feature Scaling
+    scaler = StandardScaler()
+    X_train = scaler.fit_transform(X_train)
+    X_test = scaler.transform(X_test)
+
+    # Step 5: Model Building
+    st.subheader("Model Building")
+    model = LinearRegression()
+    model.fit(X_train, y_train)
+
+    # Step 6: Model Evaluation
+    st.subheader("Model Evaluation")
+    y_pred = model.predict(X_test)
+
+    mse = mean_squared_error(y_test, y_pred)
+    r2 = r2_score(y_test, y_pred)
+
+    st.write(f"Mean Squared Error: {mse}")
+    st.write(f"R-squared: {r2}")
+
+    # Display a comparison of the first few predictions vs. actual prices
+    comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
+    st.write("Comparison of Actual and Predicted Prices:")
+    st.write(comparison.head())
+
+    # Step 7: Deployment - Input data and predict
+    st.subheader("Predict Property Price")
+
+    transaction_date = st.number_input("Transaction Date", min_value=0)
+    house_age = st.number_input("House Age", min_value=0)
+    distance_to_mrt = st.number_input("Distance to MRT Station", min_value=0)
+    num_convenience_stores = st.number_input("Number of Convenience Stores", min_value=0)
+    latitude = st.number_input("Latitude")
+    longitude = st.number_input("Longitude")
+
+    input_data = pd.DataFrame({
+        'X1 transaction date': [transaction_date],
+        'X2 house age': [house_age],
+        'X3 distance to the nearest MRT station': [distance_to_mrt],
+        'X4 number of convenience stores': [num_convenience_stores],
+        'X5 latitude': [latitude],
+        'X6 longitude': [longitude]
+    })
+
+    input_data = scaler.transform(input_data)
+    prediction = model.predict(input_data)
+
+    st.write(f"Predicted House Price per Unit Area: {prediction[0]}")
diff --git a/LR3/requirements.txt b/LR3/requirements.txt
@@ -0,0 +1,6 @@
+streamlit==1.24.0
+pandas==2.0.3
+numpy==1.25.1
+matplotlib==3.8.0
+seaborn==0.12.2
+scikit-learn==1.3.0
diff --git a/TS2/Dockerfile b/TS2/Dockerfile
@@ -0,0 +1,6 @@
+FROM python:3.11-slim
+WORKDIR /app
+COPY . .
+RUN pip install -r requirements.txt
+EXPOSE 8501
+CMD ["streamlit", "run", "test2.py"]
diff --git a/TS2/requirements.txt b/TS2/requirements.txt
@@ -0,0 +1,7 @@
+pandas==2.0.3
+streamlit==1.24.0
+seaborn==0.12.2
+matplotlib==3.8.0
+statsmodels==0.14.0
+scikit-learn==1.3.0
+prophet
diff --git a/TS2/test2.py b/TS2/test2.py
@@ -0,0 +1,205 @@
+import pandas as pd
+import streamlit as st
+import seaborn as sns
+import matplotlib.pyplot as plt
+from statsmodels.tsa.statespace.sarimax import SARIMAX
+from sklearn.metrics import mean_squared_error, r2_score
+from math import sqrt
+from prophet import Prophet
+
+class TimeSeries:
+    def __init__(self, train_df, stores_df, features_df):
+        self.train_df = train_df
+        self.stores_df = stores_df
+        self.features_df = features_df
+        self.merged_df = None
+
+    def explore_data(self):
+        """
+        Perform exploratory data analysis (EDA) on the provided datasets.
+        """
+        st.subheader("Training Data Summary")
+        st.write(self.train_df.describe())
+
+        st.subheader("Stores Data Summary")
+        st.write(self.stores_df.describe())
+
+        st.subheader("Features Data Summary")
+        st.write(self.features_df.describe())
+
+        # Check for missing values
+        st.subheader("Missing Values in Training Data")
+        st.write(self.train_df.isnull().sum())
+
+        st.subheader("Missing Values in Stores Data")
+        st.write(self.stores_df.isnull().sum())
+
+        st.subheader("Missing Values in Features Data")
+        st.write(self.features_df.isnull().sum())
+
+        # Correlation matrix
+        st.subheader("Correlation Matrix of Training Data")
+        numeric_df = self.train_df.select_dtypes(include=['float64', 'int64'])  # Select only numeric columns
+        corr_matrix = numeric_df.corr()
+        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
+        st.pyplot(plt.gcf())
+
+        # Distribution of sales
+        st.subheader("Sales Distribution")
+        sns.histplot(self.train_df['Weekly_Sales'], kde=True, color='blue', bins=30)
+        st.pyplot(plt.gcf())
+
+        # Sales distribution over time
+        st.subheader("Sales Distribution Over Time")
+        sns.lineplot(data=self.train_df, x='Date', y='Weekly_Sales')
+        st.pyplot(plt.gcf())
+
+        # Boxplot to identify outliers
+        st.subheader("Outliers in Weekly Sales")
+        sns.boxplot(x=self.train_df['Weekly_Sales'])
+        st.pyplot(plt.gcf())
+
+        # Pairplot for relationships between numerical features
+        st.subheader("Pairplot of Numerical Features")
+        sns.pairplot(numeric_df)
+        st.pyplot(plt.gcf())
+
+
+
+    def merge_data(self):
+        """
+        Merges the train DataFrame with the stores DataFrame and then with the features DataFrame
+        based on the 'Store' and 'Date' columns.
+        """
+        self.train_df.columns = self.train_df.columns.str.strip().str.title()
+        self.stores_df.columns = self.stores_df.columns.str.strip().str.title()
+        self.features_df.columns = self.features_df.columns.str.strip().str.title()
+
+        self.train_df['Date'] = pd.to_datetime(self.train_df['Date'], errors='coerce')
+        self.features_df['Date'] = pd.to_datetime(self.features_df['Date'], errors='coerce')
+
+        merged_store = pd.merge(self.train_df, self.stores_df, on='Store', how='left')
+        self.merged_df = pd.merge(merged_store, self.features_df, on=['Store', 'Date'], how='left')
+
+        st.subheader("Merged DataFrame Head")
+        st.dataframe(self.merged_df.head())
+
+    def preprocess_for_prophet(self):
+        """
+        Prepares the data for Prophet modeling by renaming columns to 'ds' and 'y'.
+        """
+        if self.merged_df is not None:
+            df = self.merged_df.groupby('Date')['Weekly_Sales'].sum().reset_index()
+            df.rename(columns={'Date': 'ds', 'Weekly_Sales': 'y'}, inplace=True)
+            return df
+        else:
+            st.error("DataFrames are not merged yet. Please merge the DataFrames first.")
+            return None
+
+    def fit_predict_prophet(self, train_df, test_df):
+        """
+        Fits a Prophet model on the training data and makes predictions on the test data.
+        """
+        prophet_model = Prophet()
+        prophet_model.fit(train_df)
+
+        future = test_df[['ds']].copy()
+        forecast = prophet_model.predict(future)
+
+        # Calculate evaluation metrics
+        y_true = test_df['y'].values
+        y_pred = forecast['yhat'].values
+
+        mse = mean_squared_error(y_true, y_pred)
+        rmse = sqrt(mse)
+        r2 = r2_score(y_true, y_pred)
+
+        st.subheader("Prophet Model Evaluation")
+        st.write(f"RMSE: {rmse}")
+        st.write(f"MSE: {mse}")
+        st.write(f"R²: {r2}")
+
+        # Plot the predictions
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ax.plot(test_df['ds'], y_true, label='Actual Sales', color='blue')
+        ax.plot(test_df['ds'], y_pred, label='Forecasted Sales', color='orange')
+        ax.set_title('Prophet Model - Actual vs Forecasted Sales')
+        ax.set_xlabel('Date')
+        ax.set_ylabel('Weekly Sales')
+        ax.legend()
+        st.pyplot(fig)
+
+    def fit_predict_sarima(self, train_df, test_df, order=(1, 1, 1), seasonal_order=(1, 1, 1, 52)):
+        """
+        Fits a SARIMA model on the training data and makes predictions on the test data.
+        """
+        train_series = train_df.set_index('ds')['y']
+        test_series = test_df.set_index('ds')['y']
+
+        sarima_model = SARIMAX(train_series, order=order, seasonal_order=seasonal_order)
+        sarima_result = sarima_model.fit(disp=False)
+
+        # Forecast
+        forecast = sarima_result.get_forecast(steps=len(test_series))
+        y_pred = forecast.predicted_mean
+        y_true = test_series
+
+        # Calculate evaluation metrics
+        mse = mean_squared_error(y_true, y_pred)
+        rmse = sqrt(mse)
+        r2 = r2_score(y_true, y_pred)
+
+        st.subheader("SARIMA Model Evaluation")
+        st.write(f"RMSE: {rmse}")
+        st.write(f"MSE: {mse}")
+        st.write(f"R²: {r2}")
+
+        # Plot the predictions
+        fig, ax = plt.subplots(figsize=(10, 6))
+        ax.plot(test_series.index, y_true, label='Actual Sales', color='blue')
+        ax.plot(test_series.index, y_pred, label='Forecasted Sales', color='orange')
+        ax.set_title('SARIMA Model - Actual vs Forecasted Sales')
+        ax.set_xlabel('Date')
+        ax.set_ylabel('Weekly Sales')
+        ax.legend()
+        st.pyplot(fig)
+
+def main():
+    st.title("Time Series Analysis and Forecasting")
+
+    # Upload data files
+    train_file = st.file_uploader("Upload Training Data CSV", type="csv")
+    stores_file = st.file_uploader("Upload Stores Data CSV", type="csv")
+    features_file = st.file_uploader("Upload Features Data CSV", type="csv")
+
+    if train_file and stores_file and features_file:
+        # Read the uploaded files into pandas DataFrames
+        train_df = pd.read_csv(train_file)
+        stores_df = pd.read_csv(stores_file)
+        features_df = pd.read_csv(features_file)
+
+        # Initialize analysis object
+        ts_analysis = TimeSeries(train_df, stores_df, features_df)
+
+        # Perform Data Exploration
+        ts_analysis.explore_data()
+
+        # Merge and preprocess the training data
+        ts_analysis.merge_data()
+        train_prophet_df = ts_analysis.preprocess_for_prophet()
+
+        # Prepare test data
+        ts_analysis2 = TimeSeries(train_df, stores_df, features_df)
+        ts_analysis2.merge_data()
+        test_prophet_df = ts_analysis2.preprocess_for_prophet()
+
+        # Fit and predict with Prophet
+        if train_prophet_df is not None and test_prophet_df is not None:
+            ts_analysis.fit_predict_prophet(train_prophet_df, test_prophet_df)
+
+        # Fit and predict with SARIMA
+        if train_prophet_df is not None and test_prophet_df is not None:
+            ts_analysis.fit_predict_sarima(train_prophet_df, test_prophet_df)
+
+if __name__ == "__main__":
+    main()