Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions LR3/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM python:3.11-slim
WORKDIR /app
COPY . .
RUN pip install -r requirements.txt
EXPOSE 8501
CMD ["streamlit", "run", "regres.py"]
145 changes: 145 additions & 0 deletions LR3/regres.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

st.title("Real Estate Price Prediction")

# Step 1: Data Loading
train_file = st.file_uploader("Upload the Training Dataset", type="csv")
test_file = st.file_uploader("Upload the Testing Dataset", type="csv")

if train_file is not None and test_file is not None:
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)

st.write("Training Data Preview:")
st.write(train_data.head())

st.write("Testing Data Preview:")
st.write(test_data.head())

# Step 2: Data Cleaning and Structuring
st.subheader("Data Cleaning and Structuring")

# Assuming there are no missing values, otherwise handle them
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Step 3: Exploratory Data Analysis (EDA)
st.subheader("Exploratory Data Analysis")

# Basic Information
st.write("Dataset Information:")
st.write(train_data.info())

st.write("Descriptive Statistics:")
st.write(train_data.describe())

# Distribution of the Target Variable
st.write("Distribution of House Prices per Unit Area:")
plt.figure(figsize=(10, 6))
sns.histplot(train_data['Y house price of unit area'], kde=True)
plt.title('Distribution of House Prices per Unit Area')
st.pyplot(plt)

# Boxplot of the Target Variable
st.write("Boxplot of House Prices per Unit Area:")
plt.figure(figsize=(10, 6))
sns.boxplot(x=train_data['Y house price of unit area'])
plt.title('Boxplot of House Prices per Unit Area')
st.pyplot(plt)

# Correlation Matrix
st.write("Correlation Matrix:")
corr = train_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
st.pyplot(plt)

# Pairplot to explore pairwise relationships
st.write("Pairplot of Features:")
sns.pairplot(train_data)
st.pyplot(plt)

# Distribution plots for each feature
st.write("Distribution of Features:")
features = ['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station',
'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']
for feature in features:
plt.figure(figsize=(10, 6))
sns.histplot(train_data[feature], kde=True)
plt.title(f'Distribution of {feature}')
st.pyplot(plt)

# Scatter plots for feature relationships with the target variable
st.write("Scatter Plots of Features vs. Target Variable:")
for feature in features:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=train_data[feature], y=train_data['Y house price of unit area'])
plt.title(f'{feature} vs. House Price per Unit Area')
st.pyplot(plt)

# Step 4: Feature Engineering
st.subheader("Feature Engineering")

X_train = train_data[features]
y_train = train_data['Y house price of unit area']

X_test = test_data[features]
y_test = test_data['Y house price of unit area']

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 5: Model Building
st.subheader("Model Building")
model = LinearRegression()
model.fit(X_train, y_train)

# Step 6: Model Evaluation
st.subheader("Model Evaluation")
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

st.write(f"Mean Squared Error: {mse}")
st.write(f"R-squared: {r2}")

# Display a comparison of the first few predictions vs. actual prices
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
st.write("Comparison of Actual and Predicted Prices:")
st.write(comparison.head())

# Step 7: Deployment - Input data and predict
st.subheader("Predict Property Price")

transaction_date = st.number_input("Transaction Date", min_value=0)
house_age = st.number_input("House Age", min_value=0)
distance_to_mrt = st.number_input("Distance to MRT Station", min_value=0)
num_convenience_stores = st.number_input("Number of Convenience Stores", min_value=0)
latitude = st.number_input("Latitude")
longitude = st.number_input("Longitude")

input_data = pd.DataFrame({
'X1 transaction date': [transaction_date],
'X2 house age': [house_age],
'X3 distance to the nearest MRT station': [distance_to_mrt],
'X4 number of convenience stores': [num_convenience_stores],
'X5 latitude': [latitude],
'X6 longitude': [longitude]
})

input_data = scaler.transform(input_data)
prediction = model.predict(input_data)

st.write(f"Predicted House Price per Unit Area: {prediction[0]}")
6 changes: 6 additions & 0 deletions LR3/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
streamlit==1.24.0
pandas==2.0.3
numpy==1.25.1
matplotlib==3.8.0
seaborn==0.12.2
scikit-learn==1.3.0
6 changes: 6 additions & 0 deletions TS2/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM python:3.11-slim
WORKDIR /app
COPY . .
RUN pip install -r requirements.txt
EXPOSE 8501
CMD ["streamlit", "run", "test2.py"]
7 changes: 7 additions & 0 deletions TS2/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
pandas==2.0.3
streamlit==1.24.0
seaborn==0.12.2
matplotlib==3.8.0
statsmodels==0.14.0
scikit-learn==1.3.0
prophet
205 changes: 205 additions & 0 deletions TS2/test2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import pandas as pd
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
from prophet import Prophet

class TimeSeries:
def __init__(self, train_df, stores_df, features_df):
self.train_df = train_df
self.stores_df = stores_df
self.features_df = features_df
self.merged_df = None

def explore_data(self):
"""
Perform exploratory data analysis (EDA) on the provided datasets.
"""
st.subheader("Training Data Summary")
st.write(self.train_df.describe())

st.subheader("Stores Data Summary")
st.write(self.stores_df.describe())

st.subheader("Features Data Summary")
st.write(self.features_df.describe())

# Check for missing values
st.subheader("Missing Values in Training Data")
st.write(self.train_df.isnull().sum())

st.subheader("Missing Values in Stores Data")
st.write(self.stores_df.isnull().sum())

st.subheader("Missing Values in Features Data")
st.write(self.features_df.isnull().sum())

# Correlation matrix
st.subheader("Correlation Matrix of Training Data")
numeric_df = self.train_df.select_dtypes(include=['float64', 'int64']) # Select only numeric columns
corr_matrix = numeric_df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
st.pyplot(plt.gcf())

# Distribution of sales
st.subheader("Sales Distribution")
sns.histplot(self.train_df['Weekly_Sales'], kde=True, color='blue', bins=30)
st.pyplot(plt.gcf())

# Sales distribution over time
st.subheader("Sales Distribution Over Time")
sns.lineplot(data=self.train_df, x='Date', y='Weekly_Sales')
st.pyplot(plt.gcf())

# Boxplot to identify outliers
st.subheader("Outliers in Weekly Sales")
sns.boxplot(x=self.train_df['Weekly_Sales'])
st.pyplot(plt.gcf())

# Pairplot for relationships between numerical features
st.subheader("Pairplot of Numerical Features")
sns.pairplot(numeric_df)
st.pyplot(plt.gcf())



def merge_data(self):
"""
Merges the train DataFrame with the stores DataFrame and then with the features DataFrame
based on the 'Store' and 'Date' columns.
"""
self.train_df.columns = self.train_df.columns.str.strip().str.title()
self.stores_df.columns = self.stores_df.columns.str.strip().str.title()
self.features_df.columns = self.features_df.columns.str.strip().str.title()

self.train_df['Date'] = pd.to_datetime(self.train_df['Date'], errors='coerce')
self.features_df['Date'] = pd.to_datetime(self.features_df['Date'], errors='coerce')

merged_store = pd.merge(self.train_df, self.stores_df, on='Store', how='left')
self.merged_df = pd.merge(merged_store, self.features_df, on=['Store', 'Date'], how='left')

st.subheader("Merged DataFrame Head")
st.dataframe(self.merged_df.head())

def preprocess_for_prophet(self):
"""
Prepares the data for Prophet modeling by renaming columns to 'ds' and 'y'.
"""
if self.merged_df is not None:
df = self.merged_df.groupby('Date')['Weekly_Sales'].sum().reset_index()
df.rename(columns={'Date': 'ds', 'Weekly_Sales': 'y'}, inplace=True)
return df
else:
st.error("DataFrames are not merged yet. Please merge the DataFrames first.")
return None

def fit_predict_prophet(self, train_df, test_df):
"""
Fits a Prophet model on the training data and makes predictions on the test data.
"""
prophet_model = Prophet()
prophet_model.fit(train_df)

future = test_df[['ds']].copy()
forecast = prophet_model.predict(future)

# Calculate evaluation metrics
y_true = test_df['y'].values
y_pred = forecast['yhat'].values

mse = mean_squared_error(y_true, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_true, y_pred)

st.subheader("Prophet Model Evaluation")
st.write(f"RMSE: {rmse}")
st.write(f"MSE: {mse}")
st.write(f"R²: {r2}")

# Plot the predictions
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(test_df['ds'], y_true, label='Actual Sales', color='blue')
ax.plot(test_df['ds'], y_pred, label='Forecasted Sales', color='orange')
ax.set_title('Prophet Model - Actual vs Forecasted Sales')
ax.set_xlabel('Date')
ax.set_ylabel('Weekly Sales')
ax.legend()
st.pyplot(fig)

def fit_predict_sarima(self, train_df, test_df, order=(1, 1, 1), seasonal_order=(1, 1, 1, 52)):
"""
Fits a SARIMA model on the training data and makes predictions on the test data.
"""
train_series = train_df.set_index('ds')['y']
test_series = test_df.set_index('ds')['y']

sarima_model = SARIMAX(train_series, order=order, seasonal_order=seasonal_order)
sarima_result = sarima_model.fit(disp=False)

# Forecast
forecast = sarima_result.get_forecast(steps=len(test_series))
y_pred = forecast.predicted_mean
y_true = test_series

# Calculate evaluation metrics
mse = mean_squared_error(y_true, y_pred)
rmse = sqrt(mse)
r2 = r2_score(y_true, y_pred)

st.subheader("SARIMA Model Evaluation")
st.write(f"RMSE: {rmse}")
st.write(f"MSE: {mse}")
st.write(f"R²: {r2}")

# Plot the predictions
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(test_series.index, y_true, label='Actual Sales', color='blue')
ax.plot(test_series.index, y_pred, label='Forecasted Sales', color='orange')
ax.set_title('SARIMA Model - Actual vs Forecasted Sales')
ax.set_xlabel('Date')
ax.set_ylabel('Weekly Sales')
ax.legend()
st.pyplot(fig)

def main():
st.title("Time Series Analysis and Forecasting")

# Upload data files
train_file = st.file_uploader("Upload Training Data CSV", type="csv")
stores_file = st.file_uploader("Upload Stores Data CSV", type="csv")
features_file = st.file_uploader("Upload Features Data CSV", type="csv")

if train_file and stores_file and features_file:
# Read the uploaded files into pandas DataFrames
train_df = pd.read_csv(train_file)
stores_df = pd.read_csv(stores_file)
features_df = pd.read_csv(features_file)

# Initialize analysis object
ts_analysis = TimeSeries(train_df, stores_df, features_df)

# Perform Data Exploration
ts_analysis.explore_data()

# Merge and preprocess the training data
ts_analysis.merge_data()
train_prophet_df = ts_analysis.preprocess_for_prophet()

# Prepare test data
ts_analysis2 = TimeSeries(train_df, stores_df, features_df)
ts_analysis2.merge_data()
test_prophet_df = ts_analysis2.preprocess_for_prophet()

# Fit and predict with Prophet
if train_prophet_df is not None and test_prophet_df is not None:
ts_analysis.fit_predict_prophet(train_prophet_df, test_prophet_df)

# Fit and predict with SARIMA
if train_prophet_df is not None and test_prophet_df is not None:
ts_analysis.fit_predict_sarima(train_prophet_df, test_prophet_df)

if __name__ == "__main__":
main()