diff --git a/Evaluation/Dockerfile b/Evaluation/Dockerfile new file mode 100644 index 0000000..076bb8b --- /dev/null +++ b/Evaluation/Dockerfile @@ -0,0 +1,17 @@ +# Use an official Python runtime as a parent image +FROM python:3.9-slim + +# Set the working directory in the container +WORKDIR /app + +# Copy the current directory contents into the container at /app +COPY . /app + +# Install any needed packages specified in requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Expose the port Streamlit runs on +EXPOSE 8501 + +# Command to run the Streamlit app +CMD ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"] diff --git a/Evaluation/LR3 datasets/Test Real estate.csv b/Evaluation/LR3 datasets/Test Real estate.csv new file mode 100644 index 0000000..b2c7bff --- /dev/null +++ b/Evaluation/LR3 datasets/Test Real estate.csv @@ -0,0 +1,43 @@ +No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area +359,2013.167,1.1,193.5845,6,24.96571,121.54089,45.1 +351,2013.0,13.2,492.2313,5,24.96515,121.53737,42.3 +374,2013.083,0.0,274.0144,1,24.9748,121.53059,52.2 +400,2012.917,12.7,170.1289,1,24.97371,121.52984,37.3 +370,2012.667,20.2,2185.128,3,24.96322,121.51237,22.8 +73,2013.583,32.5,424.5442,8,24.97587,121.53913,36.3 +263,2012.917,15.9,289.3248,5,24.98203,121.54348,53.0 +141,2013.25,16.2,289.3248,5,24.98203,121.54348,51.4 +94,2012.917,31.9,1146.329,0,24.9492,121.53076,16.1 +71,2013.583,6.6,90.45606,9,24.97433,121.5431,59.0 +119,2013.5,25.3,1583.722,3,24.96622,121.51709,30.6 +132,2013.5,4.0,2147.376,3,24.96299,121.51284,30.7 +337,2012.833,5.1,1867.233,2,24.98407,121.51748,35.6 +56,2012.833,31.7,1160.632,0,24.94968,121.53009,13.7 +127,2013.083,38.6,804.6897,4,24.97838,121.53477,62.9 +377,2013.417,14.7,1717.193,2,24.96447,121.51649,30.5 +57,2013.417,33.6,371.2495,8,24.97254,121.54059,41.9 +292,2012.833,3.4,56.47425,7,24.95744,121.53711,54.4 +366,2012.917,17.3,2261.432,4,24.96182,121.51222,29.5 +85,2013.083,15.1,383.2805,7,24.96735,121.54464,43.7 +117,2013.0,30.9,6396.283,1,24.94375,121.47883,12.2 +10,2013.417,17.9,1783.18,3,24.96731,121.51486,22.1 +375,2013.25,5.4,390.5684,5,24.97937,121.54245,49.5 +138,2013.5,13.6,319.0708,6,24.96495,121.54277,47.4 +321,2012.75,13.5,4197.349,0,24.93885,121.50383,18.6 +403,2012.833,12.7,187.4823,1,24.97388,121.52981,28.5 +232,2012.833,16.2,4074.736,0,24.94235,121.50357,14.7 +91,2012.833,0.0,274.0144,1,24.9748,121.53059,45.4 +95,2012.917,40.9,167.5989,5,24.9663,121.54026,41.0 +174,2013.083,41.3,401.8807,4,24.98326,121.5446,35.1 +31,2013.5,25.9,4519.69,0,24.94826,121.49587,22.1 +142,2013.333,5.1,1559.827,3,24.97213,121.51627,28.9 +105,2012.667,32.7,392.4459,6,24.96398,121.5425,30.5 +80,2013.0,18.0,1414.837,1,24.95182,121.54887,26.5 +34,2013.25,16.5,323.655,6,24.97841,121.54281,49.3 +291,2013.083,37.7,490.3446,0,24.97217,121.53471,37.0 +287,2012.917,5.9,90.45606,9,24.97433,121.5431,56.3 +410,2013.0,13.7,4082.015,0,24.94155,121.50381,15.4 +223,2013.583,30.6,431.1114,10,24.98123,121.53743,48.5 +362,2013.083,41.4,281.205,8,24.97345,121.54093,63.3 +16,2013.583,35.7,579.2083,2,24.9824,121.54619,50.5 +312,2013.167,21.3,537.7971,4,24.97425,121.53814,42.2 diff --git a/Evaluation/LR3 datasets/Train Real estate.csv b/Evaluation/LR3 datasets/Train Real estate.csv new file mode 100644 index 0000000..95ed994 --- /dev/null +++ b/Evaluation/LR3 datasets/Train Real estate.csv @@ -0,0 +1,373 @@ +No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area +1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9 +318,2012.75,15.6,752.7669,2,24.97795,121.53451,37.5 +273,2012.75,13.0,492.2313,5,24.96515,121.53737,40.5 +125,2012.917,9.9,279.1726,7,24.97528,121.54541,57.4 +272,2012.917,17.7,451.6419,8,24.96945,121.5449,26.5 +185,2012.75,14.1,2615.465,0,24.95495,121.56174,21.8 +276,2012.667,1.5,23.38284,7,24.96772,121.54102,49.7 +361,2012.667,32.9,87.30222,10,24.983,121.54022,47.1 +226,2013.25,1.1,193.5845,6,24.96571,121.54089,49.0 +356,2013.25,3.8,383.8624,5,24.98085,121.54391,60.7 +365,2013.417,35.3,614.1394,7,24.97913,121.53666,33.1 +275,2013.167,27.5,394.0173,7,24.97305,121.53994,41.0 +176,2013.083,30.2,472.1745,3,24.97005,121.53758,36.5 +74,2013.167,13.8,4082.015,0,24.94155,121.50381,20.0 +43,2013.417,36.1,519.4617,5,24.96305,121.53758,34.7 +166,2012.917,13.7,1236.564,1,24.97694,121.55391,30.6 +78,2012.833,20.5,2185.128,3,24.96322,121.51237,25.6 +40,2013.167,16.2,289.3248,5,24.98203,121.54348,46.2 +402,2013.083,7.6,2175.03,3,24.96305,121.51254,27.7 +26,2013.083,29.3,1487.868,2,24.97542,121.51726,27.0 +156,2013.167,13.8,4082.015,0,24.94155,121.50381,15.6 +181,2012.667,26.9,4449.27,0,24.94898,121.49621,15.5 +348,2013.583,17.4,6488.021,1,24.95719,121.47353,11.2 +267,2013.25,17.8,1783.18,3,24.96731,121.51486,23.7 +23,2012.917,14.7,1360.139,1,24.95204,121.54842,24.6 +47,2013.417,21.7,463.9623,9,24.9703,121.54458,42.0 +79,2012.917,38.2,552.4371,2,24.97598,121.53381,29.8 +153,2013.333,12.0,1360.139,1,24.95204,121.54842,28.9 +83,2013.083,13.2,150.9347,7,24.96725,121.54252,48.1 +77,2013.583,35.9,616.4004,3,24.97723,121.53767,36.8 +58,2012.917,3.5,56.47425,7,24.95744,121.53711,53.5 +111,2013.083,8.1,104.8101,5,24.96674,121.54067,51.6 +25,2013.0,39.6,480.6977,4,24.97353,121.53885,38.8 +18,2012.75,17.7,350.8515,1,24.97544,121.53119,37.4 +239,2013.083,12.8,732.8528,0,24.97668,121.52518,40.6 +406,2012.667,23.0,130.9945,6,24.95663,121.53765,37.2 +32,2012.75,29.6,769.4034,7,24.98281,121.53408,25.0 +284,2013.417,33.5,1978.671,2,24.98674,121.51844,23.5 +158,2013.25,16.1,815.9314,4,24.97886,121.53464,35.6 +204,2012.667,15.6,289.3248,5,24.98203,121.54348,46.1 +209,2012.75,11.5,1360.139,1,24.95204,121.54842,26.2 +193,2013.167,43.8,57.58945,7,24.9675,121.54069,42.7 +235,2013.25,8.0,2216.612,4,24.96007,121.51361,23.9 +6,2012.667,7.1,2175.03,3,24.96305,121.51254,32.1 +46,2013.083,36.6,488.8193,8,24.97015,121.54494,38.3 +246,2013.417,7.5,639.6198,5,24.97258,121.54814,40.8 +347,2013.417,13.2,1712.632,2,24.96412,121.5167,30.8 +196,2013.333,15.2,461.1016,5,24.95425,121.5399,34.6 +383,2013.0,16.3,3529.564,0,24.93207,121.51597,29.3 +401,2013.25,26.8,482.7581,5,24.97433,121.53863,35.5 +297,2012.75,12.5,1144.436,4,24.99176,121.53456,34.1 +4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8 +19,2013.417,16.9,368.1363,8,24.9675,121.54451,42.3 +251,2013.167,39.2,424.7132,7,24.97429,121.53917,30.0 +61,2013.417,11.0,1931.207,2,24.96365,121.51471,21.3 +391,2013.5,32.8,377.8302,9,24.97151,121.5435,38.6 +409,2013.417,18.5,2175.744,3,24.9633,121.51243,28.1 +64,2013.583,2.6,533.4762,4,24.97445,121.54765,55.0 +262,2013.167,16.2,2288.011,3,24.95885,121.51359,24.4 +298,2012.833,34.9,567.0349,4,24.97003,121.5458,28.5 +114,2013.333,14.8,393.2606,6,24.96172,121.53812,7.6 +249,2013.0,19.0,1009.235,0,24.96357,121.54951,22.3 +327,2013.5,4.1,56.47425,7,24.95744,121.53711,62.1 +411,2012.667,5.6,90.45606,9,24.97433,121.5431,50.0 +392,2013.583,6.2,1939.749,1,24.95155,121.55387,31.3 +353,2012.833,18.4,2674.961,3,24.96143,121.50827,25.7 +133,2013.167,26.6,482.7581,5,24.97433,121.53863,37.5 +238,2013.167,13.0,732.8528,0,24.97668,121.52518,39.0 +109,2013.417,31.4,592.5006,2,24.9726,121.53561,34.1 +102,2012.833,12.7,170.1289,1,24.97371,121.52984,32.9 +194,2013.417,9.7,421.479,5,24.98246,121.54477,49.3 +30,2013.083,7.1,451.2438,5,24.97563,121.54694,57.1 +308,2012.833,10.3,3079.89,0,24.9546,121.56627,24.7 +387,2012.833,0.0,185.4296,0,24.9711,121.5317,55.3 +306,2013.083,16.2,289.3248,5,24.98203,121.54348,55.0 +115,2012.667,30.6,143.8383,8,24.98155,121.54142,53.3 +248,2013.333,21.7,1055.067,0,24.96211,121.54928,23.1 +76,2013.5,12.3,1360.139,1,24.95204,121.54842,29.5 +269,2013.417,17.2,390.5684,5,24.97937,121.54245,40.1 +149,2013.5,16.4,3780.59,0,24.93293,121.51203,45.1 +17,2013.25,0.0,292.9978,6,24.97744,121.54458,70.1 +67,2013.0,1.0,193.5845,6,24.96571,121.54089,50.7 +342,2013.0,13.0,750.0704,2,24.97371,121.54951,37.0 +8,2013.417,20.3,287.6025,6,24.98042,121.54228,46.7 +195,2013.5,15.2,3771.895,0,24.93363,121.51158,29.3 +20,2012.667,1.5,23.38284,7,24.96772,121.54102,47.7 +177,2012.833,13.9,4573.779,0,24.94867,121.49507,19.2 +368,2012.833,15.0,1828.319,2,24.96464,121.51531,20.9 +305,2013.417,20.0,1626.083,3,24.96622,121.51668,29.4 +199,2013.083,34.0,157.6052,7,24.96628,121.54196,39.1 +146,2012.917,2.1,451.2438,5,24.97563,121.54694,45.5 +266,2012.833,15.7,815.9314,4,24.97886,121.53464,38.1 +110,2013.583,4.0,2147.376,3,24.96299,121.51284,28.4 +203,2012.917,38.3,642.6985,3,24.97559,121.53713,31.5 +197,2013.0,22.8,707.9067,2,24.981,121.54713,36.6 +397,2012.667,37.1,918.6357,1,24.97198,121.55063,31.9 +240,2013.5,18.1,837.7233,0,24.96334,121.54767,29.7 +84,2012.917,25.3,2707.392,3,24.96056,121.50831,17.7 +169,2013.083,27.6,515.1122,5,24.96299,121.5432,37.4 +324,2013.417,28.6,197.1338,6,24.97631,121.54436,42.5 +75,2012.917,6.8,379.5575,10,24.98343,121.53762,54.4 +384,2012.667,29.1,506.1144,4,24.97845,121.53889,40.3 +120,2013.5,16.6,289.3248,5,24.98203,121.54348,59.6 +210,2012.833,34.8,175.6294,8,24.97347,121.54271,40.9 +220,2012.75,29.3,529.7771,8,24.98102,121.53655,40.2 +154,2013.25,6.5,376.1709,6,24.95418,121.53713,40.9 +178,2013.083,33.0,181.0766,9,24.97697,121.54262,42.0 +388,2013.25,16.2,2103.555,3,24.96042,121.51462,25.6 +372,2013.5,4.1,312.8963,5,24.95591,121.53956,51.7 +37,2012.917,14.7,1935.009,2,24.96386,121.51458,22.9 +140,2012.667,12.9,492.2313,5,24.96515,121.53737,42.5 +339,2012.917,31.5,258.186,9,24.96867,121.54331,36.3 +173,2013.583,6.6,90.45606,9,24.97433,121.5431,58.1 +212,2013.083,0.0,274.0144,1,24.9748,121.53059,43.5 +333,2013.167,39.8,617.7134,2,24.97577,121.53475,39.6 +60,2013.083,13.3,336.0532,5,24.95776,121.53438,42.4 +112,2013.583,33.3,196.6172,7,24.97701,121.54224,39.4 +285,2012.917,15.0,383.2805,7,24.96735,121.54464,34.4 +326,2013.083,36.6,488.8193,8,24.97015,121.54494,38.1 +7,2012.667,34.5,623.4731,7,24.97933,121.53642,40.3 +300,2013.167,33.2,121.7262,10,24.98178,121.54059,46.1 +286,2013.167,30.1,718.2937,3,24.97509,121.53644,55.3 +159,2013.0,11.6,390.5684,5,24.97937,121.54245,39.4 +151,2013.25,35.8,170.7311,7,24.96719,121.54269,48.5 +11,2013.083,34.8,405.2134,1,24.97349,121.53372,41.4 +332,2013.333,25.6,4519.69,0,24.94826,121.49587,15.6 +104,2012.75,0.0,208.3905,6,24.95618,121.53844,45.7 +82,2013.0,30.8,377.7956,6,24.96427,121.53964,36.8 +279,2012.75,0.0,208.3905,6,24.95618,121.53844,44.0 +259,2013.417,0.0,292.9978,6,24.97744,121.54458,63.3 +168,2013.417,28.2,330.0854,8,24.97408,121.54011,43.4 +389,2013.5,10.4,2251.938,4,24.95957,121.51353,27.3 +224,2013.25,9.1,1402.016,0,24.98569,121.5276,42.3 +230,2013.583,31.0,1156.412,0,24.9489,121.53095,19.0 +182,2013.167,11.6,201.8939,8,24.98489,121.54121,55.9 +228,2012.917,32.4,265.0609,8,24.98059,121.53986,40.2 +90,2013.5,23.0,3947.945,0,24.94783,121.50243,25.3 +390,2013.25,40.9,122.3619,8,24.96756,121.5423,67.7 +164,2013.5,8.5,104.8101,5,24.96674,121.54067,55.5 +148,2012.75,3.2,489.8821,8,24.97017,121.54494,43.2 +247,2013.417,16.4,389.8219,6,24.96412,121.54273,40.6 +93,2012.917,20.6,2469.645,4,24.96108,121.51046,21.8 +70,2012.833,12.5,561.9845,5,24.98746,121.54391,42.0 +124,2013.417,0.0,185.4296,0,24.9711,121.5317,45.5 +97,2013.417,6.4,90.45606,9,24.97433,121.5431,59.5 +144,2013.5,13.6,492.2313,5,24.96515,121.53737,40.1 +363,2013.417,17.1,967.4,4,24.98872,121.53408,40.0 +98,2013.083,28.4,617.4424,3,24.97746,121.53299,34.6 +250,2012.833,18.0,6306.153,1,24.95743,121.47516,15.0 +69,2013.417,30.4,464.223,6,24.97964,121.53805,36.2 +24,2013.083,10.1,279.1726,7,24.97528,121.54541,47.9 +38,2013.167,12.0,1360.139,1,24.95204,121.54842,25.3 +145,2013.083,11.9,1360.139,1,24.95204,121.54842,28.4 +123,2013.25,31.5,414.9476,4,24.98199,121.54464,32.5 +229,2013.417,11.9,3171.329,0,25.00115,121.51776,46.6 +183,2013.5,13.5,2147.376,3,24.96299,121.51284,23.6 +68,2013.5,8.5,104.8101,5,24.96674,121.54067,56.8 +341,2013.333,33.6,270.8895,0,24.97281,121.53265,42.9 +245,2013.083,4.8,1559.827,3,24.97213,121.51627,21.7 +395,2013.5,32.6,4136.271,1,24.95544,121.4963,24.7 +385,2012.75,16.1,4066.587,0,24.94297,121.50342,12.9 +330,2013.0,13.6,4197.349,0,24.93885,121.50383,19.2 +211,2013.5,5.2,390.5684,5,24.97937,121.54245,52.2 +126,2013.167,1.1,193.5845,6,24.96571,121.54089,48.6 +147,2012.75,0.0,185.4296,0,24.9711,121.5317,52.2 +290,2013.333,13.9,289.3248,5,24.98203,121.54348,44.5 +87,2012.833,1.8,1455.798,1,24.9512,121.549,27.0 +413,2013.0,8.1,104.8101,5,24.96674,121.54067,52.5 +221,2013.333,37.2,186.5101,9,24.97703,121.54265,78.3 +350,2012.75,7.8,104.8101,5,24.96674,121.54067,47.0 +328,2013.417,3.5,757.3377,3,24.97538,121.54971,36.7 +357,2012.833,10.3,211.4473,1,24.97417,121.52999,45.3 +315,2013.25,3.7,577.9615,6,24.97201,121.54722,41.6 +184,2013.5,17.0,4082.015,0,24.94155,121.50381,18.8 +219,2013.417,13.6,492.2313,5,24.96515,121.53737,43.8 +295,2013.5,26.4,335.5273,6,24.9796,121.5414,38.1 +334,2012.75,7.8,104.8101,5,24.96674,121.54067,38.4 +393,2013.083,42.7,443.802,6,24.97927,121.53874,35.3 +130,2013.417,38.5,216.8329,7,24.98086,121.54162,41.0 +39,2012.667,3.1,577.9615,6,24.97201,121.54722,47.7 +12,2013.333,6.3,90.45606,9,24.97433,121.5431,58.1 +281,2013.25,2.3,184.3302,6,24.96581,121.54086,45.4 +186,2012.75,31.4,1447.286,3,24.97285,121.5173,21.5 +282,2013.333,4.7,387.7721,9,24.98118,121.53788,44.8 +113,2013.417,9.9,2102.427,3,24.96044,121.51462,23.1 +180,2013.083,14.0,438.8513,1,24.97493,121.5273,42.6 +302,2012.75,38.0,461.7848,0,24.97229,121.53445,35.7 +367,2012.75,14.2,1801.544,1,24.95153,121.55254,24.8 +338,2012.833,31.3,600.8604,5,24.96871,121.54651,30.9 +118,2013.0,13.6,4197.349,0,24.93885,121.50383,13.0 +382,2013.417,8.0,132.5469,9,24.98298,121.53981,47.3 +288,2013.0,19.2,461.1016,5,24.95425,121.5399,32.9 +399,2013.417,14.7,1717.193,2,24.96447,121.51649,23.0 +299,2013.333,16.7,4082.015,0,24.94155,121.50381,16.7 +234,2013.333,39.7,333.3679,9,24.98016,121.53932,32.4 +376,2013.25,21.7,1157.988,0,24.96165,121.55011,23.8 +256,2013.417,31.5,5512.038,1,24.95095,121.48458,17.4 +165,2012.833,0.0,185.4296,0,24.9711,121.5317,55.2 +355,2013.417,12.2,1360.139,1,24.95204,121.54842,30.1 +137,2012.75,11.4,390.5684,5,24.97937,121.54245,46.8 +155,2013.5,16.9,4066.587,0,24.94297,121.50342,20.7 +200,2013.417,18.2,451.6419,8,24.96945,121.5449,31.6 +198,2013.25,34.4,126.7286,8,24.96881,121.54089,48.2 +407,2013.167,1.9,372.1386,7,24.97293,121.54026,40.5 +3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3 +233,2012.917,27.1,4412.765,1,24.95032,121.49587,17.4 +205,2013.0,18.0,1414.837,1,24.95182,121.54887,26.6 +116,2013.083,20.6,737.9161,2,24.98092,121.54739,46.4 +254,2012.667,30.4,1735.595,2,24.96464,121.51623,25.9 +322,2012.917,16.9,964.7496,4,24.98872,121.53411,37.7 +27,2012.667,3.1,383.8624,5,24.98085,121.54391,56.2 +121,2013.167,13.3,492.2313,5,24.96515,121.53737,31.3 +319,2013.333,7.1,379.5575,10,24.98343,121.53762,49.8 +225,2013.333,34.5,324.9419,6,24.97814,121.5417,46.0 +335,2012.917,30.0,1013.341,5,24.99006,121.5346,22.8 +278,2013.417,21.2,2185.128,3,24.96322,121.51237,27.7 +128,2013.25,3.8,383.8624,5,24.98085,121.54391,55.0 +343,2012.667,5.7,90.45606,9,24.97433,121.5431,53.5 +108,2013.333,12.2,1360.139,1,24.95204,121.54842,26.6 +396,2012.917,21.2,512.5487,4,24.974,121.53842,42.5 +134,2012.833,18.0,373.3937,8,24.9866,121.54082,39.5 +303,2013.5,16.5,2288.011,3,24.95885,121.51359,23.2 +45,2013.583,2.7,533.4762,4,24.97445,121.54765,53.9 +381,2013.333,14.1,289.3248,5,24.98203,121.54348,53.3 +66,2013.417,40.1,123.7429,8,24.97635,121.54329,44.3 +86,2012.75,0.0,338.9679,9,24.96853,121.54413,50.8 +243,2012.833,2.0,2077.39,3,24.96357,121.51329,33.4 +187,2013.167,20.9,2185.128,3,24.96322,121.51237,25.7 +160,2012.667,15.5,815.9314,4,24.97886,121.53464,37.4 +13,2012.917,13.0,492.2313,5,24.96515,121.53737,39.3 +36,2013.5,13.9,4079.418,0,25.01459,121.51816,27.3 +29,2013.5,19.2,557.478,4,24.97419,121.53797,47.0 +171,2013.333,24.0,4527.687,0,24.94741,121.49628,14.4 +143,2013.417,19.8,640.6071,5,24.97017,121.54647,37.5 +404,2012.667,30.9,161.942,9,24.98353,121.53966,39.7 +313,2013.583,35.4,318.5292,9,24.97071,121.54069,78.0 +222,2013.333,9.0,1402.016,0,24.98569,121.5276,38.5 +96,2012.917,8.0,104.8101,5,24.96674,121.54067,51.8 +52,2013.083,31.3,1758.406,1,24.95402,121.55282,20.7 +241,2013.083,11.0,1712.632,2,24.96412,121.5167,28.8 +311,2013.583,16.4,1643.499,2,24.95394,121.55174,24.7 +317,2013.25,13.3,250.631,7,24.96606,121.54297,42.0 +179,2013.5,13.1,1144.436,4,24.99176,121.53456,36.7 +42,2013.5,16.8,4066.587,0,24.94297,121.50342,18.2 +323,2013.0,12.9,187.4823,1,24.97388,121.52981,33.1 +207,2013.25,22.2,379.5575,10,24.98343,121.53762,44.0 +283,2012.917,2.0,1455.798,1,24.9512,121.549,25.6 +255,2012.667,1.1,329.9747,5,24.98254,121.54395,51.8 +394,2013.0,16.9,967.4,4,24.98872,121.53408,40.3 +5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1 +257,2012.667,14.6,339.2289,1,24.97519,121.53151,26.5 +371,2012.75,15.9,289.3248,5,24.98203,121.54348,42.1 +101,2013.5,17.5,964.7496,4,24.98872,121.53411,38.2 +227,2013.0,16.5,4082.015,0,24.94155,121.50381,12.8 +354,2013.5,4.1,2147.376,3,24.96299,121.51284,31.3 +214,2013.083,6.2,90.45606,9,24.97433,121.5431,58.0 +172,2013.083,3.6,383.8624,5,24.98085,121.54391,58.8 +99,2013.417,16.4,289.3248,5,24.98203,121.54348,51.0 +293,2013.083,17.5,395.6747,5,24.95674,121.534,24.5 +216,2013.333,19.2,383.7129,8,24.972,121.54477,48.1 +62,2013.5,5.3,259.6607,6,24.97585,121.54516,63.2 +48,2013.583,35.9,640.7391,3,24.97563,121.53715,61.5 +33,2012.75,37.9,488.5727,1,24.97349,121.53451,34.2 +268,2012.833,34.7,482.7581,5,24.97433,121.53863,41.1 +201,2013.417,17.4,995.7554,0,24.96305,121.54915,25.5 +369,2013.417,18.2,350.8515,1,24.97544,121.53119,43.1 +28,2013.25,10.4,276.449,5,24.95593,121.53913,33.6 +325,2012.667,12.4,1712.632,2,24.96412,121.5167,31.3 +231,2013.5,4.0,2147.376,3,24.96299,121.51284,33.4 +261,2013.25,17.0,1485.097,4,24.97073,121.517,30.7 +289,2013.583,16.6,323.6912,6,24.97841,121.5428,51.0 +163,2012.75,16.0,4066.587,0,24.94297,121.50342,11.6 +352,2012.833,4.0,2180.245,3,24.96324,121.51241,28.6 +139,2013.167,10.0,942.4664,0,24.97843,121.52406,43.5 +63,2012.917,17.2,2175.877,3,24.96303,121.51254,27.7 +136,2012.917,18.9,1009.235,0,24.96357,121.54951,20.8 +129,2013.083,41.3,124.9912,6,24.96674,121.54039,60.7 +9,2013.5,31.7,5512.038,1,24.95095,121.48458,18.8 +65,2013.333,17.5,995.7554,0,24.96305,121.54915,25.3 +301,2013.083,2.5,156.2442,4,24.96696,121.53992,36.9 +15,2013.5,13.2,1164.838,4,24.99156,121.53406,34.3 +157,2013.583,30.7,1264.73,0,24.94883,121.52954,18.3 +41,2013.0,13.6,4082.015,0,24.94155,121.50381,15.9 +380,2013.333,0.0,292.9978,6,24.97744,121.54458,69.7 +336,2013.583,27.3,337.6016,6,24.96431,121.54063,36.5 +217,2013.25,37.8,590.9292,1,24.97153,121.53559,39.7 +280,2013.417,2.6,1554.25,3,24.97026,121.51642,31.1 +358,2013.417,0.0,338.9679,9,24.96853,121.54413,44.9 +237,2013.167,3.6,373.8389,10,24.98322,121.53765,61.9 +208,2013.083,38.5,665.0636,3,24.97503,121.53692,34.2 +213,2013.333,17.6,1805.665,2,24.98672,121.52091,31.1 +296,2013.167,18.2,2179.59,3,24.96299,121.51252,21.8 +378,2013.333,3.9,49.66105,8,24.95836,121.53756,56.8 +252,2012.917,31.7,1159.454,0,24.9496,121.53018,13.8 +405,2013.333,16.4,289.3248,5,24.98203,121.54348,41.2 +379,2013.333,37.3,587.8877,8,24.97077,121.54634,37.4 +304,2013.5,38.3,439.7105,0,24.97161,121.53423,38.4 +270,2013.0,17.6,837.7233,0,24.96334,121.54767,23.0 +202,2013.417,13.1,561.9845,5,24.98746,121.54391,45.9 +162,2013.417,19.2,616.4004,3,24.97723,121.53767,39.6 +44,2012.75,34.4,512.7871,6,24.98748,121.54301,34.1 +218,2012.917,28.0,372.6242,6,24.97838,121.54119,40.8 +191,2013.5,35.3,616.5735,8,24.97945,121.53642,42.3 +310,2013.25,30.3,1264.73,0,24.94883,121.52954,19.1 +260,2013.083,17.7,837.7233,0,24.96334,121.54767,28.8 +106,2012.833,0.0,292.9978,6,24.97744,121.54458,71.0 +54,2013.083,13.3,492.2313,5,24.96515,121.53737,38.9 +2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2 +50,2012.667,29.4,4510.359,1,24.94925,121.49542,13.2 +81,2013.5,11.8,533.4762,4,24.97445,121.54765,40.3 +206,2013.083,12.8,1449.722,3,24.97289,121.51728,21.4 +35,2012.75,15.4,205.367,7,24.98419,121.54243,55.1 +264,2013.417,3.9,2147.376,3,24.96299,121.51284,31.7 +92,2013.25,9.1,1402.016,0,24.98569,121.5276,43.2 +340,2013.333,1.7,329.9747,5,24.98254,121.54395,50.4 +53,2013.583,32.1,1438.579,3,24.97419,121.5175,27.0 +346,2012.667,0.0,185.4296,0,24.9711,121.5317,37.9 +265,2013.167,32.6,493.657,7,24.96968,121.54522,40.6 +242,2013.5,13.7,250.631,7,24.96606,121.54297,41.4 +14,2012.667,20.4,2469.645,4,24.96108,121.51046,23.8 +316,2013.083,15.6,1756.411,2,24.9832,121.51812,27.3 +89,2012.917,8.9,1406.43,0,24.98573,121.52758,48.0 +274,2013.417,13.2,170.1289,1,24.97371,121.52984,29.3 +167,2013.417,0.0,292.9978,6,24.97744,121.54458,73.6 +329,2012.833,15.9,1497.713,3,24.97003,121.51696,23.6 +408,2013.0,5.2,2408.993,0,24.95505,121.55964,22.3 +135,2012.667,33.4,186.9686,6,24.96604,121.54211,42.2 +307,2013.5,14.4,169.9803,1,24.97369,121.52979,50.2 +398,2013.417,13.1,1164.838,4,24.99156,121.53406,32.2 +320,2013.25,34.6,272.6783,5,24.95562,121.53872,26.9 +244,2013.417,32.8,204.1705,8,24.98236,121.53923,48.2 +55,2013.083,16.1,289.3248,5,24.98203,121.54348,51.7 +364,2013.5,32.3,109.9455,10,24.98182,121.54086,48.0 +51,2013.417,21.7,512.5487,4,24.974,121.53842,44.2 +175,2013.417,4.3,432.0385,7,24.9805,121.53778,45.2 +190,2012.917,16.3,4066.587,0,24.94297,121.50342,20.5 +412,2013.25,18.8,390.9696,7,24.97923,121.53986,40.6 +188,2013.0,8.9,3078.176,0,24.95464,121.56627,22.0 +170,2013.417,8.4,1962.628,1,24.95468,121.55481,23.5 +59,2013.5,30.3,4510.359,1,24.94925,121.49542,22.6 +49,2013.417,24.2,4605.749,0,24.94684,121.49578,13.4 +345,2013.5,34.6,3085.17,0,24.998,121.5155,41.2 +236,2012.75,12.9,250.631,7,24.96606,121.54297,39.3 +253,2012.833,5.9,90.45606,9,24.97433,121.5431,52.7 +22,2013.417,10.5,279.1726,7,24.97528,121.54541,51.6 +314,2013.333,8.3,104.8101,5,24.96674,121.54067,42.8 +161,2012.917,3.5,49.66105,8,24.95836,121.53756,57.8 +277,2013.0,19.1,461.1016,5,24.95425,121.5399,34.0 +192,2013.167,13.2,750.0704,2,24.97371,121.54951,37.8 +386,2013.0,18.3,82.88643,10,24.983,121.54026,46.6 +294,2012.667,12.6,383.2805,7,24.96735,121.54464,42.5 +344,2013.0,33.5,563.2854,8,24.98223,121.53597,46.6 +258,2013.25,17.3,444.1334,1,24.97501,121.5273,43.9 +309,2013.417,16.4,289.3248,5,24.98203,121.54348,53.0 +150,2012.667,34.9,179.4538,8,24.97349,121.54245,39.7 +131,2013.25,29.6,535.527,8,24.98092,121.53653,37.5 +152,2013.5,4.9,387.7721,9,24.98118,121.53788,44.7 +360,2013.5,5.6,2408.993,0,24.95505,121.55964,24.7 +100,2013.417,6.4,90.45606,9,24.97433,121.5431,62.2 +373,2013.0,33.9,157.6052,7,24.96628,121.54196,41.5 +88,2013.583,16.9,4066.587,0,24.94297,121.50342,18.3 +331,2013.083,32.0,1156.777,0,24.94935,121.53046,12.8 +215,2013.583,18.1,1783.18,3,24.96731,121.51486,20.9 +122,2013.5,13.6,492.2313,5,24.96515,121.53737,48.0 +414,2013.5,6.5,90.45606,9,24.97433,121.5431,63.9 +21,2013.417,4.5,2275.877,3,24.96314,121.51151,29.3 +189,2012.917,34.8,190.0392,8,24.97707,121.54312,44.3 +72,2013.083,35.5,640.7391,3,24.97563,121.53715,40.8 +107,2013.083,17.2,189.5181,8,24.97707,121.54308,47.1 +271,2013.333,10.8,252.5822,1,24.9746,121.53046,117.5 +349,2012.833,4.6,259.6607,6,24.97585,121.54516,53.7 +103,2013.083,1.1,193.5845,6,24.96571,121.54089,54.4 diff --git a/Evaluation/README.md b/Evaluation/README.md new file mode 100644 index 0000000..9e2999d --- /dev/null +++ b/Evaluation/README.md @@ -0,0 +1,23 @@ +# Data Science Evaluation Application + +This repository contains a Streamlit-based web application developed as part of a data science evaluation task. The application includes two primary tasks: + +1. **Real Estate Price Prediction**: A linear regression model is used to predict real estate prices based on features such as transaction date, house age, distance to MRT stations, number of convenience stores, latitude, and longitude. + +2. **Time Series Analysis of Household Power Consumption**: A time series model is used to analyze and forecast household power consumption data. + +## Summary + +The application is designed to allow users to upload their datasets, perform exploratory data analysis (EDA) with interactive visualizations, build and evaluate models, and compare the models' predictions with actual values. + +### Key Features: +- **Upload Dataset**: Users can upload their datasets, which are then processed by the application for analysis. +- **EDA and Visualization**: The application provides interactive visualizations using Altair to help users gain insights from their data. +- **Model Building and Evaluation**: For the real estate task, a linear regression model is built and evaluated. For the time series task, a suitable time series model is selected and evaluated. + +### Installation and Usage + +1. **Clone the Repository**: + ```bash + git clone https://github.com/your-username/Data-Science-Evaluation.git + cd Data-Science-Evaluation diff --git a/Evaluation/TS1 datasets/TS1 (1).zip b/Evaluation/TS1 datasets/TS1 (1).zip new file mode 100644 index 0000000..f78c773 Binary files /dev/null and b/Evaluation/TS1 datasets/TS1 (1).zip differ diff --git a/Evaluation/lr3.py b/Evaluation/lr3.py new file mode 100644 index 0000000..1b929d7 --- /dev/null +++ b/Evaluation/lr3.py @@ -0,0 +1,297 @@ +import warnings +warnings.filterwarnings("ignore") + +import streamlit as st +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +import altair as alt +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.preprocessing import StandardScaler, PolynomialFeatures +from sklearn.linear_model import LinearRegression, Ridge, Lasso +from sklearn.metrics import mean_squared_error, r2_score +from sklearn.pipeline import Pipeline +import joblib + + +def main(): + # Title and Description + st.title("Linear Regression for Real Estate Price Prediction") + st.write(""" + Welcome to the Real Estate Price Prediction application. This tool is designed to help you explore real estate data, perform in-depth data analysis, and build predictive models to forecast house prices. In this evaluation task, we will walk through the process of data cleaning, exploratory data analysis (EDA), model building, and making predictions with detailed explanations and insights. + """) + + # Data Upload Section + st.header("1. Upload Your Dataset") + st.write(""" + In this section, you'll upload your training and test datasets. The application will automatically clean and prepare the data for analysis. + """) + train_file = st.file_uploader("Upload the training dataset (CSV)", type=["csv"]) + test_file = st.file_uploader("Upload the test dataset (CSV)", type=["csv"]) + + if train_file is not None and test_file is not None: + # Read the datasets + train_data = pd.read_csv(train_file) + test_data = pd.read_csv(test_file) + + # Display the first few rows of the data + st.subheader("Training Data Overview") + st.write(""" + Below is a preview of the training data that will be used to build the model. We will analyze the features, clean the data, and prepare it for modeling. + """) + st.write(train_data.head()) + + st.subheader("Test Data Overview") + st.write(""" + Below is a preview of the test data that will be used to evaluate the model's performance. We will ensure this data is consistent with the training data. + """) + st.write(test_data.head()) + + # Display the column names + st.subheader("Column Names in Training Data") + st.write(train_data.columns) + + st.subheader("Column Names in Test Data") + st.write(test_data.columns) + + # Identify numeric and non-numeric columns + numeric_columns = train_data.select_dtypes(include=[np.number]).columns.tolist() + non_numeric_columns = train_data.select_dtypes(exclude=[np.number]).columns.tolist() + + st.subheader("Numeric Columns in Training Data") + st.write(numeric_columns) + + st.subheader("Non-Numeric Columns in Training Data") + st.write(non_numeric_columns) + + # Data Cleaning and Feature Engineering + st.header("2. Data Cleaning and Feature Engineering") + st.write(""" + In this section, we undertake several crucial steps to prepare the data for effective modeling. Proper data cleaning and feature engineering are fundamental to building a robust predictive model. Heres what well do: + - **Handle Missing Values**: Missing data can introduce bias or inaccuracies in the model. We fill missing values with the mean of the respective columns to maintain the integrity of the dataset. + - **Drop Non-Numeric Columns**: Non-numeric columns are excluded from the analysis at this stage to focus on features that directly contribute to the numerical prediction of house prices. This simplification ensures that the model can be trained efficiently. + - **Scale Features**: Scaling the numeric features standardizes the range of independent variables or features of data. This step is essential for algorithms that calculate distances between data points, such as in regression models. It ensures that all features contribute equally to the model. + """) + + # Handle non-numeric columns (For now, we'll drop them) + if non_numeric_columns: + st.write(f"**Dropped Non-Numeric Columns:** {non_numeric_columns}") + train_data = train_data.drop(columns=non_numeric_columns) + test_data = test_data.drop(columns=non_numeric_columns) + + # Handle missing values + train_data.fillna(train_data.mean(), inplace=True) + test_data.fillna(test_data.mean(), inplace=True) + + # Feature Scaling + scaler = StandardScaler() + train_data[numeric_columns] = scaler.fit_transform(train_data[numeric_columns]) + test_data[numeric_columns] = scaler.transform(test_data[numeric_columns]) + + st.write(""" + **Data Cleaning and Feature Engineering completed.** The following steps have been successfully applied: + - Missing values have been handled to ensure no gaps in the data. + - Non-numeric columns have been dropped, allowing us to focus on the numerical aspects of the dataset. + - All features have been scaled, ensuring they are on a common scale, which is crucial for the accuracy and performance of our regression model. + + The data is now pre-processed and ready for the next stage: Exploratory Data Analysis (EDA). This preparation sets a solid foundation for building a reliable and accurate predictive model. + """) + + # Exploratory Data Analysis (EDA) + st.header("3. Exploratory Data Analysis (EDA)") + st.write(""" + In this section, we explore the relationships and distributions within the dataset. Understanding these patterns helps in making informed decisions during model building. + """) + + # Interactive Correlation Heatmap + st.subheader("Correlation Heatmap") + st.write(""" + The correlation heatmap below shows the relationships between numeric features. A high correlation (close to 1 or -1) between features can indicate multicollinearity, which we need to address in the modeling stage. + """) + corr_matrix = pd.DataFrame(train_data, columns=numeric_columns).corr().stack().reset_index() + corr_matrix.columns = ['Feature 1', 'Feature 2', 'Correlation'] + + heatmap = alt.Chart(corr_matrix).mark_rect().encode( + x='Feature 1:O', + y='Feature 2:O', + color=alt.Color('Correlation:Q', scale=alt.Scale(scheme='blueorange')), + tooltip=['Feature 1', 'Feature 2', 'Correlation'] + ).properties( + width=600, + height=600 + ) + st.altair_chart(heatmap, use_container_width=True) + + # Interactive Scatter Plots + st.subheader("Pairwise Scatter Plots") + st.write(""" + These scatter plots illustrate the relationships between each feature and the target variable, 'Y house price of unit area'. Analyzing these plots helps us understand which features are most influential in predicting house prices. + """) + for feature in numeric_columns: + scatter_plot = alt.Chart(train_data).mark_circle(size=60).encode( + x=alt.X(feature, scale=alt.Scale(zero=False)), + y=alt.Y('Y house price of unit area', scale=alt.Scale(zero=False)), + tooltip=[feature, 'Y house price of unit area'] + ).interactive().properties( + title=f'Scatter plot of {feature} vs Y house price of unit area', + width=600, + height=400 + ) + st.altair_chart(scatter_plot, use_container_width=True) + + # Interactive Histogram + st.subheader("Distribution of Target Variable") + st.write(""" + The histogram below shows the distribution of the target variable, 'Y house price of unit area'. This analysis helps us understand the range and skewness of house prices in the dataset. + """) + hist = alt.Chart(train_data).mark_bar().encode( + alt.X('Y house price of unit area:Q', bin=True), + y='count()', + tooltip=['count()'] + ).properties( + title='Distribution of Y house price of unit area', + width=600, + height=400 + ).interactive() + st.altair_chart(hist, use_container_width=True) + + # Interactive Box Plots + st.subheader("Box Plots of Numeric Features") + st.write(""" + The box plots below help in identifying the spread and outliers in the data. Outliers can sometimes distort model predictions and might need to be treated separately. + """) + for feature in numeric_columns: + box_plot = alt.Chart(train_data).mark_boxplot().encode( + x=alt.X('Y house price of unit area:Q'), + y=alt.Y(feature + ':Q'), + tooltip=[feature, 'Y house price of unit area'] + ).properties( + title=f'Box plot of Y house price of unit area by {feature}', + width=600, + height=400 + ) + st.altair_chart(box_plot, use_container_width=True) + + # Model Building + st.header("4. Model Building") + st.write(""" + In this section, we build and evaluate different linear models: standard Linear Regression, Lasso Regression, and Ridge Regression. These models are chosen to explore how regularization techniques (Lasso and Ridge) affect the model's performance. + """) + + X_train = train_data.drop(columns=['Y house price of unit area']) + y_train = train_data['Y house price of unit area'] + + X_test = test_data.drop(columns=['Y house price of unit area']) + y_test = test_data['Y house price of unit area'] + + X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) + + # Create Polynomial Features + poly = PolynomialFeatures(degree=2, include_bias=False) + + # Models to Evaluate + models = { + "Linear Regression": LinearRegression(), + "Ridge Regression": Ridge(), + "Lasso Regression": Lasso() + } + + # Evaluate each model + results = {} + for name, model in models.items(): + pipeline = Pipeline([ + ('poly_features', poly), + ('regression', model) + ]) + + pipeline.fit(X_train_split, y_train_split) + y_val_pred = pipeline.predict(X_val) + mse = mean_squared_error(y_val, y_val_pred) + rmse = np.sqrt(mse) + r2 = r2_score(y_val, y_val_pred) + + results[name] = { + "RMSE": rmse, + "R-squared": r2 + } + + # Display the results + st.subheader("Model Evaluation Results") + for model_name, metrics in results.items(): + st.write(f"**{model_name}:**") + st.write(f"- Validation RMSE: {metrics['RMSE']:.4f}") + st.write(f"- Validation R-squared: {metrics['R-squared']:.4f}") + + # Choose the best model based on RMSE + best_model_name = min(results, key=lambda k: results[k]["RMSE"]) + best_model = models[best_model_name] + pipeline = Pipeline([ + ('poly_features', poly), + ('regression', best_model) + ]) + pipeline.fit(X_train, y_train) + + # Save the best model + joblib.dump(pipeline, 'trained_linear_model.pkl') + + # Prediction on the test set + y_test_pred = pipeline.predict(X_test) + test_mse = mean_squared_error(y_test, y_test_pred) + test_rmse = np.sqrt(test_mse) + test_r2 = r2_score(y_test, y_test_pred) + + st.write(f"**Test Results for {best_model_name}:**") + st.write(f"- Test RMSE: {test_rmse:.4f}") + st.write(f"- Test R-squared: {test_r2:.4f}") + + # Actual vs Predicted + st.header("5. Actual vs Predicted") + st.write(""" + The following chart compares the actual house prices to the predicted prices on the test set. This comparison allows us to visually assess the accuracy of our model. A perfect model would have all points lying on the 45-degree line, indicating that the predicted values match the actual values exactly. + """) + + actual_vs_predicted = pd.DataFrame({ + 'Actual': y_test, + 'Predicted': y_test_pred + }) + + scatter_actual_vs_predicted = alt.Chart(actual_vs_predicted).mark_circle(size=60).encode( + x=alt.X('Actual', scale=alt.Scale(zero=False)), + y=alt.Y('Predicted', scale=alt.Scale(zero=False)), + tooltip=['Actual', 'Predicted'] + ).interactive().properties( + title='Actual vs Predicted House Prices', + width=600, + height=400 + ) + + st.altair_chart(scatter_actual_vs_predicted, use_container_width=True) + + st.write(""" + The scatter plot above shows the relationship between actual and predicted house prices. The closer the points are to the diagonal line, the better the model's predictions. Deviations from this line indicate discrepancies between the actual and predicted values, which can be further analyzed to improve model performance. + """) + + # Prediction Section + st.header("6. Predict House Price") + st.write(""" + Use the inputs below to predict the house price per unit area based on the trained model. This feature allows you to experiment with different inputs and see how the model responds. + """) + + house_age = st.number_input("House Age", min_value=0, max_value=100) + distance_to_mrt = st.number_input("Distance to MRT Station", min_value=0) + convenience_stores = st.number_input("Number of Convenience Stores", min_value=0) + latitude = st.number_input("Latitude") + longitude = st.number_input("Longitude") + + if st.button("Predict House Price"): + features = poly.transform(scaler.transform([[house_age, distance_to_mrt, convenience_stores, latitude, longitude]])) + prediction = pipeline.predict(features) + st.write(f"**Predicted House Price per Unit Area:** {prediction[0]:.2f}") + + st.write(""" + This section allows you to predict house prices using the model trained earlier. By inputting the relevant features (house age, distance to the nearest MRT station, number of convenience stores nearby, latitude, and longitude), the model will estimate the price per unit area of the house. + """) + +if __name__ == "__main__": + main() diff --git a/Evaluation/requirements.txt b/Evaluation/requirements.txt new file mode 100644 index 0000000..a2eed95 --- /dev/null +++ b/Evaluation/requirements.txt @@ -0,0 +1,8 @@ +streamlit +pandas +numpy +matplotlib +seaborn +altair +scikit-learn +joblib \ No newline at end of file diff --git a/Evaluation/streamlit_app.py b/Evaluation/streamlit_app.py new file mode 100644 index 0000000..025eca1 --- /dev/null +++ b/Evaluation/streamlit_app.py @@ -0,0 +1,30 @@ +import warnings +warnings.filterwarnings("ignore") + +import streamlit as st + + +# Import the contents of the first app (lr3.py) +import lr3 # Assuming lr3.py has a main() function + +# Import the contents of the second app (ts1.py) +import ts1 # Assuming ts1.py has a main() function + +# Main application title +st.title("Evaluation Application: Linear Regression & Time Series") + +# Create tabs for each of the applications +tab1, tab2 = st.tabs(["Linear Regression", "Time Series"]) + +# Linear Regression Tab +with tab1: + st.header("Linear Regression Analysis") + lr3.main() # Call the main function from lr3.py + +# Time Series Tab +with tab2: + st.header("Time Series Analysis") + ts1.main() # Call the main function from ts1.py + +# Run the combined app with the following command in your terminal: +# streamlit run combined_app.py diff --git a/Evaluation/ts1.py b/Evaluation/ts1.py new file mode 100644 index 0000000..249296f --- /dev/null +++ b/Evaluation/ts1.py @@ -0,0 +1,340 @@ +import warnings +warnings.filterwarnings("ignore") + +import pandas as pd +import numpy as np +import altair as alt +import seaborn as sns +import matplotlib.pyplot as plt +import streamlit as st +from statsmodels.tsa.seasonal import seasonal_decompose +from pandas.plotting import lag_plot +from statsmodels.tsa.statespace.sarimax import SARIMAX +from sklearn.metrics import mean_absolute_error, mean_squared_error +import joblib + + +# Cached data loading with resampling +@st.cache +def load_data(file, resample_freq='H'): + df = pd.read_csv(file, delimiter=';', parse_dates=[['Date', 'Time']], dayfirst=True) + st.write("Original Columns:", df.columns.tolist()) + + df.rename(columns={ + 'Date_Time': 'Datetime', + 'Unnamed: 0': 'Index', + 'Global_active_power': 'Global Active Power', + 'Global_reactive_power': 'Global Reactive Power', + 'Voltage': 'Voltage', + 'Global_intensity': 'Global Intensity', + 'Sub_metering_1': 'Sub Metering 1', + 'Sub_metering_2': 'Sub Metering 2', + 'Sub_metering_3': 'Sub Metering 3' + }, inplace=True) + + st.write("Columns after renaming:", df.columns.tolist()) + + for col in df.columns: + if col != 'Datetime': + df[col] = pd.to_numeric(df[col], errors='coerce') + + df.set_index('Datetime', inplace=True) + df_resampled = df.resample(resample_freq).mean() + + return df_resampled + +# Handle missing values +def handle_missing_values(df): + df = df.fillna(method='ffill').fillna(method='bfill') + df = df.dropna() + return df + +# Train SARIMA model +def train_sarima_model(train_data): + st.subheader("Model Training: SARIMA Model") + st.write("### Why SARIMA?") + st.write(""" + The Seasonal AutoRegressive Integrated Moving Average (SARIMA) model is chosen for this analysis due to its ability + to capture both non-seasonal and seasonal components in time series data. Household electricity consumption data + typically exhibits daily and weekly seasonal patterns, making SARIMA an ideal choice. The model is specified + with both non-seasonal (p, d, q) and seasonal (P, D, Q, m) components to account for these patterns. + """) + + model = SARIMAX(train_data['Global Active Power'], + order=(1, 1, 1), + seasonal_order=(1, 1, 1, 24), + enforce_stationarity=False, + enforce_invertibility=False) + + model_fit = model.fit(disp=False) + + st.write("### Model Summary") + st.write(""" + The following table provides a summary of the SARIMA model's coefficients and statistical metrics. This includes the + ARIMA parameters (p, d, q), seasonal parameters (P, D, Q), and other diagnostics such as the AIC (Akaike Information Criterion), + which helps in evaluating model fit. + """) + st.write(model_fit.summary()) + + joblib.dump(model_fit, 'sarima_model.pkl') + + return model_fit + +# Predict using SARIMA model +def predict_sarima_model(model_fit, start, end): + st.subheader("Prediction Results") + st.write(""" + After training the SARIMA model, predictions are made on the test data. The following plot compares the actual + electricity consumption against the model's predictions. + """) + predictions = model_fit.predict(start=start, end=end, dynamic=False) + return predictions + +# Evaluate model predictions +def evaluate_model(test_data, predictions): + st.subheader("Model Evaluation") + st.write(""" + To assess the accuracy of the SARIMA model, the following evaluation metrics are calculated: + + - **Mean Absolute Error (MAE):** Measures the average magnitude of the errors in a set of predictions, without considering their direction. + - **Mean Squared Error (MSE):** Measures the average of the squares of the errors, giving more weight to larger errors. + - **Root Mean Squared Error (RMSE):** The square root of the MSE, providing an error metric in the same units as the original data. + """) + + mae = mean_absolute_error(test_data['Global Active Power'], predictions) + mse = mean_squared_error(test_data['Global Active Power'], predictions) + rmse = np.sqrt(mse) + + st.write(f"**Mean Absolute Error (MAE):** {mae:.4f}") + st.write(f"**Mean Squared Error (MSE):** {mse:.4f}") + st.write(f"**Root Mean Squared Error (RMSE):** {rmse:.4f}") + + # Plot actual vs predicted values + st.write("### Actual vs Predicted Power Consumption") + plt.figure(figsize=(10, 6)) + plt.plot(test_data.index, test_data['Global Active Power'], label='Actual', color='blue') + plt.plot(test_data.index, predictions, label='Predicted', color='red') + plt.title('Actual vs Predicted Global Active Power') + plt.xlabel('Datetime') + plt.ylabel('Global Active Power (kilowatts)') + plt.legend() + plt.grid(True) + st.pyplot(plt) + + st.write(""" + As observed, the SARIMA model effectively captures the trend and seasonality in the power consumption data, as shown + by the close alignment between the actual and predicted values. This indicates that the model is well-suited for + forecasting household electricity consumption. + """) + +# Function to perform exploratory data analysis (EDA) +def perform_eda(df): + st.header("Exploratory Data Analysis (EDA)") + st.write(""" + Before diving into modeling, it's crucial to understand the data through exploratory data analysis (EDA). + The following sections provide insights into the statistical properties, distributions, and correlations in the data. + """) + + st.subheader("Descriptive Statistics") + st.write(""" + The table below provides a summary of the key statistics for the dataset, including the mean, standard deviation, + and percentiles. This gives an overview of the data distribution. + """) + st.write(df.describe()) + + st.write("### Data Columns Overview") + st.write("Columns available for EDA:", df.columns.tolist()) + + if st.checkbox("Show Histograms"): + st.subheader("Histograms") + st.write(""" + The histograms below show the distribution of each numerical variable in the dataset. This helps identify + potential skewness, outliers, or anomalies in the data. + """) + for col in df.columns: + if df[col].dtype in [np.float64, np.int64]: + hist = alt.Chart(df.reset_index()).mark_bar().encode( + alt.X(col, bin=alt.Bin(maxbins=50), title=f"Distribution of {col}"), + alt.Y('count()', title='Frequency'), + tooltip=[col] + ).properties( + width=600, + height=400 + ).configure_mark( + color='#1f77b4' + ) + st.altair_chart(hist) + + if st.checkbox("Show Box Plots"): + st.subheader("Box Plots") + st.write(""" + Box plots are useful for identifying outliers and understanding the spread of the data. The plots below show + the quartiles and median for each numerical variable. + """) + for col in df.columns: + if df[col].dtype in [np.float64, np.int64]: + box = alt.Chart(df.reset_index()).mark_boxplot().encode( + alt.Y(col, title=f"Box Plot of {col}"), + tooltip=[col] + ).properties( + width=600, + height=400 + ) + st.altair_chart(box) + + if st.checkbox("Show Density Plots"): + st.subheader("Density Plots") + st.write(""" + Density plots are used to visualize the distribution of the data. Unlike histograms, density plots provide a + smoothed curve, making it easier to identify the underlying distribution shape. + """) + for col in df.columns: + if df[col].dtype in [np.float64, np.int64]: + density = alt.Chart(df.reset_index()).transform_density( + col, + as_=[col, 'density'], + ).mark_area(color='#ff7f0e').encode( + x=alt.X(col, title=f"Density Plot of {col}"), + y=alt.Y('density:Q', title='Density'), + tooltip=[col] + ).properties( + width=600, + height=400 + ) + st.altair_chart(density) + + if st.checkbox("Show Scatter Plot Matrix"): + st.subheader("Scatter Plot Matrix") + st.write(""" + The scatter plot matrix provides pairwise scatter plots between numerical variables, helping to identify + potential relationships or correlations between them. + """) + scatter_matrix = sns.pairplot(df.sample(100), diag_kind='kde') + st.pyplot(scatter_matrix) + + if st.checkbox("Show Time Series Plots"): + st.subheader("Time Series Plots") + st.write(""" + The time series plots below show how each variable changes over time. This is particularly useful for identifying + trends, seasonality, and anomalies in the data. + """) + for col in df.columns: + if df[col].dtype in [np.float64, np.int64]: + line = alt.Chart(df.reset_index()).mark_line(color='#2ca02c').encode( + x='Datetime:T', + y=alt.Y(col, title=f"Time Series of {col}"), + tooltip=['Datetime', col] + ).properties( + width=700, + height=400 + ) + st.altair_chart(line) + + if st.checkbox("Show Lag Plot"): + st.subheader("Lag Plot") + st.write(""" + Lag plots are useful for identifying patterns or autocorrelations in time series data. A linear pattern suggests + that the data is highly autocorrelated. + """) + lag_plot_fig, ax = plt.subplots(figsize=(6, 4)) + lag_plot(df['Global Active Power'], ax=ax) + plt.title('Lag Plot of Global Active Power') + st.pyplot(lag_plot_fig) + + if st.checkbox("Show Seasonal Decomposition"): + st.subheader("Seasonal Decomposition") + st.write(""" + Seasonal decomposition allows us to break down the time series into its individual components: trend, seasonality, + and residuals. This helps in understanding the underlying patterns in the data. + """) + df = handle_missing_values(df) + + decomposed = seasonal_decompose(df['Global Active Power'], model='additive') + fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8)) + decomposed.observed.plot(ax=ax1, title='Observed') + decomposed.trend.plot(ax=ax2, title='Trend') + decomposed.seasonal.plot(ax=ax3, title='Seasonal') + decomposed.resid.plot(ax=ax4, title='Residual') + plt.tight_layout() + st.pyplot(fig) + + if st.checkbox("Show Correlation Matrix"): + st.subheader("Correlation Matrix") + st.write(""" + The correlation matrix shows the pairwise correlations between numerical variables. The heatmap visualization + makes it easy to identify strong positive or negative correlations. + """) + corr_matrix = df.corr().reset_index().melt('index') + corr_matrix.columns = ['Variable 1', 'Variable 2', 'Correlation'] + heatmap = alt.Chart(corr_matrix).mark_rect().encode( + x='Variable 1:O', + y='Variable 2:O', + color=alt.Color('Correlation:Q', scale=alt.Scale(scheme='blueorange')), + tooltip=['Variable 1', 'Variable 2', 'Correlation'] + ).properties( + width=600, + height=600 + ).configure_axis( + grid=False + ).configure_view( + strokeWidth=0 + ) + st.altair_chart(heatmap) + +# Main function +def main(): + st.title('Electric Power Consumption Data Analysis and Prediction') + st.write(""" + This application provides a comprehensive analysis of household electric power consumption data. + The analysis includes Exploratory Data Analysis (EDA), model training using SARIMA, and evaluation of the model's + predictive performance. + """) + + train_file = st.file_uploader("Upload Training Data", type=["txt"], key="train") + test_file = st.file_uploader("Upload Test Data", type=["txt"], key="test") + + resample_option = st.selectbox( + "Resample Data By:", + ('Hourly', 'Daily') + ) + + resample_freq = 'H' if resample_option == 'Hourly' else 'D' + + if train_file and test_file: + train_data = load_data(train_file, resample_freq) + test_data = load_data(test_file, resample_freq) + + if train_data is not None and test_data is not None: + st.write(f"### Training Data Overview (Resampled to {resample_option})") + st.write(""" + The table below shows the first few rows of the training data after resampling. This gives a quick glimpse into the + structure and content of the data that will be used for model training. + """) + st.write(train_data.head()) + + st.write(f"### Test Data Overview (Resampled to {resample_option})") + st.write(""" + The table below shows the first few rows of the test data after resampling. This data will be used to evaluate + the model's performance. + """) + st.write(test_data.head()) + + # Perform EDA before training the model + perform_eda(train_data) + + train_data = handle_missing_values(train_data) + test_data = handle_missing_values(test_data) + + # Train SARIMA model + model_fit = train_sarima_model(train_data) + + # Predict on the test set + start = len(train_data) + end = start + len(test_data) - 1 + predictions = predict_sarima_model(model_fit, start, end) + + # Evaluate model + evaluate_model(test_data, predictions) + +if __name__ == "__main__": + main()