From d170518dd74236885427c0b3a414a49c82482be8 Mon Sep 17 00:00:00 2001 From: Alan Thankachan Date: Tue, 30 Sep 2025 19:31:43 -0700 Subject: [PATCH] Updated assignment-1 --- 02_activities/assignments/assignment_1.ipynb | 141 +++++++++++++++++-- 1 file changed, 127 insertions(+), 14 deletions(-) diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb index 45cfc9cd7..f9f856b4f 100644 --- a/02_activities/assignments/assignment_1.ipynb +++ b/02_activities/assignments/assignment_1.ipynb @@ -26,17 +26,27 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dotenv extension is already loaded. To reload it, use:\n", + " %reload_ext dotenv\n" + ] + } + ], "source": [ "# Write your code below.\n", - "\n" + "%load_ext dotenv\n", + "%dotenv\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -55,14 +65,37 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PRICE_DATA path: ../../05_src/data/prices/\n", + "Number of parquet files found: 2858\n", + "First few parquet files: ['../../05_src/data/prices\\\\ACES\\\\ACES_2016\\\\part.0.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2018\\\\part.0.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2018\\\\part.1.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2019\\\\part.0.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2019\\\\part.1.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2020\\\\part.0.parquet', '../../05_src/data/prices\\\\ACES\\\\ACES_2020\\\\part.1.parquet', '../../05_src/data/prices\\\\ACN\\\\ACN_2001\\\\part.0.parquet', '../../05_src/data/prices\\\\ACN\\\\ACN_2001\\\\part.1.parquet', '../../05_src/data/prices\\\\ACN\\\\ACN_2002\\\\part.0.parquet']\n" + ] + } + ], "source": [ "import os\n", "from glob import glob\n", "\n", "# Write your code below.\n", + "\n", + "# Load the environment variable PRICE_DATA\n", + "price_data_dir = os.getenv(\"PRICE_DATA\")\n", + "print(\"PRICE_DATA path:\", price_data_dir)\n", + "\n", + "# Use glob to find the path to all parquet files in the PRICE_DATA directory\n", + "parquet_files = glob(os.path.join(price_data_dir, \"**/*.parquet\"), recursive=True)\n", + "\n", + "# Number of parquet files found\n", + "print(f\"Number of parquet files found: {len(parquet_files)}\")\n", + "\n", + "# First few parquet files\n", + "print(\"First few parquet files:\", parquet_files[:10])\n", "\n" ] }, @@ -88,11 +121,45 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\alanm\\AppData\\Local\\Temp\\ipykernel_8760\\2477076351.py:7: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n", + " Before: .apply(func)\n", + " After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n", + " or: .apply(func, meta=('x', 'f8')) for series result\n", + " ddf['Close_lag1'] = ddf.groupby('ticker')['Close'].shift(1)\n", + "C:\\Users\\alanm\\AppData\\Local\\Temp\\ipykernel_8760\\2477076351.py:8: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n", + " Before: .apply(func)\n", + " After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n", + " or: .apply(func, meta=('x', 'f8')) for series result\n", + " ddf['Adj_Close_lag1'] = ddf.groupby('ticker')['Adj Close'].shift(1)\n" + ] + } + ], "source": [ "# Write your code below.\n", + "\n", + "# Load all the parquet files into a single Dask DataFrame\n", + "ddf = dd.read_parquet(parquet_files)\n", + "\n", + "# Add lags for variable Close and Adj_Close\n", + "ddf['Close_lag1'] = ddf.groupby('ticker')['Close'].shift(1)\n", + "ddf['Adj_Close_lag1'] = ddf.groupby('ticker')['Adj Close'].shift(1)\n", + "\n", + "# Add returns based on Close\n", + "ddf['returns'] = (ddf['Close'] / ddf['Close_lag1']) - 1\n", + "\n", + "# Add hi_lo_range\n", + "ddf['hi_lo_range'] = ddf['High'] - ddf['Low']\n", + "\n", + "# Assign the results to dd_feat\n", + "dd_feat = ddf\n", + "\n", "\n" ] }, @@ -108,11 +175,30 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\alanm\\AppData\\Local\\Temp\\ipykernel_8760\\857855713.py:7: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.\n", + " Before: .apply(func)\n", + " After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result\n", + " or: .apply(func, meta=('x', 'f8')) for series result\n", + " df_feat['returns_mavg_10'] = df_feat.groupby('ticker')['returns'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())\n" + ] + } + ], "source": [ "# Write your code below.\n", + "\n", + "# Convert the Dask DataFrame to a Pandas DataFrame\n", + "df_feat = dd_feat.compute()\n", + "\n", + "# Add a new feature: 10-day moving average of returns per ticker\n", + "df_feat['returns_mavg_10'] = df_feat.groupby('ticker')['returns'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())\n", + "\n", "\n" ] }, @@ -122,8 +208,35 @@ "source": [ "Please comment:\n", "\n", - "+ Was it necessary to convert to pandas to calculate the moving average return?\n", - "+ Would it have been better to do it in Dask? Why?\n", + "+ Was it necessary to convert to pandas to calculate the moving average return?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No, but converting to pandas made it much simpler to apply a groupwise rolling window.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "+ Would it have been better to do it in Dask? Why?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Yes, for very large datasets, Dask can compute in parallel and avoid memory issues, making it more efficient than pandas.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "\n", "(1 pt)" ] @@ -165,7 +278,7 @@ ], "metadata": { "kernelspec": { - "display_name": "env", + "display_name": "dsi_participant", "language": "python", "name": "python3" }, @@ -179,7 +292,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.9.19" } }, "nbformat": 4,