diff --git a/.gitignore b/.gitignore index e9c2ba607..78773820b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ .vscode/ *.zip *.csv +*.xls +*.xlsx +*.xlsm *.parquet *.log **/__pycache__/* diff --git a/01_materials/labs/01_setup.ipynb b/01_materials/labs/01_setup.ipynb index bb48e562e..92feacca8 100644 --- a/01_materials/labs/01_setup.ipynb +++ b/01_materials/labs/01_setup.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Production 1: Setting Up A Repo" + "# Production 1: Getting Started" ] }, { @@ -13,8 +13,9 @@ "source": [ "## Introduction\n", "\n", - "+ Working with code in production is hard. Rarely we will have a chance to work on a greenfield development and will get a chance to define all of its specifications.\n", - "+ Sometimes, we may be offered the option of scraping a system and starting from scratch. This option should be considered carefully and, most of the time, rejected.\n", + "+ Working with code in production is hard. \n", + "+ We will rarely have the chance to work on a greenfield development and will be able to define all its specifications. Most of the time, we will work in collaborative environments and use code to produce outputs and to communicate with colleagues (and ourselves in the future).\n", + "+ Sometimes, we may be offered the option to scrap a system and start from scratch. This option should be considered carefully and, most of the time, rejected.\n", "+ Working with legacy code will be the norm:\n", "\n", " - Legacy code includes our own code.\n", @@ -25,7 +26,7 @@ "\n", "## Software Entropy\n", "\n", - "+ Software entroy is the natural evolution of code towards chaos.\n", + "+ Software entropy is the natural evolution of code towards chaos.\n", "+ Messy code is a natural consequence of change:\n", "\n", " - Requirements change.\n", @@ -43,9 +44,14 @@ " - Testing and CI/CD.\n", " - Documentation.\n", "\n", + "## Technical Debt\n", + "\n", "+ *Technical debt* is future work that is owed to fix issues with the current codebase.\n", "+ Technical debt has principal and interest: complexity spreads and what was a simple *duct tape* solution becomes the source of complexity in downstream consumers.\n", - "+ ML systems are complex: they involve many components and the interaction among those components determines the behaviour of the system. Adding additional complexity by using poor software development practices can be avoided.\n", + "\n", + "## Complexity of ML Systems\n", + "\n", + "+ ML systems are complex: they involve many components, and the interaction among those components determines the behaviour of the system. Adding additional complexity by using poor software development practices can be avoided.\n", "+ Building ML Systems is most of the time a team sport. Our tools should be designed for collaboration." ] }, @@ -53,14 +59,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# A Reference Architecture" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## What are we building?\n", + "# Reference Architecture\n", "\n", "+ [Agrawal and others (2019)](https://arxiv.org/abs/1909.00084) propose the reference architecture below.\n", "\n", @@ -69,7 +68,7 @@ "\n", "\n", "\n", - "+ Through the course, we will write the code in Python for the different components of this architecture. \n" + "+ Throughout the course, we will write Python code for the different components of this architecture." ] }, { @@ -78,12 +77,10 @@ "source": [ "# Source Control\n", "\n", - "\n", - "\n", - "## Git and Github\n", + "## Git and GitHub\n", "\n", "+ Git is a version control system that lets you manage and keep track of your source code history.\n", - "+ If you have not done so, please get an account on [Github](https://github.com/) and setup SSH authentication:\n", + "+ If you have not done so, please get an account on [Github](https://github.com/) and set up SSH authentication:\n", "\n", " - Check for [existing SSH keys](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/checking-for-existing-ssh-keys).\n", " - If needed, create an [SSH Key](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent).\n", @@ -103,18 +100,18 @@ "+ Commit early and commit often.\n", "+ Use meaningful commits:\n", "\n", - " - The drawback of commiting very frequently is that there will be incomplete commits, errors and stepbacks in the commit messages. Commit messages include: \"Committing before switching to another task\", \"Oops\", \"Undoing previous idea\", \"Fire alarm\", etc.\n", + " - The drawback of committing very frequently is that there will be incomplete commits, errors, and stepbacks in the commit messages. Commit messages include: \"Committing before switching to another task\", \"Oops\", \"Undoing previous idea\", \"Fire alarm\", etc.\n", " - In Pull Requests, squash commits and write meaningful messages. \n", "\n", "+ Apply a branch strategy.\n", - "+ Submit clean pull requests: verify that latest branch is merged and review conflicts.\n", + "+ Submit clean pull requests: verify that the latest branch has been merged and review any conflicts.\n", "\n", "## Commit Messages\n", "\n", - "+ Clear commit messages help document your code and allow you to trace the reaoning behind design decisions. \n", + "+ Clear commit messages help document your code and allow you to trace the reasoning behind design decisions. \n", "+ A few guidelines for writing commit messages:\n", "\n", - " - Use markdown: Github interprets commit messages as markdown.\n", + " - Use markdown: GitHub interprets commit messages as markdown.\n", " - First line is a subject:\n", "\n", " * No period at the end.\n", @@ -141,7 +138,7 @@ "## Branching Strategies\n", "\n", "+ When working standalone or in a team, you should consider your [branching strategy](https://www.atlassian.com/agile/software-development/branching).\n", - "+ A branching strategy is a way to organize the progression of code in your repo. \n", + "+ A branching strategy is a way to organise the progression of code in your repo. \n", "+ In [trunk-based branching strategy](https://www.atlassian.com/continuous-delivery/continuous-integration/trunk-based-development), each developer works based on the *trunk* or *main* branch. (Ryaboy, 2021)]\n", "\n", "
\n", @@ -171,7 +168,7 @@ "\n", " - From the source control menu, one can easily stage files, commit, and push/pull to origin.\n", "\n", - " - Other commands can be accessed via the command pallete (`Ctrl + Shift + P`). For instance, one can select or create a new branch using the option *Git: Checkout to*." + " - Other commands can be accessed via the command palette (`Ctrl + Shift + P`). For instance, one can select or create a new branch using the option *Git: Checkout to*." ] }, { @@ -182,23 +179,21 @@ "\n", "+ There are many reasons to control our development environment, including version numbers for Python and all the libraries that we are using:\n", "\n", - " - Reproducibility: we want to be able to reproduce our process in a production environment with as little change as possible. \n", - " - Backup and archiving: saving our work in a way that can be used in the future, despite Python and libraries evolving.\n", - " - Collaboration: work with colleagues on different portions of the code involves everyone having a standard platform to run the codebase.\n", + " - Reproducibility: We want to be able to reproduce our process in a production environment with as few changes as possible. \n", + " - Backup and archiving: Saving our work in a way that can be used in the future, despite Python and libraries evolving.\n", + " - Collaboration: Working with colleagues on different portions of the code involves everyone having a standard platform to run the codebase.\n", "\n", - "+ We can achieve the objectives above in many ways, including vritualizing our environments, packaging our code in containers, and using virtual machines, among others.\n", - "+ Most of the time, creating a virtual environment will be part of the initial development setup. This vritual environment will help us *freeze* the python version and some version libraries. " + "+ We can achieve the objectives above in many ways, including virtualising our environments, packaging our code in containers, and using virtual machines, among others.\n", + "+ Most of the time, creating a virtual environment will be part of the initial development setup. This vritual environment will help us *freeze* the python version and some version libraries." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Setting up the environment\n", + "## Setting Up the Environment with uv\n", "\n", - "### uv\n", - "\n", - "+ [uv](https://docs.astral.sh/uv/) is a fast command line tool for Python package and environment management. It combines the roles of `venv` and `pip`.\n", + "+ [uv](https://docs.astral.sh/uv/) is a fast command-line tool for Python package and environment management. It combines the roles of `venv` and `pip`.\n", "+ From the terminal, create a virtual environment with: `uv venv --python `. For example, `uv venv production-env --python 3.11` creates a new environment called `production-env` using Python 3.11.\n", "+ Activate the environment with:\n", " - macOS/Linux: `source /bin/activate`\n", @@ -208,7 +203,9 @@ " - Verify uv installation: `uv --version`\n", " - Add a new package to the environment: `uv add --active`\n", " - Install all required packages for the project: `uv sync --active`\n", - " - Create a lockfile of exact package versions: `uv lock`" + " - Create a lockfile of exact package versions: `uv lock`\n", + "\n", + "+ You can find more detailed instructions in [setup.md](../../SETUP.md)." ] }, { @@ -219,27 +216,22 @@ "\n", "+ We will use Python's logging module and will provision our standard loggers through our first module.\n", "+ The module is located in `./05_src/utils/logger.py`.\n", - "+ Our notebooks will need to add `../05_src/` to their path and load environment variables from `../05_src/.env`. Notice that these paths are based on the notebook's location. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "+ Our notebooks will need to add `../05_src/` to their path and load environment variables from `../05_src/.env`. Notice that these paths are based on the notebook's location. \n", + "\n", "### Logger highlights\n", "\n", "A few highlights about `./05_src/utils/logger.py`:\n", "\n", "+ This logger has two handlers: \n", "\n", - " - A `FileHandler` that will save logs to files that are datetime index.\n", + " - A `FileHandler` that will save logs to files that are a datetime index.\n", " - A `StreamHandler` handler that outputs messages to the stdout.\n", "\n", "+ Each logger can set its own format. \n", "+ The log directory and log level are obtained from the environment.\n", "+ According to the [Advanced Logging Tutorial](https://docs.python.org/2/howto/logging.html#logging-advanced-tutorial): \n", "\n", - " >\"A good convention to use when naming loggers is to use a module-level logger, in each module which uses logging, named as follows: \n", + " >\"A good convention to use when naming loggers is to use a module-level logger, in each module that uses logging, named as follows: \n", " >\n", " >`logger = logging.get_logger(__name__)`.\n", " >\n", @@ -269,8 +261,14 @@ "metadata": {}, "outputs": [], "source": [ + "from pathlib import Path\n", "import sys\n", - "sys.path.append(\"../../05_src\")" + "\n", + "notebook_dir = Path.cwd()\n", + "src_path = (notebook_dir / \"../../05_src\").resolve()\n", + "\n", + "if str(src_path) not in sys.path:\n", + " sys.path.insert(0, str(src_path)) # insert(0) gives it priority" ] }, { @@ -288,117 +286,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Using Docker to Set Up Experiment Tracking\n", - "\n", - "+ For our work, we need an environment that resembles the production environment as closely as possible. \n", - "+ One way to achieve this is to use containers and containerized application. \n", - "+ Without going into the details, you can think of a container as software that encapsulates the key features of an operating system, a programming language, and the application code.\n", - "+ Containers are meant to be portable across operating systems: a container will work the same regardless if the underlying Docker application is installed in a Windows, Linux or Mac machine.\n", - "+ Containers are not Virtual Machines.\n", - "+ Docker is a popular application that implement containers.\n", - "\n", - "## What is Docker?\n", - "\n", - "+ From product documentation:\n", - "\n", - "> Docker is an open platform for developing, shipping, and running applications. Docker enables you to separate your applications from your infrastructure so you can deliver software quickly. With Docker, you can manage your infrastructure in the same ways you manage your applications. By taking advantage of Docker's methodologies for shipping, testing, and deploying code, you can significantly reduce the delay between writing code and running it in production.\n", - "\n", - "## General Procedure\n", - "\n", - "+ To setup services using containers, we will do the following:\n", - "\n", - "1. Download an image from [Docker Hub](https://hub.docker.com/) or equivalent image repository.\n", - "2. If required, set up a volume to [persist data](https://docs.docker.com/guides/walkthroughs/persist-data/).\n", - "3. Redirect ports as needed.\n", - "4. Start the container.\n", - "\n", - "In our course, we will setup the following services:\n", - "\n", - "+ MLFlow: an experiment tracking system. MLFlow requires two backends: a database and an object store.\n", - "+ PostgreSQL: a database management system.\n", - "+ MinIO: an object store that resembles S3 buckets in AWS.\n", - "\n", - "## Starting the Containers\n", - "\n", - "+ To run the process above, first navigate to the `./05_src/experiment_tracking/` folder.\n", - "+ The first time that you set up the containers, you will need to build the MLFlow image. You can build the required image with `docker compose build`. \n", - "+ After building a local image for MLFlow, run `docker compose up -d`. \n", - "+ The flag `-d` indicates that we will do a headless run. \n", - "+ Notice that the containers are set to always restart. You can remove the option or turn the containers off manually. Be aware that if you leave this option on, the containers will run any time your Docker desktop restarts.\n", + "# A Few Remarks\n", "\n", - "## Stopping the Containers\n", + "## On Jupyter Notebooks\n", "\n", - "+ To stop the containers use (from `./05_src/db/`): `docker compose stop`.\n", - "+ Alternatively, you can bring all images down including their volumes with: `docker compose down -v`. \n", - "\n", - " - The `-v` flag removes volumes. \n", - " - It is the best option when you are do not need the data any more because **it will delete the data in your DB **. \n", - "\n", - "\n", - "## Connecting to the MLFlow UI\n", - "\n", - "+ MLFlow offers a convenient interface that can be accessed via [http://localhost:5001](http://localhost:5001).\n", - "\n", - "
\n", - "\n", - "\n", - "## Connecting to PgAdmin\n", - "\n", - "+ PgAdmin4 is management software for PostgreSQL Server.\n", - "+ You can open the local implementation by navigating to [http://localhost:5051](http://localhost:5051/). You will find a screen like the one below.\n", - "\n", - "
\n", - "\n", - "+ Login using the credentials specified in the file `./05_src/experiment_tracking/.env`. Notice there are two sets of credentials, use the ones for PgAdmin4. After authentication, you will see a screen like the one below.\n", - "\n", - "
\n", - "\n", - "+ Click on \"Add New Server\":\n", - "\n", - " - In the *General* Tab, under Name enter: localhost. \n", - " - Under the *Connection* Tab, use Host name *postgres* (this is the name of the service in the docker compose file). \n", - " - Username and password are the ones found in the `./05_src/experiment_tracking/.env` file.\n", - "\n", - "\n", - "## Connect to MinIO\n", - "\n", - "+ The interface for MinIO can be reached via [http://localhost:9001](http://localhost:9001)\n", - "+ The credentials can be found in the `./05_src/experiment_tracking/.env` file.\n", - "\n", - "
\n", - "\n", - "\n", - "## Learn More\n", + "+ Jupyter Notebooks are great for drafting code, fast experimentation, demos, documentation, and some prototypes.\n", + "+ They are not ideal for production code or experiment tracking.\n", "\n", - "+ Containers and containerization are topics well beyond the scope of this course. However, we will use containerized applications to help us implement certain patterns. \n", - "+ If you are interested in Docker, a good place to start is the [Official Docker Guides](https://docs.docker.com/get-started/overview/). " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# On Jupyter Notebooks\n", + "## On Copilot and other AI Code Generators\n", "\n", - "+ Jupyter Notebooks are great for drafting code, fast experimentation, demos, documentation, and some prototypes.\n", - "+ They are not great for production code and not great for experiment tracking." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# A Note about Copilot\n", + "+ AI-assisted coding is a reality, and developers are incorporating it into their day-to-day activities.\n", + "+ This technology will allow you to solve questions, resolve syntax issues, and bring new ideas. However, you may want to consider a few items:\n", "\n", - "+ AI-assisted coding is a reality. I would like your opinions about the use of this technology.\n", - "+ I will start the course with Copilot on, but if it becomes too distracting, I will be happy to turn it off. \n", - "+ Copilot is a nice tool, but it is not for everyone. If you are starting to code or are trying to level up, I recommend that you leave AI assistants (Copilot, ChatGPT, etc.) for later." + " - You are still responsible for your code: understand what the code assistant has proposed and make appropriate changes.\n", + " - System architecture is important, and generative AI may induce architectural decisions that may impact system performance significantly.\n", + " - If you are starting out as a developer, give yourself a chance to make mistakes and learn by trial and error. Code assistants can help you when you get stuck, but experimentation is great for learning.\n", + " - Not all models are the same: some perform better than others." ] } ], "metadata": { "kernelspec": { - "display_name": "dsi_participant", + "display_name": "production-env (3.11.13)", "language": "python", "name": "python3" }, @@ -412,7 +321,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.21" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/01_materials/labs/02_data_engineering.ipynb b/01_materials/labs/02_data_engineering.ipynb index ec48f374a..16c49bd4d 100644 --- a/01_materials/labs/02_data_engineering.ipynb +++ b/01_materials/labs/02_data_engineering.ipynb @@ -21,12 +21,12 @@ "\n", "* Build a data pipeline that downloads price data from the internet, stores it locally, transforms it into return data, and stores the feature set.\n", " - Getting the data.\n", - " - Schemas and index in dask.\n", + " - Schemas and index in Dask.\n", "\n", "* Explore the parquet format.\n", " - Reading and writing parquet files.\n", " - Read datasets that are stored in distributed files.\n", - " - Discuss dask vs pandas as a small example of big vs small data.\n", + " - Discuss Dask vs. Pandas as a small example of big vs small data.\n", " \n", "* Discuss the use of environment variables for settings.\n", "* Discuss how to use Jupyter notebooks and source code concurrently. \n", @@ -35,19 +35,23 @@ "## About the Data\n", "\n", "+ We will download the prices for a list of stocks.\n", - "+ The source is Yahoo Finance and we will use the API provided by the library yfinance.\n", + "+ The source is Yahoo Finance, and the data, along with its description, is available via [Kaggle](https://www.kaggle.com/datasets/jacksoncrow/stock-market-dataset).\n", "\n", "\n", "## Medallion Architecture\n", "\n", "+ The architecture that we are thinking about is called Medallion by [DataBricks](https://www.databricks.com/glossary/medallion-architecture). It is an ELT type of thinking, although our data is well-structured.\n", "\n", - "![Medallion Architecture (DataBicks)](./images/02_medallion_architecture.png)\n", + "
\n", + "\n", + "
\n", "\n", "+ In our case, we would like to optimize the number of times that we download data from the internet. \n", - "+ Ultimately, we will build a pipeline manager class that will help us control the process of obtaining and transforming our data.\n", + "+ Ultimately, we will build a pipeline manager class to control the process of obtaining and transforming our data.\n", "\n", - "![](./images/02_target_pipeline_manager.png)" + "
\n", + "\n", + "
\n" ] }, { @@ -56,14 +60,17 @@ "source": [ "# Download Data\n", "\n", - "Download the [Stock Market Dataset from Kaggle](https://www.kaggle.com/datasets/jacksoncrow/stock-market-dataset). Note that you may be required to register for a free account.\n", + "Download the [Stock Market Dataset from Kaggle](https://www.kaggle.com/datasets/jacksoncrow/stock-market-dataset). Note that you may be required to register for a free account. Alternatively, download the file from [this location](https://drive.google.com/drive/folders/1AA4gapDLpI194TGce1bY25sd91Km-tU3?usp=drive_link).\n", "\n", - "Extract all files into the directory: `./05_src/data/prices_csv/`\n", - "\n", - "Your folder structure should include the following paths:\n", - "\n", - "+ `05_src/data/prices_csv/etfs`\n", - "+ `05_src/data/prices_csv/stocks`\n" + "+ Extract stock prices (not ETFs) into the directory: `./05_src/data/prices_csv/`. \n", + "+ To be clear, your folder structure should include the path `05_src/data/prices_csv/stocks`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The command `%run update_path.py` runs a local script that adds the repository's `./05_src/` directory to the Notebook's kernel path. This way, we can use our modules within the notebook." ] }, { @@ -72,34 +79,54 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "import os\n", - "import sys\n", - "from glob import glob\n", - "\n", - "sys.path.append(os.getenv('SRC_DIR'))\n", - "\n", - "from utils.logger import get_logger\n", - "_logs = get_logger(__name__)" + "%run update_path.py" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A few things to notice in the code chunk above:\n", + "To load the historical price data for stocks and ETFs, use the code below. Notice the following:\n", + "\n", + "+ Libraries are ordered from high-level to low-level libraries from the package manager. Local modules are imported at the end. \n", + "+ The function `get_logger()` is called with `__name__` as recommended by [Python's documentation](https://docs.python.org/2/howto/logging.html#logging-advanced-tutorial).\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from glob import glob\n", + "import os\n", + "import pandas as pd\n", + "import random\n", "\n", - "+ Libraries are ordered from high-level to low-level libraries from the package manager (pip in this case, but could be uv, poetry, etc.)\n", - "+ The command `sys.path.append(\"../05_src/)` will add the `../05_src/` directory to the path in the Notebook's kernel. This way, we can use our modules as part of the notebook.\n", - "+ Local modules are imported at the end. \n", - "+ The function `get_logger()` is called with `__name__` as recommended by the documentation." + "from utils.logger import get_logger\n", + "_logs = get_logger(__name__)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now, to load the historical price data for stocks and ETFs, we could use:" + "+ The [`glob` module](https://docs.python.org/3/library/glob.html) is used for finding path names that match specified patterns using Unix shell-style rules.\n", + "\n", + "+ Notice that the module `glob` contains a function called `glob`; therefore, we used `from glob import glob` above.\n", + "\n", + "+ The path in which we are searching for our csv files is produced by joining two strings:\n", + "\n", + "\n", + "\n", + " - The value of the environment variable 'SRC_DIR' that we obtain with `os.getenv('SRC_DIR')` (this variable points to ./05_src).\n", + "\n", + " - Another string given by \"data/prices_csv/stocks/*.csv\".\n", + "\n", + " - Both strings are combined into an OS-consistent path using `os.path.join(...)`.\n", + "\n", + "+ After we know the location of all our files, we sample a subset of them." ] }, { @@ -108,13 +135,35 @@ "metadata": {}, "outputs": [], "source": [ - "import random\n", - "\n", "stock_files = glob(os.path.join(os.getenv('SRC_DIR'), \"data/prices_csv/stocks/*.csv\"))\n", + "_logs.info(f'Found {len(stock_files)} stock price files.')\n", "\n", "random.seed(42)\n", - "stock_files = random.sample(stock_files, 60)\n", + "n_sample = 60\n", + "stock_files = random.sample(stock_files, n_sample)\n", + "_logs.info(f'Sampled {n_sample} stock price files for processing. The files are: {stock_files }')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We load the sampled files into dataframes and concatenate them:\n", "\n", + "+ Start with an empty list.\n", + "+ Read each file into a dataframe and [`append()` it to the list](https://docs.python.org/3/tutorial/datastructures.html#more-on-lists). Notice that `append()` is an in-place operation (it does not return a list, it modifies the list in place).\n", + "+ Finally, we concatenate all dataframes along the vertical axis (`axis=0`) using [`pd.concat()`](https://pandas.pydata.org/docs/user_guide/merging.html#concat). \n", + "+ Notice that we do not concatenate each time that we load a dataframe. According to [Panda's documentation](https://pandas.pydata.org/docs/user_guide/merging.html#concat): \n", + "\n", + "> \"`concat()` makes a full copy of the data, and iteratively reusing `concat()` can create unnecessary copies. Collect all DataFrame or Series objects in a list before using `concat()`.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "dt_list = []\n", "for s_file in stock_files:\n", " _logs.info(f\"Reading file: {s_file}\")\n", @@ -124,15 +173,14 @@ " Date = lambda x: pd.to_datetime(x['Date'])\n", " )\n", " dt_list.append(dt)\n", - "stock_prices = pd.concat(dt_list, axis = 0, ignore_index = True)\n", - "\n" + "stock_prices = pd.concat(dt_list, axis = 0, ignore_index = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Verify the structure of the `stock_prices` data:" + "Verify the structure of the `stock_prices` data using the [`info()` dataframe method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.info.html):" ] }, { @@ -148,7 +196,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can subset our ticker data set using standard indexing techniques. A good reference for this type of data manipulation is Panda's [Documentation](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-and-selecting-data) and [Cookbook](https://pandas.pydata.org/docs/user_guide/cookbook.html#cookbook-selection)." + "We can subset our ticker data set using standard indexing techniques. Good references for this type of data manipulation are:\n", + "\n", + "+ [Panda's Documentation](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-and-selecting-data). \n", + "+ [Panda's Cookbook](https://pandas.pydata.org/docs/user_guide/cookbook.html#cookbook-selection)." ] }, { @@ -173,18 +224,12 @@ "metadata": {}, "source": [ "# Storing Data in CSV\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "\n", "+ We have some data. How do we store it?\n", - "+ We can compare two options, CSV and Parqruet, by measuring their performance:\n", + "+ We can compare two options, CSV and Parquet, by measuring their performance:\n", "\n", - " - Time to save.\n", - " - Space required." + " - Time to save: We will measure time by using the `time` library.\n", + " - Space required on drive: We will use the custom function below." ] }, { @@ -206,13 +251,15 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import time\n", - "import shutil" + "Load the specification of a temporary directory from the environment and create a subdirectory in it called \"csv\":\n", + "\n", + "+ Use `os.getenv(\"TEMP_DATA\")` to obtain the desired location of the temporary folder from an environment variable.\n", + "+ If the subdirectory exists, delete it using `shutil.rmtree()`; the flag `ignore_errors=True` helps us in case the subdirectory does not exist (for instance, in the first run).\n", + "+ Create a directory with path given by `csv_dir` using `os.makedirs()`; the flag `exist_ok=True` indicates that if the directory already exists, then the function will do nothing.\n", + "+ Finally, create the stock price file location, `stock_csv`, which will be used to create the csv file." ] }, { @@ -221,11 +268,21 @@ "metadata": {}, "outputs": [], "source": [ + "import shutil\n", + "\n", "temp = os.getenv(\"TEMP_DATA\")\n", "csv_dir = os.path.join(temp, \"csv\")\n", + "\n", "shutil.rmtree(csv_dir, ignore_errors=True)\n", - "stock_csv = os.path.join(csv_dir, \"stock_px.csv\")\n", - "os.makedirs(csv_dir, exist_ok=True)" + "os.makedirs(csv_dir, exist_ok=True)\n", + "stock_csv = os.path.join(csv_dir, \"stock_px.csv\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save the concatenated dataframe to a CSV file. We measure the time elapsed by storing the start and end times, then we calculate their difference in seconds." ] }, { @@ -234,6 +291,7 @@ "metadata": {}, "outputs": [], "source": [ + "import time\n", "\n", "start = time.time()\n", "stock_prices.to_csv(stock_csv, index = False)\n", @@ -249,11 +307,22 @@ "source": [ "## Save Data to Parquet\n", "\n", - "### Dask \n", + "### Notes on Dask \n", "\n", - "We can work with with large data sets and parquet files. In fact, recent versions of pandas support pyarrow data types and future versions will require a pyarrow backend. The pyarrow library is an interface between Python and the Appache Arrow project. The [parquet data format](https://parquet.apache.org/) and [Arrow](https://arrow.apache.org/docs/python/parquet.html) are projects of the Apache Foundation.\n", + "We could use Pandas to save the data directly into Parquet files. However, we will use a different approach by applying the [Dask framework](https://www.dask.org/). Dask provides functionality for working with datasets that do not fit in memory and parallelization to speed up computation. A few notes on Dask and Pandas:\n", "\n", - "However, Dask is much more than an interface to Arrow: Dask provides parallel and distributed computing on pandas-like dataframes. It is also relatively easy to use, bridging a gap between pandas and Spark. " + "- Pandas, Parquet, and Arrow:\n", + "\n", + " + We can work with large datasets and Parquet files in Pandas, but we will generally be limited by the amount of data that can fit in our computer's memory.\n", + " + Pandas can write Parquet files using a PyArrow backend. In fact, recent versions of Pandas support PyArrow data types, and future versions will require a PyArrow backend. \n", + " + The PyArrow library is an interface between Python and the Apache Arrow project. In particular, the [Parquet data format](https://parquet.apache.org/) and [Arrow](https://arrow.apache.org/docs/python/parquet.html) are Apache projects.\n", + "\n", + "- Dask \n", + "\n", + " + Dask is much more than an interface to Arrow: Dask provides parallel and distributed computing on Pandas-like dataframes. \n", + " + Dask is also relatively easy to use as it mimics Pandas' API.\n", + " + Dask allows us to work with larger datasets than Pandas. In a sense, it is an intermediate step between Pandas and big-data frameworks like Spark (or Databricks).\n", + " + If you are familiar with Pandas, a good introduction is [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html)." ] }, { @@ -289,40 +358,83 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Parquet files and Dask Dataframes\n", + "### Pandas, Dask, and Parquet\n", + "\n", + "The distinction of Pandas Dataframes, Dask Dataframes, and Parquet files is important:\n", + "\n", + "+ Pandas dataframes combine the functionality of [NumPy](https://numpy.org/) (efficient vector operations, especially vector algebra) with a concise data manipulation framework that allows us to create columns of different data types (NumPy only allows single-type matrices), database-like operations (such as filtering rows, subsetting columns, and joining different dataframes).\n", + "+ Dask dataframes extend the functionality of Pandas dataframes beyond the confines of available memory and implement parallelized operations, among other benefits.\n", + "+ Parquet files are a file format. Parquet files can be created by Pandas and Dask, but Dask offers a superior interface. Parquet files are immutable: once written, they cannot be modified.\n", + "+ Parquet and Dask are not the same: Parquet is a file format that can be accessed by many applications and programming languages (Python, R, Power BI, etc.), while Dask is a Python package for working with large datasets using distributed computation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dask is Powerful, but not Infallible\n", + "\n", + "It is tempting to think of Dask as a super-Pandas, but each package has its advantages and disadvantages. \n", + "\n", + "+ Dask is not good at everything (see [Dask DataFrames Best Practices](https://docs.dask.org/en/stable/dataframe-best-practices.html)). \n", + "+ A useful and somewhat idiosyncratic comparison of various data manipulation frameworks is shown below (from [DataFrames at Scale Comparison: TPC-H](https://docs.coiled.io/blog/tpch.html#when-to-use-duckdb-vs-polars-vs-dask-vs-spark)) \n", + "\n", + "Concept | Spark | Dask | DuckDB | Polars\n", + "---------------------|-------|------|--------|--------\n", + "Fast Locally | ❌ | 🤔 | ✅ | ✅\n", + "Fast on Cloud (1 TB) | ✅ | ✅ | ✅ | ❌\n", + "Fast on Cloud (10 TB)| ❌ | ✅ | ✅ | ❌\n", + "Scales Out | ✅ | ✅ | ❌ | ❌\n", + "SQL | ✅ | 🤔 | ✅ | 🤔\n", + "More than SQL | ✅ | ✅ | ❌ | ✅\n", + "Sensible Defaults | ❌ | ✅ | ✅ | ✅\n", + "\n", + "\n", + "### Dask Best Practices\n", + "\n", + "Parallelism brings extra complexity and overhead. Here are a few ideas to help you decide when to use Dask (from [Dask's Best Practices](https://docs.dask.org/en/stable/best-practices.html)):\n", + "\n", + "#### Small is Better\n", "\n", - "+ Parquet files are immutable: once written, they cannot be modified.\n", - "+ Dask DataFrames are a useful implementation to manipulate data stored in parquets.\n", - "+ Parquet and Dask are not the same: parquet is a file format that can be accessed by many applications and programming languages (Python, R, PowerBI, etc.), while Dask is a package in Python to work with large datasets using distributed computation.\n", - "+ **Dask is not for everything** (see [Dask DataFrames Best Practices](https://docs.dask.org/en/stable/dataframe-best-practices.html)). \n", + "+ Start small: if possible, use Pandas. Also, try to reduce your data using aggregation, then use Pandas.\n", + "+ More generally, NumPy, Pandas, and Scikit-Learn may have faster functions for what you need. Consult the relevant documentation, experiment, and/or consult with a colleague or expert.\n", "\n", - " - Consider cases suchas small to large joins, where the small dataframe fits in memory, but the large one does not. \n", - " - If possible, use pandas: reduce, then use pandas.\n", - " - Pandas performance tips apply to Dask.\n", - " - Use the index: it is beneficial to have a well-defined index in Dask DataFrames, as it may speed up searching (filtering) the data. A one-dimensional index is allowed.\n", - " - Avoid (or minimize) full-data shuffling: indexing is an expensive operations. \n", - " - Some joins are more expensive than others. \n", + "#### Index with Care\n", "\n", - " * Not expensive:\n", + "+ Use the index: it is beneficial to have a well-defined index in Dask DataFrames, as it may speed up searching (filtering) the data. A one-dimensional index is allowed.\n", + "+ Minimize full-data shuffling as much as possible: indexing is an expensive operation. \n", "\n", - " - Join a Dask DataFrame with a pandas DataFrame.\n", - " - Join a Dask DataFrame with another Dask DataFrame of a single partition.\n", - " - Join Dask DataFrames along their indexes.\n", + "### Consider the Cost of Joins\n", "\n", - " * Expensive:\n", + "+ Consider cases such as small-to-large joins, where the small dataframe fits in memory, but the large one does not. The small dataframe can be Pandas, while the larger one is a Dask dataframe.\n", + "+ Some joins are more expensive than others. \n", "\n", - " - Join Dask DataFrames along columns that are not their index.\n" + " * Not expensive:\n", + "\n", + " - Join a Dask DataFrame with a Pandas DataFrame.\n", + " - Join a Dask DataFrame with another Dask DataFrame of a single partition.\n", + " - Join Dask DataFrames along their indexes.\n", + "\n", + " * Expensive:\n", + "\n", + " - Join Dask DataFrames along columns that are not their index." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# How do we store prices?\n", + "# How Do We Store Prices?\n", "\n", - "+ We can store our data as a single blob. This can be difficult to maintain, especially because parquet files are immutable.\n", - "+ Strategy: organize data files by ticker and date. Update only latest month.\n", - "\n" + "+ We can store our data as a single blob. \n", + "\n", + " - This can be difficult to maintain, especially because parquet files are immutable.\n", + " - Using a single file, we would need to recreate the complete file any time that we update it.\n", + "\n", + "+ An alternative strategy is to organize data files by ticker and date: \n", + "\n", + " - We can create one file per ticker and month (or any other suitable frequency). \n", + " - Under this approach, we would only need to recreate the latest month's file at any update. " ] }, { @@ -354,12 +466,22 @@ "outputs": [], "source": [ "for ticker in stock_prices['ticker'].unique():\n", + " # Filter data for ticker\n", + " # Notice that these are Pandas dataframes\n", + " _logs.info(f'Processing ticker: {ticker}')\n", " ticker_dt = stock_prices[stock_prices['ticker'] == ticker]\n", " ticker_dt = ticker_dt.assign(Year = ticker_dt.Date.dt.year)\n", " for yr in ticker_dt['Year'].unique():\n", + " _logs.info(f'Processing year {yr} for ticker {ticker}.')\n", + " # Filter data for year and convert to Dask dataframe\n", " yr_dd = dd.from_pandas(ticker_dt[ticker_dt['Year'] == yr],2)\n", + " \n", + " # Define path and create directories if not exist\n", " yr_path = os.path.join(PRICE_DATA, ticker, f\"{ticker}_{yr}\")\n", " os.makedirs(os.path.dirname(yr_path), exist_ok=True)\n", + " _logs.info(f'Writing data to path: {yr_path}')\n", + "\n", + " # Write to Parquet\n", " yr_dd.to_parquet(yr_path, engine = \"pyarrow\")\n", " " ] @@ -370,15 +492,17 @@ "source": [ "Why would we want to store data this way?\n", "\n", - "+ Easier to maintain. We do not update old data, only recent data.\n", - "+ We can also access all files as follows." + "+ Data files are easier to maintain. We do not update old data, only recent data or the most recent \"delta\".\n", + "+ Parquet files, as long as they maintain a consistent schema, can all be read jointly. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Load, Transform and Save " + "# Load, Transform, and Save\n", + "\n", + "In this section, we will load the Parquet files we generated, transform the data, and save the resulting dataset." ] }, { @@ -389,7 +513,7 @@ "\n", "+ Parquet files can be read individually or as a collection.\n", "+ `dd.read_parquet()` can take a list (collection) of files as input.\n", - "+ Use `glob` to get the collection of files." + "+ Use `glob` to obtain the collection of files." ] }, { @@ -401,6 +525,8 @@ "from glob import glob\n", "\n", "parquet_files = glob(os.path.join(PRICE_DATA, \"**/*.parquet\"), recursive = True)\n", + "_logs.info(f'Found {len(parquet_files)} parquet files for reading back into Dask.')\n", + "\n", "dd_px = dd.read_parquet(parquet_files).set_index(\"ticker\")" ] }, @@ -411,8 +537,18 @@ "## Transform\n", "\n", "+ This transformation step will create a *Features* data set. In our case, features will be stock returns (we obtained prices).\n", - "+ Dask dataframes work like pandas dataframes: in particular, we can perform groupby and apply operations.\n", - "+ Notice the use of [an anonymous (lambda) function](https://realpython.com/python-lambda/) in the apply statement." + "+ Dask dataframes work similarly to Pandas dataframes: in particular, we can perform groupby and apply operations.\n", + "+ Notice the use of [an anonymous (lambda) function](https://realpython.com/python-lambda/) in the apply statement.\n", + "\n", + "In the code below, the following operation occurs:\n", + "\n", + "+ Start with a Dask dataframe, `dd_px`.\n", + "+ Group the rows of this dataframe by the variable `ticker`, i.e., each group will contain the observations that pertain only to one ticker at a time. The `group_key` parameter controls whether an index entry is added with the value of the grouping variable (`ticker` in this case); if we made `group_keys=True`, we would have a duplicate `ticker` column.\n", + "+ For each group defined by a `ticker`, `apply()` the following calculation:\n", + "\n", + " - Sort the values by `Date` in ascending order.\n", + " - Assign a new variable called `Close_lag_1` by shifting the position of the closing price (`Close`) by one position. \n", + " - Define the schema of the resulting dataframe. If we omit this specification, we would get a warning; however, the simplicity of the calculation ensures that Dask can determine the resulting schema." ] }, { @@ -421,9 +557,40 @@ "metadata": {}, "outputs": [], "source": [ - "dd_shift = dd_px.groupby('ticker', group_keys=False).apply(\n", - " lambda x: x.assign(Close_lag_1 = x['Close'].shift(1))\n", - ")" + "dd_px" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd_shift = (\n", + " dd_px\n", + " .groupby('ticker', group_keys=False)\n", + " .apply(\n", + " lambda x: x.sort_values('Date', ascending = True)\n", + " .assign(Close_lag_1 = x['Close'].shift(1)), \n", + " meta = pd.DataFrame(data ={'Date': 'datetime64[ns]',\n", + " 'Open': 'f8',\n", + " 'High': 'f8',\n", + " 'Low': 'f8',\n", + " 'Close': 'f8',\n", + " 'Adj Close': 'f8',\n", + " 'Volume': 'i8',\n", + " 'source': 'object',\n", + " 'Year': 'int32',\n", + " 'Close_lag_1': 'f8'},\n", + " index = pd.Index([], dtype=pd.StringDtype(), name='ticker'))\n", + " ))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, using the dataframe that we created above, we can now `assign` the `Returns` variable to the entire dataset." ] }, { @@ -437,6 +604,13 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Question: How do we know that we are not (erroneously) combining the last price of a ticker with the first price of the next ticker?" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -459,8 +633,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "+ Dask is a lazy execution framework: commands will not execute until they are required. \n", - "+ To trigger an execution in dask use `.compute()`." + "+ Dask is a lazy execution framework: commands will not execute until the computation is required. \n", + "+ To trigger an execution in dask use `.compute()` or execute a command that requires the actual values (for example, write to Parquet or SQL)." ] }, { @@ -478,9 +652,19 @@ "source": [ "## Save\n", "\n", - "+ Apply transformations to calculate daily returns\n", - "+ Store the enriched data, the silver dataset, in a new directory.\n", - "+ Should we keep the same namespace? All columns?" + "With our transformed data, we can now save the new feature to a Parquet file. We will need to answer the following questions depending on our context, setup, and available resources:\n", + "\n", + "+ Should we keep the same namespace? \n", + "+ Should we save all columns?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dd_rets" ] }, { @@ -493,7 +677,22 @@ "FEATURES_DATA = os.getenv(\"FEATURES_DATA\")\n", "if os.path.exists(FEATURES_DATA):\n", " shutil.rmtree(FEATURES_DATA)\n", - "dd_rets.to_parquet(FEATURES_DATA, overwrite = True)" + "dd_rets.to_parquet(FEATURES_DATA, \n", + " overwrite = True, \n", + " schema={\n", + " 'Date': 'timestamp[ns]',\n", + " 'Open': 'float64',\n", + " 'High': 'float64',\n", + " 'Low': 'float64',\n", + " 'Close': 'float64',\n", + " 'Adj Close': 'float64',\n", + " 'Volume': 'int64',\n", + " 'source': 'string',\n", + " 'Year': 'int32',\n", + " 'Close_lag_1': 'float64',\n", + " 'Returns': 'float64',\n", + " 'ticker': 'large_string'\n", + " })" ] }, { @@ -574,7 +773,7 @@ ], "metadata": { "kernelspec": { - "display_name": "dsi_participant", + "display_name": "production-env (3.11.13)", "language": "python", "name": "python3" }, @@ -588,7 +787,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.21" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/01_materials/labs/03a_sampling.ipynb b/01_materials/labs/03a_sampling.ipynb index 44c5cf302..c4b9e98be 100644 --- a/01_materials/labs/03a_sampling.ipynb +++ b/01_materials/labs/03a_sampling.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "In this notebook, we will demonstrate various sampling methods in Pandas and Dask. To illustrate the methods, we use a dataset on the [annual number of objects launched into space from Our World in Data](https://ourworldindata.org/grapher/yearly-number-of-objects-launched-into-outer-space) and hosted in [Tidy Tuesday's Repository](https://github.com/rfordatascience/tidytuesday/blob/main/data/2024/2024-04-23/readme.md)." + ] + }, { "cell_type": "code", "execution_count": null, @@ -8,9 +17,17 @@ "source": [ "%load_ext dotenv\n", "%dotenv \n", - "import os\n", - "import sys\n", - "sys.path.append(os.getenv('SRC_DIR'))\n", + "%run update_path.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.dataframe as dd\n", + "from glob import glob\n", "from utils.logger import get_logger\n", "_logs = get_logger(__name__)" ] @@ -21,11 +38,8 @@ "metadata": {}, "outputs": [], "source": [ - "import dask.dataframe as dd\n", "import pandas as pd\n", - "import numpy as np\n", - "import os\n", - "from glob import glob" + "outer_space_dt = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-04-23/outer_space_objects.csv')" ] }, { @@ -34,25 +48,27 @@ "metadata": {}, "outputs": [], "source": [ - "ft_dir = os.getenv(\"FEATURES_DATA\")\n", - "ft_glob = glob(os.path.join(ft_dir, '**/*.parquet'), \n", - " recursive = True)\n", - "df = dd.read_parquet(ft_glob).compute().reset_index()" + "outer_space_dt.info()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "# Sampling in Python" + "idx = outer_space_dt['Year'] >= 2020\n", + "idx &= outer_space_dt['Entity'] != 'World'\n", + "outer_space_dt = outer_space_dt[idx]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "+ There are different packages that allow sampling.\n", - "+ A practical approach is to use pandas/Dask sampling methods." + "# Sampling in Python\n", + "\n", + "There are different packages that allow sampling. A practical approach is to use pandas/Dask sampling methods." ] }, { @@ -61,9 +77,9 @@ "source": [ "## Random Sampling\n", "\n", - "+ Sample n rows from a dataframe with [`df.sample()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html).\n", + "Sample n rows from a dataframe with [`df.sample()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html).\n", "\n", - "```\n", + "```python\n", "DataFrame.sample(\n", " n=None, frac=None, replace=False, weights=None, \n", " random_state=None, axis=None, ignore_index=False\n", @@ -77,7 +93,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.sample(n = 5)" + "outer_space_dt.sample(n = 10, random_state = 42)" ] }, { @@ -88,17 +104,19 @@ "source": [ "import random\n", "random.seed(42)\n", - "sample_tickers = random.sample(df['ticker'].unique().tolist(), 30)\n", - "df = df[df['ticker'].isin(sample_tickers)]\n", - "simple_sample_dt = df.sample(frac = 0.1)\n", - "simple_sample_dt.shape, df.shape" + "frac = 0.5\n", + "\n", + "simple_sample_dt = outer_space_dt.sample(frac = frac)\n", + "simple_sample_dt.shape, outer_space_dt.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Look at the distribution of tickers." + "## Stratified Sampling\n", + "\n", + "Use `groupby()` and `.sample()` for stratified sampling." ] }, { @@ -107,7 +125,8 @@ "metadata": {}, "outputs": [], "source": [ - "df['ticker'].value_counts().plot(kind='bar')" + "strat_sample_dt=outer_space_dt.groupby('Entity').sample(frac=frac, random_state=42)\n", + "strat_sample_dt.shape, outer_space_dt.shape" ] }, { @@ -116,26 +135,34 @@ "metadata": {}, "outputs": [], "source": [ - "simple_sample_dt['ticker'].value_counts().plot(kind='bar')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stratified Sampling\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", "\n", - "+ Use `groupby()` and `.sample()` for stratified sampling." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "strat_sample_dt = df.groupby(['ticker']).sample(frac = 0.1)\n", - "strat_sample_dt['ticker'].value_counts().plot(kind='bar')" + "# Prepare data for comparison\n", + "df_orig = outer_space_dt['Entity'].value_counts().reset_index()\n", + "df_orig.columns = ['Entity', 'count']\n", + "df_orig['sample_type'] = 'Original'\n", + "\n", + "df_simple = simple_sample_dt['Entity'].value_counts().reset_index()\n", + "df_simple.columns = ['Entity', 'count']\n", + "df_simple['sample_type'] = 'Simple Random'\n", + "\n", + "df_strat = strat_sample_dt['Entity'].value_counts().reset_index()\n", + "df_strat.columns = ['Entity', 'count']\n", + "df_strat['sample_type'] = 'Stratified'\n", + "\n", + "# Combine all data\n", + "combined_df = pd.concat([df_orig, df_simple, df_strat])\n", + "\n", + "# Create faceted plot\n", + "sns.set_style(\"whitegrid\")\n", + "g = sns.catplot(data=combined_df, x='Entity', y='count', col='sample_type', \n", + " kind='bar', height=5, aspect=1, palette='Set2')\n", + "g.set_xticklabels(rotation=90, ha='right', fontsize=5)\n", + "g.set_titles(\"{col_name}\")\n", + "plt.tight_layout()\n", + "plt.show()" ] }, { @@ -144,7 +171,15 @@ "source": [ "# Sampling in Dask\n", "\n", - "+ Stratified sampling in `dask` can be achieved with `groupby().apply()` and a lambda function." + "Stratified sampling in Dask works somewhat differently. The code below will raise a Key Error (the \"key\" *sample* is not found).\n", + "\n", + "```python\n", + "strat_sample_dd = (dd_dt.groupby('Entity', group_keys=False)\n", + " .sample(frac = frac)\n", + " .compute())\n", + "```\n", + "\n", + "However, stratified sampling in Dask can be done with `groupby().apply()` and a lambda function." ] }, { @@ -153,19 +188,19 @@ "metadata": {}, "outputs": [], "source": [ - "dd_dt = dd.read_parquet(ft_glob)\n", + "dd_dt = dd.from_pandas(outer_space_dt, npartitions=4)\n", + "\n", "strat_sample_dd = (dd_dt\n", - " .groupby('ticker', group_keys=False)\n", - " .apply(lambda x: x.sample(frac = 0.1))\n", + " .groupby('Entity', group_keys=False)\n", + " .apply(lambda x: x.sample(frac = frac))\n", " .compute()\n", - " .reset_index())\n", - "strat_sample_dd[strat_sample_dd['ticker'].isin(sample_tickers)]['ticker'].value_counts().plot(kind='bar')" + " .reset_index())\n" ] } ], "metadata": { "kernelspec": { - "display_name": "dsi_participant", + "display_name": "production-env (3.11.13)", "language": "python", "name": "python3" }, @@ -179,7 +214,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.21" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/01_materials/labs/03b_pipeline.ipynb b/01_materials/labs/03b_pipeline.ipynb index 0cb641651..a025adc22 100644 --- a/01_materials/labs/03b_pipeline.ipynb +++ b/01_materials/labs/03b_pipeline.ipynb @@ -4,7 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# An initial training pipeline" + "# Introduction to Model Pipelines\n", + "\n", + "In this notebook, we implement a few simple pipelines using Scikit-Learn. To illustrate the procedures, we will use the dataset on [Wine Quality available in the UCI Machine Learning Repository](https://archive.ics.uci.edu/dataset/186/wine+quality)." ] }, { @@ -15,17 +17,33 @@ "source": [ "%load_ext dotenv\n", "%dotenv \n", - "import os\n", - "import sys\n", - "sys.path.append(os.getenv('SRC_DIR'))\n", - "import dask.dataframe as dd\n", + "%run update_path.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use the [ucimlrepo](https://pypi.org/project/ucimlrepo/) package to easily access the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "import pandas as pd\n", - "import numpy as np\n", + "from ucimlrepo import fetch_ucirepo \n", "\n", - "from glob import glob\n", - "ft_dir = os.getenv(\"FEATURES_DATA\")\n", - "ft_glob = glob(ft_dir+'/*.parquet')\n", - "df = dd.read_parquet(ft_glob).compute().reset_index().dropna()\n" + "wine_quality = fetch_ucirepo(id=186)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspect the object `wine_quality`, which contains useful information about the dataset. For example, use `wine_quality.keys()` to obtain the object's keys. " ] }, { @@ -34,14 +52,69 @@ "metadata": {}, "outputs": [], "source": [ - "cat_file = os.path.join(\n", - " os.getenv(\"PRICE_CSV_DATA\"), \n", - " 'symbols_valid_meta.csv'\n", - ")\n", - "cat_df = (pd.read_csv(cat_file)\n", - " .rename(columns = {'Symbol': 'ticker'})[['ticker', 'Listing Exchange', 'Market Category']]\n", - " )\n", - "df = df.merge(cat_df, on = 'ticker', how = 'left')" + "wine_quality.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wine_quality.data.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dt = wine_quality.data.original\n", + "X = dt.drop(columns=['quality'])\n", + "Y = 1*(dt[['quality']] >= 7).values.ravel()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "sns.pairplot(X, diag_kind='hist', plot_kws={'alpha': 0.5})\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.figure(figsize=(8, 5))\n", + "sns.histplot(data=dt, x='quality', \n", + " bins=range(dt['quality'].min(), dt['quality'].max() + 2), \n", + " edgecolor='black', kde=False, discrete=True, multiple='stack')\n", + "plt.xlabel('Quality')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Distribution of Wine Quality')\n", + "plt.show()" ] }, { @@ -50,10 +123,9 @@ "source": [ "## Preprocessing\n", "\n", - "+ Previously, we produced a features data set.\n", - "+ Most times, one or more [preprocessing steps](https://scikit-learn.org/stable/modules/preprocessing.html#) steps will be applied to data.\n", - "+ The most practical way to apply them is by arranging them in `Pipeline` objects, wchich are sequential transformations applied to data. \n", - "+ It is convenient for us to label these transformations and there is a standard way of doing so.\n" + "+ While building machine learning models, we will apply one or more [preprocessing or feature engineering steps](https://scikit-learn.org/stable/modules/preprocessing.html#) to the data.\n", + "+ The most practical way to do this is by arranging the preprocessing steps in `Pipeline` objects, which are sequential transformations applied to data. After preprocessing, we pass the data to our model. \n", + "+ It is convenient for us to label these transformations, and there is a standard way of doing so." ] }, { @@ -62,19 +134,18 @@ "source": [ "## Transformations\n", "\n", - "+ Transformations are classes that implement `fit` and `transform` methods.\n", + "Transformations are classes that implement `fit` and `transform` methods.\n", "\n", "### StandardScaler\n", "\n", - "+ For example, transform a numerical variable by standardizing it.\n", - "- Standardization is removing the mean value of the feature and scale it by dividing non-constant features by their standard deviation.\n", + "For example, transform a numerical variable by standardizing it. Standardization removes the mean of each feature and scales it by dividing each non-constant feature by its standard deviation.\n", "\n", "$$\n", "z = \\frac{x-\\mu}{\\sigma}\n", "$$\n", "\n", "\n", - "+ Using [`StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html), one can do the following:" + "Using [`StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html), one can do the following:" ] }, { @@ -83,7 +154,16 @@ "metadata": {}, "outputs": [], "source": [ - "df.columns" + "# Create a StandardScaler object\n", + "from sklearn.preprocessing import StandardScaler\n", + "std_scaler = StandardScaler()\n", + "\n", + "# Select only numeric features for scaling\n", + "numeric_features = X.select_dtypes(include=['int64', 'float64']).columns\n", + "X_num = X[numeric_features]\n", + "\n", + "# Fit the StandardScaler object with the returns data\n", + "std_scaler.fit(X_num)" ] }, { @@ -92,18 +172,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = (df.assign(\n", - " returns = lambda x: x['Close']/x['Close_lag_1'] - 1, \n", - " positive_return = lambda x: 1.0*(x['returns'] > 0),\n", - " hi_lo = lambda x: x['High'] - x['Low'],\n", - " op_cl = lambda x: x['Close'] - x['Open']\n", - " ).groupby(['ticker'], group_keys=False).apply(\n", - " lambda x: x.assign(target = x['positive_return'].shift(-1))\n", - " )\n", - " .reset_index(drop=True)\n", - " .dropna(subset = ['target'])\n", - " )\n", - "df" + "X_num.mean()" ] }, { @@ -112,14 +181,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Create a StandardScaler object\n", - "\n", - "from sklearn.preprocessing import StandardScaler\n", - "std_scaler = StandardScaler()\n", - "\n", + "# Transform the returns data using the fitted scaler\n", "\n", - "# Fit the StandardScaler object with the returns data\n", - "std_scaler.fit(returns)" + "scaled_X = std_scaler.transform(X_num)\n", + "scaled_X_df = pd.DataFrame(scaled_X, columns=X_num.columns)" ] }, { @@ -128,11 +193,25 @@ "metadata": {}, "outputs": [], "source": [ - "# Transform the returns data using the fitted scaler\n", - "\n", - "scaled_returns_np = std_scaler.transform(returns)\n", - "scaled_returns = pd.DataFrame(scaled_returns_np, columns=returns.columns)\n", - "scaled_returns.describe()" + "scaled_X_df.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_num.std()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scaled_X_df.std()" ] }, { @@ -141,7 +220,7 @@ "source": [ "### OneHotEncoder\n", "\n", - "+ Categorical features can be encoded as numerical values using `OneHotEncoder`." + "Categorical features can be encoded as numerical values using `OneHotEncoder`." ] }, { @@ -150,19 +229,20 @@ "metadata": {}, "outputs": [], "source": [ - "df['Listing Exchange'].value_counts().plot(kind = 'bar')" + "X['color'].value_counts().plot(kind = 'bar')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "+ Use [`OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) to encode a categorical variable as numerical.\n", - "+ Important parameters:\n", + "Use [`OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) to encode a categorical variable as numerical.\n", + "\n", + "Some key parameters are:\n", "\n", - " - `categories` allows you to specify the categories to work with.\n", - " - `drop`: we can drop the `'first'` value (dummy encoding) or `'if_binary'`, a convenience setting for binary values.\n", - " - `handle_unknown` allows three options, `'error'`, `'ignore'`, and `'infrequent_if_exist'`, depending on what we want to do with new values." + "- `categories` allows you to specify the categories to work with.\n", + "- `drop`: we can drop the `'first'` value (dummy encoding) or `'if_binary'`, a convenience setting for binary values.\n", + "- `handle_unknown` allows three options, `'error'`, `'ignore'`, and `'infrequent_if_exist'`, depending on what we want to do with new values." ] }, { @@ -172,8 +252,8 @@ "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", - "onehot = OneHotEncoder()\n", - "onehot.fit(df[['Listing Exchange']])" + "onehot = OneHotEncoder(drop='if_binary')\n", + "onehot.fit(X[['color']])" ] }, { @@ -182,7 +262,7 @@ "metadata": {}, "outputs": [], "source": [ - "listing_enc = onehot.transform(df[['Listing Exchange']])\n", + "listing_enc = onehot.transform(X[['color']])\n", "listing_enc.toarray()" ] }, @@ -193,12 +273,12 @@ "# Pipelines\n", "\n", "+ It is impractical and costly to manipulate data \"by hand\". \n", - "+ To manage data preprocessing steps within the cross-validation process use `Pipeline` objects.\n", + "+ To manage data preprocessing steps within the cross-validation process, use `Pipeline` objects.\n", "+ A [`Pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) object allows us to sequentially apply transformation steps and, if required, a predictor.\n", "+ `Pipeline` objects compose transforms, i.e., classes that implement `transform` and `fit` methods.\n", "+ The purpose of `Pipeline` objects is to ensemble transforms and predictors to be used in cross-validation.\n", "+ A `Pipeline` is defined by a list of tuples.\n", - "+ Each tuple is composed of `(\"name\", )`, the name of the step and the `` function of our chosing." + "+ Each tuple is composed of `(\"name\", )`, the name of the step, and the `` function of our choosing." ] }, { @@ -209,7 +289,7 @@ "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, log_loss, cohen_kappa_score, f1_score\n" ] @@ -222,8 +302,8 @@ "source": [ "pipe1 = Pipeline(\n", " [\n", - " ('onehot', OneHotEncoder(handle_unknown='ignore')),\n", - " ('knn', DecisionTreeClassifier(criterion = 'entropy', max_depth=3))\n", + " ('scaler', StandardScaler()),\n", + " ('logistic', LogisticRegression())\n", "\n", " ]\n", ")\n", @@ -236,9 +316,8 @@ "metadata": {}, "outputs": [], "source": [ - "X0 = df[['Listing Exchange', 'Market Category']]\n", - "Y0 = df['target']\n", - "X0_train, X0_test, Y0_train, Y0_test = train_test_split(X0, Y0, test_size=0.2, random_state=42)\n", + "\n", + "X0_train, X0_test, Y0_train, Y0_test = train_test_split(X_num, Y, test_size=0.2, random_state=42)\n", "\n", "pipe1.fit(X0_train, Y0_train)" ] @@ -286,8 +365,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "+ The model does not show great performance, but the pipeline shows results. \n", - "+ Below, we expand the pipeline to include more variables, and further we will work with more robust model selection pipelines." + "Below, we expand the pipeline to include more variables, and further, we will work with more robust model selection pipelines." ] }, { @@ -306,28 +384,22 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.compose import ColumnTransformer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "from sklearn.compose import ColumnTransformer\n", + "\n", "transformer = ColumnTransformer(\n", " transformers=[\n", - " ('numeric_transfomer', StandardScaler(), ['returns', 'Volume', 'op_cl', 'hi_lo'] ),\n", - " ('onehot', OneHotEncoder(handle_unknown='infrequent_if_exist'), ['Listing Exchange', 'Market Category']), \n", + " ('num_transform', StandardScaler(), X_num.columns.values ),\n", + " ('cat_transform', OneHotEncoder(handle_unknown='infrequent_if_exist', drop='if_binary'), ['color']), \n", " ], remainder='drop'\n", ")\n", "\n", "pipe = Pipeline(\n", " [\n", " ('preproc', transformer), \n", - " ('decisiontree', DecisionTreeClassifier(criterion = 'entropy', max_depth=3))\n", + " ('logistic', LogisticRegression(l1_ratio=1.0))\n", " ]\n", - ")" + ")\n", + "pipe" ] }, { @@ -339,9 +411,9 @@ "The model selection process is an iterative process in which :\n", "\n", "+ Select schema and load data.\n", - "+ Define a pipeline and its (hyper) parameters.\n", + "+ Define the pipeline and its (hyper)parameters.\n", "\n", - " - Use ColumnTransformers to transform numeric and cateogrical variables.\n", + " - Use ColumnTransformers to transform numeric and categorical variables.\n", " - Hyperparameters can be defined independently of code. \n", "\n", "+ Implement a splitting strategy. \n", @@ -361,13 +433,13 @@ "source": [ "## Training, Validation, Testing Split\n", "\n", - "+ The first spliting strategy is to use a training, validation, and test set.\n", - "+ Training set will be used to fit the model.\n", - "+ Validation set is used to evaluate hyperparameter choice.\n", + "+ The first splitting strategy is to use a training, validation, and test set.\n", + "+ The training set will be used to fit the model.\n", + "+ The validation set is used to evaluate hyperparameter choices.\n", "+ Testing set is used to evaluate performance on data the model has not yet seen.\n", - "+ In this case we want to compare two models: \n", + "+ In this case, we want to compare two models: \n", "\n", - " - Decision Tree with 3 minumum samples per leaf.\n", + " - Decision Tree with 3 minimum samples per leaf.\n", " - Decision Tree with 10 minimum samples per leaf.\n", "\n", "![](./images/03b_train_validate_test.png)" @@ -381,13 +453,7 @@ "\n", "+ One can obtain the parameters of a pipeline with `pipe.get_params()`.\n", "+ We can set any parameter of a pipeline with `pipe.set_parames(**kwargs)`. \n", - "+ The input `**kwargs` is a dictionary of the params to be modified. Params of the steps are labeled with the name of the step followed by `__` and the name of the parameter." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "+ The input `**kwargs` is a dictionary of the params to be modified. Params of the steps are labeled with the name of the step followed by `__` and the name of the parameter.\n", "+ There are a few steps that we will repeat: \n", "\n", " - Fit the candidate model on training data.\n", @@ -395,7 +461,7 @@ " - Compute training and test performance metrics.\n", " - Return.\n", "\n", - "+ We encapsulate this procedure in a function. " + "+ We encapsulate this procedure in a function. \n" ] }, { @@ -429,10 +495,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Schema\n", - "X = df[['returns', 'op_cl', 'hi_lo', 'Volume', 'Listing Exchange', 'Market Category']]\n", - "Y = df['target']\n", - "\n", "# Split the data\n", "X_rest, X_test, Y_rest, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)\n", "X_train, X_validate, Y_train, Y_validate = train_test_split(X_rest, Y_rest, test_size=0.2, random_state=42)\n" @@ -444,10 +506,11 @@ "metadata": {}, "outputs": [], "source": [ + "\n", "# Evaluate hyperparameter configuration 2\n", - "pipe_d3 = pipe.set_params(**{'decisiontree__max_depth': 3})\n", - "res_d3 = evaluate_model(pipe_d3, X_train, Y_train, X_validate, Y_validate)\n", - "res_d3" + "pipe_1 = pipe.set_params(**{'logistic__C': 0.0001})\n", + "res_1 = evaluate_model(pipe_1, X_train, Y_train, X_validate, Y_validate)\n", + "res_1" ] }, { @@ -457,9 +520,9 @@ "outputs": [], "source": [ "# Evaluate hyperparameter configuration 2\n", - "pipe_d15 = pipe.set_params(**{'decisiontree__max_depth':15})\n", - "res_d15 = evaluate_model(pipe_d15, X_train, Y_train, X_validate, Y_validate)\n", - "res_d15" + "pipe_2 = pipe.set_params(**{'logistic__C': 1.0})\n", + "res_2 = evaluate_model(pipe_2, X_train, Y_train, X_validate, Y_validate)\n", + "res_2" ] }, { @@ -479,7 +542,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "From [Scikit's Documentation ](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-evaluating-estimator-performance), the diagram below shows the data divisions and folds during the cross-validation process." + "From [Scikit's Documentation](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-evaluating-estimator-performance), the diagram below shows the data divisions and folds used during cross-validation." ] }, { @@ -493,7 +556,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There are two functions that can be used for [calculating cross-validation performance scores](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-evaluating-estimator-performance): `cross_val_score()` and `cross_validate()`. The first function, [`cross_val_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score), is a convenience function to get quick perfromance calculations. We will discuss `cross_validate()` as it offers advantages over `cross_val_score()`." + "There are two functions that can be used for [calculating cross-validation performance scores](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-evaluating-estimator-performance): `cross_val_score()` and `cross_validate()`. The first function, [`cross_val_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score), is a convenience function to get quick performance calculations. We will discuss `cross_validate()`, which offers advantages over `cross_val_score()`." ] }, { @@ -506,7 +569,7 @@ "+ There are two advantages of using this function. From [Scikit's documentation](https://scikit-learn.org/stable/modules/cross_validation.html#the-cross-validate-function-and-multiple-metric-evaluation):\n", "\n", ">- It allows specifying multiple metrics for evaluation.\n", - ">- It returns a dict containing fit-times, score-times (and optionally training scores, fitted estimators, train-test split indices) in addition to the test score.\n" + ">- It returns a dict containing fit-times, score-times (and optionally training scores, fitted estimators, train-test split indices) in addition to the test score." ] }, { @@ -516,8 +579,10 @@ "outputs": [], "source": [ "from sklearn.model_selection import cross_validate\n", - "scoring = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc', 'neg_log_loss', 'neg_brier_score']\n", - "d3_dict = cross_validate(pipe_d3, X, Y, cv=5, scoring = scoring, return_train_score = True)" + "\n", + "scoring = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc', 'neg_log_loss']\n", + "\n", + "dict_1 = cross_validate(pipe_1, X, Y, cv=5, scoring = scoring, return_train_score = True)" ] }, { @@ -533,7 +598,7 @@ "metadata": {}, "outputs": [], "source": [ - "pd.DataFrame(d3_dict)" + "pd.DataFrame(dict_1)" ] }, { @@ -542,8 +607,8 @@ "metadata": {}, "outputs": [], "source": [ - "d15_dict = cross_validate(pipe_d15, X, Y, cv=5, scoring = scoring, return_train_score = True)\n", - "pd.DataFrame(d15_dict)" + "dict_2 = cross_validate(pipe_2, X, Y, cv=5, scoring = scoring, return_train_score = True)\n", + "pd.DataFrame(dict_2)" ] }, { @@ -552,9 +617,18 @@ "source": [ "# About Performance\n", "\n", - "+ Notice that in order to acquire information about our model and continue development, we are spending resources: time, electricity, equipment use, etc. As well, we are generating data and binary objects that implement our models (fitted `Pipeline` objects, for example).\n", + "+ Notice that in order to acquire information about our model and continue development, we are spending resources: time, electricity, equipment use, etc. We are also generating data and binary objects that implement our models (e.g., fitted `Pipeline` objects).\n", "+ For certain applications, operating performance (latency or `'score_time'`) may be as important or more important than predictive performance metrics. \n", - "+ Every experiment throws important information and we can log them, as well as run them systematically." + "+ Every experiment throws important information, and we can log them, as well as run them systematically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(dict_1).mean()" ] }, { @@ -563,13 +637,13 @@ "metadata": {}, "outputs": [], "source": [ - "pd.DataFrame(d15_dict).mean()" + "pd.DataFrame(dict_2).mean()" ] } ], "metadata": { "kernelspec": { - "display_name": "dsi_participant", + "display_name": "production-env (3.11.13)", "language": "python", "name": "python3" }, @@ -583,7 +657,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.21" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/01_materials/labs/04_0_docker.md b/01_materials/labs/04_0_docker.md new file mode 100644 index 000000000..ed8b7f185 --- /dev/null +++ b/01_materials/labs/04_0_docker.md @@ -0,0 +1,84 @@ +# Using Docker to Set Up Experiment Tracking + ++ For our work, we need an environment that closely resembles the production environment. ++ One way to achieve this is to use containers and containerized applications. ++ Without going into the details, you can think of a container as software that encapsulates the key features of an operating system, a programming language, and the application code. ++ Containers are meant to be portable across operating systems: a container will work the same regardless of whether the underlying Docker application is installed on a Windows, Linux, or Mac machine. ++ Containers are not Virtual Machines. ++ Docker is a popular containerization platform. + +## What is Docker? + ++ From product documentation: + +> Docker is an open platform for developing, shipping, and running applications. Docker enables you to separate your applications from your infrastructure, allowing you to deliver software quickly. With Docker, you can manage your infrastructure in the same ways you manage your applications. By leveraging Docker's methodologies for shipping, testing, and deploying code, you can significantly reduce the time between writing code and running it in production. + +## General Procedure + ++ To set up services using containers, we will do the following: + +1. Download an image from [Docker Hub](https://hub.docker.com/) or an equivalent image repository. +2. If required, set up a volume to [persist data](https://docs.docker.com/guides/walkthroughs/persist-data/). +3. Redirect ports as needed. +4. Start the container. + +In our course, we will set up the following services: + ++ MLFlow: an experiment tracking system. MLFlow requires two backends: a database and an object store. ++ PostgreSQL: a database management system. ++ MinIO: an object store that resembles S3 buckets in AWS. + +## Starting the Containers + ++ To run the process above, first navigate to the `./05_src/experiment_tracking/` folder. ++ The first time that you set up the containers, you will need to build the MLFlow image. You can build the required image with `docker compose build`. ++ After building a local image for MLFlow, run `docker compose up -d`. ++ The flag `-d` indicates that we will do a headless run. ++ Notice that the containers are set to always restart. You can remove the option or turn the containers off manually. Be aware that if you leave this option on, the containers will run whenever Docker Desktop restarts. + +## Stopping the Containers + ++ To stop the containers use (from `./05_src/db/`): `docker compose stop`. ++ Alternatively, you can bring all images down, including their volumes, with: `docker compose down -v`. + + - The `-v` flag removes volumes. + - It is the best option when you do not need the data any more because **it will delete the data in your DB **. + + +## Connecting to the MLFlow UI + ++ MLFlow provides a convenient interface accessible at [http://localhost:5001](http://localhost:5001). + +
+ + +## Connecting to PgAdmin + ++ PgAdmin4 is management software for PostgreSQL Server. ++ You can open the local implementation by navigating to [http://localhost:5051](http://localhost:5051/). You will find a screen like the one below. + +
+ ++ Login using the credentials specified in the file `./05_src/experiment_tracking/.env`. Notice there are two sets of credentials; use the ones for PgAdmin4. After authentication, you will see a screen like the one below. + +
+ ++ Click on "Add New Server": + + - In the *General* Tab, under Name enter: localhost. + - Under the *Connection* Tab, use Host name *postgres* (this is the name of the service in the docker compose file). + - Username and password are the ones found in the `./05_src/experiment_tracking/.env` file. + + +## Connect to MinIO + ++ The interface for MinIO can be reached via [http://localhost:9001](http://localhost:9001) ++ The credentials can be found in the `./05_src/experiment_tracking/.env` file. + +
+ + +## Learn More + ++ Containers and containerization are topics well beyond the scope of this course. However, we will use containerized applications to help us implement certain patterns. ++ If you are interested in Docker, a good place to start is the [Official Docker Guides](https://docs.docker.com/get-started/overview/). diff --git a/01_materials/labs/04_transforms.ipynb b/01_materials/labs/04_transforms.ipynb index 58200c0ef..614061fa2 100644 --- a/01_materials/labs/04_transforms.ipynb +++ b/01_materials/labs/04_transforms.ipynb @@ -7,7 +7,7 @@ "# Feature Engineering\n", "\n", "+ Feature engineering is to transform the data in such a way that the information content is easily exposed to the model.\n", - "+ This statement can mean many things and highly depends on what exactly is \"the model\".0\n", + "+ This statement can mean many things and highly depends on what exactly is \"the model\".\n", "+ As we have seen, we are using many tools in combination to manipulate data. Thus far, we have encountered pandas, Dask, and sklearn in this course, but there are many more (PySpark, SQL, DAX, M, R, etc.)\n", "+ It is important to discuss which tools are the right ones, specifically in the context of data leakage." ] @@ -16,21 +16,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Transform using pandas/Dask/SQL or sklearn?\n", - "\n", - "+ Depending on the perspective, the answer could be neither, pandas, or sklearn:\n", - "\n", - " - Neither: \n", - " * Most join and filtering should be done closer to the source using a database or parquet/Dask operation. \n", - " * Map-Reduce and Group-by-Aggregate (\"data warehousing\") operations.\n", - " * Indexing and reshuffling.\n", - " - Pandas, Dask, or PySpark: \n", - " * Renames tasks.\n", - " * Use python libraries like pandas, Dask, or pySpark to add contemporaneous feature, time-series manipulation (for example, adding lags), parallel computation (using Dask or pySpark).\n", - " * Do not use these libraries for sample-dependent features.\n", - " - Use sklearn, pytorch:\n", - " * Use python libraries like sklearn or pytorch to add features that are sample-dependent like scaling and normalization, one-hot encoding, tokenization, and vectorization.\n", - " * Model-depdenent transformations: PCA, embeddings, iterative/knn imputation, etc.\n", + "## Transform using Pandas, Dask, SQL, or Scikit-Learn?\n", + "\n", + "+ Most join and filtering should be done closer to the source such as a database, Spark or DataBricks.\n", + "+ Use data manipulation tools like Pandas, Dask, or PySpark: \n", + " * Rename columns.\n", + " * Column transforms that do not require sampling.\n", + " * Time-series manipulation such as adding lags and contemporaneous features.\n", + " * Parallel computation.\n", + "- Use ML pipelines with sklearn or PyTorch:\n", + " * Add features that are sample-dependent like scaling and normalization, one-hot encoding, tokenization, and vectorization.\n", + " * Model-dependent transformations: PCA, embeddings, iterative/knn imputation, etc.\n", + "\n", "+ Decisions must be guided by optimization criteria (time and resources) while avoiding data leakage." ] }, @@ -40,7 +37,7 @@ "source": [ "## Example Transforms in sklearn\n", "\n", - "The list below is found in [Scikit's Documentation](https://scikit-learn.org/stable/modules/preprocessing.html), which also includes convenience interfaces for the classes below.\n", + "The list below is from [Scikit's Documentation](https://scikit-learn.org/stable/modules/preprocessing.html), which also includes convenience interfaces for the classes listed below.\n", "\n", "Work with categorical variables:\n", "\n", @@ -69,7 +66,7 @@ "+ `preprocessing.PowerTransformer([method, ...])`: Apply a power transform featurewise to make data more Gaussian-like.\n", "+ `preprocessing.QuantileTransformer(*[, ...])`: Transform features using quantiles information.\n", "+ `preprocessing.SplineTransformer([n_knots, ...])`: Generate univariate B-spline bases for features.\n", - "+ `preprocessing.TargetEncoder([categories, ...])`: Target Encoder for regression and classification targets.\n" + "+ `preprocessing.TargetEncoder([categories, ...])`: Target Encoder for regression and classification targets." ] }, { @@ -104,7 +101,7 @@ "\n", "+ We are looking for informative features: their contribution to prediction is valuable.\n", "+ We prefer parsimonious models.\n", - "+ We want to retain evidence of our work and afford reproducibility. " + "+ We want to retain evidence of our work and ensure reproducibility." ] }, { @@ -136,10 +133,9 @@ "# Load environment variables\n", "%load_ext dotenv\n", "%dotenv \n", - "# Add src to path\n", + "%run update_path.py\n", + "\n", "import os\n", - "import sys\n", - "sys.path.append(os.getenv('SRC_DIR'))\n", "\n", "# Standard libraries\n", "import pandas as pd\n", @@ -180,10 +176,6 @@ " 'NumberOfTime60-89DaysPastDueNotWorse': 'num_60_89_days_late',\n", " 'NumberOfDependents': 'num_dependents'\n", " }\n", - ").assign(\n", - " high_debt_ratio = lambda x: (x['debt_ratio'] > 1)*1,\n", - " missing_monthly_income = lambda x: x['monthly_income'].isna()*1,\n", - " missing_num_dependents = lambda x: x['num_dependents'].isna()*1, \n", ")" ] }, @@ -193,7 +185,7 @@ "source": [ "## Manual Solution\n", "\n", - "+ To get deeper insights into the task, first approach it manually." + "To get some insights into the task, first approach it manually." ] }, { @@ -207,16 +199,12 @@ "from sklearn.preprocessing import StandardScaler, PowerTransformer\n", "from sklearn.impute import SimpleImputer, KNNImputer\n", "from sklearn.model_selection import train_test_split, cross_validate\n", - "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.linear_model import LogisticRegression\n", "\n", "num_cols = ['revolving_unsecured_line_utilization', 'age',\n", " 'num_30_59_days_late', 'debt_ratio', 'monthly_income',\n", " 'num_open_credit_loans', 'num_90_days_late', 'num_real_estate_loans',\n", - " 'num_60_89_days_late', 'num_dependents', \n", - " # Although expressed as numbers, these columns are boolean:\n", - " # 'high_debt_ratio',\n", - " # 'missing_monthly_income', \n", - " # 'missing_num_dependents' \n", + " 'num_60_89_days_late', 'num_dependents'\n", " ]\n", "\n", "pipe_num_simple = Pipeline([\n", @@ -230,7 +218,7 @@ "\n", "pipe_simple = Pipeline([\n", " ('preprocess', ctransform_simple),\n", - " ('model', GaussianNB())\n", + " ('model', LogisticRegression())\n", "])\n", "pipe_simple\n" ] @@ -239,7 +227,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Cross-validation of simple pipeline" + "## Cross-Validation of Simple Pipeline" ] }, { @@ -272,7 +260,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "On average, we obtain a log-loss of about 0.362." + "On average, we obtain a log-loss of about 0.23." ] }, { @@ -290,11 +278,9 @@ "source": [ "## Alternative Pipeline\n", "\n", - "+ The pipeline below is more complex:\n", - "\n", - " - Treat selected numericals using [Yeo-Johnson transformation](https://feature-engine.trainindata.com/en/latest/user_guide/transformation/YeoJohnsonTransformer.html).\n", - " - Treat other numericals with scaling only.\n", - " - Do not treat booleans." + "- The pipeline below is more complex.\n", + "- Treat selected numericals using [Yeo-Johnson transformation](https://feature-engine.trainindata.com/en/latest/user_guide/transformation/YeoJohnsonTransformer.html).\n", + "- Treat other numericals with scaling only." ] }, { @@ -328,7 +314,7 @@ "\n", "pipe_yj = Pipeline([\n", " ('preprocess', ctramsform_yj),\n", - " ('clf', GaussianNB())\n", + " ('clf', LogisticRegression())\n", "])\n", "pipe_yj" ] @@ -348,7 +334,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We obtained a greater loss of 0.443, therefore the additional feature is not profitable." + "We obtained a loss of 0.22, therefore the additional feature enhances performance." ] }, { @@ -369,7 +355,7 @@ "+ We are currently evaluating two feature engineering procedures using the same classifier. \n", "\n", " - However, feature engineering is classifier-dependent: each classifier is a specialized tool to learn a certain type of hypothesis. \n", - " - Different classifiers will benefit from different type of engineered features (see, for example, [Khun and Silge's recommendations on TMWR.org](https://www.tmwr.org/pre-proc-table)).\n", + " - Different classifiers will benefit from different types of engineered features (see, for example, [Khun and Silge's recommendations on TMWR.org](https://www.tmwr.org/pre-proc-table)).\n", "\n", "+ We are producing data from our experiments.\n", "\n", @@ -380,7 +366,7 @@ "+ We modify code to produce experiments:\n", "\n", " - Our experiment results will be a function of our algorithm's logic, its implementation (code), and our data.\n", - " - Code tracking is doen with Git.\n", + " - Code tracking is done with Git.\n", " - Data tracking is in development.\n", "\n", "**It is generally a good idea to use software for experiment tracking once you move out of the Proof of Concept stage.** Some solutions include:\n", @@ -396,17 +382,17 @@ "source": [ "# MLFlow\n", "\n", - "+ MLFlow is a software tool that automates taks related to experiment tracking:\n", + "+ MLFlow is a software tool that automates tasks related to experiment tracking:\n", "\n", " - Keep track of experiment parameters.\n", - " - Save configuration+s for individual experiment runs in files or databases.\n", - " - Store models and other artifacts to an object store.\n", + " - Save configurations for individual experiment runs in files or databases.\n", + " - Store models and other artifacts in an object store.\n", "\n", "+ A few features that may be useful:\n", "\n", - " - Keep track of code and artifacts associated with experiment.\n", + " - Keep track of code and artifacts associated with the experiment.\n", " - Store experiment run times and system characteristics.\n", - " - Work with different backend stores (\"[Observers](https://mlflow.org/docs/latest/tracking/backend-stores)\").\n" + " - Work with different backend stores (\"[Observers](https://mlflow.org/docs/latest/tracking/backend-stores)\")." ] }, { @@ -429,7 +415,7 @@ ], "metadata": { "kernelspec": { - "display_name": "dsi_participant", + "display_name": "production-env (3.11.13)", "language": "python", "name": "python3" }, @@ -443,7 +429,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.21" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/01_materials/labs/05_hyperparams.ipynb b/01_materials/labs/05_hyperparams.ipynb index 4c0f909b0..abdec2719 100644 --- a/01_materials/labs/05_hyperparams.ipynb +++ b/01_materials/labs/05_hyperparams.ipynb @@ -22,14 +22,15 @@ "\n", "The diagram below, taken from Scikit Learn's documentation, shows the procedure that we will follow:\n", "\n", - "![](./images/05_grid_search_workflow.png)\n", + "
\n", + "\n", + "
\n", "\n", - "\n", - "+ System requriements:\n", + "+ System requirements:\n", " \n", " - Automation: the system should operate automatically with the least amount of supervision. \n", - " - Replicability: changes to code and (arguably) data should be logged and controled. Randomness should also be controlled (random seeds, etc.)\n", - " - Persistence: persist results for later analysis.\n" + " - Replicability: changes to code and (arguably) data should be logged and controlled. Randomness should also be controlled (e.g., random seeds).\n", + " - Persistence: persist results for later analysis." ] }, { @@ -39,8 +40,8 @@ "## What is a Hyperparameter?\n", "\n", "+ Generally speaking, hyperparameters are parameters that control the learning process: regularization weights, learning rate, entropy/gini metrics, etc. \n", - "+ Hyperparameters will drive the behaviour and performance of a model. Model selection is intimately related with hyperparameter tuning. \n", - "+ Selection critieria are based on performance evaluation and, to get better performance estimates, we use cross-validation." + "+ Hyperparameters determine a model's behaviour and performance. Model selection is intimately related to hyperparameter tuning. \n", + "+ Selection criteria are based on performance evaluation. To get better performance estimates, we use cross-validation." ] }, { @@ -50,7 +51,7 @@ "## Searching the Hyperparameter Grid\n", "\n", "+ To address the automation requirement, we could use `GridSearchCV()`, which is a self-contained function for performing a Grid Search over a hyperparameter space.\n", - "+ To \"Search the Hyperparameter Grid\" exhaustively means that we will consider all possible combination of hyperparameter values in the search space and evaluate the model using those hyperparams. For example, if we have two parameters that we are exploring, kernel (takes values \"rbf\" and \"poly\") and C (takes values 1.0 and 0.5), then this grid would be the combinations:\n", + "+ To \"Search the Hyperparameter Grid\" exhaustively means that we will consider all possible combinations of hyperparameter values in the search space and evaluate the model using those hyperparameters. For example, if we have two parameters that we are exploring, kernel (takes values \"rbf\" and \"poly\") and C (takes values 1.0 and 0.5), then this grid would be the combinations:\n", "\n", " + (rbf, 1.0)\n", " + (rbf, 0.5)\n", @@ -77,9 +78,9 @@ "source": [ "%load_ext dotenv\n", "%dotenv \n", + "%run update_path.py\n", + "\n", "import os\n", - "import sys\n", - "sys.path.append(os.getenv('SRC_DIR'))\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", @@ -107,10 +108,6 @@ " 'NumberOfTime60-89DaysPastDueNotWorse': 'num_60_89_days_late',\n", " 'NumberOfDependents': 'num_dependents'\n", " }\n", - ").assign(\n", - " high_debt_ratio = lambda x: (x['debt_ratio'] > 1)*1,\n", - " missing_monthly_income = lambda x: x['monthly_income'].isna()*1,\n", - " missing_num_dependents = lambda x: x['num_dependents'].isna()*1, \n", ")" ] }, @@ -123,7 +120,7 @@ "+ Preprocessing steps.\n", "+ Logistic Regression classifier.\n", "\n", - "We will explore the hyperparameter sapce by evaluating different regularization strategies and parameters." + "We will explore the hyperparameter space by evaluating different regularization strategies and parameters." ] }, { @@ -150,11 +147,7 @@ "num_cols = ['revolving_unsecured_line_utilization', 'age',\n", " 'num_30_59_days_late', 'debt_ratio', 'monthly_income',\n", " 'num_open_credit_loans', 'num_90_days_late', 'num_real_estate_loans',\n", - " 'num_60_89_days_late', 'num_dependents', \n", - " # Although expressed as numbers, these columns are boolean:\n", - " # 'high_debt_ratio',\n", - " # 'missing_monthly_income', \n", - " # 'missing_num_dependents' \n", + " 'num_60_89_days_late', 'num_dependents'\n", " ]\n", "\n", "pipe_num_simple = Pipeline([\n", @@ -215,7 +208,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To perform the Grid Search we need to define a parameter grid:\n", + "To perform the Grid Search, we need to define a parameter grid:\n", "\n", "- A parameter grid defines all of the combinations of parameters that we need to explore.\n", "- The function `GridSearchCV()` performs an exhaustive search of parameter combinations.\n", @@ -232,9 +225,10 @@ "outputs": [], "source": [ "param_grid = {\n", - " 'clf__C': [0.01, 0.5, 1.0],\n", - " 'clf__penalty': ['l1', 'l2'],\n", - " 'clf__solver': ['liblinear'],\n", + " 'clf__C': [0.01, 0.1, 1, 10, 100],\n", + " 'clf__l1_ratio': [0, 1],\n", + " 'clf__penalty': ['elasticnet'],\n", + " 'clf__solver': ['saga'],\n", " }" ] }, @@ -283,7 +277,7 @@ "res.columns\n", "\n", "res[['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',\n", - " 'param_clf__C', 'param_clf__penalty', 'param_clf__solver', 'params',\n", + " 'param_clf__C', 'param_clf__l1_ratio', 'param_clf__solver', 'params',\n", " 'mean_test_neg_log_loss',\n", " 'std_test_neg_log_loss', 'rank_test_neg_log_loss']].sort_values('rank_test_neg_log_loss')" ] @@ -305,19 +299,19 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "grid_cv.best_estimator_" + "The best-performing classifier (pipeline) trained on the complete training set is:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "The best-performing classifier (pipeline) trained on the complete training set is:" + "grid_cv.best_estimator_" ] }, { @@ -330,8 +324,8 @@ "+ The plan:\n", "\n", " - Create a model ingredient to obtain the classifier object.\n", - " - Create experiment param grids to organize our parameter grids.\n", - " - Schedule the experiments.\n" + " - Create experiment parameter grids to organize our parameter grids.\n", + " - Schedule the experiments." ] }, { @@ -340,9 +334,9 @@ "source": [ "Explore the code in `./05_src/exp__logistic_simple.py` and `./05_src/exp__logistic_grid_search.py`:\n", "\n", - "+ `exp__logistic_simple.py` implements a single experiment run in MLFlow, i.e., a single set of parameters will be trained and evaluated by the code.\n", + "+ `exp__logistic_simple.py` runs a single experiment in MLFlow; i.e., a single set of parameters is trained and evaluated by the code.\n", "+ `exp__logistic_grid_search.py` runs through a series of tests (one test given by a parametrization of the model pipeline). Each run is recorded independently as a parent run.\n", - "+ Also notice that we have pulled the data component of the experiment to a module of its own." + "+ Also, notice that we have pulled the data component of the experiment to a module of its own." ] }, { @@ -356,13 +350,13 @@ "```\n", "cd src # if required\n", "python -m credit.exp__logistic_grid_search.py\n", - "```\n" + "```" ] } ], "metadata": { "kernelspec": { - "display_name": "dsi_participant", + "display_name": "production-env (3.11.13)", "language": "python", "name": "python3" }, @@ -376,7 +370,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.21" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/01_materials/labs/06_explainability.ipynb b/01_materials/labs/06_explainability.ipynb index abe10782f..bc6f0c10f 100644 --- a/01_materials/labs/06_explainability.ipynb +++ b/01_materials/labs/06_explainability.ipynb @@ -8,9 +8,9 @@ "source": [ "%load_ext dotenv\n", "%dotenv \n", + "%run update_path.py\n", + "\n", "import os\n", - "import sys\n", - "sys.path.append(os.getenv('SRC_DIR'))\n", "import pandas as pd\n", "import numpy as np\n", "import pickle\n", @@ -341,7 +341,7 @@ ], "metadata": { "kernelspec": { - "display_name": "dsi_participant", + "display_name": "production-env (3.11.13)", "language": "python", "name": "python3" }, @@ -355,7 +355,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.21" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/01_materials/labs/07_distribution_shifts.ipynb b/01_materials/labs/07_distribution_shifts.ipynb index e9c4af185..1e24ccd2c 100644 --- a/01_materials/labs/07_distribution_shifts.ipynb +++ b/01_materials/labs/07_distribution_shifts.ipynb @@ -18,9 +18,9 @@ "source": [ "%load_ext dotenv\n", "%dotenv ../05_src/.env\n", - "import sys\n", - "sys.path.append(\"../05_src\")\n", - "from logger import get_logger\n", + "%run update_path.py\n", + "\n", + "from utils.logger import get_logger\n", "_logs = get_logger(__name__)" ] }, @@ -314,7 +314,7 @@ ], "metadata": { "kernelspec": { - "display_name": "env", + "display_name": "production-env (3.11.13)", "language": "python", "name": "python3" }, @@ -328,7 +328,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.11.13" } }, "nbformat": 4, diff --git a/01_materials/labs/images/01_trunk_based_development.png b/01_materials/labs/images/01_trunk_based_development.png index a6981b618..d6fa2deaf 100644 Binary files a/01_materials/labs/images/01_trunk_based_development.png and b/01_materials/labs/images/01_trunk_based_development.png differ diff --git a/01_materials/labs/update_path.py b/01_materials/labs/update_path.py new file mode 100644 index 000000000..f4e301e5f --- /dev/null +++ b/01_materials/labs/update_path.py @@ -0,0 +1,8 @@ +from pathlib import Path +import sys + +notebook_dir = Path.cwd() +src_path = (notebook_dir / "../../05_src").resolve() + +if str(src_path) not in sys.path: + sys.path.insert(0, str(src_path)) # insert(0) gives it priority \ No newline at end of file diff --git a/01_materials/slides/01_introduction.pdf b/01_materials/slides/01_introduction.pdf index a352bb57e..ed6f302b4 100644 Binary files a/01_materials/slides/01_introduction.pdf and b/01_materials/slides/01_introduction.pdf differ diff --git a/01_materials/slides/02_data_engineering.pdf b/01_materials/slides/02_data_engineering.pdf index 7019ed30f..94abd5b19 100644 Binary files a/01_materials/slides/02_data_engineering.pdf and b/01_materials/slides/02_data_engineering.pdf differ diff --git a/01_materials/slides/03_training.pdf b/01_materials/slides/03_training.pdf index b7a88f826..fdcc8f8c5 100644 Binary files a/01_materials/slides/03_training.pdf and b/01_materials/slides/03_training.pdf differ diff --git a/01_materials/slides/04_feature_engineering.pdf b/01_materials/slides/04_feature_engineering.pdf index 2677b05a8..bad110ee2 100644 Binary files a/01_materials/slides/04_feature_engineering.pdf and b/01_materials/slides/04_feature_engineering.pdf differ diff --git a/01_materials/slides/05_model_development.pdf b/01_materials/slides/05_model_development.pdf index 5deaf9ad8..10388a446 100644 Binary files a/01_materials/slides/05_model_development.pdf and b/01_materials/slides/05_model_development.pdf differ diff --git a/01_materials/slides/06_deployment.pdf b/01_materials/slides/06_deployment.pdf index 53c89679c..36eb308c2 100644 Binary files a/01_materials/slides/06_deployment.pdf and b/01_materials/slides/06_deployment.pdf differ diff --git a/01_materials/slides/07_monitoring.pdf b/01_materials/slides/07_monitoring.pdf index f038d38ce..581063927 100644 Binary files a/01_materials/slides/07_monitoring.pdf and b/01_materials/slides/07_monitoring.pdf differ diff --git a/01_materials/slides/08_infra_and_org.pdf b/01_materials/slides/08_infra_and_org.pdf index 985629d70..6d405c5f8 100644 Binary files a/01_materials/slides/08_infra_and_org.pdf and b/01_materials/slides/08_infra_and_org.pdf differ diff --git a/03_instructional_team/markdown_slides/01_introduction.md b/03_instructional_team/markdown_slides/01_introduction.md index 6b06a9c70..f44d8061a 100644 --- a/03_instructional_team/markdown_slides/01_introduction.md +++ b/03_instructional_team/markdown_slides/01_introduction.md @@ -34,7 +34,7 @@ $ echo "Data Sciences Institute" - **1.3 Project Setup**     - Introduction.     - Repo File Structure. -    - Git, authorization, and production pipelines. +    - Git, authorisation, and production pipelines.     - VS Code and Git.     - Python virtual environments.     - Branching Strategies. @@ -101,11 +101,9 @@ ML is a collection of methods that allow a computer to: ## When to Use ML? - A business problem is not the same as an ML problem. - -    - Generally, a business will be concerned with profit maximization (directly or indirectly): increasing sales, cutting costs, enhancing customer satisfaction, reducing churn, increasing time on the website, etc. -    - The objective of an ML method is to enhance the performance of the task, given more data. -    - Optimising ML performance metrics does not automatically translate to optimizing business performance. - + - The objective of a business is generally concerned with profit maximisation: increasing sales, cutting costs, enhancing customer satisfaction, reducing churn, increasing time on the website, etc. + - The objective of an ML method is to enhance the performance of the task, given more data. +- Optimising ML performance metrics does not automatically translate to optimising business performance. - Some of the most popular business applications of ML are in areas where business and ML performance overlap: fraud detection, recommender systems, etc. --- @@ -150,7 +148,7 @@ ML is a collection of methods that allow a computer to: ### Unseen data - Unseen data shares patterns with the training data. -- The learning method generalizes reasonably well on testing data. +- The learning method generalises reasonably well on testing data. ### It is repetitive @@ -232,7 +230,7 @@ ML is a collection of methods that allow a computer to: ### Computational priorities during model development         - Training is the bottleneck. -- Throughput, the number of cases processed, should be maximized. +- Throughput, the number of cases processed, should be maximised. --- @@ -240,7 +238,7 @@ ML is a collection of methods that allow a computer to: ### Computational priorities in production - Fast inference is desirable. -- Latency, the time between when a query is received and when it is addressed, should be minimized. +- Latency, the time between when a query is received and when it is addressed, should be minimised. - Latency is usually measured using percentiles of time elapsed (e.g., 99th percentile should be below X ms.) ![bg contain right:40%](./images/01_latency_throughput.png) @@ -286,7 +284,7 @@ ML is a collection of methods that allow a computer to: ## Designing Data-Intensive Applications - Many applications today are data-intensive instead of compute-intensive. -    - The limit factor is data and not computation. +    - The limiting factor is data and not computation.     - Concerns: the amount of data, the complexity of data, and the speed at which it changes. - ML Systems tend to be embedded in data-intensive applications. - (Kleppmann, 2017) @@ -344,8 +342,8 @@ Have things changed that much? (Huyen, 2022) and [CRISP-DM (c. 1999)](https://ww     - Regression.     - A regression model can be framed as a classification model and vice versa. -    - Regression to classification: apply quantization. -    - Classification to regression: predict the likelihood of class. +    - Regression to classification: apply quantisation. +    - Classification to regression: predict the likelihood of a class.     --- @@ -374,7 +372,7 @@ Have things changed that much? (Huyen, 2022) and [CRISP-DM (c. 1999)](https://ww ## Objective Functions (1/2) -- ML requires an objective function to guide the learning process through optimization. +- ML requires an objective function to guide the learning process through optimisation. - In the context of ML:         - Regression tasks generally employ error or accuracy metrics: Root Mean Square Error (RMSE) or Mean Absolute Error (MAE). @@ -390,7 +388,7 @@ Have things changed that much? (Huyen, 2022) and [CRISP-DM (c. 1999)](https://ww H(y, p)=-\frac{1}{N}\sum_{i=1}^{n}\left(y_i ln(\hat{p}_{i}) +(1-y_{i})ln(1-\hat{p}_{i})\right) $$ -- Formulation is related to maximum likelihood: minimizing negative log-likelihood is the "same" as minimizing log loss. +- Formulation is related to maximum likelihood: minimising negative log-likelihood is the "same" as minimising log loss. --- @@ -445,4 +443,4 @@ Loss = -(1*ln(0.1)) = 2.3026 - Olah, C. "Conv Nets: A Modular Perspective." (2014) [URL](https://colah.github.io/posts/2014-07-Conv-Nets-Modular/) - Sculley, D. et al. "Hidden technical debt in machine learning systems." Advances in neural information processing systems 28 (2015). -- Wirth, R. and J. Hipp. "CRISP-DM: Towards a standard process model for data mining." Proceedings of the 4th international conference on the practical applications of knowledge discovery and data mining. Vol. 1. (2000). +- Wirth, R. and J. Hipp. "CRISP-DM: Towards a standard process model for data mining." Proceedings of the 4th international conference on the practical applications of knowledge discovery and data mining. Vol. 1. (2000). \ No newline at end of file diff --git a/03_instructional_team/markdown_slides/02_data_engineering.md b/03_instructional_team/markdown_slides/02_data_engineering.md index 680d30ab9..28e9af7f2 100644 --- a/03_instructional_team/markdown_slides/02_data_engineering.md +++ b/03_instructional_team/markdown_slides/02_data_engineering.md @@ -74,9 +74,9 @@ $ echo "Data Sciences Institute" - Different data sources have different characteristics. - User input data: -    - Data that is explicitly input by users. -    - Text, images, videos, files, etc. -    - Prone to error: text too long, too short, incomplete, unexpected data types, etc. + - Data that is explicitly input by users. + - Text, images, videos, files, etc. + - Prone to error: text too long, too short, incomplete, unexpected data types, etc. --- @@ -84,19 +84,19 @@ $ echo "Data Sciences Institute" - System-generated data: -    - Logs, performance metrics, and other system outputs. -    - Generally, well-formatted and can grow rapidly. -    + - Logs, performance metrics, and other system outputs. + - Generally, well-formatted and can grow rapidly. + - Databases generated by (internal) services and enterprise applications: -    - Many times, structured data. -    - Varying degrees of data quality. + - Many times, structured data. + - Varying degrees of data quality. - Third-party data: -    - Data collected from the public when the public is not a customer of the collecting organization. -    - Price databases, news aggregators, etc. + - Data collected from the public when the public is not a customer of the collecting organization. + - Price databases, news aggregators, etc. --- @@ -109,10 +109,10 @@ $ echo "Data Sciences Institute" - Data storage is a fundamental component in any ML system: -    - Store raw input data. -    - Store pre-computed features. -    - Store model performance metrics and other model-related information. -    - Store logs for monitoring and debugging. + - Store raw input data. + - Store pre-computed features. + - Store model performance metrics and other model-related information. + - Store logs for monitoring and debugging. - A sequence of operations that read from one or multiple storage types combined with data transformation procedures to create *pipelines*. @@ -123,9 +123,9 @@ $ echo "Data Sciences Institute" - Selecting the right data format for storing can be beneficial in terms of performance and costs. - *Data serialization* is converting a data structure or object state into a format that can be stored, transmitted, and reconstructed later. - Data formats can be: -    - Text or binary-based. -    - Human-readable. -    - Row-major or column-major. + - Text or binary-based. + - Human-readable. + - Row-major or column-major. --- @@ -133,12 +133,12 @@ $ echo "Data Sciences Institute" |Format |Binary/Text    |Human-readable |Example use cases| |-------|---------------|---------------|-----------------| -|JSON   |Text           |Yes            |Everywhere| -|CSV    |Text           |Yes            |Everywhere| -|Parquet|Binary         |No             |Hadoop, Amazon Redshift| -|Avro   |Binary primary |No             |Hadoop| -|Protobuf|Binary primary|No             |Google, TensorFlow (TFRecord)| -|Pickle |Binary         |No             |Python, PyTorch serialization| +|JSON   |Text   |Yes    |Everywhere| +|CSV    |Text   |Yes    |Everywhere| +|Parquet|Binary |No |Hadoop, Amazon Redshift| +|Avro   |Binary primary |No |Hadoop| +|Protobuf|Binary primary|No |Google, TensorFlow (TFRecord)| +|Pickle |Binary |No |Python, PyTorch serialization| --- @@ -223,8 +223,8 @@ We can also represent the data with less structure: - Non-text file formats are called *binary*. - Binary files are more compact: -    - To store the number 1000000 would require seven characters or 7 bytes (at one character per byte). -    - To store 1000000 as int32 would require 32 bits or 4 bytes. + - To store the number 1000000 would require seven characters or 7 bytes (at one character per byte). + - To store 1000000 as int32 would require 32 bits or 4 bytes. --- @@ -266,8 +266,8 @@ We can also represent the data with less structure: - Normalization is determining how much redundancy exists in a table and reducing it, as required. - The goals of normalization are to: -    - Be able to characterize the level of redundancy in a relational schema. -    - Provide mechanisms for transforming schemas to remove redundancy + - Be able to characterize the level of redundancy in a relational schema. + - Provide mechanisms for transforming schemas to remove redundancy - Generally, we want to minimize the redundancy of primary and foreign keys. - One disadvantage of normalizing data is that it becomes spread out in different tables. @@ -297,8 +297,8 @@ We can also represent the data with less structure: - No SQL, started as a negation of SQL, but it is now generally understood as "Not Only SQL". - No SQL models can be of two types: -    - Document model. -    - Graph model. + - Document model. + - Graph model. --- @@ -364,8 +364,8 @@ Document1: harry_potter.json - Graph models enable network or graph metrics: -    - Node metrics like centrality measures: degree, eigen, betweenness. -    - Graph-level features: cliques, clusters, modularity. + - Node metrics like centrality measures: degree, eigen, and betweenness. + - Graph-level features: cliques, clusters, modularity. - Graph databases may bundle other features (visualization). @@ -407,7 +407,7 @@ Document1: harry_potter.json |Schema clearly defined |Data does not need to follow a schema| |Easy to search and analyze |Fast arrival| |Can only handle data with a specific schema |Can handle data from any source| -|Schema changes will cause a lot of troubles |No need to worry about schema changes (yet), as the worry is shifted to the downstream applications that use this data| +|Schema changes will cause significant trouble. |No need to worry about schema changes (yet), as the worry is shifted to the downstream applications that use this data| |Stored in data warehouses |Stored in data lakes| --- @@ -432,10 +432,10 @@ Document1: harry_potter.json - Transactional databases are designed to maintain low latency and high availability. - Transactional databases usually offer ACID guarantees: -    - Atomicity: all steps in a transaction are completed successfully as a group. If one step fails, all fail. -    - Consistency: all transactions coming through must follow predefined rules. -    - Isolation: two transactions happen at the same time as if they were isolated. Two users accessing the same data will not change it at the same time. -    - Durability: once a transaction has been committed, it will remain committed even in the case of system failure. + - Atomicity: all steps in a transaction are completed successfully as a group. If one step fails, all fail. + - Consistency: all transactions coming through must follow predefined rules. + - Isolation: Two transactions happen at the same time as if they were isolated. Two users accessing the same data will not change it simultaneously. + - Durability: once a transaction has been committed, it will remain committed even in the case of system failure. - Some transactional databases do not offer ACID, but BASE: "Basically Available, Soft state, and Eventual consistency." (Kleppmann, 2017) @@ -466,15 +466,15 @@ OLTP and OLAP are terms falling out of use, since the divide is somewhat outdate ## ETL: Extract, Transform, and Load -ETL is the process of extracting data from one or several sources, transforming it to the shape that an application or model requires it, and loading it to a desired destination. -    +ETL is the process of extracting data from one or several sources, transforming it into the shape that an application or model requires, and loading it into a desired destination. + - Extract the data from all data sources, including validating and rejecting data that does not meet requirements. Notify sources of rejected data. - Transform the data through different operations: join, filter, standardization, etc. - Load is deciding how and how often to load the transformed data into the destination (a file, a database, or a data warehouse). - Schema on read forces app developers to determine the schema in advance. -- Data acquisition grows rapidly and storage is inexpensive. +- Data acquisition grows rapidly, and storage is inexpensive. - Some companies invested in a store-all-the-data strategy. @@ -545,9 +545,9 @@ Three ways of passing data: ![bg left:40% w:400](./images/02_data_flows_1.png) -- Process 1 writes to DB, Process 2 reads from the same DB. +- Process 1 writes to the DB, Process 2 reads from the same DB. - Both processes require access to the same database. -- Database access can be slow, which may not be suitable for apps with strict latency requirements such as consumer-facing applications. +- Database access can be slow, which may not be suitable for apps with strict latency requirements, such as consumer-facing applications. @@ -601,7 +601,7 @@ Three ways of passing data: - A piece of data broadcast to a real-time transport is called an event.   - This architecture is called event-driven. -- The real-time transport is sometimes called event bus. +- The real-time transport is sometimes called an event bus. - Request-driven architecture works well for systems that rely more on app logic than data. - Event-driven architecture works better for data-intensive systems. @@ -657,8 +657,8 @@ In the message queue model: ## Stream Processing - Stream processing is performing computation on streaming data coming from real-time transports. -- Computation can also be started periodically, but the periods are generally shorter. Computation could also be started when the need arises. -- Streaming processing is performed on rapid-changing variables known as dynamic features (for example, average metric in past 5 minutes). +- Computation can also be started periodically, but the periods are generally shorter. Computation could also be started when needed. +- Streaming processing is performed on rapidly changing variables known as dynamic features (for example, average metric in the past 5 minutes). - Example products: Apache Flink, KSQL, and Spark Streaming. @@ -673,4 +673,4 @@ In the message queue model: - Agrawal, A. et al. "Cloudy with high chance of DBMS: A 10-year prediction for Enterprise-Grade ML." arXiv preprint arXiv:1909.00084 (2019). - Huyen, Chip. "Designing machine learning systems." O'Reilly Media, Inc.(2022). - Financial Stability Board (FSB). "Artificial intelligence and machine learning in financial services" (2017). [URL](https://www.fsb.org/2017/11/artificial-intelligence-and-machine-learning-in-financial-service/) -- Kleppmann, M. "Designing data-intensive applications: The big ideas behind reliable, scalable, and maintainable systems." O'Reilly Media, Inc. (2017). +- Kleppmann, M. "Designing data-intensive applications: The big ideas behind reliable, scalable, and maintainable systems." O'Reilly Media, Inc. (2017). \ No newline at end of file diff --git a/03_instructional_team/markdown_slides/03_training.md b/03_instructional_team/markdown_slides/03_training.md index 9c2b009ed..2969cbed7 100644 --- a/03_instructional_team/markdown_slides/03_training.md +++ b/03_instructional_team/markdown_slides/03_training.md @@ -24,7 +24,7 @@ $ echo "Data Sciences Institute" - Sampling in Python. - An initial training pipeline. -- Modularizing the training pipeline. +- Modularising the training pipeline. - Decoupling settings, parameters, data, code, and results. --- @@ -44,7 +44,7 @@ $ echo "Data Sciences Institute" ![h:450px](./images/03_flock_ref_arhitecture_highlighted_3.png) -
Agrawal et al (2019)
+
Agrawal et al. (2019)
--- @@ -137,7 +137,7 @@ Generally, selecting data to train ML methods using this family of sampling meth + Assume the data contains 25% red samples and 75% blue samples. + We know the actual distribution is closer to 50% red and 50% blue. - + We can apply red weights that are three times higher than blue weights. + + We can apply red weights that are three times as high as blue weights. --- @@ -162,7 +162,7 @@ Generally, selecting data to train ML methods using this family of sampling meth - Put the first k elements into the reservoir. - For each incoming nth element, generate a random number i such that 1 ≤ i ≤ n. - - If 1 ≤ i ≤ k: replace the ith element in the reservoir with the nth element. Else, do nothing. + - If 1 ≤ i ≤ k: replace the ith element in the reservoir with the nth element. Otherwise, do nothing. - Each incoming nth element has a k/n probability of being in the reservoir. @@ -348,7 +348,7 @@ To handle class imbalance: ## Class Probabilities Carry Information - A greater Area Under the ROC Curve (AUC ROC) indicates a better model: AUC ROC can be interpreted as the probability that the classifier ranks a randomly chosen positive instance above a randomly chosen negative one. -- AUC ROC measures the ranking order of a model's prediction: it is useful when costs are unavailable and class distributions are unknown. +- AUC ROC measures the ranking order of a model's prediction: it is useful when costs are unavailable, and class distributions are unknown. ![bg right:40% w:400](./images/03_roc_curve_comparison.png) @@ -364,7 +364,7 @@ To handle class imbalance: $$ -- Formulation is related to maximum likelihood: minimizing negative log-likelihood is the "same" as minimizing log loss. +- Formulation is related to maximum likelihood: minimising negative log-likelihood is the "same" as minimising log loss. --- @@ -377,7 +377,7 @@ To handle class imbalance: Loss = -(1*ln(0.9)) = 0.1054 ``` -- If the model is unsure and predicted 0.5, then +- If the model is unsure and predicts 0.5, then ``` Loss = -(1*ln(0.5)) = 0.6931 @@ -393,7 +393,7 @@ Loss = -(1*ln(0.1)) = 2.3026 ## Class Weights -- Some models can optimize a cost or loss function that differentiates for specific types of errors. +- Some models can optimise a cost or loss function that differentiates for specific types of errors. - In some instances, one can assume that misclassifying minority events (false negatives) is more costly than incorrectly predicting non-events (false positives). - Relative cost or class weights can be determined by @@ -418,7 +418,7 @@ $$ ## SMOTE and ADASYN -- SMOTE: Synthetic Minority Oversampling TEchnique +- SMOTE: Synthetic Minority Oversampling Technique   - Creates new instances based on random linear combinations of existing cases. - ADASYN: Adaptive Synthetic Sampling Method   - Similar to SMOTE, but new instances are generated based on density. @@ -441,4 +441,4 @@ $$ ## References - Agrawal, A. et al. "Cloudy with high chance of DBMS: A 10-year prediction for Enterprise-Grade ML." arXiv preprint arXiv:1909.00084 (2019). -- Huyen, Chip. "Designing machine learning systems." O'Reilly Media, Inc.(2022). +- Huyen, Chip. "Designing machine learning systems." O'Reilly Media, Inc.(2022). \ No newline at end of file diff --git a/03_instructional_team/markdown_slides/04_feature_engineering.md b/03_instructional_team/markdown_slides/04_feature_engineering.md index 955993943..74957c6e0 100644 --- a/03_instructional_team/markdown_slides/04_feature_engineering.md +++ b/03_instructional_team/markdown_slides/04_feature_engineering.md @@ -6,8 +6,8 @@ paginate: true @@ -22,7 +22,7 @@ $ echo "Data Sciences Institute" **4.1 Feature Engineering** -  + - Common Operations - Data Leakage - Feature Importance @@ -50,7 +50,7 @@ $ echo "Data Sciences Institute" ## The Flock Reference Architecture ![h:450px center](./images/04_flock_ref_arhitecture_highlighted_4.png) -
Agrawal et al (2019)
+
Agrawal et al. (2019)
--- @@ -90,9 +90,9 @@ Feature engineering can include: - Missing values are a common occurrence in production data. - Missing values can be of three types: -  - Missing Not At Random (MNAR): a value is missing because of the actual value itself. -  - Missing At Random (MAR): a value is missing not due to the value itself but to another observed variable. -  - Missing Completely At Random (MCAR): There is no pattern in missing values. + - Missing Not At Random (MNAR): a value is missing because of the actual value itself. + - Missing At Random (MAR): a value is missing not due to the value itself but to another observed variable. + - Missing Completely At Random (MCAR): There is no pattern in missing values. --- @@ -100,10 +100,10 @@ Feature engineering can include: - The simplest way to remove missing values is deletion. - *Column deletion* -  - Remove variables with excessive missing values, but be cautious of potentially losing valuable information and compromising model performance. + - Remove variables with excessive missing values, but be cautious of potentially losing valuable information and compromising model performance. - *Row deletion* -  - Remove samples with missing values, but only effective for MCAR with a few missing values. -  - Drawbacks: Doesn't work for MNAR data and can create biases by removing rows. + - Remove samples with missing values, but only effective for MCAR with a few missing values. + - Drawbacks: Doesn't work for MNAR data and can create biases by removing rows. --- @@ -112,12 +112,12 @@ Feature engineering can include: - Impute missing values using default values: missing strings, filled with "". - Use a statistic like mean, median, or mode: fill the missing temperature with the mean temperature for the time of day within a specific window. - Domain specific: if prices are liquid, use the last available price. -  + --- # Imputation -- Model-based: if two variables are correlated and one of them has missing values, model the relationship and use model results for imputation. +- Model-based: if two variables are correlated and one of them has missing values, model the relationship and use the model results for imputation. - Flag imputed missing values. - Avoid filling missing values with possible (fixed) values. Example: Missing number of children should not be filled with 0, a possible value. @@ -126,7 +126,7 @@ Feature engineering can include: # Scaling - The Objective is to obtain values of similar magnitude. -- Scaling makes variables a "standard size". It benefits algorithms that are scale-sensitive and generally does not hurt algorithms that are scale-insensitive. +- Scaling makes variables a "standard size". It benefits scale-sensitive algorithms and generally does not hurt scale-insensitive algorithms. - There is little downside to scaling features, in general. - Warning: scaling is a common source of data leakage. - Scaling requires global statistics that may be expensive to calculate. @@ -188,8 +188,8 @@ $$ - With dummy variables, if the original variable contained *C* levels, then we will get *C-1* levels by default. - For instance, our example had five levels (one per weekday), but the resulting dummy representation only has four. -  - We can back out the fifth value since we know that when all four values are 0, the fifth value should be 1. -  - This avoids an undesirable situation for certain methods called colinearity. + - We can back out the fifth value since we know that when all four values are 0, the fifth value should be 1. + - This avoids an undesirable situation for certain methods called colinearity. - Collinearity occurs when one variable can be obtained as a linear function of others. - Colinearity is a form of observing information redundancy. @@ -208,8 +208,8 @@ $$ - Categories are not static: categories change over time. -  - Categories were not represented in the training data. -  - New categories may appear over time. + - Categories were not represented in the training data. + - New categories may appear over time. - It is generally a good idea to consider the category UNKNOWN. --- @@ -218,9 +218,9 @@ $$ - In some cases, UNKNOWN labels may refer to samples that do not belong together: two new brands may not target the same market, new products, new IP addresses, new user accounts, etc. - One solution is the hashing trick: -  - Use a hash function to generate a hash for every category. -  - The hashed value will become the index of the category. -  - Some collisions may occur, but the overloading of the UNKNOWN category is reduced. + - Use a hash function to generate a hash for every category. + - The hashed value will become the index of the category. + - Some collisions may occur, but the overloading of the UNKNOWN category is reduced. --- @@ -240,7 +240,7 @@ $$ # Multivariate Transformations -Some transformations may include more complex formulations or the results of models that we use to pre-process the data. +Some transformations may involve more complex formulations or the results of models we use to preprocess the data. - Principal Components Analysis - Discriminant Analysis - Embeddings @@ -248,7 +248,7 @@ Some transformations may include more complex formulations or the results of mod --- ## Principal Components Analysis: -  + - Principal Components Analysis (PCA) is a change of base such that orthogonal directions of maximum variation are used. - Compute PC Scores of a group of variables in the data and keep only the first n (up to a percent of variability explained). - Reduces redundant (highly correlated) information. @@ -267,9 +267,9 @@ Image Source: [Devopedia.com](https://devopedia.org/principal-component-analysis ## Multivariate Transformations - Other transformations: -  - Discriminant Analysis Score: linear discriminant analysis produces a projection that maximizes linear separability. -  - Distance to cluster centroids. -  + - Discriminant Analysis Score: linear discriminant analysis produces a projection that maximizes linear separability. + - Distance to cluster centroids. + --- ## Embeddings (1/2) @@ -277,9 +277,9 @@ Image Source: [Devopedia.com](https://devopedia.org/principal-component-analysis ![bg left:50% w:500](./images/04_word_embeddings.png) -- Training NN is computationally intensive and time-consuming. +- Training an NN is computationally intensive and time-consuming. - Assume that an NN has been trained. -- Pre-trained NN are available. +- Pre-trained NNs are available. - Models can be trained on general language (news articles, Wikipedia, etc.) and specialized language (legal, medical, etc.) corpora. --- @@ -312,7 +312,7 @@ Image Source: [Devopedia.com](https://devopedia.org/principal-component-analysis ### Splitting time-correlated data randomly instead of by time -- In many cases, we are dealing with time series data: the date-time in which data is generated affects its label distribution. +- In many cases, we are dealing with time series data: the date-time at which data is generated affects its label distribution. - Ex: stock prices. - Solution: split data by time instead of random sampling whenever possible (ex., time-series cross-validation). @@ -350,7 +350,7 @@ Image Source: [Devopedia.com](https://devopedia.org/principal-component-analysis ## Data Leakage: Common Causes (5/6) ### Group leakage -- Similar to duplication, where a group of examples have strongly correlated labels but are divided into different splits. +- Similar to duplication, where a group of examples has strongly correlated labels but are divided into different splits. - Example: in object detection, several pictures are taken a few seconds apart and are almost identical. --- @@ -367,10 +367,10 @@ Image Source: [Devopedia.com](https://devopedia.org/principal-component-analysis ## Detecting Data Leakage - Measure the predictive power of each feature. -  - Investigate unusually high readings. -  - Investigate data generation and whether we can derive a valid explanation. + - Investigate unusually high readings. + - Investigate data generation and whether we can derive a valid explanation. - Perform ablation studies (remove one feature at a time) to measure how important a feature or set of features is to your model. -  - If removing a feature causes the model's performance to deteriorate significantly, investigate why that feature is so important. + - If removing a feature causes the model's performance to deteriorate significantly, investigate why that feature is so important. - Pay attention to new features added to the model. --- @@ -497,8 +497,8 @@ Image Source: [Devopedia.com](https://devopedia.org/principal-component-analysis ## Limitations of Post-Hoc Explainability Methods: SHAP Values - Difficult to interpret: -  - Incorrect: the Shapley value of a feature value is the difference of the predicted value after removing the feature from the model training. -  - Correct: given the current set of feature values, the contribution of a feature value to the difference between the actual prediction and the mean prediction is the estimated Shapley value. + - Incorrect: the Shapley value of a feature value is the difference of the predicted value after removing the feature from the model training. + - Correct: given the current set of feature values, the contribution of a feature value to the difference between the actual prediction and the mean prediction is the estimated Shapley value. - Method (non-Tree implementation) is computationally expensive. - SHAP (non-Tree) ignores feature dependence. @@ -513,5 +513,5 @@ Image Source: [Devopedia.com](https://devopedia.org/principal-component-analysis - Agrawal, A. et al. "Cloudy with high chance of DBMS: A 10-year prediction for Enterprise-Grade ML." arXiv preprint arXiv:1909.00084 (2019). - Gilyadov, J (2017). Word2Vec Explained. [URL](https://israelg99.github.io/2017-03-23-Word2Vec-Explained/) - Huyen, Chip. "Designing machine learning systems." O'Reilly Media, Inc.(2022). -- Lunderberg and Lee. A Unified Approach to INterpreting Model Predictions. Advances in Nueral INformation Processing Systems 30 (NIPS 2017). [GitHub Repository](https://github.com/shap/shap?tab=readme-ov-file) -- Molnar, C. Interpretable Machine Learning. (2023) [URL](https://christophm.github.io/interpretable-ml-book/) +- Lunderberg and Lee. A Unified Approach to Interpreting Model Predictions. Advances in Neural Information Processing Systems 30 (NIPS 2017). [GitHub Repository](https://github.com/shap/shap?tab=readme-ov-file) +- Molnar, C. Interpretable Machine Learning. (2023) [URL](https://christophm.github.io/interpretable-ml-book/) \ No newline at end of file diff --git a/03_instructional_team/markdown_slides/05_model_development.md b/03_instructional_team/markdown_slides/05_model_development.md index 9209fc2c2..748b9df4a 100644 --- a/03_instructional_team/markdown_slides/05_model_development.md +++ b/03_instructional_team/markdown_slides/05_model_development.md @@ -55,7 +55,7 @@ $ echo "Data Sciences Institute" ## The Flock Reference Architecture ![h:470px center](./images/05_flock_ref_architecture_highlighted_5.png) -
Agrawal et al (2019)
+
Agrawal et al. (2019)
--- @@ -107,7 +107,7 @@ Linear Discriminant Analysis (LDA): Quadratic Discriminant Analysis (QDA): -+ Classes do not share covariance matrix. ++ Classes do not share a covariance matrix. ![bg contain right:45%](./images/05_qda.png) @@ -135,7 +135,7 @@ $$ ## Unregularized Decision Trees + Divide-and-conquer strategy: segment data based on an attribute such that information gain is maximized. -+ Information can be measured with Gini coefficient or entropy. ++ Information can be measured with the Gini coefficient or entropy. + Fully expanded decision trees often contain unnecessary structure. ![bg contain right:45%](./images/05_decision_tree_no_reg.png) @@ -146,7 +146,7 @@ $$ ## Regularized Decision Trees + Pre-pruning: during training, decide which branches to stop developing. -+ Post-pruning: subtree replacement involves training a full tree, then decide if a branch can be substituted by a leaf node. ++ Post-pruning: subtree replacement involves training a full tree, then deciding if a branch can be substituted by a leaf node. + Constrain tree depth or number of examples in leaf nodes (hyperparameters). @@ -159,7 +159,7 @@ $$ + Bagging = Bootstrap + Aggregation ### Bootstrap -+ Create subsets of data by sampling with replacement; train decision trees on each subset. ++ Create data subsets by sampling with replacement; train decision trees on each subset. ### Aggregation @@ -195,7 +195,7 @@ $$ ## Support Vector Machines (2/2) -SVM use linear models to implement non-linear boundaries by performing a non-linear mapping of inputs: +SVMs use linear models to implement non-linear boundaries by performing a non-linear mapping of inputs: - Polynomial - Radial Basis Function - Sigmoid @@ -231,9 +231,9 @@ SVM use linear models to implement non-linear boundaries by performing a non-lin ### Avoid the state-of-the-art trap - Researchers evaluate models in academic settings: if a model is state-of-the-art, it performs better than existing models on some static dataset. -- It is essential to remain up to date but solve the problem first. +- It is essential to remain up to date, but solve the problem first. - Start with the simplest models -- Simple is better than complex: easier to deploy, easier to understand, and serve as a baseline. +- Simple is better than complex: easier to deploy, easier to understand, and serves as a baseline. - Easier to deploy: speeds up the experimentation cycle. - Easier to understand: adds complexity as needed. - Baseline: simple models serve as a starting comparison point for model development. @@ -285,7 +285,7 @@ State of the Art Model Performance on ImageNet c.2023 (paperswithcode.com) Understand your model's assumptions: - Every model comes with its assumptions. -- Prediction assumption: every model that aims to predict an output Y from an input X assumes that it is possible to predict Y based on X. +- Prediction assumption: Every model that aims to predict an output Y from an input X assumes that it is possible to predict Y based on X. - Independent and Identically Distributed: neural nets assume that examples are independent and identically distributed. --- @@ -302,7 +302,7 @@ Understand your model's assumptions: ## The Wisdom of the Crowds -> “Aggregating the judgment of many consistently beats the accuracy of the average member of the group, and is often as startlingly accurate […] In fact, in any group there are likely to be individuals who beat the group. But those bull’s-eye guesses typically say more about the power of luck […] than about the skill of the guesser. That becomes clear when the exercise is repeated many times.” (Tetlock and Gardner, 2015) +> “Aggregating the judgment of many consistently beats the accuracy of the average member of the group, and is often as startlingly accurate […] In fact, in any group, there are likely to be individuals who beat the group. But those bull’s-eye guesses typically say more about the power of luck […] than about the skill of the guesser. That becomes clear when the exercise is repeated many times.” (Tetlock and Gardner, 2015) --- @@ -432,14 +432,14 @@ Understand your model's assumptions: - Theoretical constraints: model assumptions are not met. For example, use a linear model when decision boundaries are not linear. - Poor implementation: The model may be a good fit, but implementation has errors. -- Poor choice of hyperparameters: with the same model, one set of hyperparameters can give better results than others. +- Poor choice of hyperparameters: With the same model, one set of hyperparameters can give better results than others. --- ## Debugging: Why ML Models Fail - Data problems: noise and dirty data are everywhere. Additionally, poor implementation of data flows can lead to data problems. -- Poor choice of features: Too many features may cause overfitting or data leakage. Too few features might lack predictive power to allow for making good predictions. +- Poor choice of features: Too many features may cause overfitting or data leakage. Too few features might lack the predictive power to make good predictions. - Some debugging approaches:     - Start simple and gradually add more components.     - Overfit a single batch. @@ -487,7 +487,7 @@ Understand your model's assumptions: ### Model calibration or conformal prediction methods - Idea: If the forecast has a 70% chance of rain, then 70% of the time this forecast was made, it actually rained. -- Prediction scores are often normalized to values between 0 and 1. It is tempting to think of them as probabilities, but they are not necessarily so. +- Prediction scores are often normalized to values between 0 and 1. It is tempting to think of them as probabilities, but they need not be. - Use conformal prediction methods to calibrate prediction scores. - Confidence measurement: show only predictions where the model is confident. - Slice-based evaluation: model performance is different in subsets of data. @@ -503,4 +503,4 @@ Understand your model's assumptions: - Agrawal, A. et al. "Cloudy with a high chance of DBMS: A 10-year prediction for Enterprise-Grade ML." arXiv preprint arXiv:1909.00084 (2019). - Domingos, Pedro. "A few useful things to know about machine learning." Communications of the ACM 55, no. 10 (2012): 78-87. - Huyen, Chip. "Designing machine learning systems." O'Reilly Media, Inc.(2022). -- Tetlock and Gardner. Superforecasting: The art and science of prediction. Random House, 2016. +- Tetlock and Gardner. Superforecasting: The art and science of prediction. Random House, 2016. \ No newline at end of file diff --git a/03_instructional_team/markdown_slides/06_deployment.md b/03_instructional_team/markdown_slides/06_deployment.md index a86217f07..25fe92d6b 100644 --- a/03_instructional_team/markdown_slides/06_deployment.md +++ b/03_instructional_team/markdown_slides/06_deployment.md @@ -18,7 +18,7 @@ $ echo "Data Sciences Institute" ## Agenda **6.1 Model Deployment and Prediction Service** -    + - ML Deployment Myths and Anti-Patterns - Batch Prediction vs Online Prediction @@ -80,9 +80,9 @@ $ echo "Data Sciences Institute" - Model performance decays over time. - Deployments should be easy: -    - The development environment should resemble the production environment as closely as possible. -    - Infrastructure should be easier to rebuild than to repair. -    - Small incremental and frequent changes. + - The development environment should resemble the production environment as closely as possible. + - Infrastructure should be easier to rebuild than to repair. + - Small incremental and frequent changes. --- @@ -98,7 +98,7 @@ $ echo "Data Sciences Institute" ### 4. Most ML engineers don't need to worry about scale - Scale means different things to different applications. -- Number of users, availability, speed or volume of data. +- Number of users, availability, speed, or volume of data. --- @@ -145,4 +145,4 @@ Three types of model prediction or inference service: # References - Agrawal, A. et al. "Cloudy with a high chance of DBMS: A 10-year prediction for Enterprise-Grade ML." arXiv preprint arXiv:1909.00084 (2019). -- Huyen, Chip. "Designing machine learning systems." O'Reilly Media, Inc.(2021). +- Huyen, Chip. "Designing machine learning systems." O'Reilly Media, Inc.(2021). \ No newline at end of file diff --git a/03_instructional_team/markdown_slides/07_monitoring.md b/03_instructional_team/markdown_slides/07_monitoring.md index 1ac0da1af..c9b6dc9bf 100644 --- a/03_instructional_team/markdown_slides/07_monitoring.md +++ b/03_instructional_team/markdown_slides/07_monitoring.md @@ -94,7 +94,7 @@ A failure happens when one or more expectations of the system are not met: - A key assumption is that training and unseen data come from the same distribution. - When we say that a model *learns* from data, we are saying that the model learns the distribution of the training data to use this information on unseen data. -- When predictions on unseen data are satisfactory, we say the model "generalizes to unseen data". +- When predictions on unseen data are satisfactory, we say the model "generalizes to unseen data. - The test data used in the model development phase and the cross-validation are *estimates* of the error in unseen (production) data. - Reasons for difference:     - Data collection, encoding, and instrumentation. @@ -181,7 +181,7 @@ $$     - $P(X)$ changes.     - $P(Y|X)$ does not change. - Widely studied distribution shifts. -- Covariate is an independent variable that can influence the outcome of a statistical trial but it is not of direct interest. +- A covariate is an independent variable that can influence the outcome of a statistical trial, but it is not of direct interest. - Example: while predicting house prices as a function of location, a covariate is square footage. --- @@ -272,7 +272,7 @@ Causes: ### Classes of metrics to monitor - Operational metrics: - + Convey the health of the system. Operational metrics are related to the network, machine, and application. + + Convey the system's health. Operational metrics relate to the network, machines, and applications. + Ex.: Latency, throughput, prediction requests per unit of time, percentage of successful predictions, CPU/GPU utilization, memory use, etc.     - ML-specific metrics: Model performance, predictions, features, and raw inputs. @@ -333,4 +333,4 @@ Causes: # References - Agrawal, A. et al. "Cloudy with a high chance of DBMS: A 10-year prediction for Enterprise-Grade ML." arXiv preprint arXiv:1909.00084 (2019). -- Huyen, Chip. "Designing machine learning systems." O'Reilly Media, Inc.(2021). +- Huyen, Chip. "Designing machine learning systems." O'Reilly Media, Inc.(2021). \ No newline at end of file diff --git a/03_instructional_team/markdown_slides/08_infra_and_org.md b/03_instructional_team/markdown_slides/08_infra_and_org.md index eeacf58e8..108192f0b 100644 --- a/03_instructional_team/markdown_slides/08_infra_and_org.md +++ b/03_instructional_team/markdown_slides/08_infra_and_org.md @@ -4,7 +4,7 @@ theme: dsi_certificates_theme paginate: true --- -# Production: Infrastructure and Organization +# Production: Infrastructure and Organisation ```code $ echo "Data Sciences Institute" @@ -56,8 +56,8 @@ $ echo "Data Sciences Institute" # Storage and Compute - ML systems require and produce a lot of data. -- Storage layer can be HDD or SDD, but can also be blob (binary large object) storage. -- Over the last decade, storage has been commoditized in the cloud. +- Storage layer can be HDD or SSD, but can also be blob (binary large object) storage. +- Over the last decade, storage has been commoditised in the cloud. --- @@ -67,12 +67,12 @@ $ echo "Data Sciences Institute" - Compute can be permanent or ephemeral: - Training has spiky compute requirements that tend to be ephemeral. - DB will require some compute to operate and, generally, this compute is permanent. -- Compute and storage can scale: cloud infrastructure is attractive for its elasticity (it grows with needs.) -- Compute must have access to storage, therefore, it is important to consider the cost of data transmission. +- Compute and storage can scale: cloud infrastructure is attractive for its elasticity (it grows with needs) +- Compute must have access to storage; therefore, it is important to consider the cost of data transmission. --- -# Development Environment +# Development Environment (1/2) - Where ML engineers write code, run experiments, and interact with the production environment. - Consists of IDE, versioning, and CI/CD. @@ -80,28 +80,28 @@ $ echo "Data Sciences Institute" --- -# Development Environment +# Development Environment (2/2) - Versioning is fundamental for ML System implementation. - Dev environment should be built for CI/CD: - Automated testing. - Continuous integration. - - Andon Cord: capability to revert to latest working verison of system. -- Dev Environment should ressemble the production environment as closely as possible. + - Andon Cord: capability to revert to the latest working version of the system. +- Dev Environment should resemble the production environment as closely as possible. --- # Resource Management - In terrestrial data centres, storage and compute are finite. -- With cloud infrastructure, storage and compute are elastic, but they are charged by utilization. +- With cloud infrastructure, storage and compute are elastic, but they are charged by utilisation. - Two key characteristics to consider: - Repetitiveness. - Dependencies. ![bg right:50% w:500](./images/08_dag_example.png) - + --- @@ -109,10 +109,10 @@ $ echo "Data Sciences Institute" --- -# Roles, Tasks, and Skills +# Roles, Tasks, and Skills (1/4) - CDO/DS Leader: - - Bridges the gap between business and datas science. + - Bridges the gap between business and data science. - Defines the vision and technical lead. - Skills: leadership, design thinking, data science/ML, domain experience. - Data engineer: @@ -122,46 +122,46 @@ $ echo "Data Sciences Institute" --- -# Roles, Tasks, and Skills +# Roles, Tasks, and Skills (2/4) - Analyst: - - Collects, cleans, transforms data. + - Collects, cleans, and transforms data. - Interprets analytical results, reports and communicates. - Skills: R, Python, SQL, BI Tools. -- Visualization Engineer +- Visualisation Engineer - Makes sense of data and analysis output by showing it in the right context. - Articulate business problems and display solutions with data. - Skills: design thinking, BI Tools, presentation and writing. --- -# Roles, Tasks, and Skills (cont.) +# Roles, Tasks, and Skills (3/4) - Data Scientist - Solves business tasks using ML and data. - Data preparation, training, and evaluating models. - Skills: R, Python, modelling, data manipulation. - ML Engineer - - Combines software engineering and modeling to implement data intensive products. + - Combines software engineering and modelling to implement data-intensive products. - Deploys models into production and at scale. - Python, Spark, Julia, MLOps, DevOps, CI/CD. --- -# Roles, Tasks, and Skills +# Roles, Tasks, and Skills (4/4) - Subject Matter Expert - - Applies rigorous methods developed in area of expertise. + - Applies rigorous methods developed in the area of expertise. - Help decision-makers come to conclusions safely beyond ML models. - Ex: Statistician, Actuary, Econometrician, Physicist, Epidemiologist - Model validation - Independently validate models, including their interpretation. - Perform technical testing. - - Skills: similar to data scientis/SME. + - Skills: similar to a data scientist/SME. --- -# Where to Focus Our Efforts? +# Where to Focus Our Efforts? (1/2) ![bg left:50% w:500](./images/08_areas_of_focus.png) @@ -170,12 +170,12 @@ $ echo "Data Sciences Institute" Start with the data: - Mature proprietary solutions have stronger support for data management. -- Providing complete and useable thrid-party solutions is non-trivial. +- Providing complete and usable third-party solutions is non-trivial. - There is no data analysis without data. --- -# Where to Focus Our Efforts? +# Where to Focus Our Efforts? (2/2) ![bg left:50% w:500](./images/08_areas_of_focus.png) diff --git a/05_src/stock_prices/data_manager.py b/05_src/stock_prices/data_manager.py index b742a5b14..9fabc83a1 100644 --- a/05_src/stock_prices/data_manager.py +++ b/05_src/stock_prices/data_manager.py @@ -100,12 +100,15 @@ def save_by_year(price_dt, out_dir): ''' _logs.info(f'Saving data by year') for ticker in price_dt['ticker'].unique(): + _logs.info(f'Processing ticker: {ticker}') ticker_dt = price_dt[price_dt['ticker'] == ticker] ticker_dt = ticker_dt.assign(Year = ticker_dt.Date.dt.year) for yr in ticker_dt['Year'].unique(): + _logs.info(f'Processing year {yr} for ticker {ticker}.') yr_dd = dd.from_pandas(ticker_dt[ticker_dt['Year'] == yr],2) yr_path = os.path.join(out_dir, ticker, f"{ticker}_{yr}") os.makedirs(os.path.dirname(yr_path), exist_ok=True) + _logs.info(f'Writing data to path: {yr_path}') yr_dd.to_parquet(yr_path, engine = "pyarrow") @@ -150,23 +153,29 @@ def create_features(self): _logs.info(f'Creating features') _logs.debug(f'Columns in price data {self.price_dd.columns}') price_dd = self.price_dd - features = (price_dd.groupby('ticker', group_keys=False) - .apply( - lambda x: x.assign( - Close_lag_1 = x['Close'].shift(1)) - )) - self.features = features - - def create_target(self, target_name = 'positive_return', target_window = 1): - ''' - Create target variable. - ''' + features = ( + price_dd + .groupby('ticker', group_keys=False) + .apply( + lambda x: x.sort_values('Date', ascending = True) + .assign(Close_lag_1 = x['Close'].shift(1)), + meta = pd.DataFrame(data ={'Date': 'datetime64[ns]', + 'Open': 'f8', + 'High': 'f8', + 'Low': 'f8', + 'Close': 'f8', + 'Adj Close': 'f8', + 'Volume': 'i8', + 'source': 'object', + 'Year': 'int32', + 'Close_lag_1': 'f8'}, + index = pd.Index([], dtype=pd.StringDtype(), name='ticker')) + )) + dd_returns = features.assign( + Returns = lambda x: x['Close']/x['Close_lag_1'] - 1 + ) + self.features = dd_returns - _logs.info(f'Creating target') - self.features = (self.features.groupby('ticker', group_keys=False).apply( - lambda x: x.sort_values('Date').assign( - target = lambda x: x[target_name].shift(-target_window) - ))) def save_features(self): ''' @@ -174,10 +183,23 @@ def save_features(self): ''' _logs.info(f'Saving features to {self.features_path}') _logs.debug(f'Features columns {self.features.columns}') - self.features.to_parquet( - self.features_path, - write_index = True, - overwrite = True) + self.features.to_parquet(FEATURES_DATA, + overwrite = True, + write_index = True, + schema={ + 'Date': 'timestamp[ns]', + 'Open': 'float64', + 'High': 'float64', + 'Low': 'float64', + 'Close': 'float64', + 'Adj Close': 'float64', + 'Volume': 'int64', + 'source': 'string', + 'Year': 'int32', + 'Close_lag_1': 'float64', + 'Returns': 'float64', + 'ticker': 'large_string' + }) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Download and process stock price data.') diff --git a/pyproject.toml b/pyproject.toml index 8f0e36725..376a7f6a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,8 @@ dependencies = [ "pyarrow>=21.0.0", "python-dotenv>=1.1.1", "scikit-learn>=1.7.2", + "seaborn>=0.13.2", + "ucimlrepo>=0.0.7", "utils>=1.0.2", "yfinance>=0.2.65", ] diff --git a/uv.lock b/uv.lock index 1494f8c09..c300ff5d0 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.11" resolution-markers = [ "python_full_version >= '3.12'", @@ -730,6 +730,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/8e/abdd3f14d735b2929290a018ecf133c901be4874b858dd1c604b9319f064/greenlet-3.2.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2523e5246274f54fdadbce8494458a2ebdcdbc7b802318466ac5606d3cded1f8", size = 587684, upload-time = "2025-08-07T13:18:25.164Z" }, { url = "https://files.pythonhosted.org/packages/5d/65/deb2a69c3e5996439b0176f6651e0052542bb6c8f8ec2e3fba97c9768805/greenlet-3.2.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1987de92fec508535687fb807a5cea1560f6196285a4cde35c100b8cd632cc52", size = 1116647, upload-time = "2025-08-07T13:42:38.655Z" }, { url = "https://files.pythonhosted.org/packages/3f/cc/b07000438a29ac5cfb2194bfc128151d52f333cee74dd7dfe3fb733fc16c/greenlet-3.2.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:55e9c5affaa6775e2c6b67659f3a71684de4c549b3dd9afca3bc773533d284fa", size = 1142073, upload-time = "2025-08-07T13:18:21.737Z" }, + { url = "https://files.pythonhosted.org/packages/67/24/28a5b2fa42d12b3d7e5614145f0bd89714c34c08be6aabe39c14dd52db34/greenlet-3.2.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9c6de1940a7d828635fbd254d69db79e54619f165ee7ce32fda763a9cb6a58c", size = 1548385, upload-time = "2025-11-04T12:42:11.067Z" }, + { url = "https://files.pythonhosted.org/packages/6a/05/03f2f0bdd0b0ff9a4f7b99333d57b53a7709c27723ec8123056b084e69cd/greenlet-3.2.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03c5136e7be905045160b1b9fdca93dd6727b180feeafda6818e6496434ed8c5", size = 1613329, upload-time = "2025-11-04T12:42:12.928Z" }, { url = "https://files.pythonhosted.org/packages/d8/0f/30aef242fcab550b0b3520b8e3561156857c94288f0332a79928c31a52cf/greenlet-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:9c40adce87eaa9ddb593ccb0fa6a07caf34015a29bf8d344811665b573138db9", size = 299100, upload-time = "2025-08-07T13:44:12.287Z" }, { url = "https://files.pythonhosted.org/packages/44/69/9b804adb5fd0671f367781560eb5eb586c4d495277c93bde4307b9e28068/greenlet-3.2.4-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:3b67ca49f54cede0186854a008109d6ee71f66bd57bb36abd6d0a0267b540cdd", size = 274079, upload-time = "2025-08-07T13:15:45.033Z" }, { url = "https://files.pythonhosted.org/packages/46/e9/d2a80c99f19a153eff70bc451ab78615583b8dac0754cfb942223d2c1a0d/greenlet-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ddf9164e7a5b08e9d22511526865780a576f19ddd00d62f8a665949327fde8bb", size = 640997, upload-time = "2025-08-07T13:42:56.234Z" }, @@ -739,6 +741,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" }, { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" }, { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" }, + { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" }, + { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" }, { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" }, { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" }, { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" }, @@ -748,6 +752,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, @@ -755,6 +761,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, + { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" }, { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, ] @@ -1574,6 +1582,8 @@ dependencies = [ { name = "pyarrow" }, { name = "python-dotenv" }, { name = "scikit-learn" }, + { name = "seaborn" }, + { name = "ucimlrepo" }, { name = "utils" }, { name = "yfinance" }, ] @@ -1592,6 +1602,8 @@ requires-dist = [ { name = "pyarrow", specifier = ">=21.0.0" }, { name = "python-dotenv", specifier = ">=1.1.1" }, { name = "scikit-learn", specifier = ">=1.7.2" }, + { name = "seaborn", specifier = ">=0.13.2" }, + { name = "ucimlrepo", specifier = ">=0.0.7" }, { name = "utils", specifier = ">=1.0.2" }, { name = "yfinance", specifier = ">=0.2.65" }, ] @@ -2098,6 +2110,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/97/30/2f9a5243008f76dfc5dee9a53dfb939d9b31e16ce4bd4f2e628bfc5d89d2/scipy-1.16.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d2a4472c231328d4de38d5f1f68fdd6d28a615138f842580a8a321b5845cf779", size = 26448374, upload-time = "2025-09-11T17:45:03.45Z" }, ] +[[package]] +name = "seaborn" +version = "0.13.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "matplotlib" }, + { name = "numpy" }, + { name = "pandas" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" }, +] + [[package]] name = "setuptools" version = "80.9.0" @@ -2301,6 +2327,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] +[[package]] +name = "ucimlrepo" +version = "0.0.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "pandas" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/87/7c/f5a400cc99a5365d153609ebf803084f78b4638b0f7925aa31d9abb62b8e/ucimlrepo-0.0.7.tar.gz", hash = "sha256:4cff3f9e814367dd60956da999ace473197237b9fce4c07e9a689e77b4ffb59a", size = 9369, upload-time = "2024-05-21T06:06:41.465Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/07/1252560194df2b4fad1cb3c46081b948331c63eb1bb0b97620d508d12a53/ucimlrepo-0.0.7-py3-none-any.whl", hash = "sha256:0a5ce7e21d7ec850a0da4427c47f9dd96fcc6532f1c7e95dcec63eeb40f08026", size = 8041, upload-time = "2024-05-21T06:06:39.826Z" }, +] + [[package]] name = "urllib3" version = "2.5.0"