diff --git a/examples/tools/hunting-anomalies/README.md b/examples/tools/hunting-anomalies/README.md new file mode 100644 index 00000000..4b36f1b5 --- /dev/null +++ b/examples/tools/hunting-anomalies/README.md @@ -0,0 +1,49 @@ +# Hunting Anomalies in the Stock Market + +This repository contains all the necessary scripts and data directories used in the [Hunting Anomalies in the Stock Market](https://polygon.io/blog/hunting-anomalies-in-stock-market/) tutorial, hosted on Polygon.io's blog. The tutorial demonstrates how to detect statistical anomalies in historical US stock market data through a comprehensive workflow that involves downloading data, building a lookup table, querying for anomalies, and visualizing them through a web interface. + +### Prerequisites + +- Python 3.8+ +- Access to Polygon.io's historical data via Flat Files +- An active Polygon.io API key, obtainable by signing up for a Stocks paid plan + +### Repository Contents + +- `README.md`: This file, outlining setup and execution instructions. +- `aggregates_day`: Directory where downloaded CSV data files are stored. +- `build-lookup-table.py`: Python script to build a lookup table from the historical data. +- `query-lookup-table.py`: Python script to query the lookup table for anomalies. +- `gui-lookup-table.py`: Python script for a browser-based interface to explore anomalies visually. + +### Running the Tutorial + +1. **Ensure Python 3.8+ is installed:** Check your Python version and ensure all required libraries (polygon-api-client, pandas, pickle, and argparse) are installed. + +2. **Set up your API key:** Make sure you have an active paid Polygon.io Stock subscription for accessing Flat Files. Set up your API key in your environment or directly in the scripts where required. + +3. **Download Historical Data:** Use the MinIO client to download historical stock market data. Adjust the commands and paths based on the data you are interested in. + ```bash + mc alias set s3polygon https://files.polygon.io YOUR_ACCESS_KEY YOUR_SECRET_KEY + mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/08/ ./aggregates_day/ + mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/09/ ./aggregates_day/ + mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/10/ ./aggregates_day/ + gunzip ./aggregates_day/*.gz + ``` + +4. **Build the Lookup Table:** This script processes the downloaded data and builds a lookup table, saving it as `lookup_table.pkl`. + ```bash + python build-lookup-table.py + ``` + +5. **Query Anomalies:** Replace `2024-10-18` with the date you want to analyze for anomalies. + ```bash + python query-lookup-table.py 2024-10-18 + ``` + +6. **Run the GUI:** Access the web interface at `http://localhost:8888` to explore the anomalies visually. + ```bash + python gui-lookup-table.py + ``` + +For a complete step-by-step guide on each phase of the anomaly detection process, including additional configurations and troubleshooting, refer to the detailed [tutorial on our blog](https://polygon.io/blog/hunting-anomalies-in-stock-market/). diff --git a/examples/tools/hunting-anomalies/aggregates_day/README.md b/examples/tools/hunting-anomalies/aggregates_day/README.md new file mode 100644 index 00000000..a0ade480 --- /dev/null +++ b/examples/tools/hunting-anomalies/aggregates_day/README.md @@ -0,0 +1 @@ +Download flat files into here. diff --git a/examples/tools/hunting-anomalies/build-lookup-table.py b/examples/tools/hunting-anomalies/build-lookup-table.py new file mode 100644 index 00000000..16abca2d --- /dev/null +++ b/examples/tools/hunting-anomalies/build-lookup-table.py @@ -0,0 +1,91 @@ +import os +import pandas as pd # type: ignore +from collections import defaultdict +import pickle +import json +from typing import DefaultDict, Dict, Any, BinaryIO + +# Directory containing the daily CSV files +data_dir = "./aggregates_day/" + +# Initialize a dictionary to hold trades data +trades_data = defaultdict(list) + +# List all CSV files in the directory +files = sorted([f for f in os.listdir(data_dir) if f.endswith(".csv")]) + +print("Starting to process files...") + +# Process each file (assuming files are named in order) +for file in files: + print(f"Processing {file}") + file_path = os.path.join(data_dir, file) + df = pd.read_csv(file_path) + # For each stock, store the date and relevant data + for _, row in df.iterrows(): + ticker = row["ticker"] + date = pd.to_datetime(row["window_start"], unit="ns").date() + trades = row["transactions"] + close_price = row["close"] # Ensure 'close' column exists in your CSV + trades_data[ticker].append( + {"date": date, "trades": trades, "close_price": close_price} + ) + +print("Finished processing files.") +print("Building lookup table...") + +# Now, build the lookup table with rolling averages and percentage price change +lookup_table: DefaultDict[str, Dict[str, Any]] = defaultdict( + dict +) # Nested dict: ticker -> date -> stats + +for ticker, records in trades_data.items(): + # Convert records to DataFrame + df_ticker = pd.DataFrame(records) + # Sort records by date + df_ticker.sort_values("date", inplace=True) + df_ticker.set_index("date", inplace=True) + + # Calculate the percentage change in close_price + df_ticker["price_diff"] = ( + df_ticker["close_price"].pct_change() * 100 + ) # Multiply by 100 for percentage + + # Shift trades to exclude the current day from rolling calculations + df_ticker["trades_shifted"] = df_ticker["trades"].shift(1) + # Calculate rolling average and standard deviation over the previous 5 days + df_ticker["avg_trades"] = df_ticker["trades_shifted"].rolling(window=5).mean() + df_ticker["std_trades"] = df_ticker["trades_shifted"].rolling(window=5).std() + # Store the data in the lookup table + for date, row in df_ticker.iterrows(): + # Convert date to string for JSON serialization + date_str = date.strftime("%Y-%m-%d") + # Ensure rolling stats are available + if pd.notnull(row["avg_trades"]) and pd.notnull(row["std_trades"]): + lookup_table[ticker][date_str] = { + "trades": row["trades"], + "close_price": row["close_price"], + "price_diff": row["price_diff"], + "avg_trades": row["avg_trades"], + "std_trades": row["std_trades"], + } + else: + # Store data without rolling stats if not enough data points + lookup_table[ticker][date_str] = { + "trades": row["trades"], + "close_price": row["close_price"], + "price_diff": row["price_diff"], + "avg_trades": None, + "std_trades": None, + } + +print("Lookup table built successfully.") + +# Convert defaultdict to regular dict for JSON serialization +lookup_table_dict = {k: v for k, v in lookup_table.items()} + +# Save the lookup table to a file for later use +with open("lookup_table.pkl", "wb") as f: # type: BinaryIO + pickle.dump(lookup_table_dict, f) + +print("Lookup table saved to 'lookup_table.pkl'.") diff --git a/examples/tools/hunting-anomalies/gui-lookup-table.py b/examples/tools/hunting-anomalies/gui-lookup-table.py new file mode 100644 index 00000000..df58746c --- /dev/null +++ b/examples/tools/hunting-anomalies/gui-lookup-table.py @@ -0,0 +1,302 @@ +import os +import pickle +import json +from datetime import datetime +from polygon import RESTClient +from polygon.rest.models import Agg +import http.server +import socketserver +import traceback +from urllib.parse import urlparse, parse_qs + +PORT = 8888 + +# Load the lookup_table +with open("lookup_table.pkl", "rb") as f: + lookup_table = pickle.load(f) + + +class handler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + # Parse the path and query parameters + parsed_path = urlparse(self.path) + path = parsed_path.path + query_params = parse_qs(parsed_path.query) + + if path == "/": + # Handle the root path + # Get the date parameter if provided + date_param = query_params.get("date", [None])[0] + + # Get all dates from the lookup table + all_dates = set() + for ticker_data in lookup_table.values(): + all_dates.update(ticker_data.keys()) + all_dates = sorted(all_dates) + + # If date is None, get the latest date from the lookup table + if date_param is None: + if all_dates: + latest_date = max(all_dates) + else: + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + html_content = ( + "
" + if prev_date: + html_content += 'Previous Date '.format( + prev_date + ) + if next_date: + html_content += 'Next Date '.format(next_date) + html_content += "
" + # Display the anomalies in a table + html_content += ( + 'Ticker | " + html_content += "Trades | " + html_content += "Avg Trades | " + html_content += "Std Dev | " + html_content += "Z-score | " + html_content += "Close Price | " + html_content += "Price Diff | " + html_content += "Chart | " + html_content += "
---|---|---|---|---|---|---|---|
{} | ".format(anomaly["ticker"]) + html_content += "{} | ".format(anomaly["trades"]) + html_content += "{:.2f} | ".format(anomaly["avg_trades"]) + html_content += "{:.2f} | ".format(anomaly["std_trades"]) + html_content += "{:.2f} | ".format(anomaly["z_score"]) + html_content += "{:.2f} | ".format(anomaly["close_price"]) + html_content += "{:.2f} | ".format(anomaly["price_diff"]) + # Add a link to the chart + html_content += ( + 'View Chart | '.format( + anomaly["ticker"], latest_date + ) + ) + html_content += "