From f6a96b1bd32b6281d17f6eff8f6f48b52ffc081d Mon Sep 17 00:00:00 2001
From: justinpolygon <123573436+justinpolygon@users.noreply.github.com>
Date: Mon, 4 Nov 2024 13:34:39 -0800
Subject: [PATCH 1/5] Add Hunting Anomalies in the Stock Market scripts
---
examples/tools/hunting-anomalies/README.md | 50 ++++
.../aggregates_day/README.md | 1 +
.../hunting-anomalies/build-lookup-table.py | 94 ++++++
.../hunting-anomalies/gui-lookup-table.py | 270 ++++++++++++++++++
.../hunting-anomalies/query-lookup-table.py | 63 ++++
5 files changed, 478 insertions(+)
create mode 100644 examples/tools/hunting-anomalies/README.md
create mode 100644 examples/tools/hunting-anomalies/aggregates_day/README.md
create mode 100644 examples/tools/hunting-anomalies/build-lookup-table.py
create mode 100644 examples/tools/hunting-anomalies/gui-lookup-table.py
create mode 100644 examples/tools/hunting-anomalies/query-lookup-table.py
diff --git a/examples/tools/hunting-anomalies/README.md b/examples/tools/hunting-anomalies/README.md
new file mode 100644
index 00000000..f7bc9aa2
--- /dev/null
+++ b/examples/tools/hunting-anomalies/README.md
@@ -0,0 +1,50 @@
+# Hunting Anomalies in the Stock Market
+
+This repository contains all the necessary scripts and data directories used in the [Hunting Anomalies in the Stock Market](https://polygon.io/blog/hunting-anomalies-in-stock-market/) tutorial, hosted on Polygon.io's blog. The tutorial demonstrates how to detect statistical anomalies in historical US stock market data through a comprehensive workflow that involves downloading data, building a lookup table, querying for anomalies, and visualizing them through a web interface.
+
+### Prerequisites
+
+- Python 3.8+
+- Access to Polygon.io's historical data via Flat Files
+- An active Polygon.io API key, obtainable by signing up for a Stocks paid plan
+
+### Repository Contents
+
+- `README.md`: This file, outlining setup and execution instructions.
+- `aggregates_day`: Directory where downloaded CSV data files are stored.
+- `build-lookup-table.py`: Python script to build a lookup table from the historical data.
+- `query-lookup-table.py`: Python script to query the lookup table for anomalies.
+- `gui-lookup-table.py`: Python script for a browser-based interface to explore anomalies visually.
+
+### Running the Tutorial
+
+1. **Ensure Python 3.8+ is installed:** Check your Python version and ensure all required libraries (polygon-api-client, pandas, pickle, and argparse) are installed.
+
+2. **Set up your API key:** Make sure you have an active paid Polygon.io Stock subscription for accessing Flat Files. Set up your API key in your environment or directly in the scripts where required.
+
+3. **Download Historical Data:** Use the MinIO client to download historical stock market data:
+ ```bash
+ mc alias set s3polygon https://files.polygon.io YOUR_ACCESS_KEY YOUR_SECRET_KEY
+ mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/08/ ./aggregates_day/
+ mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/09/ ./aggregates_day/
+ mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/10/ ./aggregates_day/
+ gunzip ./aggregates_day/*.gz
+ ```
+ Adjust the commands and paths based on the data you're interested in.
+
+4. **Build the Lookup Table:** This script processes the downloaded data and builds a lookup table, saving it as `lookup_table.pkl`.
+ ```bash
+ python build-lookup-table.py
+ ```
+
+5. **Query Anomalies:** Replace `2024-10-18` with the date you want to analyze for anomalies.
+ ```bash
+ python query-lookup-table.py 2024-10-18
+ ```
+
+6. **Run the GUI:** Access the web interface at `http://localhost:8888` to explore the anomalies visually.
+ ```bash
+ python gui-lookup-table.py
+ ```
+
+For a complete step-by-step guide on each phase of the anomaly detection process, including additional configurations and troubleshooting, refer to the detailed [tutorial on our blog](https://polygon.io/blog/hunting-anomalies-in-stock-market).
diff --git a/examples/tools/hunting-anomalies/aggregates_day/README.md b/examples/tools/hunting-anomalies/aggregates_day/README.md
new file mode 100644
index 00000000..a0ade480
--- /dev/null
+++ b/examples/tools/hunting-anomalies/aggregates_day/README.md
@@ -0,0 +1 @@
+Download flat files into here.
diff --git a/examples/tools/hunting-anomalies/build-lookup-table.py b/examples/tools/hunting-anomalies/build-lookup-table.py
new file mode 100644
index 00000000..c173d58d
--- /dev/null
+++ b/examples/tools/hunting-anomalies/build-lookup-table.py
@@ -0,0 +1,94 @@
+import os
+import pandas as pd
+from collections import defaultdict
+import pickle
+import json
+
+# Directory containing the daily CSV files
+data_dir = './aggregates_day/'
+
+# Initialize a dictionary to hold trades data
+trades_data = defaultdict(list)
+
+# List all CSV files in the directory
+files = sorted([f for f in os.listdir(data_dir) if f.endswith('.csv')])
+
+print("Starting to process files...")
+
+# Process each file (assuming files are named in order)
+for file in files:
+ print(f"Processing {file}")
+ file_path = os.path.join(data_dir, file)
+ df = pd.read_csv(file_path)
+ # For each stock, store the date and relevant data
+ for _, row in df.iterrows():
+ ticker = row['ticker']
+ date = pd.to_datetime(row['window_start'], unit='ns').date()
+ trades = row['transactions']
+ close_price = row['close'] # Ensure 'close' column exists in your CSV
+ trades_data[ticker].append({
+ 'date': date,
+ 'trades': trades,
+ 'close_price': close_price
+ })
+
+print("Finished processing files.")
+print("Building lookup table...")
+
+# Now, build the lookup table with rolling averages and percentage price change
+lookup_table = defaultdict(dict) # Nested dict: ticker -> date -> stats
+
+for ticker, records in trades_data.items():
+ # Convert records to DataFrame
+ df_ticker = pd.DataFrame(records)
+ # Sort records by date
+ df_ticker.sort_values('date', inplace=True)
+ df_ticker.set_index('date', inplace=True)
+
+ # Calculate the percentage change in close_price
+ df_ticker['price_diff'] = df_ticker['close_price'].pct_change() * 100 # Multiply by 100 for percentage
+
+ # Shift trades to exclude the current day from rolling calculations
+ df_ticker['trades_shifted'] = df_ticker['trades'].shift(1)
+ # Calculate rolling average and standard deviation over the previous 5 days
+ df_ticker['avg_trades'] = df_ticker['trades_shifted'].rolling(window=5).mean()
+ df_ticker['std_trades'] = df_ticker['trades_shifted'].rolling(window=5).std()
+ # Store the data in the lookup table
+ for date, row in df_ticker.iterrows():
+ # Convert date to string for JSON serialization
+ date_str = date.strftime('%Y-%m-%d')
+ # Ensure rolling stats are available
+ if pd.notnull(row['avg_trades']) and pd.notnull(row['std_trades']):
+ lookup_table[ticker][date_str] = {
+ 'trades': row['trades'],
+ 'close_price': row['close_price'],
+ 'price_diff': row['price_diff'],
+ 'avg_trades': row['avg_trades'],
+ 'std_trades': row['std_trades']
+ }
+ else:
+ # Store data without rolling stats if not enough data points
+ lookup_table[ticker][date_str] = {
+ 'trades': row['trades'],
+ 'close_price': row['close_price'],
+ 'price_diff': row['price_diff'],
+ 'avg_trades': None,
+ 'std_trades': None
+ }
+
+print("Lookup table built successfully.")
+
+# Convert defaultdict to regular dict for JSON serialization
+lookup_table = {k: v for k, v in lookup_table.items()}
+
+# Save the lookup table to a JSON file
+with open('lookup_table.json', 'w') as f:
+ json.dump(lookup_table, f, indent=4)
+
+print("Lookup table saved to 'lookup_table.json'.")
+
+# Save the lookup table to a file for later use
+with open('lookup_table.pkl', 'wb') as f:
+ pickle.dump(lookup_table, f)
+
+print("Lookup table saved to 'lookup_table.pkl'.")
diff --git a/examples/tools/hunting-anomalies/gui-lookup-table.py b/examples/tools/hunting-anomalies/gui-lookup-table.py
new file mode 100644
index 00000000..ee2fc43b
--- /dev/null
+++ b/examples/tools/hunting-anomalies/gui-lookup-table.py
@@ -0,0 +1,270 @@
+import os
+import pickle
+import json
+from datetime import datetime
+from polygon import RESTClient
+from polygon.rest.models import Agg
+import http.server
+import socketserver
+import traceback
+from urllib.parse import urlparse, parse_qs
+
+PORT = 8888
+
+# Load the lookup_table
+with open('lookup_table.pkl', 'rb') as f:
+ lookup_table = pickle.load(f)
+
+class handler(http.server.SimpleHTTPRequestHandler):
+ def do_GET(self):
+ # Parse the path and query parameters
+ parsed_path = urlparse(self.path)
+ path = parsed_path.path
+ query_params = parse_qs(parsed_path.query)
+
+ if path == '/':
+ # Handle the root path
+ # Get the date parameter if provided
+ date_param = query_params.get('date', [None])[0]
+
+ # Get all dates from the lookup table
+ all_dates = set()
+ for ticker_data in lookup_table.values():
+ all_dates.update(ticker_data.keys())
+ all_dates = sorted(all_dates)
+
+ # If date is None, get the latest date from the lookup table
+ if date_param is None:
+ if all_dates:
+ latest_date = max(all_dates)
+ else:
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ html_content = '
No data available.
'
+ self.wfile.write(html_content.encode())
+ return
+ else:
+ latest_date = date_param
+
+ # Ensure latest_date is in all_dates
+ if latest_date not in all_dates:
+ # Handle the case where the provided date is invalid
+ self.send_response(400)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ error_html = f'
Error: No data available for date {latest_date}
'
+ self.wfile.write(error_html.encode())
+ return
+
+ # Now, get the anomalies for the latest_date
+ anomalies = []
+ for ticker, date_data in lookup_table.items():
+ if latest_date in date_data:
+ data = date_data[latest_date]
+ trades = data['trades']
+ avg_trades = data['avg_trades']
+ std_trades = data['std_trades']
+ if (
+ avg_trades is not None and
+ std_trades is not None and
+ std_trades > 0
+ ):
+ z_score = (trades - avg_trades) / std_trades
+ threshold_multiplier = 3 # Adjust as needed
+ if z_score > threshold_multiplier:
+ anomalies.append({
+ 'ticker': ticker,
+ 'date': latest_date,
+ 'trades': trades,
+ 'avg_trades': avg_trades,
+ 'std_trades': std_trades,
+ 'z_score': z_score,
+ 'close_price': data['close_price'],
+ 'price_diff': data['price_diff']
+ })
+ # Sort anomalies by trades in descending order
+ anomalies.sort(key=lambda x: x['trades'], reverse=True)
+ # Generate the HTML to display the anomalies
+ self.send_response(200)
+ self.send_header("Content-type", "text/html")
+ self.end_headers()
+ # Build the HTML content
+ html_content = 'Anomalies for {}'.format(latest_date)
+ html_content += '
'.format(str(e))
+ self.wfile.write(error_html.encode())
+ else:
+ # Serve files from the current directory
+ super().do_GET()
+
+def run_server():
+ with socketserver.TCPServer(("", PORT), handler) as httpd:
+ print("serving at port", PORT)
+ try:
+ httpd.serve_forever()
+ except KeyboardInterrupt:
+ print("\nExiting gracefully...")
+ httpd.shutdown()
+ httpd.server_close()
+
+if __name__ == '__main__':
+ run_server()
diff --git a/examples/tools/hunting-anomalies/query-lookup-table.py b/examples/tools/hunting-anomalies/query-lookup-table.py
new file mode 100644
index 00000000..4037a031
--- /dev/null
+++ b/examples/tools/hunting-anomalies/query-lookup-table.py
@@ -0,0 +1,63 @@
+import pickle
+import argparse
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description='Anomaly Detection Script')
+parser.add_argument('date', type=str, help='Target date in YYYY-MM-DD format')
+args = parser.parse_args()
+
+# Load the lookup_table
+with open('lookup_table.pkl', 'rb') as f:
+ lookup_table = pickle.load(f)
+
+# Threshold for considering an anomaly (e.g., 3 standard deviations)
+threshold_multiplier = 3
+
+# Date for which we want to find anomalies
+target_date_str = args.date
+
+# List to store anomalies
+anomalies = []
+
+# Iterate over all tickers in the lookup table
+for ticker, date_data in lookup_table.items():
+ if target_date_str in date_data:
+ data = date_data[target_date_str]
+ trades = data['trades']
+ avg_trades = data['avg_trades']
+ std_trades = data['std_trades']
+ if (
+ avg_trades is not None and
+ std_trades is not None and
+ std_trades > 0
+ ):
+ z_score = (trades - avg_trades) / std_trades
+ if z_score > threshold_multiplier:
+ anomalies.append({
+ 'ticker': ticker,
+ 'date': target_date_str,
+ 'trades': trades,
+ 'avg_trades': avg_trades,
+ 'std_trades': std_trades,
+ 'z_score': z_score,
+ 'close_price': data['close_price'],
+ 'price_diff': data['price_diff']
+ })
+
+# Sort anomalies by trades in descending order
+anomalies.sort(key=lambda x: x['trades'], reverse=True)
+
+# Print the anomalies with aligned columns
+print(f"\nAnomalies Found for {target_date_str}:\n")
+print(f"{'Ticker':<10}{'Trades':>10}{'Avg Trades':>15}{'Std Dev':>10}{'Z-score':>10}{'Close Price':>12}{'Price Diff':>12}")
+print("-" * 91)
+for anomaly in anomalies:
+ print(
+ f"{anomaly['ticker']:<10}"
+ f"{anomaly['trades']:>10.0f}"
+ f"{anomaly['avg_trades']:>15.2f}"
+ f"{anomaly['std_trades']:>10.2f}"
+ f"{anomaly['z_score']:>10.2f}"
+ f"{anomaly['close_price']:>12.2f}"
+ f"{anomaly['price_diff']:>12.2f}"
+ )
From ce3a0f1b601007c6cf8b3ea671acec3abd6a1e65 Mon Sep 17 00:00:00 2001
From: justinpolygon <123573436+justinpolygon@users.noreply.github.com>
Date: Mon, 4 Nov 2024 13:37:02 -0800
Subject: [PATCH 2/5] Fix lint
---
.../hunting-anomalies/build-lookup-table.py | 62 +++----
.../hunting-anomalies/gui-lookup-table.py | 162 +++++++++++-------
.../hunting-anomalies/query-lookup-table.py | 46 ++---
3 files changed, 151 insertions(+), 119 deletions(-)
diff --git a/examples/tools/hunting-anomalies/build-lookup-table.py b/examples/tools/hunting-anomalies/build-lookup-table.py
index c173d58d..a2de6ca8 100644
--- a/examples/tools/hunting-anomalies/build-lookup-table.py
+++ b/examples/tools/hunting-anomalies/build-lookup-table.py
@@ -5,13 +5,13 @@
import json
# Directory containing the daily CSV files
-data_dir = './aggregates_day/'
+data_dir = "./aggregates_day/"
# Initialize a dictionary to hold trades data
trades_data = defaultdict(list)
# List all CSV files in the directory
-files = sorted([f for f in os.listdir(data_dir) if f.endswith('.csv')])
+files = sorted([f for f in os.listdir(data_dir) if f.endswith(".csv")])
print("Starting to process files...")
@@ -22,15 +22,13 @@
df = pd.read_csv(file_path)
# For each stock, store the date and relevant data
for _, row in df.iterrows():
- ticker = row['ticker']
- date = pd.to_datetime(row['window_start'], unit='ns').date()
- trades = row['transactions']
- close_price = row['close'] # Ensure 'close' column exists in your CSV
- trades_data[ticker].append({
- 'date': date,
- 'trades': trades,
- 'close_price': close_price
- })
+ ticker = row["ticker"]
+ date = pd.to_datetime(row["window_start"], unit="ns").date()
+ trades = row["transactions"]
+ close_price = row["close"] # Ensure 'close' column exists in your CSV
+ trades_data[ticker].append(
+ {"date": date, "trades": trades, "close_price": close_price}
+ )
print("Finished processing files.")
print("Building lookup table...")
@@ -42,38 +40,40 @@
# Convert records to DataFrame
df_ticker = pd.DataFrame(records)
# Sort records by date
- df_ticker.sort_values('date', inplace=True)
- df_ticker.set_index('date', inplace=True)
+ df_ticker.sort_values("date", inplace=True)
+ df_ticker.set_index("date", inplace=True)
# Calculate the percentage change in close_price
- df_ticker['price_diff'] = df_ticker['close_price'].pct_change() * 100 # Multiply by 100 for percentage
+ df_ticker["price_diff"] = (
+ df_ticker["close_price"].pct_change() * 100
+ ) # Multiply by 100 for percentage
# Shift trades to exclude the current day from rolling calculations
- df_ticker['trades_shifted'] = df_ticker['trades'].shift(1)
+ df_ticker["trades_shifted"] = df_ticker["trades"].shift(1)
# Calculate rolling average and standard deviation over the previous 5 days
- df_ticker['avg_trades'] = df_ticker['trades_shifted'].rolling(window=5).mean()
- df_ticker['std_trades'] = df_ticker['trades_shifted'].rolling(window=5).std()
+ df_ticker["avg_trades"] = df_ticker["trades_shifted"].rolling(window=5).mean()
+ df_ticker["std_trades"] = df_ticker["trades_shifted"].rolling(window=5).std()
# Store the data in the lookup table
for date, row in df_ticker.iterrows():
# Convert date to string for JSON serialization
- date_str = date.strftime('%Y-%m-%d')
+ date_str = date.strftime("%Y-%m-%d")
# Ensure rolling stats are available
- if pd.notnull(row['avg_trades']) and pd.notnull(row['std_trades']):
+ if pd.notnull(row["avg_trades"]) and pd.notnull(row["std_trades"]):
lookup_table[ticker][date_str] = {
- 'trades': row['trades'],
- 'close_price': row['close_price'],
- 'price_diff': row['price_diff'],
- 'avg_trades': row['avg_trades'],
- 'std_trades': row['std_trades']
+ "trades": row["trades"],
+ "close_price": row["close_price"],
+ "price_diff": row["price_diff"],
+ "avg_trades": row["avg_trades"],
+ "std_trades": row["std_trades"],
}
else:
# Store data without rolling stats if not enough data points
lookup_table[ticker][date_str] = {
- 'trades': row['trades'],
- 'close_price': row['close_price'],
- 'price_diff': row['price_diff'],
- 'avg_trades': None,
- 'std_trades': None
+ "trades": row["trades"],
+ "close_price": row["close_price"],
+ "price_diff": row["price_diff"],
+ "avg_trades": None,
+ "std_trades": None,
}
print("Lookup table built successfully.")
@@ -82,13 +82,13 @@
lookup_table = {k: v for k, v in lookup_table.items()}
# Save the lookup table to a JSON file
-with open('lookup_table.json', 'w') as f:
+with open("lookup_table.json", "w") as f:
json.dump(lookup_table, f, indent=4)
print("Lookup table saved to 'lookup_table.json'.")
# Save the lookup table to a file for later use
-with open('lookup_table.pkl', 'wb') as f:
+with open("lookup_table.pkl", "wb") as f:
pickle.dump(lookup_table, f)
print("Lookup table saved to 'lookup_table.pkl'.")
diff --git a/examples/tools/hunting-anomalies/gui-lookup-table.py b/examples/tools/hunting-anomalies/gui-lookup-table.py
index ee2fc43b..df58746c 100644
--- a/examples/tools/hunting-anomalies/gui-lookup-table.py
+++ b/examples/tools/hunting-anomalies/gui-lookup-table.py
@@ -12,27 +12,28 @@
PORT = 8888
# Load the lookup_table
-with open('lookup_table.pkl', 'rb') as f:
+with open("lookup_table.pkl", "rb") as f:
lookup_table = pickle.load(f)
+
class handler(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
# Parse the path and query parameters
parsed_path = urlparse(self.path)
path = parsed_path.path
query_params = parse_qs(parsed_path.query)
-
- if path == '/':
+
+ if path == "/":
# Handle the root path
# Get the date parameter if provided
- date_param = query_params.get('date', [None])[0]
-
+ date_param = query_params.get("date", [None])[0]
+
# Get all dates from the lookup table
all_dates = set()
for ticker_data in lookup_table.values():
all_dates.update(ticker_data.keys())
all_dates = sorted(all_dates)
-
+
# If date is None, get the latest date from the lookup table
if date_param is None:
if all_dates:
@@ -41,109 +42,131 @@ def do_GET(self):
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
- html_content = '
No data available.
'
+ html_content = (
+ "
No data available.
"
+ )
self.wfile.write(html_content.encode())
return
else:
latest_date = date_param
-
+
# Ensure latest_date is in all_dates
if latest_date not in all_dates:
# Handle the case where the provided date is invalid
self.send_response(400)
self.send_header("Content-type", "text/html")
self.end_headers()
- error_html = f'
Error: No data available for date {latest_date}
'
+ error_html = f"
Error: No data available for date {latest_date}
"
self.wfile.write(error_html.encode())
return
-
+
# Now, get the anomalies for the latest_date
anomalies = []
for ticker, date_data in lookup_table.items():
if latest_date in date_data:
data = date_data[latest_date]
- trades = data['trades']
- avg_trades = data['avg_trades']
- std_trades = data['std_trades']
+ trades = data["trades"]
+ avg_trades = data["avg_trades"]
+ std_trades = data["std_trades"]
if (
- avg_trades is not None and
- std_trades is not None and
- std_trades > 0
+ avg_trades is not None
+ and std_trades is not None
+ and std_trades > 0
):
z_score = (trades - avg_trades) / std_trades
threshold_multiplier = 3 # Adjust as needed
if z_score > threshold_multiplier:
- anomalies.append({
- 'ticker': ticker,
- 'date': latest_date,
- 'trades': trades,
- 'avg_trades': avg_trades,
- 'std_trades': std_trades,
- 'z_score': z_score,
- 'close_price': data['close_price'],
- 'price_diff': data['price_diff']
- })
+ anomalies.append(
+ {
+ "ticker": ticker,
+ "date": latest_date,
+ "trades": trades,
+ "avg_trades": avg_trades,
+ "std_trades": std_trades,
+ "z_score": z_score,
+ "close_price": data["close_price"],
+ "price_diff": data["price_diff"],
+ }
+ )
# Sort anomalies by trades in descending order
- anomalies.sort(key=lambda x: x['trades'], reverse=True)
+ anomalies.sort(key=lambda x: x["trades"], reverse=True)
# Generate the HTML to display the anomalies
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
# Build the HTML content
- html_content = 'Anomalies for {}'.format(latest_date)
- html_content += '