From f6a96b1bd32b6281d17f6eff8f6f48b52ffc081d Mon Sep 17 00:00:00 2001 From: justinpolygon <123573436+justinpolygon@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:34:39 -0800 Subject: [PATCH 1/5] Add Hunting Anomalies in the Stock Market scripts --- examples/tools/hunting-anomalies/README.md | 50 ++++ .../aggregates_day/README.md | 1 + .../hunting-anomalies/build-lookup-table.py | 94 ++++++ .../hunting-anomalies/gui-lookup-table.py | 270 ++++++++++++++++++ .../hunting-anomalies/query-lookup-table.py | 63 ++++ 5 files changed, 478 insertions(+) create mode 100644 examples/tools/hunting-anomalies/README.md create mode 100644 examples/tools/hunting-anomalies/aggregates_day/README.md create mode 100644 examples/tools/hunting-anomalies/build-lookup-table.py create mode 100644 examples/tools/hunting-anomalies/gui-lookup-table.py create mode 100644 examples/tools/hunting-anomalies/query-lookup-table.py diff --git a/examples/tools/hunting-anomalies/README.md b/examples/tools/hunting-anomalies/README.md new file mode 100644 index 00000000..f7bc9aa2 --- /dev/null +++ b/examples/tools/hunting-anomalies/README.md @@ -0,0 +1,50 @@ +# Hunting Anomalies in the Stock Market + +This repository contains all the necessary scripts and data directories used in the [Hunting Anomalies in the Stock Market](https://polygon.io/blog/hunting-anomalies-in-stock-market/) tutorial, hosted on Polygon.io's blog. The tutorial demonstrates how to detect statistical anomalies in historical US stock market data through a comprehensive workflow that involves downloading data, building a lookup table, querying for anomalies, and visualizing them through a web interface. + +### Prerequisites + +- Python 3.8+ +- Access to Polygon.io's historical data via Flat Files +- An active Polygon.io API key, obtainable by signing up for a Stocks paid plan + +### Repository Contents + +- `README.md`: This file, outlining setup and execution instructions. +- `aggregates_day`: Directory where downloaded CSV data files are stored. +- `build-lookup-table.py`: Python script to build a lookup table from the historical data. +- `query-lookup-table.py`: Python script to query the lookup table for anomalies. +- `gui-lookup-table.py`: Python script for a browser-based interface to explore anomalies visually. + +### Running the Tutorial + +1. **Ensure Python 3.8+ is installed:** Check your Python version and ensure all required libraries (polygon-api-client, pandas, pickle, and argparse) are installed. + +2. **Set up your API key:** Make sure you have an active paid Polygon.io Stock subscription for accessing Flat Files. Set up your API key in your environment or directly in the scripts where required. + +3. **Download Historical Data:** Use the MinIO client to download historical stock market data: + ```bash + mc alias set s3polygon https://files.polygon.io YOUR_ACCESS_KEY YOUR_SECRET_KEY + mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/08/ ./aggregates_day/ + mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/09/ ./aggregates_day/ + mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/10/ ./aggregates_day/ + gunzip ./aggregates_day/*.gz + ``` + Adjust the commands and paths based on the data you're interested in. + +4. **Build the Lookup Table:** This script processes the downloaded data and builds a lookup table, saving it as `lookup_table.pkl`. + ```bash + python build-lookup-table.py + ``` + +5. **Query Anomalies:** Replace `2024-10-18` with the date you want to analyze for anomalies. + ```bash + python query-lookup-table.py 2024-10-18 + ``` + +6. **Run the GUI:** Access the web interface at `http://localhost:8888` to explore the anomalies visually. + ```bash + python gui-lookup-table.py + ``` + +For a complete step-by-step guide on each phase of the anomaly detection process, including additional configurations and troubleshooting, refer to the detailed [tutorial on our blog](https://polygon.io/blog/hunting-anomalies-in-stock-market). diff --git a/examples/tools/hunting-anomalies/aggregates_day/README.md b/examples/tools/hunting-anomalies/aggregates_day/README.md new file mode 100644 index 00000000..a0ade480 --- /dev/null +++ b/examples/tools/hunting-anomalies/aggregates_day/README.md @@ -0,0 +1 @@ +Download flat files into here. diff --git a/examples/tools/hunting-anomalies/build-lookup-table.py b/examples/tools/hunting-anomalies/build-lookup-table.py new file mode 100644 index 00000000..c173d58d --- /dev/null +++ b/examples/tools/hunting-anomalies/build-lookup-table.py @@ -0,0 +1,94 @@ +import os +import pandas as pd +from collections import defaultdict +import pickle +import json + +# Directory containing the daily CSV files +data_dir = './aggregates_day/' + +# Initialize a dictionary to hold trades data +trades_data = defaultdict(list) + +# List all CSV files in the directory +files = sorted([f for f in os.listdir(data_dir) if f.endswith('.csv')]) + +print("Starting to process files...") + +# Process each file (assuming files are named in order) +for file in files: + print(f"Processing {file}") + file_path = os.path.join(data_dir, file) + df = pd.read_csv(file_path) + # For each stock, store the date and relevant data + for _, row in df.iterrows(): + ticker = row['ticker'] + date = pd.to_datetime(row['window_start'], unit='ns').date() + trades = row['transactions'] + close_price = row['close'] # Ensure 'close' column exists in your CSV + trades_data[ticker].append({ + 'date': date, + 'trades': trades, + 'close_price': close_price + }) + +print("Finished processing files.") +print("Building lookup table...") + +# Now, build the lookup table with rolling averages and percentage price change +lookup_table = defaultdict(dict) # Nested dict: ticker -> date -> stats + +for ticker, records in trades_data.items(): + # Convert records to DataFrame + df_ticker = pd.DataFrame(records) + # Sort records by date + df_ticker.sort_values('date', inplace=True) + df_ticker.set_index('date', inplace=True) + + # Calculate the percentage change in close_price + df_ticker['price_diff'] = df_ticker['close_price'].pct_change() * 100 # Multiply by 100 for percentage + + # Shift trades to exclude the current day from rolling calculations + df_ticker['trades_shifted'] = df_ticker['trades'].shift(1) + # Calculate rolling average and standard deviation over the previous 5 days + df_ticker['avg_trades'] = df_ticker['trades_shifted'].rolling(window=5).mean() + df_ticker['std_trades'] = df_ticker['trades_shifted'].rolling(window=5).std() + # Store the data in the lookup table + for date, row in df_ticker.iterrows(): + # Convert date to string for JSON serialization + date_str = date.strftime('%Y-%m-%d') + # Ensure rolling stats are available + if pd.notnull(row['avg_trades']) and pd.notnull(row['std_trades']): + lookup_table[ticker][date_str] = { + 'trades': row['trades'], + 'close_price': row['close_price'], + 'price_diff': row['price_diff'], + 'avg_trades': row['avg_trades'], + 'std_trades': row['std_trades'] + } + else: + # Store data without rolling stats if not enough data points + lookup_table[ticker][date_str] = { + 'trades': row['trades'], + 'close_price': row['close_price'], + 'price_diff': row['price_diff'], + 'avg_trades': None, + 'std_trades': None + } + +print("Lookup table built successfully.") + +# Convert defaultdict to regular dict for JSON serialization +lookup_table = {k: v for k, v in lookup_table.items()} + +# Save the lookup table to a JSON file +with open('lookup_table.json', 'w') as f: + json.dump(lookup_table, f, indent=4) + +print("Lookup table saved to 'lookup_table.json'.") + +# Save the lookup table to a file for later use +with open('lookup_table.pkl', 'wb') as f: + pickle.dump(lookup_table, f) + +print("Lookup table saved to 'lookup_table.pkl'.") diff --git a/examples/tools/hunting-anomalies/gui-lookup-table.py b/examples/tools/hunting-anomalies/gui-lookup-table.py new file mode 100644 index 00000000..ee2fc43b --- /dev/null +++ b/examples/tools/hunting-anomalies/gui-lookup-table.py @@ -0,0 +1,270 @@ +import os +import pickle +import json +from datetime import datetime +from polygon import RESTClient +from polygon.rest.models import Agg +import http.server +import socketserver +import traceback +from urllib.parse import urlparse, parse_qs + +PORT = 8888 + +# Load the lookup_table +with open('lookup_table.pkl', 'rb') as f: + lookup_table = pickle.load(f) + +class handler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + # Parse the path and query parameters + parsed_path = urlparse(self.path) + path = parsed_path.path + query_params = parse_qs(parsed_path.query) + + if path == '/': + # Handle the root path + # Get the date parameter if provided + date_param = query_params.get('date', [None])[0] + + # Get all dates from the lookup table + all_dates = set() + for ticker_data in lookup_table.values(): + all_dates.update(ticker_data.keys()) + all_dates = sorted(all_dates) + + # If date is None, get the latest date from the lookup table + if date_param is None: + if all_dates: + latest_date = max(all_dates) + else: + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + html_content = '

No data available.

' + self.wfile.write(html_content.encode()) + return + else: + latest_date = date_param + + # Ensure latest_date is in all_dates + if latest_date not in all_dates: + # Handle the case where the provided date is invalid + self.send_response(400) + self.send_header("Content-type", "text/html") + self.end_headers() + error_html = f'

Error: No data available for date {latest_date}

' + self.wfile.write(error_html.encode()) + return + + # Now, get the anomalies for the latest_date + anomalies = [] + for ticker, date_data in lookup_table.items(): + if latest_date in date_data: + data = date_data[latest_date] + trades = data['trades'] + avg_trades = data['avg_trades'] + std_trades = data['std_trades'] + if ( + avg_trades is not None and + std_trades is not None and + std_trades > 0 + ): + z_score = (trades - avg_trades) / std_trades + threshold_multiplier = 3 # Adjust as needed + if z_score > threshold_multiplier: + anomalies.append({ + 'ticker': ticker, + 'date': latest_date, + 'trades': trades, + 'avg_trades': avg_trades, + 'std_trades': std_trades, + 'z_score': z_score, + 'close_price': data['close_price'], + 'price_diff': data['price_diff'] + }) + # Sort anomalies by trades in descending order + anomalies.sort(key=lambda x: x['trades'], reverse=True) + # Generate the HTML to display the anomalies + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + # Build the HTML content + html_content = 'Anomalies for {}'.format(latest_date) + html_content += '

Anomalies for {}

'.format(latest_date) + # Add navigation links (prev and next dates) + current_index = all_dates.index(latest_date) + prev_date = all_dates[current_index - 1] if current_index > 0 else None + next_date = all_dates[current_index + 1] if current_index < len(all_dates) - 1 else None + html_content += '

' + if prev_date: + html_content += 'Previous Date '.format(prev_date) + if next_date: + html_content += 'Next Date '.format(next_date) + html_content += '

' + # Display the anomalies in a table + html_content += '' + html_content += '' + html_content += '' + html_content += '' + html_content += '' + html_content += '' + html_content += '' + html_content += '' + html_content += '' + html_content += '' + html_content += '' + for anomaly in anomalies: + html_content += '' + html_content += ''.format(anomaly['ticker']) + html_content += ''.format(anomaly['trades']) + html_content += ''.format(anomaly['avg_trades']) + html_content += ''.format(anomaly['std_trades']) + html_content += ''.format(anomaly['z_score']) + html_content += ''.format(anomaly['close_price']) + html_content += ''.format(anomaly['price_diff']) + # Add a link to the chart + html_content += ''.format(anomaly['ticker'], latest_date) + html_content += '' + html_content += '
TickerTradesAvg TradesStd DevZ-scoreClose PricePrice DiffChart
{}{}{:.2f}{:.2f}{:.2f}{:.2f}{:.2f}View Chart
' + html_content += '
' + self.wfile.write(html_content.encode()) + elif path == '/chart': + # Handle the chart page + # Get 'ticker' and 'date' from query parameters + ticker = query_params.get('ticker', [None])[0] + date = query_params.get('date', [None])[0] + if ticker is None or date is None: + # Return an error page + self.send_response(400) + self.send_header("Content-type", "text/html") + self.end_headers() + error_html = '

Error: Missing ticker or date parameter

' + self.wfile.write(error_html.encode()) + else: + # Fetch minute aggregates for the ticker and date + client = RESTClient(trace=True) # POLYGON_API_KEY environment variable is used + try: + aggs = [] + date_from = date + date_to = date + for a in client.list_aggs( + ticker, + 1, + "minute", + date_from, + date_to, + limit=50000, + ): + aggs.append(a) + # Prepare data for the chart + data = [] + for agg in aggs: + if isinstance(agg, Agg) and isinstance(agg.timestamp, int): + new_record = [ + agg.timestamp, + agg.open, + agg.high, + agg.low, + agg.close + ] + data.append(new_record) + # Generate the HTML for the chart page + chart_html = """ + + + + + + + + + + + + +
+ +
+ + + """ % (json.dumps(data), ticker, date, ticker) + self.send_response(200) + self.send_header("Content-type", "text/html") + self.send_header('Access-Control-Allow-Origin', '*') + self.end_headers() + self.wfile.write(chart_html.encode()) + except Exception as e: + # Handle exceptions + self.send_response(500) + self.send_header("Content-type", "text/html") + self.end_headers() + error_html = '

Error fetching data: {}

'.format(str(e)) + self.wfile.write(error_html.encode()) + else: + # Serve files from the current directory + super().do_GET() + +def run_server(): + with socketserver.TCPServer(("", PORT), handler) as httpd: + print("serving at port", PORT) + try: + httpd.serve_forever() + except KeyboardInterrupt: + print("\nExiting gracefully...") + httpd.shutdown() + httpd.server_close() + +if __name__ == '__main__': + run_server() diff --git a/examples/tools/hunting-anomalies/query-lookup-table.py b/examples/tools/hunting-anomalies/query-lookup-table.py new file mode 100644 index 00000000..4037a031 --- /dev/null +++ b/examples/tools/hunting-anomalies/query-lookup-table.py @@ -0,0 +1,63 @@ +import pickle +import argparse + +# Parse command-line arguments +parser = argparse.ArgumentParser(description='Anomaly Detection Script') +parser.add_argument('date', type=str, help='Target date in YYYY-MM-DD format') +args = parser.parse_args() + +# Load the lookup_table +with open('lookup_table.pkl', 'rb') as f: + lookup_table = pickle.load(f) + +# Threshold for considering an anomaly (e.g., 3 standard deviations) +threshold_multiplier = 3 + +# Date for which we want to find anomalies +target_date_str = args.date + +# List to store anomalies +anomalies = [] + +# Iterate over all tickers in the lookup table +for ticker, date_data in lookup_table.items(): + if target_date_str in date_data: + data = date_data[target_date_str] + trades = data['trades'] + avg_trades = data['avg_trades'] + std_trades = data['std_trades'] + if ( + avg_trades is not None and + std_trades is not None and + std_trades > 0 + ): + z_score = (trades - avg_trades) / std_trades + if z_score > threshold_multiplier: + anomalies.append({ + 'ticker': ticker, + 'date': target_date_str, + 'trades': trades, + 'avg_trades': avg_trades, + 'std_trades': std_trades, + 'z_score': z_score, + 'close_price': data['close_price'], + 'price_diff': data['price_diff'] + }) + +# Sort anomalies by trades in descending order +anomalies.sort(key=lambda x: x['trades'], reverse=True) + +# Print the anomalies with aligned columns +print(f"\nAnomalies Found for {target_date_str}:\n") +print(f"{'Ticker':<10}{'Trades':>10}{'Avg Trades':>15}{'Std Dev':>10}{'Z-score':>10}{'Close Price':>12}{'Price Diff':>12}") +print("-" * 91) +for anomaly in anomalies: + print( + f"{anomaly['ticker']:<10}" + f"{anomaly['trades']:>10.0f}" + f"{anomaly['avg_trades']:>15.2f}" + f"{anomaly['std_trades']:>10.2f}" + f"{anomaly['z_score']:>10.2f}" + f"{anomaly['close_price']:>12.2f}" + f"{anomaly['price_diff']:>12.2f}" + ) From ce3a0f1b601007c6cf8b3ea671acec3abd6a1e65 Mon Sep 17 00:00:00 2001 From: justinpolygon <123573436+justinpolygon@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:37:02 -0800 Subject: [PATCH 2/5] Fix lint --- .../hunting-anomalies/build-lookup-table.py | 62 +++---- .../hunting-anomalies/gui-lookup-table.py | 162 +++++++++++------- .../hunting-anomalies/query-lookup-table.py | 46 ++--- 3 files changed, 151 insertions(+), 119 deletions(-) diff --git a/examples/tools/hunting-anomalies/build-lookup-table.py b/examples/tools/hunting-anomalies/build-lookup-table.py index c173d58d..a2de6ca8 100644 --- a/examples/tools/hunting-anomalies/build-lookup-table.py +++ b/examples/tools/hunting-anomalies/build-lookup-table.py @@ -5,13 +5,13 @@ import json # Directory containing the daily CSV files -data_dir = './aggregates_day/' +data_dir = "./aggregates_day/" # Initialize a dictionary to hold trades data trades_data = defaultdict(list) # List all CSV files in the directory -files = sorted([f for f in os.listdir(data_dir) if f.endswith('.csv')]) +files = sorted([f for f in os.listdir(data_dir) if f.endswith(".csv")]) print("Starting to process files...") @@ -22,15 +22,13 @@ df = pd.read_csv(file_path) # For each stock, store the date and relevant data for _, row in df.iterrows(): - ticker = row['ticker'] - date = pd.to_datetime(row['window_start'], unit='ns').date() - trades = row['transactions'] - close_price = row['close'] # Ensure 'close' column exists in your CSV - trades_data[ticker].append({ - 'date': date, - 'trades': trades, - 'close_price': close_price - }) + ticker = row["ticker"] + date = pd.to_datetime(row["window_start"], unit="ns").date() + trades = row["transactions"] + close_price = row["close"] # Ensure 'close' column exists in your CSV + trades_data[ticker].append( + {"date": date, "trades": trades, "close_price": close_price} + ) print("Finished processing files.") print("Building lookup table...") @@ -42,38 +40,40 @@ # Convert records to DataFrame df_ticker = pd.DataFrame(records) # Sort records by date - df_ticker.sort_values('date', inplace=True) - df_ticker.set_index('date', inplace=True) + df_ticker.sort_values("date", inplace=True) + df_ticker.set_index("date", inplace=True) # Calculate the percentage change in close_price - df_ticker['price_diff'] = df_ticker['close_price'].pct_change() * 100 # Multiply by 100 for percentage + df_ticker["price_diff"] = ( + df_ticker["close_price"].pct_change() * 100 + ) # Multiply by 100 for percentage # Shift trades to exclude the current day from rolling calculations - df_ticker['trades_shifted'] = df_ticker['trades'].shift(1) + df_ticker["trades_shifted"] = df_ticker["trades"].shift(1) # Calculate rolling average and standard deviation over the previous 5 days - df_ticker['avg_trades'] = df_ticker['trades_shifted'].rolling(window=5).mean() - df_ticker['std_trades'] = df_ticker['trades_shifted'].rolling(window=5).std() + df_ticker["avg_trades"] = df_ticker["trades_shifted"].rolling(window=5).mean() + df_ticker["std_trades"] = df_ticker["trades_shifted"].rolling(window=5).std() # Store the data in the lookup table for date, row in df_ticker.iterrows(): # Convert date to string for JSON serialization - date_str = date.strftime('%Y-%m-%d') + date_str = date.strftime("%Y-%m-%d") # Ensure rolling stats are available - if pd.notnull(row['avg_trades']) and pd.notnull(row['std_trades']): + if pd.notnull(row["avg_trades"]) and pd.notnull(row["std_trades"]): lookup_table[ticker][date_str] = { - 'trades': row['trades'], - 'close_price': row['close_price'], - 'price_diff': row['price_diff'], - 'avg_trades': row['avg_trades'], - 'std_trades': row['std_trades'] + "trades": row["trades"], + "close_price": row["close_price"], + "price_diff": row["price_diff"], + "avg_trades": row["avg_trades"], + "std_trades": row["std_trades"], } else: # Store data without rolling stats if not enough data points lookup_table[ticker][date_str] = { - 'trades': row['trades'], - 'close_price': row['close_price'], - 'price_diff': row['price_diff'], - 'avg_trades': None, - 'std_trades': None + "trades": row["trades"], + "close_price": row["close_price"], + "price_diff": row["price_diff"], + "avg_trades": None, + "std_trades": None, } print("Lookup table built successfully.") @@ -82,13 +82,13 @@ lookup_table = {k: v for k, v in lookup_table.items()} # Save the lookup table to a JSON file -with open('lookup_table.json', 'w') as f: +with open("lookup_table.json", "w") as f: json.dump(lookup_table, f, indent=4) print("Lookup table saved to 'lookup_table.json'.") # Save the lookup table to a file for later use -with open('lookup_table.pkl', 'wb') as f: +with open("lookup_table.pkl", "wb") as f: pickle.dump(lookup_table, f) print("Lookup table saved to 'lookup_table.pkl'.") diff --git a/examples/tools/hunting-anomalies/gui-lookup-table.py b/examples/tools/hunting-anomalies/gui-lookup-table.py index ee2fc43b..df58746c 100644 --- a/examples/tools/hunting-anomalies/gui-lookup-table.py +++ b/examples/tools/hunting-anomalies/gui-lookup-table.py @@ -12,27 +12,28 @@ PORT = 8888 # Load the lookup_table -with open('lookup_table.pkl', 'rb') as f: +with open("lookup_table.pkl", "rb") as f: lookup_table = pickle.load(f) + class handler(http.server.SimpleHTTPRequestHandler): def do_GET(self): # Parse the path and query parameters parsed_path = urlparse(self.path) path = parsed_path.path query_params = parse_qs(parsed_path.query) - - if path == '/': + + if path == "/": # Handle the root path # Get the date parameter if provided - date_param = query_params.get('date', [None])[0] - + date_param = query_params.get("date", [None])[0] + # Get all dates from the lookup table all_dates = set() for ticker_data in lookup_table.values(): all_dates.update(ticker_data.keys()) all_dates = sorted(all_dates) - + # If date is None, get the latest date from the lookup table if date_param is None: if all_dates: @@ -41,109 +42,131 @@ def do_GET(self): self.send_response(200) self.send_header("Content-type", "text/html") self.end_headers() - html_content = '

No data available.

' + html_content = ( + "

No data available.

" + ) self.wfile.write(html_content.encode()) return else: latest_date = date_param - + # Ensure latest_date is in all_dates if latest_date not in all_dates: # Handle the case where the provided date is invalid self.send_response(400) self.send_header("Content-type", "text/html") self.end_headers() - error_html = f'

Error: No data available for date {latest_date}

' + error_html = f"

Error: No data available for date {latest_date}

" self.wfile.write(error_html.encode()) return - + # Now, get the anomalies for the latest_date anomalies = [] for ticker, date_data in lookup_table.items(): if latest_date in date_data: data = date_data[latest_date] - trades = data['trades'] - avg_trades = data['avg_trades'] - std_trades = data['std_trades'] + trades = data["trades"] + avg_trades = data["avg_trades"] + std_trades = data["std_trades"] if ( - avg_trades is not None and - std_trades is not None and - std_trades > 0 + avg_trades is not None + and std_trades is not None + and std_trades > 0 ): z_score = (trades - avg_trades) / std_trades threshold_multiplier = 3 # Adjust as needed if z_score > threshold_multiplier: - anomalies.append({ - 'ticker': ticker, - 'date': latest_date, - 'trades': trades, - 'avg_trades': avg_trades, - 'std_trades': std_trades, - 'z_score': z_score, - 'close_price': data['close_price'], - 'price_diff': data['price_diff'] - }) + anomalies.append( + { + "ticker": ticker, + "date": latest_date, + "trades": trades, + "avg_trades": avg_trades, + "std_trades": std_trades, + "z_score": z_score, + "close_price": data["close_price"], + "price_diff": data["price_diff"], + } + ) # Sort anomalies by trades in descending order - anomalies.sort(key=lambda x: x['trades'], reverse=True) + anomalies.sort(key=lambda x: x["trades"], reverse=True) # Generate the HTML to display the anomalies self.send_response(200) self.send_header("Content-type", "text/html") self.end_headers() # Build the HTML content - html_content = 'Anomalies for {}'.format(latest_date) - html_content += '

Anomalies for {}

'.format(latest_date) + html_content = 'Anomalies for {}'.format( + latest_date + ) + html_content += '

Anomalies for {}

'.format( + latest_date + ) # Add navigation links (prev and next dates) current_index = all_dates.index(latest_date) prev_date = all_dates[current_index - 1] if current_index > 0 else None - next_date = all_dates[current_index + 1] if current_index < len(all_dates) - 1 else None - html_content += '

' + next_date = ( + all_dates[current_index + 1] + if current_index < len(all_dates) - 1 + else None + ) + html_content += "

" if prev_date: - html_content += 'Previous Date '.format(prev_date) + html_content += 'Previous Date '.format( + prev_date + ) if next_date: html_content += 'Next Date '.format(next_date) - html_content += '

' + html_content += "

" # Display the anomalies in a table - html_content += '' - html_content += '' - html_content += '' - html_content += '' - html_content += '' - html_content += '' - html_content += '' - html_content += '' - html_content += '' - html_content += '' - html_content += '' + html_content += ( + '
TickerTradesAvg TradesStd DevZ-scoreClose PricePrice DiffChart
' + ) + html_content += "" + html_content += "" + html_content += "" + html_content += "" + html_content += "" + html_content += "" + html_content += "" + html_content += "" + html_content += "" + html_content += "" for anomaly in anomalies: - html_content += '' - html_content += ''.format(anomaly['ticker']) - html_content += ''.format(anomaly['trades']) - html_content += ''.format(anomaly['avg_trades']) - html_content += ''.format(anomaly['std_trades']) - html_content += ''.format(anomaly['z_score']) - html_content += ''.format(anomaly['close_price']) - html_content += ''.format(anomaly['price_diff']) + html_content += "" + html_content += "".format(anomaly["ticker"]) + html_content += "".format(anomaly["trades"]) + html_content += "".format(anomaly["avg_trades"]) + html_content += "".format(anomaly["std_trades"]) + html_content += "".format(anomaly["z_score"]) + html_content += "".format(anomaly["close_price"]) + html_content += "".format(anomaly["price_diff"]) # Add a link to the chart - html_content += ''.format(anomaly['ticker'], latest_date) - html_content += '' + html_content += ( + ''.format( + anomaly["ticker"], latest_date + ) + ) + html_content += "" html_content += '
TickerTradesAvg TradesStd DevZ-scoreClose PricePrice DiffChart
{}{}{:.2f}{:.2f}{:.2f}{:.2f}{:.2f}
{}{}{:.2f}{:.2f}{:.2f}{:.2f}{:.2f}View Chart
View Chart
' - html_content += '
' + html_content += "
" self.wfile.write(html_content.encode()) - elif path == '/chart': + elif path == "/chart": # Handle the chart page # Get 'ticker' and 'date' from query parameters - ticker = query_params.get('ticker', [None])[0] - date = query_params.get('date', [None])[0] + ticker = query_params.get("ticker", [None])[0] + date = query_params.get("date", [None])[0] if ticker is None or date is None: # Return an error page self.send_response(400) self.send_header("Content-type", "text/html") self.end_headers() - error_html = '

Error: Missing ticker or date parameter

' + error_html = "

Error: Missing ticker or date parameter

" self.wfile.write(error_html.encode()) else: # Fetch minute aggregates for the ticker and date - client = RESTClient(trace=True) # POLYGON_API_KEY environment variable is used + client = RESTClient( + trace=True + ) # POLYGON_API_KEY environment variable is used try: aggs = [] date_from = date @@ -166,7 +189,7 @@ def do_GET(self): agg.open, agg.high, agg.low, - agg.close + agg.close, ] data.append(new_record) # Generate the HTML for the chart page @@ -239,10 +262,15 @@ def do_GET(self): - """ % (json.dumps(data), ticker, date, ticker) + """ % ( + json.dumps(data), + ticker, + date, + ticker, + ) self.send_response(200) self.send_header("Content-type", "text/html") - self.send_header('Access-Control-Allow-Origin', '*') + self.send_header("Access-Control-Allow-Origin", "*") self.end_headers() self.wfile.write(chart_html.encode()) except Exception as e: @@ -250,12 +278,15 @@ def do_GET(self): self.send_response(500) self.send_header("Content-type", "text/html") self.end_headers() - error_html = '

Error fetching data: {}

'.format(str(e)) + error_html = "

Error fetching data: {}

".format( + str(e) + ) self.wfile.write(error_html.encode()) else: # Serve files from the current directory super().do_GET() + def run_server(): with socketserver.TCPServer(("", PORT), handler) as httpd: print("serving at port", PORT) @@ -266,5 +297,6 @@ def run_server(): httpd.shutdown() httpd.server_close() -if __name__ == '__main__': + +if __name__ == "__main__": run_server() diff --git a/examples/tools/hunting-anomalies/query-lookup-table.py b/examples/tools/hunting-anomalies/query-lookup-table.py index 4037a031..38bb86cf 100644 --- a/examples/tools/hunting-anomalies/query-lookup-table.py +++ b/examples/tools/hunting-anomalies/query-lookup-table.py @@ -2,12 +2,12 @@ import argparse # Parse command-line arguments -parser = argparse.ArgumentParser(description='Anomaly Detection Script') -parser.add_argument('date', type=str, help='Target date in YYYY-MM-DD format') +parser = argparse.ArgumentParser(description="Anomaly Detection Script") +parser.add_argument("date", type=str, help="Target date in YYYY-MM-DD format") args = parser.parse_args() # Load the lookup_table -with open('lookup_table.pkl', 'rb') as f: +with open("lookup_table.pkl", "rb") as f: lookup_table = pickle.load(f) # Threshold for considering an anomaly (e.g., 3 standard deviations) @@ -23,33 +23,33 @@ for ticker, date_data in lookup_table.items(): if target_date_str in date_data: data = date_data[target_date_str] - trades = data['trades'] - avg_trades = data['avg_trades'] - std_trades = data['std_trades'] - if ( - avg_trades is not None and - std_trades is not None and - std_trades > 0 - ): + trades = data["trades"] + avg_trades = data["avg_trades"] + std_trades = data["std_trades"] + if avg_trades is not None and std_trades is not None and std_trades > 0: z_score = (trades - avg_trades) / std_trades if z_score > threshold_multiplier: - anomalies.append({ - 'ticker': ticker, - 'date': target_date_str, - 'trades': trades, - 'avg_trades': avg_trades, - 'std_trades': std_trades, - 'z_score': z_score, - 'close_price': data['close_price'], - 'price_diff': data['price_diff'] - }) + anomalies.append( + { + "ticker": ticker, + "date": target_date_str, + "trades": trades, + "avg_trades": avg_trades, + "std_trades": std_trades, + "z_score": z_score, + "close_price": data["close_price"], + "price_diff": data["price_diff"], + } + ) # Sort anomalies by trades in descending order -anomalies.sort(key=lambda x: x['trades'], reverse=True) +anomalies.sort(key=lambda x: x["trades"], reverse=True) # Print the anomalies with aligned columns print(f"\nAnomalies Found for {target_date_str}:\n") -print(f"{'Ticker':<10}{'Trades':>10}{'Avg Trades':>15}{'Std Dev':>10}{'Z-score':>10}{'Close Price':>12}{'Price Diff':>12}") +print( + f"{'Ticker':<10}{'Trades':>10}{'Avg Trades':>15}{'Std Dev':>10}{'Z-score':>10}{'Close Price':>12}{'Price Diff':>12}" +) print("-" * 91) for anomaly in anomalies: print( From c873d63f8a398deaa4bb5af704dd1d969b250474 Mon Sep 17 00:00:00 2001 From: justinpolygon <123573436+justinpolygon@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:44:46 -0800 Subject: [PATCH 3/5] Ignore type and fix typo --- examples/tools/hunting-anomalies/README.md | 5 ++--- examples/tools/hunting-anomalies/build-lookup-table.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/tools/hunting-anomalies/README.md b/examples/tools/hunting-anomalies/README.md index f7bc9aa2..4b36f1b5 100644 --- a/examples/tools/hunting-anomalies/README.md +++ b/examples/tools/hunting-anomalies/README.md @@ -22,7 +22,7 @@ This repository contains all the necessary scripts and data directories used in 2. **Set up your API key:** Make sure you have an active paid Polygon.io Stock subscription for accessing Flat Files. Set up your API key in your environment or directly in the scripts where required. -3. **Download Historical Data:** Use the MinIO client to download historical stock market data: +3. **Download Historical Data:** Use the MinIO client to download historical stock market data. Adjust the commands and paths based on the data you are interested in. ```bash mc alias set s3polygon https://files.polygon.io YOUR_ACCESS_KEY YOUR_SECRET_KEY mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/08/ ./aggregates_day/ @@ -30,7 +30,6 @@ This repository contains all the necessary scripts and data directories used in mc cp --recursive s3polygon/flatfiles/us_stocks_sip/day_aggs_v1/2024/10/ ./aggregates_day/ gunzip ./aggregates_day/*.gz ``` - Adjust the commands and paths based on the data you're interested in. 4. **Build the Lookup Table:** This script processes the downloaded data and builds a lookup table, saving it as `lookup_table.pkl`. ```bash @@ -47,4 +46,4 @@ This repository contains all the necessary scripts and data directories used in python gui-lookup-table.py ``` -For a complete step-by-step guide on each phase of the anomaly detection process, including additional configurations and troubleshooting, refer to the detailed [tutorial on our blog](https://polygon.io/blog/hunting-anomalies-in-stock-market). +For a complete step-by-step guide on each phase of the anomaly detection process, including additional configurations and troubleshooting, refer to the detailed [tutorial on our blog](https://polygon.io/blog/hunting-anomalies-in-stock-market/). diff --git a/examples/tools/hunting-anomalies/build-lookup-table.py b/examples/tools/hunting-anomalies/build-lookup-table.py index a2de6ca8..84b88f3d 100644 --- a/examples/tools/hunting-anomalies/build-lookup-table.py +++ b/examples/tools/hunting-anomalies/build-lookup-table.py @@ -1,5 +1,5 @@ import os -import pandas as pd +import pandas as pd # type: ignore from collections import defaultdict import pickle import json From 8c35454db9256e5e41882634ed373eafc357346c Mon Sep 17 00:00:00 2001 From: justinpolygon <123573436+justinpolygon@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:52:17 -0800 Subject: [PATCH 4/5] Fix type def from linter --- .../tools/hunting-anomalies/build-lookup-table.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/tools/hunting-anomalies/build-lookup-table.py b/examples/tools/hunting-anomalies/build-lookup-table.py index 84b88f3d..7576c131 100644 --- a/examples/tools/hunting-anomalies/build-lookup-table.py +++ b/examples/tools/hunting-anomalies/build-lookup-table.py @@ -1,8 +1,9 @@ import os -import pandas as pd # type: ignore +import pandas as pd # type: ignore from collections import defaultdict import pickle import json +from typing import DefaultDict, Dict, Any, BinaryIO # Directory containing the daily CSV files data_dir = "./aggregates_day/" @@ -34,7 +35,9 @@ print("Building lookup table...") # Now, build the lookup table with rolling averages and percentage price change -lookup_table = defaultdict(dict) # Nested dict: ticker -> date -> stats +lookup_table: DefaultDict[str, Dict[str, Any]] = defaultdict( + dict +) # Nested dict: ticker -> date -> stats for ticker, records in trades_data.items(): # Convert records to DataFrame @@ -79,16 +82,16 @@ print("Lookup table built successfully.") # Convert defaultdict to regular dict for JSON serialization -lookup_table = {k: v for k, v in lookup_table.items()} +lookup_table_dict = {k: v for k, v in lookup_table.items()} # Save the lookup table to a JSON file with open("lookup_table.json", "w") as f: - json.dump(lookup_table, f, indent=4) + json.dump(lookup_table_dict, f, indent=4) print("Lookup table saved to 'lookup_table.json'.") # Save the lookup table to a file for later use -with open("lookup_table.pkl", "wb") as f: - pickle.dump(lookup_table, f) +with open("lookup_table.pkl", "wb") as f: # type: BinaryIO + pickle.dump(lookup_table_dict, f) print("Lookup table saved to 'lookup_table.pkl'.") From 8bd08f84d84045daf45325d2c258d37259815229 Mon Sep 17 00:00:00 2001 From: justinpolygon <123573436+justinpolygon@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:55:16 -0800 Subject: [PATCH 5/5] Removed json dump --- examples/tools/hunting-anomalies/build-lookup-table.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/tools/hunting-anomalies/build-lookup-table.py b/examples/tools/hunting-anomalies/build-lookup-table.py index 7576c131..16abca2d 100644 --- a/examples/tools/hunting-anomalies/build-lookup-table.py +++ b/examples/tools/hunting-anomalies/build-lookup-table.py @@ -84,12 +84,6 @@ # Convert defaultdict to regular dict for JSON serialization lookup_table_dict = {k: v for k, v in lookup_table.items()} -# Save the lookup table to a JSON file -with open("lookup_table.json", "w") as f: - json.dump(lookup_table_dict, f, indent=4) - -print("Lookup table saved to 'lookup_table.json'.") - # Save the lookup table to a file for later use with open("lookup_table.pkl", "wb") as f: # type: BinaryIO pickle.dump(lookup_table_dict, f)