Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added data/parquet/.gitkeep
Empty file.
163 changes: 163 additions & 0 deletions notebooks/data_load_and_explore.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "845bdca6",
"metadata": {},
"outputs": [],
"source": [
"# %%\n",
"# filename: data_load_and_explore.ipynb\n",
"\n",
"# --- 1. Imports and Setup ---\n",
"import os\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"\n",
"# --- Plotting Configuration ---\n",
"%matplotlib inline\n",
"plt.style.use('seaborn-v0_8-darkgrid')\n",
"print(\"Libraries imported and plotting configured.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63ce9338",
"metadata": {},
"outputs": [],
"source": [
"# %%\n",
"# --- 2. Configuration ---\n",
"# This project expects the necessary Parquet files to be manually placed\n",
"# in its own 'data/parquet' directory.\n",
"\n",
"# Construct the path to the local data directory\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
"data_dir = os.path.join(project_root, \"data\", \"parquet\")\n",
"\n",
"# Specify the exact file you want to analyze from the local directory\n",
"# << COPY THE PARQUET FILE HERE AND UPDATE THE FILENAME >>\n",
"DATA_FILENAME = \"EURUSD_M15_2024-09-14_to_2025-09-14.parquet\"\n",
"file_path = os.path.join(data_dir, DATA_FILENAME)\n",
"\n",
"print(f\"Attempting to load data from local path: {file_path}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "483c6732",
"metadata": {},
"outputs": [],
"source": [
"# %%\n",
"# --- 3. Data Loading & Initial Inspection ---\n",
"try:\n",
" df = pd.read_parquet(file_path)\n",
" print(\"Parquet file loaded successfully!\")\n",
"\n",
" print(\"\\n--- DataFrame Info ---\")\n",
" # .info() gives a great overview of columns, data types, and non-null values\n",
" df.info()\n",
"\n",
" print(f\"\\nShape of the data: {df.shape[0]} rows, {df.shape[1]} columns\")\n",
"\n",
" print(\"\\n--- First 5 Rows (Head) ---\")\n",
" print(df.head())\n",
"\n",
"except FileNotFoundError:\n",
" print(f\"ERROR: File not found at '{file_path}'.\")\n",
" print(\"Please ensure the 'py-mt5-trader' project has been run and the filename is correct.\")\n",
"except Exception as e:\n",
" print(f\"An error occurred while reading the file: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5726d7f",
"metadata": {},
"outputs": [],
"source": [
"# %%\n",
"# --- 4. Data Preprocessing ---\n",
"# For time series analysis, the 'Time' column should be the index.\n",
"\n",
"print(\"Preprocessing data...\")\n",
"# Set the 'Time' column as the DataFrame index\n",
"df.set_index('Time', inplace=True)\n",
"\n",
"# Verify that the index is a DatetimeIndex\n",
"print(f\"Index type: {type(df.index)}\")\n",
"print(f\"Is the index timezone-aware? {'Yes' if df.index.tz is not None else 'No'}\")\n",
"\n",
"# Check for duplicate timestamps in the index\n",
"duplicate_count = df.index.duplicated().sum()\n",
"if duplicate_count > 0:\n",
" print(f\"\\nWarning: Found {duplicate_count} duplicate timestamps in the index. Consider handling them.\")\n",
"else:\n",
" print(\"\\nNo duplicate timestamps found in the index. Good.\")\n",
"\n",
"print(\"\\nPreprocessing complete. DataFrame is ready for analysis.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b1af3037",
"metadata": {},
"outputs": [],
"source": [
"# %%\n",
"# --- 5. Exploratory Data Analysis (EDA) ---\n",
"\n",
"# 1. Plot the closing price to get a feel for the overall trend\n",
"print(\"Plotting closing price...\")\n",
"df['Close'].plot(figsize=(15, 7), title=f'{DATA_FILENAME} - Closing Price')\n",
"plt.ylabel('Price')\n",
"plt.show()\n",
"\n",
"# 2. Calculate and plot the distribution of returns\n",
"print(\"\\nAnalyzing price returns...\")\n",
"# Calculate the percentage change between each bar\n",
"returns = df['Close'].pct_change().dropna()\n",
"\n",
"# Plot a histogram of the returns\n",
"plt.figure(figsize=(12, 6))\n",
"sns.histplot(returns, bins=100, kde=True)\n",
"plt.title('Distribution of Price Returns')\n",
"plt.xlabel('Return')\n",
"plt.ylabel('Frequency')\n",
"plt.show()\n",
"\n",
"print(f\"Average Return: {returns.mean():.6f}\")\n",
"print(f\"Standard Deviation (Volatility): {returns.std():.6f}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "strategy-optimizer_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}