softwaredevelop · softwaredevelop · Sep 15, 2025 · Sep 15, 2025
diff --git a/data/parquet/.gitkeep b/data/parquet/.gitkeep
diff --git a/notebooks/data_load_and_explore.ipynb b/notebooks/data_load_and_explore.ipynb
@@ -0,0 +1,163 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "845bdca6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%\n",
+    "# filename: data_load_and_explore.ipynb\n",
+    "\n",
+    "# --- 1. Imports and Setup ---\n",
+    "import os\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "\n",
+    "# --- Plotting Configuration ---\n",
+    "%matplotlib inline\n",
+    "plt.style.use('seaborn-v0_8-darkgrid')\n",
+    "print(\"Libraries imported and plotting configured.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63ce9338",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%\n",
+    "# --- 2. Configuration ---\n",
+    "# This project expects the necessary Parquet files to be manually placed\n",
+    "# in its own 'data/parquet' directory.\n",
+    "\n",
+    "# Construct the path to the local data directory\n",
+    "project_root = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
+    "data_dir = os.path.join(project_root, \"data\", \"parquet\")\n",
+    "\n",
+    "# Specify the exact file you want to analyze from the local directory\n",
+    "# << COPY THE PARQUET FILE HERE AND UPDATE THE FILENAME >>\n",
+    "DATA_FILENAME = \"EURUSD_M15_2024-09-14_to_2025-09-14.parquet\"\n",
+    "file_path = os.path.join(data_dir, DATA_FILENAME)\n",
+    "\n",
+    "print(f\"Attempting to load data from local path: {file_path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "483c6732",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%\n",
+    "# --- 3. Data Loading & Initial Inspection ---\n",
+    "try:\n",
+    "    df = pd.read_parquet(file_path)\n",
+    "    print(\"Parquet file loaded successfully!\")\n",
+    "\n",
+    "    print(\"\\n--- DataFrame Info ---\")\n",
+    "    # .info() gives a great overview of columns, data types, and non-null values\n",
+    "    df.info()\n",
+    "\n",
+    "    print(f\"\\nShape of the data: {df.shape[0]} rows, {df.shape[1]} columns\")\n",
+    "\n",
+    "    print(\"\\n--- First 5 Rows (Head) ---\")\n",
+    "    print(df.head())\n",
+    "\n",
+    "except FileNotFoundError:\n",
+    "    print(f\"ERROR: File not found at '{file_path}'.\")\n",
+    "    print(\"Please ensure the 'py-mt5-trader' project has been run and the filename is correct.\")\n",
+    "except Exception as e:\n",
+    "    print(f\"An error occurred while reading the file: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5726d7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%\n",
+    "# --- 4. Data Preprocessing ---\n",
+    "# For time series analysis, the 'Time' column should be the index.\n",
+    "\n",
+    "print(\"Preprocessing data...\")\n",
+    "# Set the 'Time' column as the DataFrame index\n",
+    "df.set_index('Time', inplace=True)\n",
+    "\n",
+    "# Verify that the index is a DatetimeIndex\n",
+    "print(f\"Index type: {type(df.index)}\")\n",
+    "print(f\"Is the index timezone-aware? {'Yes' if df.index.tz is not None else 'No'}\")\n",
+    "\n",
+    "# Check for duplicate timestamps in the index\n",
+    "duplicate_count = df.index.duplicated().sum()\n",
+    "if duplicate_count > 0:\n",
+    "    print(f\"\\nWarning: Found {duplicate_count} duplicate timestamps in the index. Consider handling them.\")\n",
+    "else:\n",
+    "    print(\"\\nNo duplicate timestamps found in the index. Good.\")\n",
+    "\n",
+    "print(\"\\nPreprocessing complete. DataFrame is ready for analysis.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1af3037",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%\n",
+    "# --- 5. Exploratory Data Analysis (EDA) ---\n",
+    "\n",
+    "# 1. Plot the closing price to get a feel for the overall trend\n",
+    "print(\"Plotting closing price...\")\n",
+    "df['Close'].plot(figsize=(15, 7), title=f'{DATA_FILENAME} - Closing Price')\n",
+    "plt.ylabel('Price')\n",
+    "plt.show()\n",
+    "\n",
+    "# 2. Calculate and plot the distribution of returns\n",
+    "print(\"\\nAnalyzing price returns...\")\n",
+    "# Calculate the percentage change between each bar\n",
+    "returns = df['Close'].pct_change().dropna()\n",
+    "\n",
+    "# Plot a histogram of the returns\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "sns.histplot(returns, bins=100, kde=True)\n",
+    "plt.title('Distribution of Price Returns')\n",
+    "plt.xlabel('Return')\n",
+    "plt.ylabel('Frequency')\n",
+    "plt.show()\n",
+    "\n",
+    "print(f\"Average Return: {returns.mean():.6f}\")\n",
+    "print(f\"Standard Deviation (Volatility): {returns.std():.6f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "strategy-optimizer_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}