diff --git a/data/parquet/.gitkeep b/data/parquet/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/data_load_and_explore.ipynb b/notebooks/data_load_and_explore.ipynb new file mode 100644 index 0000000..ede08d5 --- /dev/null +++ b/notebooks/data_load_and_explore.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "845bdca6", + "metadata": {}, + "outputs": [], + "source": [ + "# %%\n", + "# filename: data_load_and_explore.ipynb\n", + "\n", + "# --- 1. Imports and Setup ---\n", + "import os\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "# --- Plotting Configuration ---\n", + "%matplotlib inline\n", + "plt.style.use('seaborn-v0_8-darkgrid')\n", + "print(\"Libraries imported and plotting configured.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63ce9338", + "metadata": {}, + "outputs": [], + "source": [ + "# %%\n", + "# --- 2. Configuration ---\n", + "# This project expects the necessary Parquet files to be manually placed\n", + "# in its own 'data/parquet' directory.\n", + "\n", + "# Construct the path to the local data directory\n", + "project_root = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", + "data_dir = os.path.join(project_root, \"data\", \"parquet\")\n", + "\n", + "# Specify the exact file you want to analyze from the local directory\n", + "# << COPY THE PARQUET FILE HERE AND UPDATE THE FILENAME >>\n", + "DATA_FILENAME = \"EURUSD_M15_2024-09-14_to_2025-09-14.parquet\"\n", + "file_path = os.path.join(data_dir, DATA_FILENAME)\n", + "\n", + "print(f\"Attempting to load data from local path: {file_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "483c6732", + "metadata": {}, + "outputs": [], + "source": [ + "# %%\n", + "# --- 3. Data Loading & Initial Inspection ---\n", + "try:\n", + " df = pd.read_parquet(file_path)\n", + " print(\"Parquet file loaded successfully!\")\n", + "\n", + " print(\"\\n--- DataFrame Info ---\")\n", + " # .info() gives a great overview of columns, data types, and non-null values\n", + " df.info()\n", + "\n", + " print(f\"\\nShape of the data: {df.shape[0]} rows, {df.shape[1]} columns\")\n", + "\n", + " print(\"\\n--- First 5 Rows (Head) ---\")\n", + " print(df.head())\n", + "\n", + "except FileNotFoundError:\n", + " print(f\"ERROR: File not found at '{file_path}'.\")\n", + " print(\"Please ensure the 'py-mt5-trader' project has been run and the filename is correct.\")\n", + "except Exception as e:\n", + " print(f\"An error occurred while reading the file: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5726d7f", + "metadata": {}, + "outputs": [], + "source": [ + "# %%\n", + "# --- 4. Data Preprocessing ---\n", + "# For time series analysis, the 'Time' column should be the index.\n", + "\n", + "print(\"Preprocessing data...\")\n", + "# Set the 'Time' column as the DataFrame index\n", + "df.set_index('Time', inplace=True)\n", + "\n", + "# Verify that the index is a DatetimeIndex\n", + "print(f\"Index type: {type(df.index)}\")\n", + "print(f\"Is the index timezone-aware? {'Yes' if df.index.tz is not None else 'No'}\")\n", + "\n", + "# Check for duplicate timestamps in the index\n", + "duplicate_count = df.index.duplicated().sum()\n", + "if duplicate_count > 0:\n", + " print(f\"\\nWarning: Found {duplicate_count} duplicate timestamps in the index. Consider handling them.\")\n", + "else:\n", + " print(\"\\nNo duplicate timestamps found in the index. Good.\")\n", + "\n", + "print(\"\\nPreprocessing complete. DataFrame is ready for analysis.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1af3037", + "metadata": {}, + "outputs": [], + "source": [ + "# %%\n", + "# --- 5. Exploratory Data Analysis (EDA) ---\n", + "\n", + "# 1. Plot the closing price to get a feel for the overall trend\n", + "print(\"Plotting closing price...\")\n", + "df['Close'].plot(figsize=(15, 7), title=f'{DATA_FILENAME} - Closing Price')\n", + "plt.ylabel('Price')\n", + "plt.show()\n", + "\n", + "# 2. Calculate and plot the distribution of returns\n", + "print(\"\\nAnalyzing price returns...\")\n", + "# Calculate the percentage change between each bar\n", + "returns = df['Close'].pct_change().dropna()\n", + "\n", + "# Plot a histogram of the returns\n", + "plt.figure(figsize=(12, 6))\n", + "sns.histplot(returns, bins=100, kde=True)\n", + "plt.title('Distribution of Price Returns')\n", + "plt.xlabel('Return')\n", + "plt.ylabel('Frequency')\n", + "plt.show()\n", + "\n", + "print(f\"Average Return: {returns.mean():.6f}\")\n", + "print(f\"Standard Deviation (Volatility): {returns.std():.6f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "strategy-optimizer_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}