diff --git a/finai_contest/env_stock_trading/env_stock_trading_trading_gym.py b/finai_contest/env_stock_trading/env_stock_trading_trading_gym.py new file mode 100644 index 00000000..b6281268 --- /dev/null +++ b/finai_contest/env_stock_trading/env_stock_trading_trading_gym.py @@ -0,0 +1,553 @@ +# Adapted from Trading_Gym (https://github.com/Yvictor/TradingGym) +# Original author: Yvictor (MIT License) +# Refactored by Xiangyu Ji, 2025 + +from __future__ import annotations + +from typing import Any, Dict, Tuple, List + +import numpy as np +import pandas as pd +import gymnasium as gym +from gymnasium import spaces +from stable_baselines3.common.vec_env import DummyVecEnv + + +class TradingEnvStandardized(gym.Env): + """ + Single-asset trading environment with discrete actions and rolling-window observations. + + ---------------------------------------------------- + Action Space + ---------------------------------------------------- + Discrete(3): + 0 -> Hold (do nothing) + 1 -> Buy (open / increase long) + 2 -> Sell (open / increase short) + + The agent cannot exceed the maximum allowed position `max_position`. + - If a Buy at +max_position is attempted, it becomes Hold. + - If a Sell at -max_position is attempted, it becomes Hold. + + ---------------------------------------------------- + State Space (internal) + ---------------------------------------------------- + Market Data + ----------- + - df_sample + Sub-DataFrame representing one continuous trading session. + - price + 1-D array of prices used for reward & PnL calculations. + - obs_features + 2-D array of selected market features at each time step. + + Position State + -------------- + - posi_arr + 1-D array of position (number of shares/contracts) at each time step. + - posi_variation_arr + 1-D array of position changes due to trades. + - posi_entry_cover_arr + 1-D array encoding type of trade: + +1 -> new entry + +2 -> increase existing position + -1 -> partial cover + -2 -> full close + 0 -> no trade + + Price Statistics + ---------------- + - price_mean_arr + 1-D array of average entry price of current position at each time. + + Reward and P&L + -------------- + - reward_fluctuant_arr + Unrealized P&L: + (price - price_mean_arr) * posi_arr + - reward_makereal_arr + Indicator whether P&L at a step is realized (1) or not (0). + - reward_arr + 1-D array of realized reward per time step. + + Control / Bookkeeping + --------------------- + - step_st + Start index of current observation window. + - info + Auxiliary info (e.g. joined df + transaction log) at episode end. + - transaction_details + DataFrame logging position, PnL, etc. at the end of the episode. + + ---------------------------------------------------- + Observation Space (agent-facing) + ---------------------------------------------------- + The agent sees a rolling window of length `obs_len`. + + Base market window: + obs_state = obs_features[step_st : step_st + obs_len] + shape: (obs_len, feature_len) + + If `return_transaction=False`: + observation = obs_state + + If `return_transaction=True`: + For each timestep in the window, we concatenate: + [market features, + posi_arr, + posi_variation_arr, + posi_entry_cover_arr, + price, + price_mean_arr, + reward_fluctuant_arr, + reward_makereal_arr, + reward_arr] + so the number of columns becomes: + feature_len + 8 + + In both cases, the Gym observation space is a Box with shape: + (obs_len, obs_feature_len) + + ---------------------------------------------------- + Reward (Option A) + ---------------------------------------------------- + At each step, after trades and updates, the environment returns: + + step_reward = obs_reward.sum() + + where `obs_reward` is the slice of `reward_arr` over the current + observation window. (Unrealized reward is not directly returned.) + + ---------------------------------------------------- + Initial Condition (reset) + ---------------------------------------------------- + At reset(): + - df_sample <- random session from full df + - step_st <- 0 + - price <- df_sample[price_name].values + - obs_features <- df_sample[feature_names].values + - posi_arr <- zeros + - posi_variation_arr <- zeros + - posi_entry_cover_arr<- zeros + - price_mean_arr <- copy of price + - reward_fluctuant_arr<- zeros + - reward_makereal_arr <- zeros + - reward_arr <- zeros + - info <- None + - transaction_details <- empty DataFrame + - t_index <- 0 + - First observation window is taken from index 0 and padded if needed. + """ + metadata = {"render_modes": ["human"]} + + def __init__( + self, + df: pd.DataFrame, + obs_len: int, + step_len: int, + fee: float, + max_position: int = 5, + deal_col_name: str = "price", + feature_names: List[str] | None = None, + return_transaction: bool = False, + fluc_div: float = 100.0, + gameover_limit: float | None = None, + ) -> None: + super().__init__() + + assert obs_len >= 1, "obs_len must be >= 1" + assert step_len >= 1, "step_len must be >= 1" + feature_names = feature_names or ["price", "volume"] + + self.df = df.reset_index(drop=True) + self.obs_len = int(obs_len) + self.step_len = int(step_len) + self.fee = float(fee) + self.max_position = int(max_position) + self.price_name = deal_col_name + self.using_feature = feature_names + self.feature_len = len(self.using_feature) + self.return_transaction = return_transaction + self.fluc_div = float(fluc_div) + self.gameover_limit = gameover_limit + + # Precompute session boundaries: + # we assume 'serial_number' marks session start (==0) + if "serial_number" not in self.df.columns: + raise ValueError("DataFrame must contain 'serial_number' column.") + self.begin_fs = self.df[self.df["serial_number"] == 0] + self.date_leng = len(self.begin_fs) + + # ---- Gym spaces ---- + # Actions: 0=Hold, 1=Buy, 2=Sell + self.action_space = spaces.Discrete(3) + + # Observation: rolling window with optional transaction features + self.transaction_feature_len = 8 if self.return_transaction else 0 + self.obs_feature_len = self.feature_len + self.transaction_feature_len + + self.observation_space = spaces.Box( + low=-np.inf, + high=np.inf, + shape=(self.obs_len, self.obs_feature_len), + dtype=np.float32, + ) + + # Internal episode state placeholders + self.df_sample: pd.DataFrame | None = None + self.price: np.ndarray | None = None + self.obs_features: np.ndarray | None = None + + self.posi_arr: np.ndarray | None = None + self.posi_variation_arr: np.ndarray | None = None + self.posi_entry_cover_arr: np.ndarray | None = None + self.price_mean_arr: np.ndarray | None = None + self.reward_fluctuant_arr: np.ndarray | None = None + self.reward_makereal_arr: np.ndarray | None = None + self.reward_arr: np.ndarray | None = None + + self.step_st: int = 0 + self.t_index: int = 0 + self.info: Any = None + self.transaction_details: pd.DataFrame = pd.DataFrame() + + # Observation caches + self.obs_state: np.ndarray | None = None + self.obs_posi: np.ndarray | None = None + self.obs_posi_var: np.ndarray | None = None + self.obs_posi_entry_cover: np.ndarray | None = None + self.obs_price: np.ndarray | None = None + self.obs_price_mean: np.ndarray | None = None + self.obs_reward_fluctuant: np.ndarray | None = None + self.obs_makereal: np.ndarray | None = None + self.obs_reward: np.ndarray | None = None + self.obs_return: np.ndarray | None = None + # ------------------------------------------------------------------ + # Session sampler + # ------------------------------------------------------------------ + def _random_choice_section(self) -> pd.DataFrame: + """Randomly choose a session from df.""" + random_int = np.random.randint(self.date_leng) + if random_int == self.date_leng - 1: + begin_point = self.begin_fs.index[random_int] + end_point = None + else: + begin_point, end_point = self.begin_fs.index[random_int : random_int + 2] + df_section = self.df.iloc[begin_point:end_point] + return df_section + + # ------------------------------------------------------------------ + # Gymnasium API + # ------------------------------------------------------------------ + def reset( + self, + *, + seed: int | None = None, + options: Dict[str, Any] | None = None, + ) -> Tuple[np.ndarray, Dict[str, Any]]: + super().reset(seed=seed) + + # Sample one session + self.df_sample = self._random_choice_section() + self.step_st = 0 + + # Define price and features + self.price = self.df_sample[self.price_name].to_numpy(dtype=float) + self.obs_features = self.df_sample[self.using_feature].to_numpy(dtype=float) + + # Position & PnL arrays + self.posi_arr = np.zeros_like(self.price, dtype=float) + self.posi_variation_arr = np.zeros_like(self.posi_arr, dtype=float) + self.posi_entry_cover_arr = np.zeros_like(self.posi_arr, dtype=float) + + self.price_mean_arr = self.price.copy() + self.reward_fluctuant_arr = (self.price - self.price_mean_arr) * self.posi_arr + self.reward_makereal_arr = np.zeros_like(self.posi_arr, dtype=float) + self.reward_arr = np.zeros_like(self.posi_arr, dtype=float) + + self.info = None + self.transaction_details = pd.DataFrame() + self.t_index = 0 + + # Build initial observation window + self._update_obs_window() + + return self.obs_return.astype(np.float32), {} + + def step( + self, action: int + ) -> Tuple[np.ndarray, float, bool, bool, Dict[str, Any]]: + """ + Execute one environment step. + + Parameters + ---------- + action : int + 0=Hold, 1=Buy (long), 2=Sell (short). + + Returns + ------- + obs : np.ndarray + Next observation window of shape (obs_len, obs_feature_len). + reward : float + Per-step reward (here: sum of realized rewards over the window). + terminated : bool + True if episode ended (end of session or gameover). + truncated : bool + False (no time-limit truncation implemented). + info : dict + Extra info (contains `info` only at episode end). + """ + assert self.df_sample is not None and self.price is not None + + current_index = self.step_st + self.obs_len - 1 + current_price_mean = float(self.price_mean_arr[current_index]) + current_mkt_position = float(self.posi_arr[current_index]) + + self.t_index += 1 + self.step_st += self.step_len + + # Refresh observation window & change slices + self._update_obs_window() + + # Change buffers: last `step_len` rows of the window + self.chg_posi = self.obs_posi[-self.step_len:] + self.chg_posi_var = self.obs_posi_var[-self.step_len:] + self.chg_posi_entry_cover = self.obs_posi_entry_cover[-self.step_len:] + self.chg_price = self.obs_price[-self.step_len:] + self.chg_price_mean = self.obs_price_mean[-self.step_len:] + self.chg_reward_fluctuant = self.obs_reward_fluctuant[-self.step_len:] + self.chg_makereal = self.obs_makereal[-self.step_len:] + self.chg_reward = self.obs_reward[-self.step_len:] + + terminated = False + + # ----------------- End-of-session forced close ----------------- + if self.step_st + self.obs_len + self.step_len >= len(self.price): + terminated = True + # No new action; force close on last segment if position != 0 + if current_mkt_position != 0: + self.chg_price_mean[:] = current_price_mean + self.chg_posi[:] = 0.0 + self.chg_posi_var[:1] = -current_mkt_position + self.chg_posi_entry_cover[:1] = -2.0 + self.chg_makereal[:1] = 1.0 + # Realized reward for closing all + self.chg_reward[:] = ( + (self.chg_price - self.chg_price_mean) * current_mkt_position + - abs(current_mkt_position) * self.fee + ) * self.chg_makereal + + # Build transaction_details & info at the end + self.transaction_details = pd.DataFrame( + [ + self.posi_arr, + self.posi_variation_arr, + self.posi_entry_cover_arr, + self.price_mean_arr, + self.reward_fluctuant_arr, + self.reward_makereal_arr, + self.reward_arr, + ], + index=[ + "position", + "position_variation", + "entry_cover", + "price_mean", + "reward_fluctuant", + "reward_makereal", + "reward", + ], + columns=self.df_sample.index, + ).T + self.info = self.df_sample.join(self.transaction_details) + + # ----------------- Trading logic (no forced close) -------------- + if not terminated: + # Use next tick inside the change segment as entry price + enter_price = float(self.chg_price[0]) + + # Respect position limits (convert impossible actions to Hold) + if action == 1 and current_mkt_position == self.max_position: + action = 0 + if action == 2 and current_mkt_position == -self.max_position: + action = 0 + + # Long / short / cover / stay + if action == 1 and self.max_position > current_mkt_position >= 0: + # open/increase long + open_posi = (current_mkt_position == 0) + self._long(open_posi, enter_price, current_mkt_position, current_price_mean) + + elif action == 2 and -self.max_position < current_mkt_position <= 0: + # open/increase short + open_posi = (current_mkt_position == 0) + self._short(open_posi, enter_price, current_mkt_position, current_price_mean) + + elif action == 1 and current_mkt_position < 0: + # covering short + self._short_cover(current_price_mean, current_mkt_position) + + elif action == 2 and current_mkt_position > 0: + # covering long + self._long_cover(current_price_mean, current_mkt_position) + + elif action == 0 and current_mkt_position != 0: + # stay on with current position + self._stayon(current_price_mean, current_mkt_position) + + # ----------------- Update unrealized P&L ------------------------ + self.chg_reward_fluctuant[:] = ( + (self.chg_price - self.chg_price_mean) * self.chg_posi + - np.abs(self.chg_posi) * self.fee + ) + + # At this point, reward_arr & reward_fluctuant_arr windows are updated + # and obs_return should be recomputed with latest arrays + self._update_obs_window() + + # Here we simply return the sum of reward over the observation window, + # consistent with your current implementation. + step_reward = float(self.obs_reward.sum()) + + # Optional gameover condition, if configured + if self.gameover_limit is not None: + # approximate equity = sum(realized) + last unrealized + current_idx = self.step_st + self.obs_len - 1 + realized_total = float(self.reward_arr[: current_idx + 1].sum()) + unrealized = float(self.reward_fluctuant_arr[current_idx]) + equity = realized_total + unrealized + if equity < -self.gameover_limit: + terminated = True + + return self.obs_return.astype(np.float32), step_reward, terminated, False, ( + {} if self.info is None else {"info": self.info} + ) + + # ------------------------------------------------------------------ + # Trading helpers (array-based) + # ------------------------------------------------------------------ + def _long( + self, + open_posi: bool, + enter_price: float, + current_mkt_position: float, + current_price_mean: float, + ) -> None: + if open_posi: + self.chg_price_mean[:] = enter_price + self.chg_posi[:] = 1.0 + self.chg_posi_var[:1] = 1.0 + self.chg_posi_entry_cover[:1] = 1.0 + else: + after_act_mkt_position = current_mkt_position + 1.0 + self.chg_price_mean[:] = (current_price_mean * current_mkt_position + enter_price) / after_act_mkt_position + self.chg_posi[:] = after_act_mkt_position + self.chg_posi_var[:1] = 1.0 + self.chg_posi_entry_cover[:1] = 2.0 + + def _short( + self, + open_posi: bool, + enter_price: float, + current_mkt_position: float, + current_price_mean: float, + ) -> None: + if open_posi: + self.chg_price_mean[:] = enter_price + self.chg_posi[:] = -1.0 + self.chg_posi_var[:1] = -1.0 + self.chg_posi_entry_cover[:1] = 1.0 + else: + after_act_mkt_position = current_mkt_position - 1.0 + self.chg_price_mean[:] = ( + current_price_mean * abs(current_mkt_position) + enter_price + ) / abs(after_act_mkt_position) + self.chg_posi[:] = after_act_mkt_position + self.chg_posi_var[:1] = -1.0 + self.chg_posi_entry_cover[:1] = 2.0 + + def _short_cover(self, current_price_mean: float, current_mkt_position: float) -> None: + self.chg_price_mean[:] = current_price_mean + self.chg_posi[:] = current_mkt_position + 1.0 + self.chg_makereal[:1] = 1.0 + self.chg_reward[:] = ( + (self.chg_price - self.chg_price_mean) * (-1.0) - self.fee + ) * self.chg_makereal + self.chg_posi_var[:1] = 1.0 + self.chg_posi_entry_cover[:1] = -1.0 + + def _long_cover(self, current_price_mean: float, current_mkt_position: float) -> None: + self.chg_price_mean[:] = current_price_mean + self.chg_posi[:] = current_mkt_position - 1.0 + self.chg_makereal[:1] = 1.0 + self.chg_reward[:] = ( + (self.chg_price - self.chg_price_mean) * (1.0) - self.fee + ) * self.chg_makereal + self.chg_posi_var[:1] = -1.0 + self.chg_posi_entry_cover[:1] = -1.0 + + def _stayon(self, current_price_mean: float, current_mkt_position: float) -> None: + self.chg_posi[:] = current_mkt_position + self.chg_price_mean[:] = current_price_mean + + # ------------------------------------------------------------------ + # Observation helpers + # ------------------------------------------------------------------ + def _update_obs_window(self) -> None: + """Update all obs_* slices and obs_return from current step_st.""" + s = self.step_st + e = self.step_st + self.obs_len + + # Clip window if near the end + e = min(e, len(self.price)) + + self.obs_state = self.obs_features[s:e] + self.obs_posi = self.posi_arr[s:e] + self.obs_posi_var = self.posi_variation_arr[s:e] + self.obs_posi_entry_cover = self.posi_entry_cover_arr[s:e] + self.obs_price = self.price[s:e] + self.obs_price_mean = self.price_mean_arr[s:e] + self.obs_reward_fluctuant = self.reward_fluctuant_arr[s:e] + self.obs_makereal = self.reward_makereal_arr[s:e] + self.obs_reward = self.reward_arr[s:e] + + if self.return_transaction: + self.obs_return = np.concatenate( + ( + self.obs_state, + self.obs_posi[:, np.newaxis], + self.obs_posi_var[:, np.newaxis], + self.obs_posi_entry_cover[:, np.newaxis], + self.obs_price[:, np.newaxis], + self.obs_price_mean[:, np.newaxis], + self.obs_reward_fluctuant[:, np.newaxis], + self.obs_makereal[:, np.newaxis], + self.obs_reward[:, np.newaxis], + ), + axis=1, + ) + else: + self.obs_return = self.obs_state + + # If we're at the end and window is shorter than obs_len, pad + if self.obs_return.shape[0] < self.obs_len: + pad_rows = self.obs_len - self.obs_return.shape[0] + pad = np.repeat(self.obs_return[-1:, :], pad_rows, axis=0) + self.obs_return = np.concatenate([self.obs_return, pad], axis=0) + + # ------------------------------------------------------------------ + # Render + # ------------------------------------------------------------------ + def render(self) -> np.ndarray: + """Simple render: return current observation (for debugging).""" + return self.obs_return.copy() + + # ------------------------------------------------------------------ + # SB3 helper + # ------------------------------------------------------------------ + def get_sb_env(self) -> Tuple[DummyVecEnv, Any]: + """Wrap this env in SB3 DummyVecEnv and return (env, obs).""" + e = DummyVecEnv([lambda: self]) + obs = e.reset() + return e, obs diff --git a/finai_contest/tutorial/envs_tutorial/TradingGym_Env_Spec.md b/finai_contest/tutorial/envs_tutorial/TradingGym_Env_Spec.md new file mode 100644 index 00000000..9c68cff1 --- /dev/null +++ b/finai_contest/tutorial/envs_tutorial/TradingGym_Env_Spec.md @@ -0,0 +1,127 @@ +# Standardize the environment + +## Action Space +The action space includes three discrete actions: +- **0 — Hold** +- **1 — Buy** +- **2 — Sell** + +Internally, the environment sets: +```python +self.action_space = np.array([3,]) +self.gym_actions = range(3) +``` + +Action constraints: +The agent **cannot exceed** the maximum allowed position `max_position`. +If attempting to buy while already at `+max_position`, action becomes **hold**. +If attemping to sell while already at `-max_postion`, action becomes **hold**. + +## State Space +### **Market Data** +- **Market data segment**(`df_sample`): defines the time range over which the episode is executed. +- **Stock price**(`price`): is used as the reference for all reward and P&L calculations. +- **Market feature matrix**(`obs_features`): A two-dimensional array containing the selected market features at each time step. +### **Position State** +- **Market position**(`posi_arr`): A one-dimensional array representing the agent's position at each time step, measured in number of shares of contracts. +- **Position variation**(`posi_variating_arr`): records how the position changes due to trading actions, such as increasing or decreasing exposure. +- **Position entry/ cover indicator**(`posi_entry_cover_arr`): A one-dimensional array encoding whether a position change corresponds to an entry or a cover operation. +### **Price Statistics** +- **Average position price**(`price_mean_arr`): A one-dimensional array representing the average entry price of current position at each time step. +### **Reward and Profit & Loss** +- **Unrealized reward**(`reward_fluctuant_arr`): defined as the difference between the current price and the average position price, multiplied by the current position size. +- **Realized reward indicator**(`reward_makereal_arr`): A one-dimensional arry indicating whether the reward at a given time step is realized. +- **Realized reward**(`reward_arr`): A one-dimensional array representing the realized profit or loss at each time step. +## **Control and Bookkeeping** +- **Step pointer**(`step_st`): An integer indicating the starting index of the current observation window within the trading session. +- **Additional information**(`info`): A placeholder variable for returning auxiliary information when the episode terminates. +- **Transaction log**(`transaction_details`): An empty DataFrame used to record detailed information about all trading action, positions, and rewards during the episode. + +## Observation Space +`ob_state` - **rolling market window** +- Shape: `(obs_len, feature_len)` +- Definition: a slice of 'obs_features' starting at `step_st`: +```python +obs_state = obs_features[step_st : step_st + obs_len] +``` +- Each row correspond to one time step in the recent past. +- Each column corresponds to one of the chosen `feature_names`(e.g. `price`, `volume`, etc) + +## Reward Design +The environment has two types of reward: +- **Unrealized (fluctuating) reward** + +If the agent does not close a position: +```python +reward_ret = reward_fluctuant / fluc_div +``` +- **Realized reward** + +When the agent close a long or short position ("cover" action): +```python +reward = (sell_price - price_mean - fee) * position # long closing +reward = (price_mean - buy_price - fee) * (-position) # short closing +``` +This reward is added directly to `reward_sum`. + +Notes: +- Rewards are positive for profitable closes, negative for losses. +- Unrealized rewards are scaled by `fluc_div` to avoid exploding values. + +## Transition Dynamics +At each environment step, the following process occur: +1. **Interpret Agent Action** +- Action 1 -> attempt to **buy** +- Action 2 -> attempt to **sell** +- Action 0 -> **hold** + +Position constraints are enforced (cannot exceed ± `max_position`) + +2. **Execute Trade Logic** + +The environment handles: +- **Opening** long/short positions (`transact_type = "new"`) +- **Adding to existing** long/short positions +- **Closing (covering)** opposite-side positions (`transact_type = "cover"`) + +For each trade: +- Position count is updated +- Average price (`price_mean`) is recalculated +- Fees are applied +- Profit/los is recorded into `reward_sum` +- A new row is appended to `transaction_details` + +3. **Advance Market Timeline** + +After trade evaluation: +```python +self.step_st += self.step_len +``` +The new observation features: +```python +self.obs_res = self.obs_features[self.step_st : self.step_st + self.obs_len] +``` + +4. **Check Termination Conditions** +Episode ends if: +- The observation window reaches the end of available data (`step_st + obs_len + step_len ≥ total_length`) +- The agent hits the game-over loss threshold (`reward_sum + reward_fluctuant < -gameover_limit`) + +When ending: +- Any open position is automatically closed +- Final realized reward is added to `reward_sum` + +## Initial Condition +- **Market data segment** (`df_sample`) is set to `_random_choice_section`. +- **Stock price** (`price`) extracted from the selected sesion and stored as a NumPy array. +- **Market feature matrix** (`obs_features`) extraced from `df_sample` according to `feature_names`. +- **Market position** (`posi_arr`) initialized to an array of zeros. +- **Position variation** (`posi_variation_arr`) is initialized to zeros. +- **Position entry/ cover indicator** (`posi_entry_cover_arr`) is initialized to zeros. +- **Average position price** (`price_mean_arr`) is initialized as a copy of stock price array. +- **Unrealized reward** (`reward_fluctuation_arr`) is initialized to zero for all time steps. +- **Realized reward indicator** (`reward_makereal_arr`) is initialized to `0`. +- **Realized reward** (`reward_arr`) is initialized to `0`. +- **Step pointer** (`step_st`) is set to `0`. +- **Additional information** (`info`) is initialized to `None`. +- **Transaction log** (`transaction_details`) is initalized as an empty DataFrame.