Smooth target env

jsuarez5341 · jsuarez5341 · commit ab45d0ebf63d · 2025-05-21T19:35:45.000Z
diff --git a/pufferlib/config/ocean/target.ini b/pufferlib/config/ocean/target.ini
@@ -10,8 +10,9 @@ num_agents = 8
 num_goals = 4
 
 [train]
-total_timesteps = 500_000_000
+total_timesteps = 100_000_000
 gamma = 0.99
-learning_rate = 0.0015
+learning_rate = 0.015
 minibatch_size = 32768
 ent_coef = 0.02
+
diff --git a/pufferlib/ocean/target/target.c b/pufferlib/ocean/target/target.c
@@ -22,8 +22,8 @@ int main() {
         .num_goals = 8
     };
     init(&env);
-    env.observations = calloc(env.num_agents*(2*(env.num_agents + env.num_goals)+1), sizeof(float));
-    env.actions = calloc(2*env.num_agents, sizeof(float));
+    env.observations = calloc(env.num_agents*(2*(env.num_agents + env.num_goals)+4), sizeof(float));
+    env.actions = calloc(2*env.num_agents, sizeof(int));
     env.rewards = calloc(env.num_agents, sizeof(float));
     env.terminals = calloc(env.num_agents, sizeof(unsigned char));
 
@@ -34,7 +34,8 @@ int main() {
             env.actions[0] = 0;
         } else {
             for (int i=0; i<env.num_agents; i++) {
-                env.actions[i] = (float)rand()/(float)RAND_MAX - 0.5f ;
+                env.actions[2*i] = rand() % 9;
+                env.actions[2*i + 1] = rand() % 5;
             }
             //forward_linearlstm(net, env.observations, env.actions);
         }
diff --git a/pufferlib/ocean/target/target.h b/pufferlib/ocean/target/target.h
@@ -4,7 +4,7 @@
 #include <stdio.h>
 #include "raylib.h"
 
-const float VELOCITY = 20.0f;
+const float MAX_SPEED = 20.0f;
 
 typedef struct {
     float perf;
@@ -23,6 +23,7 @@ typedef struct {
     float x;
     float y;
     float heading;
+    float speed;
     int ticks_since_reward;
 } Agent;
 
@@ -37,7 +38,7 @@ typedef struct {
     Agent* agents;
     Goal* goals;
     float* observations;
-    float* actions;
+    int* actions;
     float* rewards;
     unsigned char* terminals;
     int width;
@@ -113,21 +114,29 @@ void c_step(Target* env) {
         env->rewards[i] = 0;
         Agent* agent = &env->agents[i];
         agent->ticks_since_reward += 1;
-        agent->heading += env->actions[i];
+
+        agent->heading += ((float)env->actions[2*i] - 4.0f)/12.0f;
         if (agent->heading < 0) {
             agent->heading += 2*PI;
         } else if (agent->heading > 2*PI) {
             agent->heading -= 2*PI;
         }
 
-        agent->x += VELOCITY*cosf(agent->heading);
+        agent->speed += 1.0f*((float)env->actions[2*i + 1] - 2.0f);
+        if (agent->speed > MAX_SPEED) {
+            agent->speed = MAX_SPEED;
+        } else if (agent->speed < -MAX_SPEED) {
+            agent->speed = -MAX_SPEED;
+        }
+
+        agent->x += agent->speed*cosf(agent->heading);
         if (agent->x < 0) {
             agent->x = 0;
         } else if (agent->x > env->width) {
             agent->x = env->width;
         }
 
-        agent->y += VELOCITY*sinf(agent->heading);
+        agent->y += agent->speed*sinf(agent->heading);
         if (agent->y < 0) {
             agent->y = 0;
         } else if (agent->y > env->height) {
diff --git a/pufferlib/ocean/target/target.py b/pufferlib/ocean/target/target.py
@@ -24,16 +24,13 @@ def __init__(self, num_envs=1, width=1080, height=720, num_agents=8,
             num_goals=8, render_mode=None, log_interval=128, size=11, buf=None, seed=0):
         self.single_observation_space = gymnasium.spaces.Box(low=0, high=1,
             shape=(2*(num_agents+num_goals) + 4,), dtype=np.float32)
-        self.single_action_space = gymnasium.spaces.Box(
-            low=-0.5, high=0.5, shape=(1,), dtype=np.float32)
-        #self.single_action_space = gymnasium.spaces.Discrete(9)
+        self.single_action_space = gymnasium.spaces.MultiDiscrete([9, 5])
 
         self.render_mode = render_mode
         self.num_agents = num_envs*num_agents
         self.log_interval = log_interval
 
         super().__init__(buf)
-        #self.actions = self.actions.astype(np.float32)
         c_envs = []
         for i in range(num_envs):
             c_env = binding.env_init(
@@ -55,7 +52,6 @@ def reset(self, seed=None):
 
     def step(self, actions):
         self.tick += 1
-        #actions = (actions.astype(np.float32) - 4)/8
         self.actions[:] = actions
         binding.vec_step(self.c_envs)