From 8b4a98b5aba8f6aee59c0e8850db5cc393602f52 Mon Sep 17 00:00:00 2001 From: mx2000 Date: Thu, 27 Mar 2025 05:00:53 +0100 Subject: [PATCH 1/8] comma fix on setup.py --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 9ce4537428..6aeef4cdbc 100644 --- a/setup.py +++ b/setup.py @@ -267,8 +267,7 @@ 'pufferlib/ocean/go/cy_go', 'pufferlib/ocean/rware/cy_rware', 'pufferlib/ocean/trash_pickup/cy_trash_pickup', - 'pufferlib/ocean/cpr/cy_cpr' - 'pufferlib/ocean/tower_climb/cy_tower_climb', + 'pufferlib/ocean/cpr/cy_cpr', ] system = platform.system() From 9c7cd87ad3742061dae1d54ea9b8ae5aba79fc58 Mon Sep 17 00:00:00 2001 From: mx2000 Date: Thu, 27 Mar 2025 07:08:41 +0100 Subject: [PATCH 2/8] Improve cpr from PR comments --- pufferlib/ocean/cpr/cpr.h | 119 +++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 67 deletions(-) diff --git a/pufferlib/ocean/cpr/cpr.h b/pufferlib/ocean/cpr/cpr.h index 47a2d81d49..cfc5e01887 100644 --- a/pufferlib/ocean/cpr/cpr.h +++ b/pufferlib/ocean/cpr/cpr.h @@ -179,6 +179,7 @@ int get_agent_id_from_tile(int tile) { return tile - AGENTS; } void add_food(CCpr *env, int grid_idx, int food_type) { // Add food to the grid and the food_list at grid_idx + assert(env->grid[grid_idx] == EMPTY); env->grid[grid_idx] = food_type; FoodList *foods = env->foods; foods->indexes[foods->size++] = grid_idx; @@ -218,24 +219,10 @@ void init_foods(CCpr *env) { int normal = available_tiles / (20 * normalizer); int interactive = available_tiles / (50 * normalizer); for (int i = 0; i < normal; i++) { - int idx, tile; - do { - int r = rand() % (env->height - 1); - int c = rand() % (env->width - 1); - idx = r * env->width + c; - tile = env->grid[idx]; - } while (tile != EMPTY); - add_food(env, idx, NORMAL_FOOD); + spawn_food(env, NORMAL_FOOD); } for (int i = 0; i < interactive; i++) { - int idx, tile; - do { - int r = rand() % (env->height - 1); - int c = rand() % (env->width - 1); - idx = r * env->width + c; - tile = env->grid[idx]; - } while (tile != EMPTY); - add_food(env, idx, INTERACTIVE_FOOD); + spawn_food(env, INTERACTIVE_FOOD); } } @@ -253,21 +240,22 @@ void spawn_foods(CCpr *env) { for (int ri = 0; ri < 3; ri++) { for (int ci = 0; ci < 3; ci++) { int grid_idx = grid_index(env, (r + ri), (c + ci)); - if (env->grid[grid_idx] == EMPTY) { - switch (env->grid[idx]) { - // %Chance spawning new food - case NORMAL_FOOD: - if ((rand() / (double)RAND_MAX) < env->food_base_spawn_rate) { - add_food(env, grid_idx, env->grid[idx]); - } - break; - case INTERACTIVE_FOOD: - if ((rand() / (double)RAND_MAX) < - (env->food_base_spawn_rate / 10.0)) { - add_food(env, grid_idx, env->grid[idx]); - } - break; + if (env->grid[grid_idx] != EMPTY) { + continue; + } + switch (env->grid[idx]) { + // %Chance spawning new food + case NORMAL_FOOD: + if ((rand() / (double)RAND_MAX) < env->food_base_spawn_rate) { + add_food(env, grid_idx, env->grid[idx]); } + break; + case INTERACTIVE_FOOD: + if ((rand() / (double)RAND_MAX) < + (env->food_base_spawn_rate / 10.0)) { + add_food(env, grid_idx, env->grid[idx]); + } + break; } } } @@ -305,33 +293,28 @@ void compute_observations(CCpr *env) { void c_reset(CCpr *env) { env->tick = 0; - for (int r = 0; r < env->height; r++) { - for (int c = 0; c < env->width; c++) { - int adr = grid_index(env, r, c); - env->grid[adr] = EMPTY; - } - } + memset(env->grid, EMPTY, (env->height * env->width) * sizeof(env->grid[0])); - // Walls need to cover vision radius around the grid + // top walling for (int r = 0; r < env->vision; r++) { - for (int c = 0; c < env->width; c++) { - env->grid[r * env->width + c] = WALL; - } + memset(env->grid + (r * env->width), WALL, + env->width * sizeof(env->grid[0])); } + // left side walling for (int r = 0; r < env->height; r++) { - for (int c = 0; c < env->vision; c++) { - env->grid[r * env->width + c] = WALL; - } + memset(env->grid + (r * env->width), WALL, + env->vision * sizeof(env->grid[0])); } + // bottom walling for (int r = env->height - env->vision; r < env->height; r++) { - for (int c = 0; c < env->width; c++) { - env->grid[r * env->width + c] = WALL; - } + memset(env->grid + (r * env->width), WALL, + env->width * sizeof(env->grid[0])); } + + // right side walling for (int r = 0; r < env->height; r++) { - for (int c = env->width - env->vision; c < env->width; c++) { - env->grid[r * env->width + c] = WALL; - } + memset(env->grid + (r * env->width) + (env->width - env->vision), WALL, + env->vision * sizeof(env->grid[0])); } // Agents @@ -384,21 +367,15 @@ void reward_agents_near(CCpr *env, int food_index) { if ((ac == food_c && (ar == food_r - 1 || ar == food_r + 1)) || (ar == food_r && (ac == food_c - 1 || ac == food_c + 1))) { env->rewards[i] += env->interactive_food_reward; - env->logs[i].score += env->interactive_food_reward; + env->logs[i].score += 5; add_log(env->log_buffer, &env->logs[i]); env->logs[i] = (Log){0}; } } - - // Empty grid cell - // env->grid[food_index] = EMPTY; remove_food(env, food_index); - - // Spawn new interactive food - // spawn_interactive_food(env); } -void step_agent(CCpr *env, int i) { +bool step_agent(CCpr *env, int i) { int action = env->actions[i]; @@ -418,10 +395,10 @@ void step_agent(CCpr *env, int i) { case 3: dc = 1; break; // RIGHT + case 4: + return false; // No moves } - - if (action != 4) - env->logs[i].moves += 1; + env->logs[i].moves += 1; // Get next row and column Agent *agent = &env->agents[i]; @@ -435,7 +412,7 @@ void step_agent(CCpr *env, int i) { // Anything above should be obstacle if (tile >= INTERACTIVE_FOOD) { - env->logs[i].score += env->reward_move; + env->logs[i].score += -0.01; env->rewards[i] += env->reward_move; next_r = agent->r; next_c = agent->c; @@ -467,15 +444,14 @@ void step_agent(CCpr *env, int i) { switch (tile) { case NORMAL_FOOD: - env->logs[i].score += env->reward_food; + env->logs[i].score += 1; env->rewards[i] = env->reward_food; - // spawn_food(env); remove_food(env, next_grid_idx); add_log(env->log_buffer, &env->logs[i]); env->logs[i] = (Log){0}; break; case EMPTY: - env->logs[i].score += env->reward_move; + env->logs[i].score += -0.01; env->rewards[i] = env->reward_move; break; } @@ -486,6 +462,8 @@ void step_agent(CCpr *env, int i) { env->grid[next_grid_idx] = agent_tile; agent->r = next_r; agent->c = next_c; + + return true; } void c_step(CCpr *env) { @@ -494,8 +472,14 @@ void c_step(CCpr *env) { memset(env->interactive_food_agent_count, 0, (env->width * env->height + 7) / 8); + bool logged = false; for (int i = 0; i < env->num_agents; i++) { - step_agent(env, i); + logged = step_agent(env, i); + } + // To cope with sweeps waiting for logs, in case nothing moves + if (!logged) { + env->logs[0].score += 0; + add_log(env->log_buffer, &env->logs[0]); } spawn_foods(env); @@ -521,12 +505,13 @@ Rectangle UV_COORDS[7] = { (Rectangle){384, 0, 128, 128}, }; -typedef struct { +typedef struct Renderer Renderer; +struct Renderer { int cell_size; int width; int height; Texture2D puffer; -} Renderer; +}; Renderer *init_renderer(int cell_size, int width, int height) { Renderer *renderer = (Renderer *)calloc(1, sizeof(Renderer)); From 27be9d2ae6a5c801b33af92472d8c3349d090114 Mon Sep 17 00:00:00 2001 From: mx2000 Date: Tue, 1 Apr 2025 19:32:06 +0200 Subject: [PATCH 3/8] make 1 log per env logging every step --- pufferlib/ocean/cpr/cpr.h | 52 ++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/pufferlib/ocean/cpr/cpr.h b/pufferlib/ocean/cpr/cpr.h index cfc5e01887..4f7d2e21bb 100644 --- a/pufferlib/ocean/cpr/cpr.h +++ b/pufferlib/ocean/cpr/cpr.h @@ -122,7 +122,7 @@ struct CCpr { Agent *agents; LogBuffer *log_buffer; - Log *logs; + Log *log; uint8_t *interactive_food_agent_count; @@ -136,7 +136,7 @@ void init_ccpr(CCpr *env) { env->agents = (Agent *)calloc(env->num_agents, sizeof(Agent)); env->vision_window = 2 * env->vision + 1; env->obs_size = env->vision_window * env->vision_window; - env->logs = (Log *)calloc(env->num_agents, sizeof(Log)); + env->log = (Log *)calloc(1, sizeof(Log)); env->interactive_food_agent_count = (uint8_t *)calloc((env->width * env->height + 7) / 8, sizeof(uint8_t)); env->foods = allocate_foodlist(env->width * env->height); @@ -167,7 +167,7 @@ void free_CCpr(CCpr *env) { free(env->rewards); free(env->terminals); free_logbuffer(env->log_buffer); - free(env->logs); + free(env->log); free(env->interactive_food_agent_count); free_foodlist(env->foods); } @@ -294,6 +294,7 @@ void c_reset(CCpr *env) { env->tick = 0; memset(env->grid, EMPTY, (env->height * env->width) * sizeof(env->grid[0])); + memset(env->log, 0, sizeof(Log)); // top walling for (int r = 0; r < env->vision; r++) { @@ -320,7 +321,7 @@ void c_reset(CCpr *env) { // Agents srand(time(NULL)); for (int i = 0; i < env->num_agents; i++) { - env->logs[i] = (Log){0}; + // env->log[0] = (Log){0}; Agent *agent = &env->agents[i]; @@ -367,15 +368,15 @@ void reward_agents_near(CCpr *env, int food_index) { if ((ac == food_c && (ar == food_r - 1 || ar == food_r + 1)) || (ar == food_r && (ac == food_c - 1 || ac == food_c + 1))) { env->rewards[i] += env->interactive_food_reward; - env->logs[i].score += 5; - add_log(env->log_buffer, &env->logs[i]); - env->logs[i] = (Log){0}; + env->log->score += 5; + // add_log(env->log_buffer, env->log); + // memset(env->log, 0, sizeof(Log)); } } remove_food(env, food_index); } -bool step_agent(CCpr *env, int i) { +void step_agent(CCpr *env, int i) { int action = env->actions[i]; @@ -396,9 +397,9 @@ bool step_agent(CCpr *env, int i) { dc = 1; break; // RIGHT case 4: - return false; // No moves + return; // No moves } - env->logs[i].moves += 1; + env->log->moves += 1; // Get next row and column Agent *agent = &env->agents[i]; @@ -412,7 +413,7 @@ bool step_agent(CCpr *env, int i) { // Anything above should be obstacle if (tile >= INTERACTIVE_FOOD) { - env->logs[i].score += -0.01; + env->log->score += -0.01; env->rewards[i] += env->reward_move; next_r = agent->r; next_c = agent->c; @@ -444,14 +445,14 @@ bool step_agent(CCpr *env, int i) { switch (tile) { case NORMAL_FOOD: - env->logs[i].score += 1; + env->log->score += 1; env->rewards[i] = env->reward_food; remove_food(env, next_grid_idx); - add_log(env->log_buffer, &env->logs[i]); - env->logs[i] = (Log){0}; + // add_log(env->log_buffer, env->log); + // memset(env->log, 0, sizeof(Log)); break; case EMPTY: - env->logs[i].score += -0.01; + env->log->score += -0.01; env->rewards[i] = env->reward_move; break; } @@ -463,7 +464,7 @@ bool step_agent(CCpr *env, int i) { agent->r = next_r; agent->c = next_c; - return true; + return; } void c_step(CCpr *env) { @@ -472,19 +473,24 @@ void c_step(CCpr *env) { memset(env->interactive_food_agent_count, 0, (env->width * env->height + 7) / 8); - bool logged = false; + // bool logged = false; for (int i = 0; i < env->num_agents; i++) { - logged = step_agent(env, i); - } - // To cope with sweeps waiting for logs, in case nothing moves - if (!logged) { - env->logs[0].score += 0; - add_log(env->log_buffer, &env->logs[0]); + step_agent(env, i); } + add_log(env->log_buffer, env->log); + memset(env->log, 0, sizeof(Log)); + // // To cope with sweeps waiting for logs, in case nothing moves + // if (!logged) { + // add_log(env->log_buffer, env->log); + // memset(env->log, 0, sizeof(Log)); + // } + spawn_foods(env); compute_observations(env); + + env->tick++; } // Raylib client From e5f94ec16e6dc949099e4e3e572c98faf7b6c284 Mon Sep 17 00:00:00 2001 From: mx2000 Date: Wed, 9 Apr 2025 07:33:26 +0200 Subject: [PATCH 4/8] working cpr basline --- config/ocean/cpr.ini | 73 ++++++--- pufferlib/ocean/cpr/cpr.c | 14 +- pufferlib/ocean/cpr/cpr.h | 288 +++++++++++++++++++++++++-------- pufferlib/ocean/cpr/cpr.py | 17 +- pufferlib/ocean/cpr/cy_cpr.pyx | 22 ++- pufferlib/ocean/cpr/grid.h | 42 +++++ pufferlib/sweep.py | 120 +++++++++----- 7 files changed, 427 insertions(+), 149 deletions(-) create mode 100644 pufferlib/ocean/cpr/grid.h diff --git a/config/ocean/cpr.ini b/config/ocean/cpr.ini index 6a04582c81..f2e94ec6aa 100644 --- a/config/ocean/cpr.ini +++ b/config/ocean/cpr.ini @@ -1,46 +1,67 @@ [base] package = ocean -env_name = cpr puffer_cpr +env_name = cpr vec = multiprocessing rnn_name = Recurrent [env] -num_envs = 1024 +num_envs = 512 vision = 3 -widths = [32] -heights = [32] num_agents = [12] report_interval=1 -reward_food = 1.0 -interactive_food_reward = 5.0 -reward_move = -0.01 +reward_food = 0.001 +interactive_food_reward = 0.002 +reward_move = +0.00 food_base_spawn_rate = 2e-3 [train] -total_timesteps = 2_000_000 num_envs = 1 num_workers = 1 -env_batch_size = 1 -batch_size = 131072 -update_epochs = 1 -minibatch_size = 16384 -bptt_horizon = 8 -anneal_lr = False -learning_rate=0.001 -gamma = 0.95 -gae_lambda = 0.85 -vf_ceof = 0.4 -clip_coef = 0.1 -vf_clip_coef = 0.1 -ent_coef = 0.01 -max_grad_norm = 0.86 -checkpoint_interval = 1000 +total_timesteps = 1_000_000_000 device = cpu +batch_size = 32768 +minibatch_size = 8192 +bptt_horizon = 16 +checkpoint_interval = 200 +learning_rate = 0.0008524 +gamma = 0.9989 +gae_lambda = 0.99 +vf_coef = 1 +ent_coef = 0.01 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-12 +max_grad_norm = 0.5 +vf_clip_coef = 0.1 +update_epochs = 1 [workspace] name = boxingbytes project = pufferai -; [sweep.metric] -; goal = maximize -; name = environment/episode_return +[sweep.metric] +goal = maximize +name = score +min = -10 +max = 10 + +[sweep.env.reward_food] +distribution = log_normal +min = 0.0001 +max = 0.01 +mean = 0.001 +scale = auto + +[sweep.env.interactive_food_reward] +distribution = log_normal +min = 0.0001 +max = 0.02 +mean = 0.002 +scale = auto + +[sweep.train.total_timesteps] +distribution = log_normal +min = 50e6 +max = 75e6 +mean = 60e6 +scale = time \ No newline at end of file diff --git a/pufferlib/ocean/cpr/cpr.c b/pufferlib/ocean/cpr/cpr.c index 1c4f684452..f7896a9b60 100644 --- a/pufferlib/ocean/cpr/cpr.c +++ b/pufferlib/ocean/cpr/cpr.c @@ -3,16 +3,16 @@ #include int main() { - int width = 24; - int height = 24; + int width = 32; + int height = 32; int render_cell_size = 32; CCpr env = { - .num_agents = 8, + .num_agents = 1, .width = width, .height = height, - .vision = 2, + .vision = 3, .reward_food = 1.0f, .interactive_food_reward = 5.0f, .food_base_spawn_rate = 2e-3, @@ -21,9 +21,9 @@ int main() { c_reset(&env); Renderer *renderer = init_renderer(render_cell_size, width, height); - while (!WindowShouldClose()) { + c_render(renderer, &env); int st = 0; // User can take control of the first puffer if (IsKeyDown(KEY_LEFT_SHIFT)) { @@ -41,12 +41,10 @@ int main() { sleep(2); } for (int i = st; i < env.num_agents; i++) { - env.actions[i] = rand() % 4; + env.actions[i] = rand() % 5; // printf("Agent %d gets actions %d\n", i, env->actions[i]); } c_step(&env); - - c_render(renderer, &env); } close_renderer(renderer); free_CCpr(&env); diff --git a/pufferlib/ocean/cpr/cpr.h b/pufferlib/ocean/cpr/cpr.h index 4f7d2e21bb..8331954df1 100644 --- a/pufferlib/ocean/cpr/cpr.h +++ b/pufferlib/ocean/cpr/cpr.h @@ -8,6 +8,8 @@ #include "raylib.h" +#include "grid.h" + #define EMPTY 0 #define NORMAL_FOOD 1 #define INTERACTIVE_FOOD 2 @@ -22,10 +24,28 @@ #define CHECK_BIT(arr, i) (arr[(i) / 8] & (1 << ((i) % 8))) #define min(a, b) ((a) < (b) ? (a) : (b)) +#define REWARD_20_HP -0 +#define REWARD_80_HP 0 +#define REWARD_DEATH -0.0f + + +#define LOG_SCORE_REWARD_SMALL 1 +#define LOG_SCORE_REWARD_MEDIUM 5 +#define LOG_SCORE_REWARD_MOVE - 0.0 +#define LOG_SCORE_REWARD_DEATH -1 + +#define HP_REWARD_FOOD_MEDIUM 50 +#define HP_REWARD_FOOD_SMALL 20 +#define HP_LOSS_PER_STEP 1 +#define MAX_HP 100 + typedef struct Log Log; struct Log { float score; float moves; + float food_nb; + float agents_alive; + float alive_steps; }; typedef struct LogBuffer LogBuffer; @@ -65,9 +85,15 @@ Log aggregate_and_clear(LogBuffer *logs) { for (int i = 0; i < logs->idx; i++) { log.score += logs->logs[i].score; log.moves += logs->logs[i].moves; + log.food_nb += logs->logs[i].food_nb; + log.agents_alive += logs->logs[i].agents_alive; + log.alive_steps += logs->logs[i].alive_steps; } log.score /= logs->idx; log.moves /= logs->idx; + log.food_nb /= logs->idx; + log.agents_alive /= logs->idx; + log.alive_steps /= logs->idx; logs->idx = 0; return log; } @@ -77,6 +103,8 @@ struct Agent { int r; int c; int id; + float hp; + int direction; }; typedef struct FoodList FoodList; @@ -118,6 +146,8 @@ struct CCpr { int *actions; float *rewards; unsigned char *terminals; + unsigned char *truncations; + unsigned char *masks; Agent *agents; @@ -135,7 +165,7 @@ void init_ccpr(CCpr *env) { (unsigned char *)calloc(env->width * env->height, sizeof(unsigned char)); env->agents = (Agent *)calloc(env->num_agents, sizeof(Agent)); env->vision_window = 2 * env->vision + 1; - env->obs_size = env->vision_window * env->vision_window; + env->obs_size = env->vision_window * env->vision_window;// + 1; env->log = (Log *)calloc(1, sizeof(Log)); env->interactive_food_agent_count = (uint8_t *)calloc((env->width * env->height + 7) / 8, sizeof(uint8_t)); @@ -144,13 +174,15 @@ void init_ccpr(CCpr *env) { void allocate_ccpr(CCpr *env) { // Called by C stuff - int obs_size = (2 * env->vision + 1) * (2 * env->vision + 1); + int obs_size = (2 * env->vision + 1) * (2 * env->vision + 1); //+ 1; env->observations = (unsigned char *)calloc(env->num_agents * obs_size, sizeof(unsigned char)); env->actions = (int *)calloc(env->num_agents, sizeof(unsigned int)); env->rewards = (float *)calloc(env->num_agents, sizeof(float)); env->terminals = (unsigned char *)calloc(env->num_agents, sizeof(unsigned char)); + env->truncations = (unsigned char*)calloc(env->num_agents, sizeof(unsigned char)); + env->masks = (unsigned char *)calloc(env->num_agents, sizeof(unsigned char)); env->log_buffer = allocate_logbuffer(LOG_BUFFER_SIZE); init_ccpr(env); } @@ -166,6 +198,8 @@ void free_CCpr(CCpr *env) { free(env->actions); free(env->rewards); free(env->terminals); + free(env->truncations); + free(env->masks); free_logbuffer(env->log_buffer); free(env->log); free(env->interactive_food_agent_count); @@ -185,6 +219,15 @@ void add_food(CCpr *env, int grid_idx, int food_type) { foods->indexes[foods->size++] = grid_idx; } +void reward_agent(CCpr *env, int agent_id, float reward) { + // We don't reward if agent is full life + // Agent *agent = &env->agents[agent_id]; + // if (agent->hp >= MAX_HP) { + // return; + // } + env->rewards[agent_id] += reward; +} + void spawn_food(CCpr *env, int food_type) { // Randomly spawns such food in the grid int idx, tile; @@ -261,22 +304,32 @@ void spawn_foods(CCpr *env) { } } - // Each turn there is random probability for a food to spawn at a random - // location To cope with resource depletion - int normalizer = (env->width * env->height) / 576; - if ((rand() / (double)RAND_MAX) < - min((env->food_base_spawn_rate * 2 * normalizer), 1e-2)) { - spawn_food(env, NORMAL_FOOD); - } - if ((rand() / (double)RAND_MAX) < - min((env->food_base_spawn_rate / 5.0 * normalizer), 5e-3)) { - spawn_food(env, INTERACTIVE_FOOD); - } + // // Each turn there is random probability for a food to spawn at a random + // // location To cope with resource depletion + // int normalizer = (env->width * env->height) / 576; + // if ((rand() / (double)RAND_MAX) < + // min((env->food_base_spawn_rate * 2 * normalizer), 1e-2)) { + // spawn_food(env, NORMAL_FOOD); + // } + // if ((rand() / (double)RAND_MAX) < + // min((env->food_base_spawn_rate / 5.0 * normalizer), 5e-3)) { + // spawn_food(env, INTERACTIVE_FOOD); + // } } void compute_observations(CCpr *env) { + // For full obs + // memcpy(env->observations, env->grid, + // env->width * env->height * sizeof(unsigned char)); + // return; + + // For partial obs for (int i = 0; i < env->num_agents; i++) { Agent *agent = &env->agents[i]; + // env->observations[env->vision_window*env->vision_window + i*env->obs_size] = agent->hp; + if (agent->hp == 0) { + continue; + } int obs_offset = i * env->obs_size; int r_offset = agent->r - env->vision; int c_offset = agent->c - env->vision; @@ -287,15 +340,50 @@ void compute_observations(CCpr *env) { env->observations[obs_idx] = env->grid[grid_idx]; } } + + } } -void c_reset(CCpr *env) { - env->tick = 0; +void add_hp(CCpr *env, int agent_id, float hp) { + Agent *agent = &env->agents[agent_id]; + agent->hp += hp; + if (agent->hp > MAX_HP) { + agent->hp = MAX_HP; + } else if (agent->hp <= 0) { + agent->hp = 0; + env->log->score += LOG_SCORE_REWARD_DEATH; + env->rewards[agent->id] += REWARD_DEATH; + env->terminals[agent->id] = 1; + } +} +void remove_hp(CCpr *env, int agent_id, float hp) { add_hp(env, agent_id, -hp); } - memset(env->grid, EMPTY, (env->height * env->width) * sizeof(env->grid[0])); - memset(env->log, 0, sizeof(Log)); +void save_grid_to_file(CCpr *env, const char *filename) { + FILE *file = fopen(filename, "w"); + if (!file) { + perror("Failed to open file"); + return; + } + fprintf(file, "#ifndef GRID_H\n#define GRID_H\n\n"); + fprintf(file, "#define GRID_HEIGHT %d\n", env->height); + fprintf(file, "#define GRID_WIDTH %d\n\n", env->width); + fprintf(file, "static const unsigned char grid[GRID_HEIGHT][GRID_WIDTH] = {\n"); + + for (int r = 0; r < env->height; r++) { + fprintf(file, " {"); + for (int c = 0; c < env->width; c++) { + unsigned char val = env->grid[r * env->width + c]; + fprintf(file, "0x%02X%s", val, (c == env->width - 1) ? "" : ", "); + } + fprintf(file, "}%s\n", (r == env->height - 1) ? "" : ","); + } + fprintf(file, "};\n\n#endif // GRID_H\n"); + fclose(file); +} +void make_grid_from_scratch(CCpr *env){ + memset(env->grid, EMPTY, (env->height * env->width) * sizeof(env->grid[0])); // top walling for (int r = 0; r < env->vision; r++) { memset(env->grid + (r * env->width), WALL, @@ -317,41 +405,46 @@ void c_reset(CCpr *env) { memset(env->grid + (r * env->width) + (env->width - env->vision), WALL, env->vision * sizeof(env->grid[0])); } + save_grid_to_file(env, "grid.h"); +} - // Agents - srand(time(NULL)); - for (int i = 0; i < env->num_agents; i++) { - // env->log[0] = (Log){0}; - - Agent *agent = &env->agents[i]; - - agent->id = i; - - int adr = 0; - bool allocated = false; - - // Random allocation - while (!allocated) { - - adr = rand() % (env->height * env->width); - - if (env->grid[adr] == EMPTY) { - int r = adr / env->width; - int c = adr % env->width; - agent->r = r; - agent->c = c; - allocated = true; - } +void spawn_agent(CCpr *env, int i){ + Agent *agent = &env->agents[i]; + agent->id = i; + agent->hp = 80; + int adr = 0; + + bool allocated = false; + while (!allocated) { + adr = rand() % (env->height * env->width); + if (env->grid[adr] == EMPTY) { + int r = adr / env->width; + int c = adr % env->width; + agent->r = r; + agent->c = c; + allocated = true; } + } + assert(env->grid[adr] == EMPTY); + env->grid[adr] = get_agent_tile_from_id(agent->id); +} +void c_reset(CCpr *env) { + env->tick = 0; + memset(env->log, 0, sizeof(Log)); + env->foods->size = 0; + memset(env->foods->indexes, 0, env->width * env->height * sizeof(int)); + // make_grid_from_scratch(env); + memcpy(env->grid, grid_32_32_3v, env->width * env->height * sizeof(unsigned char)); - assert(env->grid[adr] == EMPTY); - - env->grid[adr] = get_agent_tile_from_id(agent->id); + for (int i = 0; i < env->num_agents; i++) { + spawn_agent(env, i); } init_foods(env); - + memset(env->observations, 0, env->num_agents * env->obs_size * sizeof(unsigned char)); + memset(env->truncations, 0, env->num_agents * sizeof(unsigned char)); memset(env->terminals, 0, env->num_agents * sizeof(unsigned char)); + memset(env->masks, 1, env->num_agents * sizeof(unsigned char)); compute_observations(env); } @@ -367,8 +460,9 @@ void reward_agents_near(CCpr *env, int food_index) { if ((ac == food_c && (ar == food_r - 1 || ar == food_r + 1)) || (ar == food_r && (ac == food_c - 1 || ac == food_c + 1))) { - env->rewards[i] += env->interactive_food_reward; - env->log->score += 5; + reward_agent(env, i, env->interactive_food_reward); + env->log->score += LOG_SCORE_REWARD_MEDIUM; + add_hp(env, i, HP_REWARD_FOOD_MEDIUM); // add_log(env->log_buffer, env->log); // memset(env->log, 0, sizeof(Log)); } @@ -378,6 +472,8 @@ void reward_agents_near(CCpr *env, int food_index) { void step_agent(CCpr *env, int i) { + Agent *agent = &env->agents[i]; + int action = env->actions[i]; int dr = 0; @@ -386,15 +482,19 @@ void step_agent(CCpr *env, int i) { switch (action) { case 0: dr = -1; + agent->direction = 3; break; // UP case 1: dr = 1; + agent->direction = 1; break; // DOWN case 2: dc = -1; + agent->direction = 2; break; // LEFT case 3: dc = 1; + agent->direction = 0; break; // RIGHT case 4: return; // No moves @@ -402,7 +502,6 @@ void step_agent(CCpr *env, int i) { env->log->moves += 1; // Get next row and column - Agent *agent = &env->agents[i]; int next_r = agent->r + dr; int next_c = agent->c + dc; @@ -413,7 +512,7 @@ void step_agent(CCpr *env, int i) { // Anything above should be obstacle if (tile >= INTERACTIVE_FOOD) { - env->log->score += -0.01; + env->log->score += LOG_SCORE_REWARD_MOVE; env->rewards[i] += env->reward_move; next_r = agent->r; next_c = agent->c; @@ -445,14 +544,15 @@ void step_agent(CCpr *env, int i) { switch (tile) { case NORMAL_FOOD: - env->log->score += 1; - env->rewards[i] = env->reward_food; + env->log->score += LOG_SCORE_REWARD_SMALL; + reward_agent(env, i, env->reward_food); + add_hp(env, i, HP_REWARD_FOOD_SMALL); remove_food(env, next_grid_idx); // add_log(env->log_buffer, env->log); // memset(env->log, 0, sizeof(Log)); break; case EMPTY: - env->log->score += -0.01; + env->log->score += LOG_SCORE_REWARD_MOVE; env->rewards[i] = env->reward_move; break; } @@ -467,30 +567,77 @@ void step_agent(CCpr *env, int i) { return; } +void clear_agent(CCpr *env, int agent_id) { + Agent *agent = &env->agents[agent_id]; + if (agent->r < 0 || agent->c < 0) { + return; + } + int grid_idx = grid_index(env, agent->r, agent->c); + env->grid[grid_idx] = EMPTY; + agent->r = -1; + agent->c = -1; +} + void c_step(CCpr *env) { + env->tick++; + // memset(env->truncations, 0, env->num_agents * sizeof(unsigned char)); memset(env->rewards, 0, env->num_agents * sizeof(float)); + // memset(env->terminals, 0, env->num_agents * sizeof(unsigned char)); memset(env->interactive_food_agent_count, 0, (env->width * env->height + 7) / 8); - // bool logged = false; for (int i = 0; i < env->num_agents; i++) { + if (env->agents[i].hp == 0) { + env->masks[i] = 0; + clear_agent(env, i); + continue; + } step_agent(env, i); + remove_hp(env, i, HP_LOSS_PER_STEP); } - add_log(env->log_buffer, env->log); - memset(env->log, 0, sizeof(Log)); - // // To cope with sweeps waiting for logs, in case nothing moves - // if (!logged) { - // add_log(env->log_buffer, env->log); - // memset(env->log, 0, sizeof(Log)); - // } - spawn_foods(env); + //We loop again here because in the future an entity might have attacked an agent in the process + int alive_agents = 0; + for (int i = 0; i < env->num_agents; i++) { + if (env->agents[i].hp > 0) { + alive_agents += 1; + if (env->agents[i].hp < 20) { + env->rewards[i] += REWARD_20_HP; + env->log->score += REWARD_20_HP; + } else if (env->agents[i].hp > 80) { + env->rewards[i] += REWARD_80_HP; + env->log->score += REWARD_80_HP; + } + } + // else { + // int grid_idx = grid_index(env, env->agents[i].r, env->agents[i].c); + // env->grid[grid_idx] = EMPTY; + // spawn_agent(env, i); + // } + } + if (alive_agents == 0) { + env->log->moves = 0; + }else{ + env->log->moves /= alive_agents; + } + env->log->food_nb = env->foods->size; + env->log->agents_alive = alive_agents; + env->log->alive_steps = env->tick; compute_observations(env); - - env->tick++; + + if (alive_agents < env->num_agents || env->tick > 2000) { + c_reset(env); + if (alive_agents < env->num_agents) { + memset(env->terminals, 1, env->num_agents * sizeof(unsigned char)); + } else { + memset(env->truncations, 1, env->num_agents * sizeof(unsigned char)); + } + } + add_log(env->log_buffer, env->log); + memset(env->log, 0, sizeof(Log)); } // Raylib client @@ -557,15 +704,20 @@ void c_render(Renderer *renderer, CCpr *env) { } else if (tile == NORMAL_FOOD || tile == INTERACTIVE_FOOD) { DrawRectangle(c * ts, r * ts, ts, ts, COLORS[tile]); } else { - int u = 128 * (tile % 8); - int v = 128 * (tile / 8); - Rectangle source_rect = (Rectangle){u, v, 128, 128}; - Rectangle dest_rect = (Rectangle){c * ts, r * ts, ts, ts}; + int agent_id = get_agent_id_from_tile(tile); int col_id = agent_id % (sizeof(COLORS) / sizeof(COLORS[0])); Color color = COLORS[col_id]; + int starting_sprite_x = 0; + float rotation = env->agents[agent_id].direction * 90.0f; + if (rotation == 180) { + starting_sprite_x = 128; + rotation = 0; + } + Rectangle source_rect = (Rectangle){starting_sprite_x, 0, 128, 128}; + Rectangle dest_rect = (Rectangle){c * ts + ts/2, r * ts + ts/2, ts, ts}; DrawTexturePro(renderer->puffer, source_rect, dest_rect, - (Vector2){0, 0}, 0, color); + (Vector2){ts/2, ts/2}, rotation, color); } } } diff --git a/pufferlib/ocean/cpr/cpr.py b/pufferlib/ocean/cpr/cpr.py index aedfae6447..382936be13 100644 --- a/pufferlib/ocean/cpr/cpr.py +++ b/pufferlib/ocean/cpr/cpr.py @@ -6,16 +6,16 @@ class PyCPR(pufferlib.PufferEnv): def __init__(self, - num_envs = 1, + num_envs=1, widths=[32], heights=[32], num_agents=[8], - vision=2, + vision=3, reward_food=1.0, interactive_food_reward=5.0, reward_move=-0.01, food_base_spawn_rate=2e-3, - report_interval=250, + report_interval=1, render_mode=None, buf=None ): @@ -23,7 +23,7 @@ def __init__(self, heights = num_envs*heights num_agents = num_envs*num_agents - self.single_observation_space = gymnasium.spaces.Box(low=0, high=255, shape=((2*vision+1),(2*vision+1)), dtype=np.uint8) + self.single_observation_space = gymnasium.spaces.Box(low=0, high=255, shape=((2*vision+1)*(2*vision+1),), dtype=np.uint8) self.single_action_space = gymnasium.spaces.Discrete(5) self.render_mode = render_mode self.num_agents = sum(num_agents) @@ -38,6 +38,8 @@ def __init__(self, self.actions, self.rewards, self.terminals, + self.truncations, + self.masks, widths, heights, num_agents, @@ -78,13 +80,16 @@ def close(self): timeout=30 tot_agents = env.num_agents - actions = np.random.randint(0,4,(1024,tot_agents)) + actions = np.random.randint(0,5,(1024,tot_agents)) import time start = time.time() - while time.time() - start < timeout: + # while time.time() - start < timeout: + while tick < 500: atns = actions[tick % 1024] env.step(atns) + if -1 in env.rewards: + breakpoint() # env.render() tick += 1 diff --git a/pufferlib/ocean/cpr/cy_cpr.pyx b/pufferlib/ocean/cpr/cy_cpr.pyx index a8f92d70a3..f766713ab4 100644 --- a/pufferlib/ocean/cpr/cy_cpr.pyx +++ b/pufferlib/ocean/cpr/cy_cpr.pyx @@ -15,6 +15,9 @@ cdef extern from "cpr.h": ctypedef struct Log: float score float moves + float food_nb + float agents_alive + float alive_steps ctypedef struct LogBuffer: Log logs @@ -29,6 +32,8 @@ cdef extern from "cpr.h": int r int c int id + int hp + int direction ctypedef struct FoodList: int *indexes @@ -53,11 +58,13 @@ cdef extern from "cpr.h": int *actions float *rewards unsigned char *terminals + unsigned char *masks + unsigned char *truncations Agent *agents LogBuffer *log_buffer - Log *logs + Log *log uint8_t *interactive_food_agent_count float interactive_food_reward @@ -86,9 +93,12 @@ cdef class CyEnv: LogBuffer *logs int num_envs - def __init__(self, unsigned char[:,:,:] observations, int[:] actions, float[:] rewards, unsigned char[:] terminals, - list widths, list heights, list num_agents,int vision, - float reward_food,float interactive_food_reward,float reward_move, float food_base_spawn_rate) -> None: + def __init__( + self, unsigned char[:,:] observations, int[:] actions, float[:] rewards, + unsigned char[:] terminals, unsigned char[:] truncations, unsigned char[:] masks, + list widths, list heights, list num_agents,int vision, + float reward_food,float interactive_food_reward,float reward_move, float food_base_spawn_rate + ) -> None: self.num_envs = len(num_agents) self.envs = calloc(self.num_envs, sizeof(CCpr)) self.logs = allocate_logbuffer(LOG_BUFFER_SIZE) @@ -97,10 +107,12 @@ cdef class CyEnv: cdef int n = 0 for i in range(self.num_envs): self.envs[i] = CCpr( - observations = &observations[n,0,0], + observations = &observations[n,0], actions=&actions[n], rewards=&rewards[n], terminals=&terminals[n], + truncations=&truncations[n], + masks=&masks[n], log_buffer=self.logs, width=widths[i], height=heights[i], diff --git a/pufferlib/ocean/cpr/grid.h b/pufferlib/ocean/cpr/grid.h new file mode 100644 index 0000000000..7682999664 --- /dev/null +++ b/pufferlib/ocean/cpr/grid.h @@ -0,0 +1,42 @@ +#ifndef GRID_H +#define GRID_H + +#define GRID_HEIGHT 32 +#define GRID_WIDTH 32 + +static const unsigned char grid_32_32_3v[GRID_HEIGHT][GRID_WIDTH] = { + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03} +}; + +#endif // GRID_H diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py index 2105cde210..49c2494a8d 100644 --- a/pufferlib/sweep.py +++ b/pufferlib/sweep.py @@ -27,6 +27,12 @@ def __init__(self, min, max, scale, mean, is_integer=False): self.is_integer = is_integer class Linear(Space): + def __init__(self, min, max, scale, mean, is_integer=False): + if scale == 'auto': + scale = 0.5 + + super().__init__(min, max, scale, mean, is_integer) + def normalize(self, value): #assert isinstance(value, (int, float)) zero_one = (value - self.min)/(self.max - self.min) @@ -40,6 +46,13 @@ def unnormalize(self, value): return value class Pow2(Space): + def __init__(self, min, max, scale, mean, is_integer=False): + if scale == 'auto': + scale = 0.5 + #scale = 2 / (np.log2(max) - np.log2(min)) + + super().__init__(min, max, scale, mean, is_integer) + def normalize(self, value): #assert isinstance(value, (int, float)) #assert value != 0.0 @@ -56,9 +69,11 @@ class Log(Space): base: int = 10 def __init__(self, min, max, scale, mean, is_integer=False): - if scale == 'auto': + if scale == 'time': # TODO: Set scaling param intuitively based on number of jumps from min to max scale = 1 / (np.log2(max) - np.log2(min)) + elif scale == 'auto': + scale = 0.5 super().__init__(min, max, scale, mean, is_integer) @@ -79,6 +94,12 @@ def unnormalize(self, value): class Logit(Space): base: int = 10 + def __init__(self, min, max, scale, mean, is_integer=False): + if scale == 'auto': + scale = 0.5 + + super().__init__(min, max, scale, mean, is_integer) + def normalize(self, value): #assert isinstance(value, (int, float)) #assert value != 0.0 @@ -91,7 +112,7 @@ def unnormalize(self, value): log_spaced = zero_one*(math.log(1-self.max, self.base) - math.log(1-self.min, self.base)) + math.log(1-self.min, self.base) return 1 - self.base**log_spaced -def _carbs_params_from_puffer_sweep(sweep_config): +def _params_from_puffer_sweep(sweep_config): param_spaces = {} for name, param in sweep_config.items(): if name in ('method', 'name', 'metric', 'max_score'): @@ -99,7 +120,7 @@ def _carbs_params_from_puffer_sweep(sweep_config): assert isinstance(param, dict) if any(isinstance(param[k], dict) for k in param): - param_spaces[name] = _carbs_params_from_puffer_sweep(param) + param_spaces[name] = _params_from_puffer_sweep(param) continue assert 'distribution' in param @@ -130,7 +151,7 @@ def _carbs_params_from_puffer_sweep(sweep_config): class Hyperparameters: def __init__(self, config, verbose=True): - self.spaces = _carbs_params_from_puffer_sweep(config) + self.spaces = _params_from_puffer_sweep(config) self.flat_spaces = dict(pufferlib.utils.unroll_nested_dict(self.spaces)) self.num = len(self.flat_spaces) @@ -156,7 +177,6 @@ def __init__(self, config, verbose=True): for name, space in self.flat_spaces.items(): print(f'\t{name}: {space.unnormalize(min(space.norm_mean + space.scale, space.norm_max))}') - def sample(self, n, mu=None, scale=1): if mu is None: mu = self.search_centers @@ -372,14 +392,14 @@ def suggest(self, fill): # Transformed scores min_score = self.min_score if min_score is None: - min_score = np.min(y) - abs(np.min(y)) + min_score = np.min(y) - np.min(np.abs(y)) if np.min(y) < min_score - 1e-6: raise ValueError(f'Min score {min_score} is less than min score in data {np.min(y)}') max_score = self.max_score if max_score is None: - max_score = np.max(y) + abs(np.max(y)) + max_score = np.max(y) + np.max(np.abs(y)) if np.max(y) > max_score + 1e-6: raise ValueError(f'Max score {max_score} is greater than max score in data {np.max(y)}') @@ -632,55 +652,87 @@ def observe(self, hypers, score, cost, is_failure=False): else: self.success_observations.append(new_observation) -''' -from carbs import ( - CARBS, - CARBSParams, - ObservationInParam, - Param, - LinearSpace, - Pow2Space, - LogSpace, - LogitSpace, -) - -class PufferCarbs: +def _carbs_params_from_puffer_sweep(sweep_config): + from carbs import ( + Param, + LinearSpace, + LogSpace, + LogitSpace, + ) + + param_spaces = {} + for name, param in sweep_config.items(): + if name in ('method', 'name', 'metric', 'max_score'): + continue + + assert isinstance(param, dict) + if any(isinstance(param[k], dict) for k in param): + param_spaces[name] = _carbs_params_from_puffer_sweep(param) + continue + + assert 'distribution' in param + distribution = param['distribution'] + search_center = param['mean'] + kwargs = dict( + min=param['min'], + max=param['max'], + ) + if distribution == 'uniform': + space = LinearSpace(**kwargs) + elif distribution in ('int_uniform', 'uniform_pow2'): + space = LinearSpace(**kwargs, is_integer=True) + elif distribution == 'log_normal': + space = LogSpace(**kwargs) + elif distribution == 'logit_normal': + space = LogitSpace(**kwargs) + else: + raise ValueError(f'Invalid distribution: {distribution}') + + param_spaces[name] = Param( + name=name, + space=space, + search_center=search_center + ) + + return param_spaces + +class Carbs: def __init__(self, sweep_config: dict, max_suggestion_cost: float = None, resample_frequency: int = 5, num_random_samples: int = 10, ): + param_spaces = _carbs_params_from_puffer_sweep(sweep_config) flat_spaces = [e[1] for e in pufferlib.utils.unroll_nested_dict(param_spaces)] for e in flat_spaces: print(e.name, e.space) - metric = sweep_config['metric'] - goal = metric['goal'] - assert goal in ['maximize', 'minimize'], f"Invalid goal {goal}" - self.carbs_params = CARBSParams( - better_direction_sign=1 if goal == 'maximize' else -1, + from carbs import ( + CARBSParams, + CARBS, + ) + + carbs_params = CARBSParams( + better_direction_sign=1, is_wandb_logging_enabled=False, resample_frequency=resample_frequency, num_random_samples=num_random_samples, max_suggestion_cost=max_suggestion_cost, is_saved_on_every_observation=False, - #num_candidates_for_suggestion_per_dim=10 ) - self.carbs = CARBS(self.carbs_params, flat_spaces) + self.carbs = CARBS(carbs_params, flat_spaces) def suggest(self, args): - #start = time.time() self.suggestion = self.carbs.suggest().suggestion - #print(f'Suggestion took {time.time() - start} seconds') for k in ('train', 'env'): for name, param in args['sweep'][k].items(): if name in self.suggestion: args[k][name] = self.suggestion[name] - def observe(self, score, cost, is_failure=False): - #start = time.time() + def observe(self, hypers, score, cost, is_failure=False): + from carbs import ObservationInParam self.carbs.observe( ObservationInParam( input=self.suggestion, @@ -688,8 +740,4 @@ def observe(self, score, cost, is_failure=False): cost=cost, is_failure=is_failure, ) - ) - #print(f'Observation took {time.time() - start} seconds') - - -''' + ) \ No newline at end of file From 81aabd2761b3c8eb3da158094388046ca034c3de Mon Sep 17 00:00:00 2001 From: mx2000 Date: Fri, 11 Apr 2025 07:24:27 +0200 Subject: [PATCH 5/8] Cpr resets when all agents are dead or truncated --- config/ocean/cpr.ini | 6 +++--- pufferlib/ocean/cpr/cpr.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config/ocean/cpr.ini b/config/ocean/cpr.ini index f2e94ec6aa..abd21ea736 100644 --- a/config/ocean/cpr.ini +++ b/config/ocean/cpr.ini @@ -9,15 +9,15 @@ num_envs = 512 vision = 3 num_agents = [12] report_interval=1 -reward_food = 0.001 -interactive_food_reward = 0.002 +reward_food = 0.1 +interactive_food_reward = 0.2 reward_move = +0.00 food_base_spawn_rate = 2e-3 [train] num_envs = 1 num_workers = 1 -total_timesteps = 1_000_000_000 +total_timesteps = 60_000_000 device = cpu batch_size = 32768 minibatch_size = 8192 diff --git a/pufferlib/ocean/cpr/cpr.h b/pufferlib/ocean/cpr/cpr.h index 8331954df1..1600260101 100644 --- a/pufferlib/ocean/cpr/cpr.h +++ b/pufferlib/ocean/cpr/cpr.h @@ -26,7 +26,7 @@ #define REWARD_20_HP -0 #define REWARD_80_HP 0 -#define REWARD_DEATH -0.0f +#define REWARD_DEATH -1.0f #define LOG_SCORE_REWARD_SMALL 1 @@ -628,9 +628,9 @@ void c_step(CCpr *env) { env->log->alive_steps = env->tick; compute_observations(env); - if (alive_agents < env->num_agents || env->tick > 2000) { + if (alive_agents == 0|| env->tick > 1000) { c_reset(env); - if (alive_agents < env->num_agents) { + if (alive_agents == 0) { memset(env->terminals, 1, env->num_agents * sizeof(unsigned char)); } else { memset(env->truncations, 1, env->num_agents * sizeof(unsigned char)); From 28316815cfe0153141e88386be93774abed30eac Mon Sep 17 00:00:00 2001 From: mx2000 Date: Wed, 30 Apr 2025 06:30:45 +0200 Subject: [PATCH 6/8] Improve common pool resource env --- pufferlib/ocean/cpr/cpr.h | 22 ++++++++++++++-------- pufferlib/ocean/cpr/cpr.py | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pufferlib/ocean/cpr/cpr.h b/pufferlib/ocean/cpr/cpr.h index 1600260101..89380f6cef 100644 --- a/pufferlib/ocean/cpr/cpr.h +++ b/pufferlib/ocean/cpr/cpr.h @@ -165,7 +165,7 @@ void init_ccpr(CCpr *env) { (unsigned char *)calloc(env->width * env->height, sizeof(unsigned char)); env->agents = (Agent *)calloc(env->num_agents, sizeof(Agent)); env->vision_window = 2 * env->vision + 1; - env->obs_size = env->vision_window * env->vision_window;// + 1; + env->obs_size = env->vision_window * env->vision_window + 1; env->log = (Log *)calloc(1, sizeof(Log)); env->interactive_food_agent_count = (uint8_t *)calloc((env->width * env->height + 7) / 8, sizeof(uint8_t)); @@ -174,7 +174,7 @@ void init_ccpr(CCpr *env) { void allocate_ccpr(CCpr *env) { // Called by C stuff - int obs_size = (2 * env->vision + 1) * (2 * env->vision + 1); //+ 1; + int obs_size = (2 * env->vision + 1) * (2 * env->vision + 1) + 1; env->observations = (unsigned char *)calloc(env->num_agents * obs_size, sizeof(unsigned char)); env->actions = (int *)calloc(env->num_agents, sizeof(unsigned int)); @@ -221,10 +221,10 @@ void add_food(CCpr *env, int grid_idx, int food_type) { void reward_agent(CCpr *env, int agent_id, float reward) { // We don't reward if agent is full life - // Agent *agent = &env->agents[agent_id]; - // if (agent->hp >= MAX_HP) { - // return; - // } + Agent *agent = &env->agents[agent_id]; + if (agent->hp >= MAX_HP) { + return; + } env->rewards[agent_id] += reward; } @@ -326,7 +326,7 @@ void compute_observations(CCpr *env) { // For partial obs for (int i = 0; i < env->num_agents; i++) { Agent *agent = &env->agents[i]; - // env->observations[env->vision_window*env->vision_window + i*env->obs_size] = agent->hp; + env->observations[env->vision_window*env->vision_window + i*env->obs_size] = agent->hp; if (agent->hp == 0) { continue; } @@ -553,7 +553,7 @@ void step_agent(CCpr *env, int i) { break; case EMPTY: env->log->score += LOG_SCORE_REWARD_MOVE; - env->rewards[i] = env->reward_move; + env->rewards[i] += env->reward_move; break; } @@ -633,6 +633,12 @@ void c_step(CCpr *env) { if (alive_agents == 0) { memset(env->terminals, 1, env->num_agents * sizeof(unsigned char)); } else { + //Agents get rewarded for going all the way to the end + for (int i = 0; i < env->num_agents; i++) { + if (env->agents[i].hp > 0) { + env->rewards[i] += 1; + } + } memset(env->truncations, 1, env->num_agents * sizeof(unsigned char)); } } diff --git a/pufferlib/ocean/cpr/cpr.py b/pufferlib/ocean/cpr/cpr.py index 382936be13..02a8ae01df 100644 --- a/pufferlib/ocean/cpr/cpr.py +++ b/pufferlib/ocean/cpr/cpr.py @@ -23,7 +23,7 @@ def __init__(self, heights = num_envs*heights num_agents = num_envs*num_agents - self.single_observation_space = gymnasium.spaces.Box(low=0, high=255, shape=((2*vision+1)*(2*vision+1),), dtype=np.uint8) + self.single_observation_space = gymnasium.spaces.Box(low=0, high=255, shape=((2*vision+1)*(2*vision+1)+1,), dtype=np.uint8) self.single_action_space = gymnasium.spaces.Discrete(5) self.render_mode = render_mode self.num_agents = sum(num_agents) From 842a5dc120d67857e9b9561ad5bdf98e3cb0e22d Mon Sep 17 00:00:00 2001 From: mx2000 Date: Sat, 10 May 2025 18:17:49 +0200 Subject: [PATCH 7/8] Restore the previous commit --- pufferlib/ocean/cpr/cpr.h | 22 ++++++++-------------- pufferlib/ocean/cpr/cpr.py | 2 +- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/pufferlib/ocean/cpr/cpr.h b/pufferlib/ocean/cpr/cpr.h index 89380f6cef..1600260101 100644 --- a/pufferlib/ocean/cpr/cpr.h +++ b/pufferlib/ocean/cpr/cpr.h @@ -165,7 +165,7 @@ void init_ccpr(CCpr *env) { (unsigned char *)calloc(env->width * env->height, sizeof(unsigned char)); env->agents = (Agent *)calloc(env->num_agents, sizeof(Agent)); env->vision_window = 2 * env->vision + 1; - env->obs_size = env->vision_window * env->vision_window + 1; + env->obs_size = env->vision_window * env->vision_window;// + 1; env->log = (Log *)calloc(1, sizeof(Log)); env->interactive_food_agent_count = (uint8_t *)calloc((env->width * env->height + 7) / 8, sizeof(uint8_t)); @@ -174,7 +174,7 @@ void init_ccpr(CCpr *env) { void allocate_ccpr(CCpr *env) { // Called by C stuff - int obs_size = (2 * env->vision + 1) * (2 * env->vision + 1) + 1; + int obs_size = (2 * env->vision + 1) * (2 * env->vision + 1); //+ 1; env->observations = (unsigned char *)calloc(env->num_agents * obs_size, sizeof(unsigned char)); env->actions = (int *)calloc(env->num_agents, sizeof(unsigned int)); @@ -221,10 +221,10 @@ void add_food(CCpr *env, int grid_idx, int food_type) { void reward_agent(CCpr *env, int agent_id, float reward) { // We don't reward if agent is full life - Agent *agent = &env->agents[agent_id]; - if (agent->hp >= MAX_HP) { - return; - } + // Agent *agent = &env->agents[agent_id]; + // if (agent->hp >= MAX_HP) { + // return; + // } env->rewards[agent_id] += reward; } @@ -326,7 +326,7 @@ void compute_observations(CCpr *env) { // For partial obs for (int i = 0; i < env->num_agents; i++) { Agent *agent = &env->agents[i]; - env->observations[env->vision_window*env->vision_window + i*env->obs_size] = agent->hp; + // env->observations[env->vision_window*env->vision_window + i*env->obs_size] = agent->hp; if (agent->hp == 0) { continue; } @@ -553,7 +553,7 @@ void step_agent(CCpr *env, int i) { break; case EMPTY: env->log->score += LOG_SCORE_REWARD_MOVE; - env->rewards[i] += env->reward_move; + env->rewards[i] = env->reward_move; break; } @@ -633,12 +633,6 @@ void c_step(CCpr *env) { if (alive_agents == 0) { memset(env->terminals, 1, env->num_agents * sizeof(unsigned char)); } else { - //Agents get rewarded for going all the way to the end - for (int i = 0; i < env->num_agents; i++) { - if (env->agents[i].hp > 0) { - env->rewards[i] += 1; - } - } memset(env->truncations, 1, env->num_agents * sizeof(unsigned char)); } } diff --git a/pufferlib/ocean/cpr/cpr.py b/pufferlib/ocean/cpr/cpr.py index 02a8ae01df..382936be13 100644 --- a/pufferlib/ocean/cpr/cpr.py +++ b/pufferlib/ocean/cpr/cpr.py @@ -23,7 +23,7 @@ def __init__(self, heights = num_envs*heights num_agents = num_envs*num_agents - self.single_observation_space = gymnasium.spaces.Box(low=0, high=255, shape=((2*vision+1)*(2*vision+1)+1,), dtype=np.uint8) + self.single_observation_space = gymnasium.spaces.Box(low=0, high=255, shape=((2*vision+1)*(2*vision+1),), dtype=np.uint8) self.single_action_space = gymnasium.spaces.Discrete(5) self.render_mode = render_mode self.num_agents = sum(num_agents) From 952689c925d6ca55ba3a6d6b04eb0dd345ba7f5b Mon Sep 17 00:00:00 2001 From: mx2000 Date: Sat, 10 May 2025 18:18:49 +0200 Subject: [PATCH 8/8] clean cpr commit --- pufferlib/sweep.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py index 49c2494a8d..ce9202603f 100644 --- a/pufferlib/sweep.py +++ b/pufferlib/sweep.py @@ -740,4 +740,5 @@ def observe(self, hypers, score, cost, is_failure=False): cost=cost, is_failure=is_failure, ) - ) \ No newline at end of file + ) + \ No newline at end of file