diff --git a/config/ocean/cpr.ini b/config/ocean/cpr.ini index 349b87d19a..ee33760075 100644 --- a/config/ocean/cpr.ini +++ b/config/ocean/cpr.ini @@ -5,16 +5,63 @@ vec = multiprocessing rnn_name = Recurrent [env] -num_envs = 256 +num_envs = 512 vision = 3 -widths = [32] -heights = [32] -num_agents = [8] +num_agents = [12] report_interval=1 -reward_food = 1.0 -interactive_food_reward = 5.0 -reward_move = -0.01 +reward_food = 0.1 +interactive_food_reward = 0.2 +reward_move = +0.00 food_base_spawn_rate = 2e-3 [train] -total_timesteps = 100_000_000 +num_envs = 1 +num_workers = 1 +total_timesteps = 60_000_000 +device = cpu +batch_size = 32768 +minibatch_size = 8192 +bptt_horizon = 16 +checkpoint_interval = 200 +learning_rate = 0.0008524 +gamma = 0.9989 +gae_lambda = 0.99 +vf_coef = 1 +ent_coef = 0.01 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-12 +max_grad_norm = 0.5 +vf_clip_coef = 0.1 +update_epochs = 1 + +[workspace] +name = boxingbytes +project = pufferai + +[sweep.metric] +goal = maximize +name = score +min = -10 +max = 10 + +[sweep.env.reward_food] +distribution = log_normal +min = 0.0001 +max = 0.01 +mean = 0.001 +scale = auto + +[sweep.env.interactive_food_reward] +distribution = log_normal +min = 0.0001 +max = 0.02 +mean = 0.002 +scale = auto + +[sweep.train.total_timesteps] +distribution = log_normal +min = 50e6 +max = 75e6 +mean = 60e6 +scale = time \ No newline at end of file diff --git a/pufferlib/ocean/cpr/cpr.c b/pufferlib/ocean/cpr/cpr.c index 1c4f684452..f7896a9b60 100644 --- a/pufferlib/ocean/cpr/cpr.c +++ b/pufferlib/ocean/cpr/cpr.c @@ -3,16 +3,16 @@ #include int main() { - int width = 24; - int height = 24; + int width = 32; + int height = 32; int render_cell_size = 32; CCpr env = { - .num_agents = 8, + .num_agents = 1, .width = width, .height = height, - .vision = 2, + .vision = 3, .reward_food = 1.0f, .interactive_food_reward = 5.0f, .food_base_spawn_rate = 2e-3, @@ -21,9 +21,9 @@ int main() { c_reset(&env); Renderer *renderer = init_renderer(render_cell_size, width, height); - while (!WindowShouldClose()) { + c_render(renderer, &env); int st = 0; // User can take control of the first puffer if (IsKeyDown(KEY_LEFT_SHIFT)) { @@ -41,12 +41,10 @@ int main() { sleep(2); } for (int i = st; i < env.num_agents; i++) { - env.actions[i] = rand() % 4; + env.actions[i] = rand() % 5; // printf("Agent %d gets actions %d\n", i, env->actions[i]); } c_step(&env); - - c_render(renderer, &env); } close_renderer(renderer); free_CCpr(&env); diff --git a/pufferlib/ocean/cpr/cpr.h b/pufferlib/ocean/cpr/cpr.h index 5db37c4c9f..9509027374 100644 --- a/pufferlib/ocean/cpr/cpr.h +++ b/pufferlib/ocean/cpr/cpr.h @@ -9,6 +9,8 @@ #include "raylib.h" +#include "grid.h" + #define EMPTY 0 #define NORMAL_FOOD 1 #define INTERACTIVE_FOOD 2 @@ -23,6 +25,21 @@ #define CHECK_BIT(arr, i) (arr[(i) / 8] & (1 << ((i) % 8))) #define min(a, b) ((a) < (b) ? (a) : (b)) +#define REWARD_20_HP -0 +#define REWARD_80_HP 0 +#define REWARD_DEATH -1.0f + + +#define LOG_SCORE_REWARD_SMALL 1 +#define LOG_SCORE_REWARD_MEDIUM 5 +#define LOG_SCORE_REWARD_MOVE - 0.0 +#define LOG_SCORE_REWARD_DEATH -1 + +#define HP_REWARD_FOOD_MEDIUM 50 +#define HP_REWARD_FOOD_SMALL 20 +#define HP_LOSS_PER_STEP 1 +#define MAX_HP 100 + typedef struct Log Log; struct Log { float perf; @@ -30,6 +47,9 @@ struct Log { float episode_return; float episode_length; float moves; + float food_nb; + float agents_alive; + float alive_steps; float n; }; @@ -74,6 +94,9 @@ Log aggregate_and_clear(LogBuffer *logs) { log.episode_return += logs->logs[i].episode_return; log.episode_length += logs->logs[i].episode_length; log.moves += logs->logs[i].moves; + log.food_nb += logs->logs[i].food_nb; + log.agents_alive += logs->logs[i].agents_alive; + log.alive_steps += logs->logs[i].alive_steps; log.n += 1; } log.score /= logs->idx; @@ -81,6 +104,9 @@ Log aggregate_and_clear(LogBuffer *logs) { log.episode_return /= logs->idx; log.episode_length /= logs->idx; log.moves /= logs->idx; + log.food_nb /= logs->idx; + log.agents_alive /= logs->idx; + log.alive_steps /= logs->idx; log.n /= logs->idx; logs->idx = 0; return log; @@ -91,6 +117,8 @@ struct Agent { int r; int c; int id; + float hp; + int direction; }; typedef struct FoodList FoodList; @@ -132,11 +160,13 @@ struct CCpr { int *actions; float *rewards; unsigned char *terminals; + unsigned char *truncations; + unsigned char *masks; Agent *agents; LogBuffer *log_buffer; - Log *logs; + Log *log; uint8_t *interactive_food_agent_count; @@ -149,8 +179,8 @@ void init_ccpr(CCpr *env) { (unsigned char *)calloc(env->width * env->height, sizeof(unsigned char)); env->agents = (Agent *)calloc(env->num_agents, sizeof(Agent)); env->vision_window = 2 * env->vision + 1; - env->obs_size = env->vision_window * env->vision_window; - env->logs = (Log *)calloc(env->num_agents, sizeof(Log)); + env->obs_size = env->vision_window * env->vision_window;// + 1; + env->log = (Log *)calloc(1, sizeof(Log)); env->interactive_food_agent_count = (uint8_t *)calloc((env->width * env->height + 7) / 8, sizeof(uint8_t)); env->foods = allocate_foodlist(env->width * env->height); @@ -158,13 +188,15 @@ void init_ccpr(CCpr *env) { void allocate_ccpr(CCpr *env) { // Called by C stuff - int obs_size = (2 * env->vision + 1) * (2 * env->vision + 1); + int obs_size = (2 * env->vision + 1) * (2 * env->vision + 1); //+ 1; env->observations = (unsigned char *)calloc(env->num_agents * obs_size, sizeof(unsigned char)); env->actions = (int *)calloc(env->num_agents, sizeof(unsigned int)); env->rewards = (float *)calloc(env->num_agents, sizeof(float)); env->terminals = (unsigned char *)calloc(env->num_agents, sizeof(unsigned char)); + env->truncations = (unsigned char*)calloc(env->num_agents, sizeof(unsigned char)); + env->masks = (unsigned char *)calloc(env->num_agents, sizeof(unsigned char)); env->log_buffer = allocate_logbuffer(LOG_BUFFER_SIZE); init_ccpr(env); } @@ -180,8 +212,10 @@ void free_CCpr(CCpr *env) { free(env->actions); free(env->rewards); free(env->terminals); + free(env->truncations); + free(env->masks); free_logbuffer(env->log_buffer); - free(env->logs); + free(env->log); free(env->interactive_food_agent_count); free_foodlist(env->foods); } @@ -193,11 +227,21 @@ int get_agent_id_from_tile(int tile) { return tile - AGENTS; } void add_food(CCpr *env, int grid_idx, int food_type) { // Add food to the grid and the food_list at grid_idx + assert(env->grid[grid_idx] == EMPTY); env->grid[grid_idx] = food_type; FoodList *foods = env->foods; foods->indexes[foods->size++] = grid_idx; } +void reward_agent(CCpr *env, int agent_id, float reward) { + // We don't reward if agent is full life + // Agent *agent = &env->agents[agent_id]; + // if (agent->hp >= MAX_HP) { + // return; + // } + env->rewards[agent_id] += reward; +} + void spawn_food(CCpr *env, int food_type) { // Randomly spawns such food in the grid int idx, tile; @@ -232,24 +276,10 @@ void init_foods(CCpr *env) { int normal = available_tiles / (20 * normalizer); int interactive = available_tiles / (50 * normalizer); for (int i = 0; i < normal; i++) { - int idx, tile; - do { - int r = rand() % (env->height - 1); - int c = rand() % (env->width - 1); - idx = r * env->width + c; - tile = env->grid[idx]; - } while (tile != EMPTY); - add_food(env, idx, NORMAL_FOOD); + spawn_food(env, NORMAL_FOOD); } for (int i = 0; i < interactive; i++) { - int idx, tile; - do { - int r = rand() % (env->height - 1); - int c = rand() % (env->width - 1); - idx = r * env->width + c; - tile = env->grid[idx]; - } while (tile != EMPTY); - add_food(env, idx, INTERACTIVE_FOOD); + spawn_food(env, INTERACTIVE_FOOD); } } @@ -267,42 +297,53 @@ void spawn_foods(CCpr *env) { for (int ri = 0; ri < 3; ri++) { for (int ci = 0; ci < 3; ci++) { int grid_idx = grid_index(env, (r + ri), (c + ci)); - if (env->grid[grid_idx] == EMPTY) { - switch (env->grid[idx]) { - // %Chance spawning new food - case NORMAL_FOOD: - if ((rand() / (double)RAND_MAX) < env->food_base_spawn_rate) { - add_food(env, grid_idx, env->grid[idx]); - } - break; - case INTERACTIVE_FOOD: - if ((rand() / (double)RAND_MAX) < - (env->food_base_spawn_rate / 10.0)) { - add_food(env, grid_idx, env->grid[idx]); - } - break; + if (env->grid[grid_idx] != EMPTY) { + continue; + } + switch (env->grid[idx]) { + // %Chance spawning new food + case NORMAL_FOOD: + if ((rand() / (double)RAND_MAX) < env->food_base_spawn_rate) { + add_food(env, grid_idx, env->grid[idx]); + } + break; + case INTERACTIVE_FOOD: + if ((rand() / (double)RAND_MAX) < + (env->food_base_spawn_rate / 10.0)) { + add_food(env, grid_idx, env->grid[idx]); } + break; } } } } - // Each turn there is random probability for a food to spawn at a random - // location To cope with resource depletion - int normalizer = (env->width * env->height) / 576; - if ((rand() / (double)RAND_MAX) < - min((env->food_base_spawn_rate * 2 * normalizer), 1e-2)) { - spawn_food(env, NORMAL_FOOD); - } - if ((rand() / (double)RAND_MAX) < - min((env->food_base_spawn_rate / 5.0 * normalizer), 5e-3)) { - spawn_food(env, INTERACTIVE_FOOD); - } + // // Each turn there is random probability for a food to spawn at a random + // // location To cope with resource depletion + // int normalizer = (env->width * env->height) / 576; + // if ((rand() / (double)RAND_MAX) < + // min((env->food_base_spawn_rate * 2 * normalizer), 1e-2)) { + // spawn_food(env, NORMAL_FOOD); + // } + // if ((rand() / (double)RAND_MAX) < + // min((env->food_base_spawn_rate / 5.0 * normalizer), 5e-3)) { + // spawn_food(env, INTERACTIVE_FOOD); + // } } void compute_observations(CCpr *env) { + // For full obs + // memcpy(env->observations, env->grid, + // env->width * env->height * sizeof(unsigned char)); + // return; + + // For partial obs for (int i = 0; i < env->num_agents; i++) { Agent *agent = &env->agents[i]; + // env->observations[env->vision_window*env->vision_window + i*env->obs_size] = agent->hp; + if (agent->hp == 0) { + continue; + } int obs_offset = i * env->obs_size; int r_offset = agent->r - env->vision; int c_offset = agent->c - env->vision; @@ -313,75 +354,111 @@ void compute_observations(CCpr *env) { env->observations[obs_idx] = env->grid[grid_idx]; } } + + } } -void c_reset(CCpr *env) { - env->tick = 0; +void add_hp(CCpr *env, int agent_id, float hp) { + Agent *agent = &env->agents[agent_id]; + agent->hp += hp; + if (agent->hp > MAX_HP) { + agent->hp = MAX_HP; + } else if (agent->hp <= 0) { + agent->hp = 0; + env->log->score += LOG_SCORE_REWARD_DEATH; + env->rewards[agent->id] += REWARD_DEATH; + env->terminals[agent->id] = 1; + } +} +void remove_hp(CCpr *env, int agent_id, float hp) { add_hp(env, agent_id, -hp); } - for (int r = 0; r < env->height; r++) { - for (int c = 0; c < env->width; c++) { - int adr = grid_index(env, r, c); - env->grid[adr] = EMPTY; +void save_grid_to_file(CCpr *env, const char *filename) { + FILE *file = fopen(filename, "w"); + if (!file) { + perror("Failed to open file"); + return; } - } + fprintf(file, "#ifndef GRID_H\n#define GRID_H\n\n"); + fprintf(file, "#define GRID_HEIGHT %d\n", env->height); + fprintf(file, "#define GRID_WIDTH %d\n\n", env->width); + fprintf(file, "static const unsigned char grid[GRID_HEIGHT][GRID_WIDTH] = {\n"); + + for (int r = 0; r < env->height; r++) { + fprintf(file, " {"); + for (int c = 0; c < env->width; c++) { + unsigned char val = env->grid[r * env->width + c]; + fprintf(file, "0x%02X%s", val, (c == env->width - 1) ? "" : ", "); + } + fprintf(file, "}%s\n", (r == env->height - 1) ? "" : ","); + } + fprintf(file, "};\n\n#endif // GRID_H\n"); + fclose(file); +} - // Walls need to cover vision radius around the grid +void make_grid_from_scratch(CCpr *env){ + memset(env->grid, EMPTY, (env->height * env->width) * sizeof(env->grid[0])); + // top walling for (int r = 0; r < env->vision; r++) { - for (int c = 0; c < env->width; c++) { - env->grid[r * env->width + c] = WALL; - } + memset(env->grid + (r * env->width), WALL, + env->width * sizeof(env->grid[0])); } + // left side walling for (int r = 0; r < env->height; r++) { - for (int c = 0; c < env->vision; c++) { - env->grid[r * env->width + c] = WALL; - } + memset(env->grid + (r * env->width), WALL, + env->vision * sizeof(env->grid[0])); } + // bottom walling for (int r = env->height - env->vision; r < env->height; r++) { - for (int c = 0; c < env->width; c++) { - env->grid[r * env->width + c] = WALL; - } + memset(env->grid + (r * env->width), WALL, + env->width * sizeof(env->grid[0])); } + + // right side walling for (int r = 0; r < env->height; r++) { - for (int c = env->width - env->vision; c < env->width; c++) { - env->grid[r * env->width + c] = WALL; - } + memset(env->grid + (r * env->width) + (env->width - env->vision), WALL, + env->vision * sizeof(env->grid[0])); } + save_grid_to_file(env, "grid.h"); +} - // Agents - srand(time(NULL)); - for (int i = 0; i < env->num_agents; i++) { - env->logs[i] = (Log){0}; - - Agent *agent = &env->agents[i]; - - agent->id = i; - - int adr = 0; - bool allocated = false; - - // Random allocation - while (!allocated) { - - adr = rand() % (env->height * env->width); - - if (env->grid[adr] == EMPTY) { - int r = adr / env->width; - int c = adr % env->width; - agent->r = r; - agent->c = c; - allocated = true; - } +void spawn_agent(CCpr *env, int i){ + Agent *agent = &env->agents[i]; + agent->id = i; + agent->hp = 80; + int adr = 0; + + bool allocated = false; + while (!allocated) { + adr = rand() % (env->height * env->width); + if (env->grid[adr] == EMPTY) { + int r = adr / env->width; + int c = adr % env->width; + agent->r = r; + agent->c = c; + allocated = true; } + } + assert(env->grid[adr] == EMPTY); + env->grid[adr] = get_agent_tile_from_id(agent->id); +} +void c_reset(CCpr *env) { + env->tick = 0; + memset(env->log, 0, sizeof(Log)); + env->foods->size = 0; + memset(env->foods->indexes, 0, env->width * env->height * sizeof(int)); + // make_grid_from_scratch(env); + memcpy(env->grid, grid_32_32_3v, env->width * env->height * sizeof(unsigned char)); - assert(env->grid[adr] == EMPTY); - - env->grid[adr] = get_agent_tile_from_id(agent->id); + for (int i = 0; i < env->num_agents; i++) { + spawn_agent(env, i); } init_foods(env); - + memset(env->observations, 0, env->num_agents * env->obs_size * sizeof(unsigned char)); + memset(env->truncations, 0, env->num_agents * sizeof(unsigned char)); memset(env->terminals, 0, env->num_agents * sizeof(unsigned char)); + memset(env->masks, 1, env->num_agents * sizeof(unsigned char)); compute_observations(env); } @@ -397,23 +474,20 @@ void reward_agents_near(CCpr *env, int food_index) { if ((ac == food_c && (ar == food_r - 1 || ar == food_r + 1)) || (ar == food_r && (ac == food_c - 1 || ac == food_c + 1))) { - env->rewards[i] += env->interactive_food_reward; - env->logs[i].score += 1; - add_log(env->log_buffer, &env->logs[i]); - env->logs[i] = (Log){0}; + reward_agent(env, i, env->interactive_food_reward); + env->log->score += LOG_SCORE_REWARD_MEDIUM; + add_hp(env, i, HP_REWARD_FOOD_MEDIUM); + // add_log(env->log_buffer, env->log); + // memset(env->log, 0, sizeof(Log)); } } - - // Empty grid cell - // env->grid[food_index] = EMPTY; remove_food(env, food_index); - - // Spawn new interactive food - // spawn_interactive_food(env); } void step_agent(CCpr *env, int i) { + Agent *agent = &env->agents[i]; + int action = env->actions[i]; env->logs[i].episode_length += 1; @@ -423,23 +497,26 @@ void step_agent(CCpr *env, int i) { switch (action) { case 0: dr = -1; + agent->direction = 3; break; // UP case 1: dr = 1; + agent->direction = 1; break; // DOWN case 2: dc = -1; + agent->direction = 2; break; // LEFT case 3: dc = 1; + agent->direction = 0; break; // RIGHT + case 4: + return; // No moves } - - if (action != 4) - env->logs[i].moves += 1; + env->log->moves += 1; // Get next row and column - Agent *agent = &env->agents[i]; int next_r = agent->r + dr; int next_c = agent->c + dc; @@ -450,7 +527,7 @@ void step_agent(CCpr *env, int i) { // Anything above should be obstacle if (tile >= INTERACTIVE_FOOD) { - env->logs[i].score += env->reward_move; + env->log->score += LOG_SCORE_REWARD_MOVE; env->rewards[i] += env->reward_move; next_r = agent->r; next_c = agent->c; @@ -482,15 +559,15 @@ void step_agent(CCpr *env, int i) { switch (tile) { case NORMAL_FOOD: - env->logs[i].score += 1.0; - env->rewards[i] = env->reward_food; - // spawn_food(env); + env->log->score += LOG_SCORE_REWARD_SMALL; + reward_agent(env, i, env->reward_food); + add_hp(env, i, HP_REWARD_FOOD_SMALL); remove_food(env, next_grid_idx); - add_log(env->log_buffer, &env->logs[i]); - env->logs[i] = (Log){0}; + // add_log(env->log_buffer, env->log); + // memset(env->log, 0, sizeof(Log)); break; case EMPTY: - env->logs[i].score += env->reward_move; + env->log->score += LOG_SCORE_REWARD_MOVE; env->rewards[i] = env->reward_move; break; } @@ -501,21 +578,81 @@ void step_agent(CCpr *env, int i) { env->grid[next_grid_idx] = agent_tile; agent->r = next_r; agent->c = next_c; + + return; +} + +void clear_agent(CCpr *env, int agent_id) { + Agent *agent = &env->agents[agent_id]; + if (agent->r < 0 || agent->c < 0) { + return; + } + int grid_idx = grid_index(env, agent->r, agent->c); + env->grid[grid_idx] = EMPTY; + agent->r = -1; + agent->c = -1; } void c_step(CCpr *env) { + env->tick++; + // memset(env->truncations, 0, env->num_agents * sizeof(unsigned char)); memset(env->rewards, 0, env->num_agents * sizeof(float)); + // memset(env->terminals, 0, env->num_agents * sizeof(unsigned char)); memset(env->interactive_food_agent_count, 0, (env->width * env->height + 7) / 8); for (int i = 0; i < env->num_agents; i++) { + if (env->agents[i].hp == 0) { + env->masks[i] = 0; + clear_agent(env, i); + continue; + } step_agent(env, i); + remove_hp(env, i, HP_LOSS_PER_STEP); } spawn_foods(env); + //We loop again here because in the future an entity might have attacked an agent in the process + int alive_agents = 0; + for (int i = 0; i < env->num_agents; i++) { + if (env->agents[i].hp > 0) { + alive_agents += 1; + if (env->agents[i].hp < 20) { + env->rewards[i] += REWARD_20_HP; + env->log->score += REWARD_20_HP; + } else if (env->agents[i].hp > 80) { + env->rewards[i] += REWARD_80_HP; + env->log->score += REWARD_80_HP; + } + } + // else { + // int grid_idx = grid_index(env, env->agents[i].r, env->agents[i].c); + // env->grid[grid_idx] = EMPTY; + // spawn_agent(env, i); + // } + } + if (alive_agents == 0) { + env->log->moves = 0; + }else{ + env->log->moves /= alive_agents; + } + env->log->food_nb = env->foods->size; + env->log->agents_alive = alive_agents; + env->log->alive_steps = env->tick; compute_observations(env); + + if (alive_agents == 0|| env->tick > 1000) { + c_reset(env); + if (alive_agents == 0) { + memset(env->terminals, 1, env->num_agents * sizeof(unsigned char)); + } else { + memset(env->truncations, 1, env->num_agents * sizeof(unsigned char)); + } + } + add_log(env->log_buffer, env->log); + memset(env->log, 0, sizeof(Log)); } // Raylib client @@ -536,12 +673,13 @@ Rectangle UV_COORDS[7] = { (Rectangle){384, 0, 128, 128}, }; -typedef struct { +typedef struct Renderer Renderer; +struct Renderer { int cell_size; int width; int height; Texture2D puffer; -} Renderer; +}; Renderer *init_renderer(int cell_size, int width, int height) { Renderer *renderer = (Renderer *)calloc(1, sizeof(Renderer)); @@ -581,15 +719,20 @@ void c_render(Renderer *renderer, CCpr *env) { } else if (tile == NORMAL_FOOD || tile == INTERACTIVE_FOOD) { DrawRectangle(c * ts, r * ts, ts, ts, COLORS[tile]); } else { - int u = 128 * (tile % 8); - int v = 128 * (tile / 8); - Rectangle source_rect = (Rectangle){u, v, 128, 128}; - Rectangle dest_rect = (Rectangle){c * ts, r * ts, ts, ts}; + int agent_id = get_agent_id_from_tile(tile); int col_id = agent_id % (sizeof(COLORS) / sizeof(COLORS[0])); Color color = COLORS[col_id]; + int starting_sprite_x = 0; + float rotation = env->agents[agent_id].direction * 90.0f; + if (rotation == 180) { + starting_sprite_x = 128; + rotation = 0; + } + Rectangle source_rect = (Rectangle){starting_sprite_x, 0, 128, 128}; + Rectangle dest_rect = (Rectangle){c * ts + ts/2, r * ts + ts/2, ts, ts}; DrawTexturePro(renderer->puffer, source_rect, dest_rect, - (Vector2){0, 0}, 0, color); + (Vector2){ts/2, ts/2}, rotation, color); } } } diff --git a/pufferlib/ocean/cpr/cpr.py b/pufferlib/ocean/cpr/cpr.py index 0d15d1eb43..a651b85953 100644 --- a/pufferlib/ocean/cpr/cpr.py +++ b/pufferlib/ocean/cpr/cpr.py @@ -6,16 +6,16 @@ class PyCPR(pufferlib.PufferEnv): def __init__(self, - num_envs = 1, + num_envs=1, widths=[32], heights=[32], num_agents=[8], - vision=2, + vision=3, reward_food=1.0, interactive_food_reward=5.0, reward_move=-0.01, food_base_spawn_rate=2e-3, - report_interval=250, + report_interval=1, render_mode=None, buf=None, seed=0, @@ -24,7 +24,7 @@ def __init__(self, heights = num_envs*heights num_agents = num_envs*num_agents - self.single_observation_space = gymnasium.spaces.Box(low=0, high=255, shape=((2*vision+1),(2*vision+1)), dtype=np.uint8) + self.single_observation_space = gymnasium.spaces.Box(low=0, high=255, shape=((2*vision+1)*(2*vision+1),), dtype=np.uint8) self.single_action_space = gymnasium.spaces.Discrete(5) self.render_mode = render_mode self.num_agents = sum(num_agents) @@ -39,6 +39,8 @@ def __init__(self, self.actions, self.rewards, self.terminals, + self.truncations, + self.masks, widths, heights, num_agents, @@ -79,13 +81,16 @@ def close(self): timeout=30 tot_agents = env.num_agents - actions = np.random.randint(0,4,(1024,tot_agents)) + actions = np.random.randint(0,5,(1024,tot_agents)) import time start = time.time() - while time.time() - start < timeout: + # while time.time() - start < timeout: + while tick < 500: atns = actions[tick % 1024] env.step(atns) + if -1 in env.rewards: + breakpoint() # env.render() tick += 1 diff --git a/pufferlib/ocean/cpr/cy_cpr.pyx b/pufferlib/ocean/cpr/cy_cpr.pyx index ce457e1893..4aa9bf0e04 100644 --- a/pufferlib/ocean/cpr/cy_cpr.pyx +++ b/pufferlib/ocean/cpr/cy_cpr.pyx @@ -18,6 +18,9 @@ cdef extern from "cpr.h": float episode_return float episode_length float moves + float food_nb + float agents_alive + float alive_steps float n ctypedef struct LogBuffer: @@ -33,6 +36,8 @@ cdef extern from "cpr.h": int r int c int id + int hp + int direction ctypedef struct FoodList: int *indexes @@ -57,11 +62,13 @@ cdef extern from "cpr.h": int *actions float *rewards unsigned char *terminals + unsigned char *masks + unsigned char *truncations Agent *agents LogBuffer *log_buffer - Log *logs + Log *log uint8_t *interactive_food_agent_count float interactive_food_reward @@ -90,9 +97,12 @@ cdef class CyEnv: LogBuffer *logs int num_envs - def __init__(self, unsigned char[:,:,:] observations, int[:] actions, float[:] rewards, unsigned char[:] terminals, - list widths, list heights, list num_agents,int vision, - float reward_food,float interactive_food_reward,float reward_move, float food_base_spawn_rate) -> None: + def __init__( + self, unsigned char[:,:] observations, int[:] actions, float[:] rewards, + unsigned char[:] terminals, unsigned char[:] truncations, unsigned char[:] masks, + list widths, list heights, list num_agents,int vision, + float reward_food,float interactive_food_reward,float reward_move, float food_base_spawn_rate + ) -> None: self.num_envs = len(num_agents) self.envs = calloc(self.num_envs, sizeof(CCpr)) self.logs = allocate_logbuffer(LOG_BUFFER_SIZE) @@ -101,10 +111,12 @@ cdef class CyEnv: cdef int n = 0 for i in range(self.num_envs): self.envs[i] = CCpr( - observations = &observations[n,0,0], + observations = &observations[n,0], actions=&actions[n], rewards=&rewards[n], terminals=&terminals[n], + truncations=&truncations[n], + masks=&masks[n], log_buffer=self.logs, width=widths[i], height=heights[i], diff --git a/pufferlib/ocean/cpr/grid.h b/pufferlib/ocean/cpr/grid.h new file mode 100644 index 0000000000..7682999664 --- /dev/null +++ b/pufferlib/ocean/cpr/grid.h @@ -0,0 +1,42 @@ +#ifndef GRID_H +#define GRID_H + +#define GRID_HEIGHT 32 +#define GRID_WIDTH 32 + +static const unsigned char grid_32_32_3v[GRID_HEIGHT][GRID_WIDTH] = { + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03}, + {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03} +}; + +#endif // GRID_H diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py index cac9b0ef61..49c2494a8d 100644 --- a/pufferlib/sweep.py +++ b/pufferlib/sweep.py @@ -740,4 +740,4 @@ def observe(self, hypers, score, cost, is_failure=False): cost=cost, is_failure=is_failure, ) - ) + ) \ No newline at end of file