diff --git a/pufferlib/ocean/go/binding.c b/pufferlib/ocean/go/binding.c new file mode 100644 index 0000000000..272c0ba415 --- /dev/null +++ b/pufferlib/ocean/go/binding.c @@ -0,0 +1,33 @@ +#include "go.h" +#define Env CGo +#include "../env_binding.h" + +static int my_init(Env* env, PyObject* args, PyObject* kwargs) { + env->width = unpack(kwargs, "width"); + env->height = unpack(kwargs, "height"); + env->grid_size = unpack(kwargs, "grid_size"); + env->board_width = unpack(kwargs, "board_width"); + env->board_height = unpack(kwargs, "board_height"); + env->grid_square_size = unpack(kwargs, "grid_square_size"); + env->moves_made = unpack(kwargs, "moves_made"); + env->komi = unpack(kwargs, "komi"); + env->score = unpack(kwargs, "score"); + env->last_capture_position = unpack(kwargs, "last_capture_position"); + env->reward_move_pass = unpack(kwargs, "reward_move_pass"); + env->reward_move_invalid = unpack(kwargs, "reward_move_invalid"); + env->reward_move_valid = unpack(kwargs, "reward_move_valid"); + env->reward_player_capture = unpack(kwargs, "reward_player_capture"); + env->reward_opponent_capture = unpack(kwargs, "reward_opponent_capture"); + + init(env); + return 0; +} + +static int my_log(PyObject* dict, Log* log) { + assign_to_dict(dict, "perf", log->perf); + assign_to_dict(dict, "score", log->score); + assign_to_dict(dict, "episode_length", log->episode_length); + assign_to_dict(dict, "episode_return", log->episode_return); + assign_to_dict(dict, "n", log->n); + return 0; +} diff --git a/pufferlib/ocean/go/cy_go.pyx b/pufferlib/ocean/go/cy_go.pyx deleted file mode 100644 index da88e91942..0000000000 --- a/pufferlib/ocean/go/cy_go.pyx +++ /dev/null @@ -1,145 +0,0 @@ -cimport numpy as cnp -from libc.stdlib cimport calloc, free -import os - -cdef extern from "go.h": - - int LOG_BUFFER_SIZE - - ctypedef struct Log: - float perf - float score - float episode_return - float episode_length - float n - - ctypedef struct LogBuffer: - Log* logs - int length - int idx - LogBuffer* allocate_logbuffer(int) - void free_logbuffer(LogBuffer*) - Log aggregate_and_clear(LogBuffer*) - - ctypedef struct Group: - int parent - int rank - int size - int liberties - - int find(Group*) - void union_groups(Group*, int, int) - - ctypedef struct CGo: - float* observations - int* actions - float* rewards - unsigned char* dones - LogBuffer* log_buffer - Log log - float score - int width - int height - int* board_x - int* board_y - int board_width - int board_height - int grid_square_size - int grid_size - int* board_states - int* previous_board_state - int last_capture_position - int* temp_board_states - int moves_made - int* capture_count - float komi - int* visited - Group* groups - Group* temp_groups - float reward_move_pass - float reward_move_invalid - float reward_move_valid - float reward_player_capture - float reward_opponent_capture - - ctypedef struct Client - - void init(CGo* env) - void free_initialized(CGo* env) - void c_reset(CGo* env) - void c_step(CGo* env) - - Client* make_client(float width, float height) - void close_client(Client* client) - void c_render(Client* client, CGo* env) - - -cdef class CyGo: - cdef: - CGo* envs - Client* client - LogBuffer* logs - int num_envs - def __init__(self, float[:, :] observations, int[:] actions, - float[:] rewards, unsigned char[:] terminals, int num_envs, - int width, int height, int grid_size, int board_width, int board_height, - int grid_square_size, int moves_made, float komi, - float score, int last_capture_position, float reward_move_pass, - float reward_move_invalid, float reward_move_valid, float reward_player_capture, float reward_opponent_capture ): - - self.num_envs = num_envs - self.client = NULL - self.envs = calloc(num_envs, sizeof(CGo)) - self.logs = allocate_logbuffer(LOG_BUFFER_SIZE) - - cdef int i - for i in range(num_envs): - self.envs[i] = CGo( - observations=&observations[i, 0], - actions=&actions[i], - rewards=&rewards[i], - dones=&terminals[i], - log_buffer=self.logs, - width=width, - height=height, - grid_size=grid_size, - board_width=board_width, - board_height=board_height, - grid_square_size=grid_square_size, - moves_made=moves_made, - komi=komi, - score=score, - last_capture_position=last_capture_position, - reward_move_pass=reward_move_pass, - reward_move_invalid=reward_move_invalid, - reward_move_valid=reward_move_valid - ) - init(&self.envs[i]) - self.client = NULL - - def reset(self): - cdef int i - for i in range(self.num_envs): - c_reset(&self.envs[i]) - - def step(self): - cdef int i - for i in range(self.num_envs): - c_step(&self.envs[i]) - - def render(self): - cdef CGo* env = &self.envs[0] - if self.client == NULL: - self.client = make_client(env.width,env.height) - - c_render(self.client, &self.envs[0]) - - def close(self): - if self.client != NULL: - close_client(self.client) - self.client = NULL - free(self.envs) - - def log(self): - cdef Log log = aggregate_and_clear(self.logs) - return log diff --git a/pufferlib/ocean/go/go.h b/pufferlib/ocean/go/go.h index 8be8d6a058..c743f8a263 100644 --- a/pufferlib/ocean/go/go.h +++ b/pufferlib/ocean/go/go.h @@ -13,7 +13,6 @@ #define PLAYER_WIN 1 static const int DIRECTIONS[NUM_DIRECTIONS][2] = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}}; // LD_LIBRARY_PATH=raylib/lib ./go -#define LOG_BUFFER_SIZE 1024 typedef struct Log Log; struct Log { @@ -24,55 +23,6 @@ struct Log { float n; }; -typedef struct LogBuffer LogBuffer; -struct LogBuffer { - Log* logs; - int length; - int idx; -}; - -LogBuffer* allocate_logbuffer(int size) { - LogBuffer* logs = (LogBuffer*)calloc(1, sizeof(LogBuffer)); - logs->logs = (Log*)calloc(size, sizeof(Log)); - logs->length = size; - logs->idx = 0; - return logs; -} - -void free_logbuffer(LogBuffer* buffer) { - free(buffer->logs); - free(buffer); -} - -void add_log(LogBuffer* logs, Log* log) { - if (logs->idx == logs->length) { - return; - } - logs->logs[logs->idx] = *log; - logs->idx += 1; - //printf("Log: %f, %f, %f\n", log->episode_return, log->episode_length, log->score); -} - -Log aggregate_and_clear(LogBuffer* logs) { - Log log = {0}; - if (logs->idx == 0) { - return log; - } - for (int i = 0; i < logs->idx; i++) { - log.episode_return += logs->logs[i].episode_return; - log.episode_length += logs->logs[i].episode_length; - log.n += logs->logs[i].n; - log.score += logs->logs[i].score; - log.perf += logs->logs[i].perf; - } - log.episode_return /= logs->idx; - log.episode_length /= logs->idx; - log.score /= logs->idx; - log.perf /= logs->idx; - logs->idx = 0; - return log; -} - typedef struct Group Group; struct Group { int parent; @@ -114,8 +64,7 @@ struct CGo { float* observations; int* actions; float* rewards; - unsigned char* dones; - LogBuffer* log_buffer; + unsigned char* terminals; Log log; float score; int width; @@ -141,8 +90,31 @@ struct CGo { float reward_move_valid; float reward_player_capture; float reward_opponent_capture; + float tick; }; +void add_log(CGo* env) { + env->log.episode_length += env->tick; + + // Calculate perf as a win rate (1.0 if win, 0.0 if loss) + float win_value = 0.0; + if (env->score > 0) { + win_value = 1.0; // Win + } + else if (env->score < 0) { + win_value = 0.0; // Loss + } + else { + win_value = 0.0; // Tie + } + + env->log.perf = (env->log.perf * env->log.n + win_value) / (env->log.n + 1.0); + + env->log.score += env->score; + env->log.episode_return += env->rewards[0]; + env->log.n += 1.0; +} + void generate_board_positions(CGo* env) { for (int i = 0; i < (env->grid_size-1) * (env->grid_size-1); i++) { int row = i / (env->grid_size-1); @@ -182,8 +154,7 @@ void allocate(CGo* env) { env->observations = (float*)calloc((env->grid_size)*(env->grid_size)*2 + 2, sizeof(float)); env->actions = (int*)calloc(1, sizeof(int)); env->rewards = (float*)calloc(1, sizeof(float)); - env->dones = (unsigned char*)calloc(1, sizeof(unsigned char)); - env->log_buffer = allocate_logbuffer(LOG_BUFFER_SIZE); + env->terminals = (unsigned char*)calloc(1, sizeof(unsigned char)); } void free_initialized(CGo* env) { @@ -201,9 +172,8 @@ void free_initialized(CGo* env) { void free_allocated(CGo* env) { free(env->actions); free(env->observations); - free(env->dones); + free(env->terminals); free(env->rewards); - free_logbuffer(env->log_buffer); free_initialized(env); } @@ -547,7 +517,7 @@ void enemy_random_move(CGo* env){ } } // If no move is possible, pass or end the game - env->dones[0] = 1; + env->terminals[0] = 1; } int find_group_liberty(CGo* env, int root){ @@ -649,8 +619,9 @@ void enemy_greedy_easy(CGo* env){ } void c_reset(CGo* env) { - env->log = (Log){0}; - env->dones[0] = 0; + env->tick = 0; + // We don't reset the log struct - leave it accumulating like in Pong + env->terminals[0] = 0; env->score = 0; for (int i = 0; i < (env->grid_size)*(env->grid_size); i++) { env->board_states[i] = 0; @@ -672,32 +643,26 @@ void c_reset(CGo* env) { void end_game(CGo* env){ compute_score_tromp_taylor(env); if (env->score > 0) { - env->rewards[0] = 1.0 ; - env->log.perf = 1.0; + env->rewards[0] = 1.0; } else if (env->score < 0) { env->rewards[0] = -1.0; - env->log.perf = 0.0; } else { env->rewards[0] = 0.0; - env->log.perf = 0.0; } - env->log.score = env->score; - env->log.n++; - env->log.episode_return += env->rewards[0]; - add_log(env->log_buffer, &env->log); + add_log(env); c_reset(env); } void c_step(CGo* env) { - env->log.episode_length += 1; + env->tick += 1; env->rewards[0] = 0.0; int action = (int)env->actions[0]; // useful for training , can prob be a hyper param. Recommend to increase with larger board size float max_moves = 3 * env->grid_size * env->grid_size; - if (env->log.episode_length > max_moves) { - env->dones[0] = 1; + if (env->tick > max_moves) { + env->terminals[0] = 1; end_game(env); compute_observations(env); return; @@ -706,7 +671,7 @@ void c_step(CGo* env) { env->rewards[0] = env->reward_move_pass; env->log.episode_return += env->reward_move_pass; enemy_greedy_hard(env); - if (env->dones[0] == 1) { + if (env->terminals[0] == 1) { end_game(env); return; } @@ -735,7 +700,7 @@ void c_step(CGo* env) { env->rewards[0] = -1; } - if (env->dones[0] == 1) { + if (env->terminals[0] == 1) { end_game(env); return; } @@ -767,7 +732,7 @@ Client* make_client(int width, int height) { return client; } -void c_render(Client* client, CGo* env) { +void c_render(CGo* env) { if (IsKeyDown(KEY_ESCAPE)) { exit(0); } diff --git a/pufferlib/ocean/go/go.py b/pufferlib/ocean/go/go.py index adfe71135b..06b87155d3 100644 --- a/pufferlib/ocean/go/go.py +++ b/pufferlib/ocean/go/go.py @@ -9,10 +9,10 @@ import gymnasium import pufferlib -from pufferlib.ocean.go.cy_go import CyGo +from pufferlib.ocean.go import binding class Go(pufferlib.PufferEnv): - def __init__(self, num_envs=1, render_mode=None, report_interval=1, + def __init__(self, num_envs=1, render_mode=None, log_interval=1, width=950, height=800, grid_size=7, board_width=600, board_height=600, @@ -31,8 +31,8 @@ def __init__(self, num_envs=1, render_mode=None, report_interval=1, # env self.num_agents = num_envs self.render_mode = render_mode - self.report_interval = report_interval - + self.log_interval = log_interval + self.tick = 0 self.num_obs = (grid_size) * (grid_size)*2 + 2 self.num_act = (grid_size) * (grid_size) + 1 self.single_observation_space = gymnasium.spaces.Box(low=0, high=1, @@ -41,37 +41,38 @@ def __init__(self, num_envs=1, render_mode=None, report_interval=1, super().__init__(buf=buf) height = 64*(grid_size+1) - self.c_envs = CyGo(self.observations, self.actions, self.rewards, - self.terminals, num_envs, width, height, grid_size, board_width, - board_height, grid_square_size, moves_made, komi, score, - last_capture_position, reward_move_pass, reward_move_invalid, - reward_move_valid, reward_player_capture, reward_opponent_capture) + self.c_envs = binding.vec_init(self.observations, self.actions, self.rewards, + self.terminals, self.truncations, num_envs, seed, width=width, height=height, grid_size=grid_size, + board_width=board_width, board_height=board_height, grid_square_size=grid_square_size, + moves_made=moves_made, komi=komi, score=score, last_capture_position=last_capture_position, + reward_move_pass=reward_move_pass, reward_move_invalid=reward_move_invalid, + reward_move_valid=reward_move_valid, reward_player_capture=reward_player_capture, + reward_opponent_capture=reward_opponent_capture) def reset(self, seed=None): - self.c_envs.reset() + binding.vec_reset(self.c_envs, seed) self.tick = 0 return self.observations, [] def step(self, actions): self.actions[:] = actions - self.c_envs.step() + binding.vec_step(self.c_envs) self.tick += 1 info = [] - if self.tick % self.report_interval == 0: - log = self.c_envs.log() - if log['episode_length'] > 0: - info.append(log) + if self.tick % self.log_interval == 0: + info.append(binding.vec_log(self.c_envs)) + return (self.observations, self.rewards, self.terminals, self.truncations, info) def render(self): - self.c_envs.render() + binding.vec_render(self.c_envs, 0) def close(self): - self.c_envs.close() + binding.vec_close(self.c_envs) def test_performance(timeout=10, atn_cache=1024): - num_envs=1000; + num_envs=1000 env = Go(num_envs=num_envs) env.reset() tick = 0 diff --git a/setup.py b/setup.py index d4d922c335..af74569466 100644 --- a/setup.py +++ b/setup.py @@ -276,7 +276,7 @@ 'pufferlib/ocean/connect4/cy_connect4', #'pufferlib/ocean/grid/cy_grid', 'pufferlib/ocean/tripletriad/cy_tripletriad', - 'pufferlib/ocean/go/cy_go', + # 'pufferlib/ocean/go/cy_go', 'pufferlib/ocean/rware/cy_rware', 'pufferlib/ocean/trash_pickup/cy_trash_pickup', 'pufferlib/ocean/cpr/cy_cpr', @@ -313,7 +313,9 @@ #c_args = ['-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION', '-DPLATFORM_DESKTOP', '-O2'] #c_args += "-Wsign-compare -DNDEBUG -g -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC".split() -pure_c_extensions = ['squared', 'pong', 'breakout', 'enduro', 'blastar', 'grid', 'nmmo3', 'tactical', 'cartpole'] + +pure_c_extensions = ['squared', 'pong', 'breakout', 'enduro', 'blastar', 'grid', 'nmmo3', 'tactical', 'go', 'cartpole'] + extensions += [ Extension( f'pufferlib.ocean.{name}.binding',