diff --git a/docs/rating_and_ranking.md b/docs/rating_and_ranking.md index dea1c43..6bca0b6 100644 --- a/docs/rating_and_ranking.md +++ b/docs/rating_and_ranking.md @@ -27,13 +27,26 @@ Calculates the rating based on a dictionary of performances and the ID of the fi ### Example ```python performances = { - "ahc001": 2000, - "ahc002": 2200, - "ahc003": 1800 + "ahc008": 1189, + "ahc011": 1652, + "ahc015": 2446, + "ahc016": 1457, + "ahc024": 1980, + "ahc025": 1331, + "ahc026": 1965, + "ahc027": 1740, + "ahc039": 2880, + "ahc046": 2153, } -# Assuming ahc003 is the latest contest to consider for this rating calculation -final_rating = rating_calculator.calculate_rating(performances, "ahc003") + +average_performance = sum(performances.values()) / len(performances) +print(f"Average Performance: {average_performance}") +# Average Performance: 1879.3 + +# "ahc046" is the latest contest in ALE-Bench. You always need to set the final_contest to this value to report the rating. +final_rating = rating_calculator.calculate_rating(performances, "ahc046") print(f"Calculated Rating: {final_rating}") +# Calculated Rating: 2222 ``` ## `RankingCalculator` @@ -44,8 +57,8 @@ The `RankingCalculator` class allows you to determine a user's rank based on the from ale_bench.data import RankingCalculator # Initialize with a minimum number of contest participations to be included in the ranking pool -# (default is 5) -ranking_calculator = RankingCalculator(minimum_participation=5) +ranking_calculator = RankingCalculator() # Default minimum participation is 5 +ranking_calculator = RankingCalculator(minimum_participation=10) # Example with custom minimum participation ``` ### Core Methods @@ -71,15 +84,41 @@ Calculates the rank based on an overall rating. *Returns:* - `int`: The calculated rank. Lower is better. +--- +**`convert_rank_to_percentile`** + +Converts a rank to a percentile based on the distribution of ranks. + +*Parameters:* +- `rank (int)`: The rank to convert. +- `method (Literal["original", "hazen", "weibull"])`: The method to use for conversion. Defaults to `"weibull"`. + - `"original"`: $\text{percentile} = 100.0 \times \frac{\text{rank}}{\text{\#active\_users}}$, capped at 100% when $\text{rank} = \text{\#active\_users} + 1$ + - `"hazen"`: $\text{percentile} = 100.0 \times \frac{(\text{rank} - 0.5)}{(\text{\#active\_users} + 1)}$ + - `"weibull"`: $\text{percentile} = 100.0 \times \frac{\text{rank}}{(\text{\#active\_users} + 2)}$ + + > *Note:* The `"weibull"` method is recommended because it avoids 0%/100% endpoints (exclusive percentiles) and is widely used in the literature. We selected `"weibull"` as default rather than `"hazen"` because it provides a slightly more aligned to the original percentile calculation when the rank is higher. The original paper uses the `"original"` method, but it does not align well with statistical properties. All methods are acceptable as long as the method is documented. + +*Returns:* +- `float`: The corresponding percentile. + ### Example ```python # Example average performance and rating -my_avg_performance = 2150.75 -my_rating = 2345 +my_avg_performance = 1879.3 +my_rating = 2222 + +print("The number of active users in the ranking pool:", ranking_calculator.num_active_users) +# The number of active users in the ranking pool: 2220 avg_perf_rank = ranking_calculator.calculate_avg_perf_rank(my_avg_performance) -rating_rank = ranking_calculator.calculate_rating_rank(my_rating) +avg_perf_rank_percentile = ranking_calculator.convert_rank_to_percentile(avg_perf_rank, "original") # If you want to use the "original" method +avg_perf_rank_percentile = ranking_calculator.convert_rank_to_percentile(avg_perf_rank) # Using the default "weibull" method +print(f"Rank based on Average Performance ({my_avg_performance}): {avg_perf_rank} ({avg_perf_rank_percentile:.1f}%)") +# Rank based on Average Performance (1879.3): 150 (6.8%) -print(f"Rank based on Average Performance ({my_avg_performance}): {avg_perf_rank}") -print(f"Rank based on Rating ({my_rating}): {rating_rank}") +rating_rank = ranking_calculator.calculate_rating_rank(my_rating) +rating_rank_percentile = ranking_calculator.convert_rank_to_percentile(rating_rank, "original") # If you want to use the "original" method +rating_rank_percentile = ranking_calculator.convert_rank_to_percentile(rating_rank) # Using the default "weibull" method +print(f"Rank based on Rating ({my_rating}): {rating_rank} ({rating_rank_percentile:.1f}%)") +# Rank based on Rating (2222): 191 (8.6%) ``` diff --git a/src/ale_bench/data.py b/src/ale_bench/data.py index b491927..af05230 100644 --- a/src/ale_bench/data.py +++ b/src/ale_bench/data.py @@ -80,7 +80,8 @@ def model_post_init(self, __context: Any) -> None: rank, performance = self.raw_data[i] next_rank, next_performance = self.raw_data[i + 1] if rank == next_rank: - assert performance == next_performance, "Something went wrong: `performance` != `next_performance`." + if performance != next_performance: + raise RuntimeError("Something went wrong: `performance` != `next_performance`.") continue # NOTE: tie actual_rank = float((rank + next_rank - 1) / 2) # NOTE: Use the average of the rank range for performance self.data[actual_rank] = performance @@ -116,7 +117,7 @@ def get_performance(self, rank: int | float) -> int: else: lose = mid if win == len(sorted_keys): - assert False, "Something went wrong: `win` should be less than `len(sorted_keys)`." + raise RuntimeError("Something went wrong: `win` should be less than `len(sorted_keys)`.") if win == 0: win = 1 # NOTE: to avoid index out of range and use the 1st & 2nd entries rank_high = sorted_keys[win - 1] @@ -301,7 +302,8 @@ def model_post_init(self, __context: Any) -> None: for i in range(len(self.score_rank_list) - 1): _, _, end_rank = self.score_rank_list[i] _, next_start_rank, _ = self.score_rank_list[i + 1] - assert next_start_rank == end_rank + 1, "Something went wrong: `next_start_rank` != `end_rank + 1`." + if next_start_rank != end_rank + 1: + raise RuntimeError("Something went wrong: `next_start_rank` != `end_rank + 1`.") def get_new_rank(self, private_result: Result) -> tuple[int, float, list[int]]: """Get the new rank for a given private result. @@ -364,7 +366,7 @@ def get_new_rank(self, private_result: Result) -> tuple[int, float, list[int]]: return start_rank, float(start_rank), new_case_scores elif new_score == score: return start_rank, float((start_rank + end_rank) / 2), new_case_scores # NOTE: Average of the ranks - assert False, "Something went wrong in `get_new_rank` method." + raise RuntimeError("Something went wrong in `get_new_rank` method.") class ProblemMetaData(BaseModel): @@ -761,6 +763,14 @@ def load_contest_schedule() -> dict[str, tuple[dt.datetime, dt.datetime, float]] } +class RankPercentileMapMethod(str, Enum): + """Method for converting rank to percentile.""" + + ORIGINAL = "original" + HAZEN = "hazen" + WEIBULL = "weibull" + + class RankingCalculator: """Ranking calculator for ALE-Bench.""" @@ -781,7 +791,7 @@ def __init__(self, minimum_participation: int = 5) -> None: ranking_file_path = str(local_data_dir / "ranking.csv") df_ranking = pl.read_csv(ranking_file_path).filter(pl.col("competitions") >= minimum_participation) # Prepare member variables - num_active_users = len(df_ranking) + self.num_active_users = len(df_ranking) self.avg_perfs = df_ranking["avg_perf"].sort(descending=True).to_list() self.ratings = df_ranking["rating"].sort(descending=True).to_list() self.avg_perf_ranks, self.rating_ranks = [], [] @@ -798,9 +808,9 @@ def __init__(self, minimum_participation: int = 5) -> None: self.rating_ranks.append(current_rating_rank) # Append the last entry for the average performance and rating self.avg_perfs.append(-1000.0) - self.avg_perf_ranks.append(num_active_users + 1) + self.avg_perf_ranks.append(self.num_active_users + 1) self.ratings.append(0) - self.rating_ranks.append(num_active_users + 1) + self.rating_ranks.append(self.num_active_users + 1) def calculate_avg_perf_rank(self, avg_perf: float) -> int: """Calculate the rank based on the rating. @@ -826,7 +836,7 @@ def calculate_avg_perf_rank(self, avg_perf: float) -> int: else: # Exactly matched return self.avg_perf_ranks[mid] if ok == len(self.avg_perfs): - assert False, "Something went wrong: `ok` should be less than `len(self.avg_perfs)`." + raise RuntimeError("Something went wrong: `ok` should be less than `len(self.avg_perfs)`.") return self.avg_perf_ranks[ok] def calculate_rating_rank(self, rating: int) -> int: @@ -853,5 +863,41 @@ def calculate_rating_rank(self, rating: int) -> int: else: # Exactly matched return self.rating_ranks[mid] if ok == len(self.ratings): - assert False, "Something went wrong: `ok` should be less than `len(self.ratings)`." + raise RuntimeError("Something went wrong: `ok` should be less than `len(self.ratings)`.") return self.rating_ranks[ok] + + def convert_rank_to_percentile( + self, + rank: int, + method: RankPercentileMapMethod | str = RankPercentileMapMethod.WEIBULL, + ) -> float: + """Convert the rank to percentile. + + Args: + rank (int): The rank to convert. + method (RankPercentileMapMethod | str): The mode to use for conversion. Defaults to "weibull". + "original": percentile = 100.0 * rank / num_active_users + "hazen": percentile = 100.0 * (rank - 0.5) / (num_active_users + 1) + "weibull": percentile = 100.0 * rank / (num_active_users + 2) + + Returns: + float: The converted percentile. + + Raises: + ValueError: If the rank is less than 1 or greater than the number of active users + 1. + ValueError: If the method is invalid. + """ + if rank < 1 or rank > self.num_active_users + 1: + raise ValueError(f"The rank must be between 1 and {self.num_active_users + 1} (the number of users + 1).") + try: + method = RankPercentileMapMethod(method) + except ValueError: + raise ValueError(f"Invalid method: {method}. Supported methods are 'original', 'hazen', and 'weibull'.") + if method == RankPercentileMapMethod.ORIGINAL: + if rank == self.num_active_users + 1: + return 100.0 # NOTE: The lowest rank is always 100.0% (avoid exceeding 100.0%) + return 100.0 * rank / self.num_active_users + elif method == RankPercentileMapMethod.HAZEN: + return 100.0 * (rank - 0.5) / (self.num_active_users + 1) + elif method == RankPercentileMapMethod.WEIBULL: + return 100.0 * rank / (self.num_active_users + 2) diff --git a/tests/test_data.py b/tests/test_data.py index 2401bd0..b73681d 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -119,7 +119,7 @@ def test_init( [(1, 3200), (2, 200)], 2.01, pytest.raises( - AssertionError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\." + RuntimeError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\." ), 0, id="2rows_not_within_2nd", @@ -133,7 +133,7 @@ def test_init( [(1, 3200), (2, 2800), (3, 200)], 3.14, pytest.raises( - AssertionError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\." + RuntimeError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\." ), 0, id="3rows_not_within_3rd", @@ -147,7 +147,7 @@ def test_init( [(1, 3200), (99, 2800), (100, 200)], 101, pytest.raises( - AssertionError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\." + RuntimeError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\." ), 0, id="3rows_n100_not_within_100th", @@ -1617,6 +1617,17 @@ class TestRankingCalculator: def ranking_calculator_instance(self) -> RankingCalculator: return RankingCalculator(minimum_participation=0) + @pytest.mark.slow + @pytest.mark.parametrize( + "minimum_participation,expected", + [ + pytest.param(0, 6139, id="minimum_participation_0"), + pytest.param(5, 2220, id="minimum_participation_5"), + ], + ) + def test_num_active_users(self, minimum_participation: int, expected: int) -> None: + assert RankingCalculator(minimum_participation=minimum_participation).num_active_users == expected + @pytest.mark.slow @pytest.mark.parametrize( "rating,context,expected", @@ -1644,3 +1655,84 @@ def test_calculate_rating_rank( ) -> None: with context: assert ranking_calculator_instance.calculate_rating_rank(rating) == expected + + @pytest.mark.slow + @pytest.mark.parametrize( + "rank,method,context,expected", + [ + pytest.param(1, "original", does_not_raise(), 100.0 * 1.0 / 6139, id="1st_original"), + pytest.param(1, "hazen", does_not_raise(), 100.0 * 0.5 / 6140, id="1st_hazen"), + pytest.param(1, "weibull", does_not_raise(), 100.0 * 1.0 / 6141, id="1st_weibull"), + pytest.param(150, "original", does_not_raise(), 100.0 * 150.0 / 6139, id="150th_original"), + pytest.param(150, "hazen", does_not_raise(), 100.0 * 149.5 / 6140, id="150th_hazen"), + pytest.param(150, "weibull", does_not_raise(), 100.0 * 150.0 / 6141, id="150th_weibull"), + pytest.param(6139, "original", does_not_raise(), 100.0, id="6139th_original"), + pytest.param(6139, "hazen", does_not_raise(), 100.0 * 6138.5 / 6140, id="6139th_hazen"), + pytest.param(6139, "weibull", does_not_raise(), 100.0 * 6139.0 / 6141, id="6139th_weibull"), + pytest.param(6140, "original", does_not_raise(), 100.0, id="6140th_original"), + pytest.param(6140, "hazen", does_not_raise(), 100.0 * 6139.5 / 6140, id="6140th_hazen"), + pytest.param(6140, "weibull", does_not_raise(), 100.0 * 6140.0 / 6141, id="6140th_weibull"), + pytest.param( + 0, + "original", + pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."), + 0.0, + id="invalid_rank_0th_original", + ), + pytest.param( + 0, + "hazen", + pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."), + 0.0, + id="invalid_rank_0th_hazen", + ), + pytest.param( + 0, + "weibull", + pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."), + 0.0, + id="invalid_rank_0th_weibull", + ), + pytest.param( + 6141, + "original", + pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."), + 0.0, + id="invalid_rank_6141st_original", + ), + pytest.param( + 6141, + "hazen", + pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."), + 0.0, + id="invalid_rank_6141st_hazen", + ), + pytest.param( + 6141, + "weibull", + pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."), + 0.0, + id="invalid_rank_6141st_weibull", + ), + pytest.param( + 3070, + "hoge", + pytest.raises( + ValueError, + match=r"Invalid method: hoge\. Supported methods are 'original', 'hazen', and 'weibull'\.", + ), + 0.0, + id="invalid_method", + ), + ], + ) + def test_convert_rank_to_percentile( + self, + rank: int, + method: str, + context: AbstractContextManager[None], + expected: float, + ranking_calculator_instance: RankingCalculator, + ) -> None: + with context: + assert ranking_calculator_instance.convert_rank_to_percentile(rank, method) == pytest.approx(expected)