SakanaAI · Yuki-Imajuku · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/docs/rating_and_ranking.md b/docs/rating_and_ranking.md
@@ -27,13 +27,26 @@ Calculates the rating based on a dictionary of performances and the ID of the fi
 ### Example
 ```python
 performances = {
-    "ahc001": 2000,
-    "ahc002": 2200,
-    "ahc003": 1800
+    "ahc008": 1189,
+    "ahc011": 1652,
+    "ahc015": 2446,
+    "ahc016": 1457,
+    "ahc024": 1980,
+    "ahc025": 1331,
+    "ahc026": 1965,
+    "ahc027": 1740,
+    "ahc039": 2880,
+    "ahc046": 2153,
 }
-# Assuming ahc003 is the latest contest to consider for this rating calculation
-final_rating = rating_calculator.calculate_rating(performances, "ahc003")
+
+average_performance = sum(performances.values()) / len(performances)
+print(f"Average Performance: {average_performance}")
+# Average Performance: 1879.3
+
+# "ahc046" is the latest contest in ALE-Bench. You always need to set the final_contest to this value to report the rating.
+final_rating = rating_calculator.calculate_rating(performances, "ahc046")
 print(f"Calculated Rating: {final_rating}")
+# Calculated Rating: 2222
 ```
 
 ## `RankingCalculator`
@@ -44,8 +57,8 @@ The `RankingCalculator` class allows you to determine a user's rank based on the
 from ale_bench.data import RankingCalculator
 
 # Initialize with a minimum number of contest participations to be included in the ranking pool
-# (default is 5)
-ranking_calculator = RankingCalculator(minimum_participation=5)
+ranking_calculator = RankingCalculator()  # Default minimum participation is 5
+ranking_calculator = RankingCalculator(minimum_participation=10)  # Example with custom minimum participation
 ```
 
 ### Core Methods
@@ -71,15 +84,41 @@ Calculates the rank based on an overall rating.
 *Returns:*
 - `int`: The calculated rank. Lower is better.
 
+---
+**`convert_rank_to_percentile`**
+
+Converts a rank to a percentile based on the distribution of ranks.
+
+*Parameters:*
+- `rank (int)`: The rank to convert.
+- `method (Literal["original", "hazen", "weibull"])`: The method to use for conversion. Defaults to `"weibull"`.
+    - `"original"`: $\text{percentile} = 100.0 \times \frac{\text{rank}}{\text{\#active\_users}}$, capped at 100% when $\text{rank} = \text{\#active\_users} + 1$
+    - `"hazen"`: $\text{percentile} = 100.0 \times \frac{(\text{rank} - 0.5)}{(\text{\#active\_users} + 1)}$
+    - `"weibull"`: $\text{percentile} = 100.0 \times \frac{\text{rank}}{(\text{\#active\_users} + 2)}$
+
+    > *Note:* The `"weibull"` method is recommended because it avoids 0%/100% endpoints (exclusive percentiles) and is widely used in the literature. We selected `"weibull"` as default rather than `"hazen"` because it provides a slightly more aligned to the original percentile calculation when the rank is higher. The original paper uses the `"original"` method, but it does not align well with statistical properties. All methods are acceptable as long as the method is documented.
+
+*Returns:*
+- `float`: The corresponding percentile.
+
 ### Example
 ```python
 # Example average performance and rating
-my_avg_performance = 2150.75
-my_rating = 2345
+my_avg_performance = 1879.3
+my_rating = 2222
+
+print("The number of active users in the ranking pool:", ranking_calculator.num_active_users)
+# The number of active users in the ranking pool: 2220
 
 avg_perf_rank = ranking_calculator.calculate_avg_perf_rank(my_avg_performance)
-rating_rank = ranking_calculator.calculate_rating_rank(my_rating)
+avg_perf_rank_percentile = ranking_calculator.convert_rank_to_percentile(avg_perf_rank, "original")  # If you want to use the "original" method
+avg_perf_rank_percentile = ranking_calculator.convert_rank_to_percentile(avg_perf_rank)  # Using the default "weibull" method
+print(f"Rank based on Average Performance ({my_avg_performance}): {avg_perf_rank} ({avg_perf_rank_percentile:.1f}%)")
+# Rank based on Average Performance (1879.3): 150 (6.8%)
 
-print(f"Rank based on Average Performance ({my_avg_performance}): {avg_perf_rank}")
-print(f"Rank based on Rating ({my_rating}): {rating_rank}")
+rating_rank = ranking_calculator.calculate_rating_rank(my_rating)
+rating_rank_percentile = ranking_calculator.convert_rank_to_percentile(rating_rank, "original")  # If you want to use the "original" method
+rating_rank_percentile = ranking_calculator.convert_rank_to_percentile(rating_rank)  # Using the default "weibull" method
+print(f"Rank based on Rating ({my_rating}): {rating_rank} ({rating_rank_percentile:.1f}%)")
+# Rank based on Rating (2222): 191 (8.6%)
 ```
diff --git a/src/ale_bench/data.py b/src/ale_bench/data.py
@@ -80,7 +80,8 @@ def model_post_init(self, __context: Any) -> None:
             rank, performance = self.raw_data[i]
             next_rank, next_performance = self.raw_data[i + 1]
             if rank == next_rank:
-                assert performance == next_performance, "Something went wrong: `performance` != `next_performance`."
+                if performance != next_performance:
+                    raise RuntimeError("Something went wrong: `performance` != `next_performance`.")
                 continue  # NOTE: tie
             actual_rank = float((rank + next_rank - 1) / 2)  # NOTE: Use the average of the rank range for performance
             self.data[actual_rank] = performance
@@ -116,7 +117,7 @@ def get_performance(self, rank: int | float) -> int:
             else:
                 lose = mid
         if win == len(sorted_keys):
-            assert False, "Something went wrong: `win` should be less than `len(sorted_keys)`."
+            raise RuntimeError("Something went wrong: `win` should be less than `len(sorted_keys)`.")
         if win == 0:
             win = 1  # NOTE: to avoid index out of range and use the 1st & 2nd entries
         rank_high = sorted_keys[win - 1]
@@ -301,7 +302,8 @@ def model_post_init(self, __context: Any) -> None:
         for i in range(len(self.score_rank_list) - 1):
             _, _, end_rank = self.score_rank_list[i]
             _, next_start_rank, _ = self.score_rank_list[i + 1]
-            assert next_start_rank == end_rank + 1, "Something went wrong: `next_start_rank` != `end_rank + 1`."
+            if next_start_rank != end_rank + 1:
+                raise RuntimeError("Something went wrong: `next_start_rank` != `end_rank + 1`.")
 
     def get_new_rank(self, private_result: Result) -> tuple[int, float, list[int]]:
         """Get the new rank for a given private result.
@@ -364,7 +366,7 @@ def get_new_rank(self, private_result: Result) -> tuple[int, float, list[int]]:
                 return start_rank, float(start_rank), new_case_scores
             elif new_score == score:
                 return start_rank, float((start_rank + end_rank) / 2), new_case_scores  # NOTE: Average of the ranks
-        assert False, "Something went wrong in `get_new_rank` method."
+        raise RuntimeError("Something went wrong in `get_new_rank` method.")
 
 
 class ProblemMetaData(BaseModel):
@@ -761,6 +763,14 @@ def load_contest_schedule() -> dict[str, tuple[dt.datetime, dt.datetime, float]]
         }
 
 
+class RankPercentileMapMethod(str, Enum):
+    """Method for converting rank to percentile."""
+
+    ORIGINAL = "original"
+    HAZEN = "hazen"
+    WEIBULL = "weibull"
+
+
 class RankingCalculator:
     """Ranking calculator for ALE-Bench."""
 
@@ -781,7 +791,7 @@ def __init__(self, minimum_participation: int = 5) -> None:
             ranking_file_path = str(local_data_dir / "ranking.csv")
         df_ranking = pl.read_csv(ranking_file_path).filter(pl.col("competitions") >= minimum_participation)
         # Prepare member variables
-        num_active_users = len(df_ranking)
+        self.num_active_users = len(df_ranking)
         self.avg_perfs = df_ranking["avg_perf"].sort(descending=True).to_list()
         self.ratings = df_ranking["rating"].sort(descending=True).to_list()
         self.avg_perf_ranks, self.rating_ranks = [], []
@@ -798,9 +808,9 @@ def __init__(self, minimum_participation: int = 5) -> None:
             self.rating_ranks.append(current_rating_rank)
         # Append the last entry for the average performance and rating
         self.avg_perfs.append(-1000.0)
-        self.avg_perf_ranks.append(num_active_users + 1)
+        self.avg_perf_ranks.append(self.num_active_users + 1)
         self.ratings.append(0)
-        self.rating_ranks.append(num_active_users + 1)
+        self.rating_ranks.append(self.num_active_users + 1)
 
     def calculate_avg_perf_rank(self, avg_perf: float) -> int:
         """Calculate the rank based on the rating.
@@ -826,7 +836,7 @@ def calculate_avg_perf_rank(self, avg_perf: float) -> int:
             else:  # Exactly matched
                 return self.avg_perf_ranks[mid]
         if ok == len(self.avg_perfs):
-            assert False, "Something went wrong: `ok` should be less than `len(self.avg_perfs)`."
+            raise RuntimeError("Something went wrong: `ok` should be less than `len(self.avg_perfs)`.")
         return self.avg_perf_ranks[ok]
 
     def calculate_rating_rank(self, rating: int) -> int:
@@ -853,5 +863,41 @@ def calculate_rating_rank(self, rating: int) -> int:
             else:  # Exactly matched
                 return self.rating_ranks[mid]
         if ok == len(self.ratings):
-            assert False, "Something went wrong: `ok` should be less than `len(self.ratings)`."
+            raise RuntimeError("Something went wrong: `ok` should be less than `len(self.ratings)`.")
         return self.rating_ranks[ok]
+
+    def convert_rank_to_percentile(
+        self,
+        rank: int,
+        method: RankPercentileMapMethod | str = RankPercentileMapMethod.WEIBULL,
+    ) -> float:
+        """Convert the rank to percentile.
+
+        Args:
+            rank (int): The rank to convert.
+            method (RankPercentileMapMethod | str): The mode to use for conversion. Defaults to "weibull".
+                "original": percentile = 100.0 * rank / num_active_users
+                "hazen": percentile = 100.0 * (rank - 0.5) / (num_active_users + 1)
+                "weibull": percentile = 100.0 * rank / (num_active_users + 2)
+
+        Returns:
+            float: The converted percentile.
+
+        Raises:
+            ValueError: If the rank is less than 1 or greater than the number of active users + 1.
+            ValueError: If the method is invalid.
+        """
+        if rank < 1 or rank > self.num_active_users + 1:
+            raise ValueError(f"The rank must be between 1 and {self.num_active_users + 1} (the number of users + 1).")
+        try:
+            method = RankPercentileMapMethod(method)
+        except ValueError:
+            raise ValueError(f"Invalid method: {method}. Supported methods are 'original', 'hazen', and 'weibull'.")
+        if method == RankPercentileMapMethod.ORIGINAL:
+            if rank == self.num_active_users + 1:
+                return 100.0  # NOTE: The lowest rank is always 100.0% (avoid exceeding 100.0%)
+            return 100.0 * rank / self.num_active_users
+        elif method == RankPercentileMapMethod.HAZEN:
+            return 100.0 * (rank - 0.5) / (self.num_active_users + 1)
+        elif method == RankPercentileMapMethod.WEIBULL:
+            return 100.0 * rank / (self.num_active_users + 2)
-            return 100.0 * rank / (self.num_active_users + 2)
+            return 100.0 * rank / (self.num_active_users + 1)
-            return 100.0 * rank / (self.num_active_users + 2)
+            return 100.0 * rank / (self.num_active_users + 1)
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -119,7 +119,7 @@ def test_init(
                 [(1, 3200), (2, 200)],
                 2.01,
                 pytest.raises(
-                    AssertionError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
+                    RuntimeError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
                 ),
                 0,
                 id="2rows_not_within_2nd",
@@ -133,7 +133,7 @@ def test_init(
                 [(1, 3200), (2, 2800), (3, 200)],
                 3.14,
                 pytest.raises(
-                    AssertionError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
+                    RuntimeError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
                 ),
                 0,
                 id="3rows_not_within_3rd",
@@ -147,7 +147,7 @@ def test_init(
                 [(1, 3200), (99, 2800), (100, 200)],
                 101,
                 pytest.raises(
-                    AssertionError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
+                    RuntimeError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
                 ),
                 0,
                 id="3rows_n100_not_within_100th",
@@ -1617,6 +1617,17 @@ class TestRankingCalculator:
     def ranking_calculator_instance(self) -> RankingCalculator:
         return RankingCalculator(minimum_participation=0)
 
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "minimum_participation,expected",
+        [
+            pytest.param(0, 6139, id="minimum_participation_0"),
+            pytest.param(5, 2220, id="minimum_participation_5"),
+        ],
+    )
+    def test_num_active_users(self, minimum_participation: int, expected: int) -> None:
+        assert RankingCalculator(minimum_participation=minimum_participation).num_active_users == expected
+
     @pytest.mark.slow
     @pytest.mark.parametrize(
         "rating,context,expected",
@@ -1644,3 +1655,84 @@ def test_calculate_rating_rank(
     ) -> None:
         with context:
             assert ranking_calculator_instance.calculate_rating_rank(rating) == expected
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "rank,method,context,expected",
+        [
+            pytest.param(1, "original", does_not_raise(), 100.0 * 1.0 / 6139, id="1st_original"),
+            pytest.param(1, "hazen", does_not_raise(), 100.0 * 0.5 / 6140, id="1st_hazen"),
+            pytest.param(1, "weibull", does_not_raise(), 100.0 * 1.0 / 6141, id="1st_weibull"),
+            pytest.param(150, "original", does_not_raise(), 100.0 * 150.0 / 6139, id="150th_original"),
+            pytest.param(150, "hazen", does_not_raise(), 100.0 * 149.5 / 6140, id="150th_hazen"),
+            pytest.param(150, "weibull", does_not_raise(), 100.0 * 150.0 / 6141, id="150th_weibull"),
+            pytest.param(6139, "original", does_not_raise(), 100.0, id="6139th_original"),
+            pytest.param(6139, "hazen", does_not_raise(), 100.0 * 6138.5 / 6140, id="6139th_hazen"),
+            pytest.param(6139, "weibull", does_not_raise(), 100.0 * 6139.0 / 6141, id="6139th_weibull"),
+            pytest.param(6140, "original", does_not_raise(), 100.0, id="6140th_original"),
+            pytest.param(6140, "hazen", does_not_raise(), 100.0 * 6139.5 / 6140, id="6140th_hazen"),
+            pytest.param(6140, "weibull", does_not_raise(), 100.0 * 6140.0 / 6141, id="6140th_weibull"),
+            pytest.param(
+                0,
+                "original",
+                pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
+                0.0,
+                id="invalid_rank_0th_original",
+            ),
+            pytest.param(
+                0,
+                "hazen",
+                pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
+                0.0,
+                id="invalid_rank_0th_hazen",
+            ),
+            pytest.param(
+                0,
+                "weibull",
+                pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
+                0.0,
+                id="invalid_rank_0th_weibull",
+            ),
+            pytest.param(
+                6141,
+                "original",
+                pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
+                0.0,
+                id="invalid_rank_6141st_original",
+            ),
+            pytest.param(
+                6141,
+                "hazen",
+                pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
+                0.0,
+                id="invalid_rank_6141st_hazen",
+            ),
+            pytest.param(
+                6141,
+                "weibull",
+                pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
+                0.0,
+                id="invalid_rank_6141st_weibull",
+            ),
+            pytest.param(
+                3070,
+                "hoge",
+                pytest.raises(
+                    ValueError,
+                    match=r"Invalid method: hoge\. Supported methods are 'original', 'hazen', and 'weibull'\.",
+                ),
+                0.0,
+                id="invalid_method",
+            ),
+        ],
+    )
+    def test_convert_rank_to_percentile(
+        self,
+        rank: int,
+        method: str,
+        context: AbstractContextManager[None],
+        expected: float,
+        ranking_calculator_instance: RankingCalculator,
+    ) -> None:
+        with context:
+            assert ranking_calculator_instance.convert_rank_to_percentile(rank, method) == pytest.approx(expected)