Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 51 additions & 12 deletions docs/rating_and_ranking.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,26 @@ Calculates the rating based on a dictionary of performances and the ID of the fi
### Example
```python
performances = {
"ahc001": 2000,
"ahc002": 2200,
"ahc003": 1800
"ahc008": 1189,
"ahc011": 1652,
"ahc015": 2446,
"ahc016": 1457,
"ahc024": 1980,
"ahc025": 1331,
"ahc026": 1965,
"ahc027": 1740,
"ahc039": 2880,
"ahc046": 2153,
}
# Assuming ahc003 is the latest contest to consider for this rating calculation
final_rating = rating_calculator.calculate_rating(performances, "ahc003")

average_performance = sum(performances.values()) / len(performances)
print(f"Average Performance: {average_performance}")
# Average Performance: 1879.3

# "ahc046" is the latest contest in ALE-Bench. You always need to set the final_contest to this value to report the rating.
final_rating = rating_calculator.calculate_rating(performances, "ahc046")
print(f"Calculated Rating: {final_rating}")
# Calculated Rating: 2222
```

## `RankingCalculator`
Expand All @@ -44,8 +57,8 @@ The `RankingCalculator` class allows you to determine a user's rank based on the
from ale_bench.data import RankingCalculator

# Initialize with a minimum number of contest participations to be included in the ranking pool
# (default is 5)
ranking_calculator = RankingCalculator(minimum_participation=5)
ranking_calculator = RankingCalculator() # Default minimum participation is 5
ranking_calculator = RankingCalculator(minimum_participation=10) # Example with custom minimum participation
```

### Core Methods
Expand All @@ -71,15 +84,41 @@ Calculates the rank based on an overall rating.
*Returns:*
- `int`: The calculated rank. Lower is better.

---
**`convert_rank_to_percentile`**

Converts a rank to a percentile based on the distribution of ranks.

*Parameters:*
- `rank (int)`: The rank to convert.
- `method (Literal["original", "hazen", "weibull"])`: The method to use for conversion. Defaults to `"weibull"`.
- `"original"`: $\text{percentile} = 100.0 \times \frac{\text{rank}}{\text{\#active\_users}}$, capped at 100% when $\text{rank} = \text{\#active\_users} + 1$
- `"hazen"`: $\text{percentile} = 100.0 \times \frac{(\text{rank} - 0.5)}{(\text{\#active\_users} + 1)}$
- `"weibull"`: $\text{percentile} = 100.0 \times \frac{\text{rank}}{(\text{\#active\_users} + 2)}$

> *Note:* The `"weibull"` method is recommended because it avoids 0%/100% endpoints (exclusive percentiles) and is widely used in the literature. We selected `"weibull"` as default rather than `"hazen"` because it provides a slightly more aligned to the original percentile calculation when the rank is higher. The original paper uses the `"original"` method, but it does not align well with statistical properties. All methods are acceptable as long as the method is documented.

*Returns:*
- `float`: The corresponding percentile.

### Example
```python
# Example average performance and rating
my_avg_performance = 2150.75
my_rating = 2345
my_avg_performance = 1879.3
my_rating = 2222

print("The number of active users in the ranking pool:", ranking_calculator.num_active_users)
# The number of active users in the ranking pool: 2220

avg_perf_rank = ranking_calculator.calculate_avg_perf_rank(my_avg_performance)
rating_rank = ranking_calculator.calculate_rating_rank(my_rating)
avg_perf_rank_percentile = ranking_calculator.convert_rank_to_percentile(avg_perf_rank, "original") # If you want to use the "original" method
avg_perf_rank_percentile = ranking_calculator.convert_rank_to_percentile(avg_perf_rank) # Using the default "weibull" method
print(f"Rank based on Average Performance ({my_avg_performance}): {avg_perf_rank} ({avg_perf_rank_percentile:.1f}%)")
# Rank based on Average Performance (1879.3): 150 (6.8%)

print(f"Rank based on Average Performance ({my_avg_performance}): {avg_perf_rank}")
print(f"Rank based on Rating ({my_rating}): {rating_rank}")
rating_rank = ranking_calculator.calculate_rating_rank(my_rating)
rating_rank_percentile = ranking_calculator.convert_rank_to_percentile(rating_rank, "original") # If you want to use the "original" method
rating_rank_percentile = ranking_calculator.convert_rank_to_percentile(rating_rank) # Using the default "weibull" method
print(f"Rank based on Rating ({my_rating}): {rating_rank} ({rating_rank_percentile:.1f}%)")
# Rank based on Rating (2222): 191 (8.6%)
```
64 changes: 55 additions & 9 deletions src/ale_bench/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def model_post_init(self, __context: Any) -> None:
rank, performance = self.raw_data[i]
next_rank, next_performance = self.raw_data[i + 1]
if rank == next_rank:
assert performance == next_performance, "Something went wrong: `performance` != `next_performance`."
if performance != next_performance:
raise RuntimeError("Something went wrong: `performance` != `next_performance`.")
continue # NOTE: tie
actual_rank = float((rank + next_rank - 1) / 2) # NOTE: Use the average of the rank range for performance
self.data[actual_rank] = performance
Expand Down Expand Up @@ -116,7 +117,7 @@ def get_performance(self, rank: int | float) -> int:
else:
lose = mid
if win == len(sorted_keys):
assert False, "Something went wrong: `win` should be less than `len(sorted_keys)`."
raise RuntimeError("Something went wrong: `win` should be less than `len(sorted_keys)`.")
if win == 0:
win = 1 # NOTE: to avoid index out of range and use the 1st & 2nd entries
rank_high = sorted_keys[win - 1]
Expand Down Expand Up @@ -301,7 +302,8 @@ def model_post_init(self, __context: Any) -> None:
for i in range(len(self.score_rank_list) - 1):
_, _, end_rank = self.score_rank_list[i]
_, next_start_rank, _ = self.score_rank_list[i + 1]
assert next_start_rank == end_rank + 1, "Something went wrong: `next_start_rank` != `end_rank + 1`."
if next_start_rank != end_rank + 1:
raise RuntimeError("Something went wrong: `next_start_rank` != `end_rank + 1`.")

def get_new_rank(self, private_result: Result) -> tuple[int, float, list[int]]:
"""Get the new rank for a given private result.
Expand Down Expand Up @@ -364,7 +366,7 @@ def get_new_rank(self, private_result: Result) -> tuple[int, float, list[int]]:
return start_rank, float(start_rank), new_case_scores
elif new_score == score:
return start_rank, float((start_rank + end_rank) / 2), new_case_scores # NOTE: Average of the ranks
assert False, "Something went wrong in `get_new_rank` method."
raise RuntimeError("Something went wrong in `get_new_rank` method.")


class ProblemMetaData(BaseModel):
Expand Down Expand Up @@ -761,6 +763,14 @@ def load_contest_schedule() -> dict[str, tuple[dt.datetime, dt.datetime, float]]
}


class RankPercentileMapMethod(str, Enum):
"""Method for converting rank to percentile."""

ORIGINAL = "original"
HAZEN = "hazen"
WEIBULL = "weibull"


class RankingCalculator:
"""Ranking calculator for ALE-Bench."""

Expand All @@ -781,7 +791,7 @@ def __init__(self, minimum_participation: int = 5) -> None:
ranking_file_path = str(local_data_dir / "ranking.csv")
df_ranking = pl.read_csv(ranking_file_path).filter(pl.col("competitions") >= minimum_participation)
# Prepare member variables
num_active_users = len(df_ranking)
self.num_active_users = len(df_ranking)
self.avg_perfs = df_ranking["avg_perf"].sort(descending=True).to_list()
self.ratings = df_ranking["rating"].sort(descending=True).to_list()
self.avg_perf_ranks, self.rating_ranks = [], []
Expand All @@ -798,9 +808,9 @@ def __init__(self, minimum_participation: int = 5) -> None:
self.rating_ranks.append(current_rating_rank)
# Append the last entry for the average performance and rating
self.avg_perfs.append(-1000.0)
self.avg_perf_ranks.append(num_active_users + 1)
self.avg_perf_ranks.append(self.num_active_users + 1)
self.ratings.append(0)
self.rating_ranks.append(num_active_users + 1)
self.rating_ranks.append(self.num_active_users + 1)

def calculate_avg_perf_rank(self, avg_perf: float) -> int:
"""Calculate the rank based on the rating.
Expand All @@ -826,7 +836,7 @@ def calculate_avg_perf_rank(self, avg_perf: float) -> int:
else: # Exactly matched
return self.avg_perf_ranks[mid]
if ok == len(self.avg_perfs):
assert False, "Something went wrong: `ok` should be less than `len(self.avg_perfs)`."
raise RuntimeError("Something went wrong: `ok` should be less than `len(self.avg_perfs)`.")
return self.avg_perf_ranks[ok]

def calculate_rating_rank(self, rating: int) -> int:
Expand All @@ -853,5 +863,41 @@ def calculate_rating_rank(self, rating: int) -> int:
else: # Exactly matched
return self.rating_ranks[mid]
if ok == len(self.ratings):
assert False, "Something went wrong: `ok` should be less than `len(self.ratings)`."
raise RuntimeError("Something went wrong: `ok` should be less than `len(self.ratings)`.")
return self.rating_ranks[ok]

def convert_rank_to_percentile(
self,
rank: int,
method: RankPercentileMapMethod | str = RankPercentileMapMethod.WEIBULL,
) -> float:
"""Convert the rank to percentile.

Args:
rank (int): The rank to convert.
method (RankPercentileMapMethod | str): The mode to use for conversion. Defaults to "weibull".
"original": percentile = 100.0 * rank / num_active_users
"hazen": percentile = 100.0 * (rank - 0.5) / (num_active_users + 1)
"weibull": percentile = 100.0 * rank / (num_active_users + 2)
Copy link

Copilot AI Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The formula should use + 1 instead of + 2 in the denominator for the Weibull method, as the correct formula is rank / (num_active_users + 1).

Copilot uses AI. Check for mistakes.

Returns:
float: The converted percentile.

Raises:
ValueError: If the rank is less than 1 or greater than the number of active users + 1.
ValueError: If the method is invalid.
"""
if rank < 1 or rank > self.num_active_users + 1:
raise ValueError(f"The rank must be between 1 and {self.num_active_users + 1} (the number of users + 1).")
try:
method = RankPercentileMapMethod(method)
except ValueError:
raise ValueError(f"Invalid method: {method}. Supported methods are 'original', 'hazen', and 'weibull'.")
if method == RankPercentileMapMethod.ORIGINAL:
if rank == self.num_active_users + 1:
return 100.0 # NOTE: The lowest rank is always 100.0% (avoid exceeding 100.0%)
return 100.0 * rank / self.num_active_users
elif method == RankPercentileMapMethod.HAZEN:
return 100.0 * (rank - 0.5) / (self.num_active_users + 1)
elif method == RankPercentileMapMethod.WEIBULL:
return 100.0 * rank / (self.num_active_users + 2)
Copy link

Copilot AI Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Weibull percentile calculation uses an incorrect denominator. According to the standard Weibull plotting position formula, it should be (num_active_users + 1), not (num_active_users + 2). The current implementation would produce percentiles that don't match the documented formula or standard statistical references.

Suggested change
return 100.0 * rank / (self.num_active_users + 2)
return 100.0 * rank / (self.num_active_users + 1)

Copilot uses AI. Check for mistakes.
98 changes: 95 additions & 3 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def test_init(
[(1, 3200), (2, 200)],
2.01,
pytest.raises(
AssertionError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
RuntimeError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
),
0,
id="2rows_not_within_2nd",
Expand All @@ -133,7 +133,7 @@ def test_init(
[(1, 3200), (2, 2800), (3, 200)],
3.14,
pytest.raises(
AssertionError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
RuntimeError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
),
0,
id="3rows_not_within_3rd",
Expand All @@ -147,7 +147,7 @@ def test_init(
[(1, 3200), (99, 2800), (100, 200)],
101,
pytest.raises(
AssertionError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
RuntimeError, match=r"Something went wrong: `win` should be less than `len\(sorted_keys\)`\."
),
0,
id="3rows_n100_not_within_100th",
Expand Down Expand Up @@ -1617,6 +1617,17 @@ class TestRankingCalculator:
def ranking_calculator_instance(self) -> RankingCalculator:
return RankingCalculator(minimum_participation=0)

@pytest.mark.slow
@pytest.mark.parametrize(
"minimum_participation,expected",
[
pytest.param(0, 6139, id="minimum_participation_0"),
pytest.param(5, 2220, id="minimum_participation_5"),
],
)
def test_num_active_users(self, minimum_participation: int, expected: int) -> None:
assert RankingCalculator(minimum_participation=minimum_participation).num_active_users == expected

@pytest.mark.slow
@pytest.mark.parametrize(
"rating,context,expected",
Expand Down Expand Up @@ -1644,3 +1655,84 @@ def test_calculate_rating_rank(
) -> None:
with context:
assert ranking_calculator_instance.calculate_rating_rank(rating) == expected

@pytest.mark.slow
@pytest.mark.parametrize(
"rank,method,context,expected",
[
pytest.param(1, "original", does_not_raise(), 100.0 * 1.0 / 6139, id="1st_original"),
pytest.param(1, "hazen", does_not_raise(), 100.0 * 0.5 / 6140, id="1st_hazen"),
pytest.param(1, "weibull", does_not_raise(), 100.0 * 1.0 / 6141, id="1st_weibull"),
pytest.param(150, "original", does_not_raise(), 100.0 * 150.0 / 6139, id="150th_original"),
pytest.param(150, "hazen", does_not_raise(), 100.0 * 149.5 / 6140, id="150th_hazen"),
pytest.param(150, "weibull", does_not_raise(), 100.0 * 150.0 / 6141, id="150th_weibull"),
pytest.param(6139, "original", does_not_raise(), 100.0, id="6139th_original"),
pytest.param(6139, "hazen", does_not_raise(), 100.0 * 6138.5 / 6140, id="6139th_hazen"),
pytest.param(6139, "weibull", does_not_raise(), 100.0 * 6139.0 / 6141, id="6139th_weibull"),
pytest.param(6140, "original", does_not_raise(), 100.0, id="6140th_original"),
pytest.param(6140, "hazen", does_not_raise(), 100.0 * 6139.5 / 6140, id="6140th_hazen"),
pytest.param(6140, "weibull", does_not_raise(), 100.0 * 6140.0 / 6141, id="6140th_weibull"),
pytest.param(
0,
"original",
pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
0.0,
id="invalid_rank_0th_original",
),
pytest.param(
0,
"hazen",
pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
0.0,
id="invalid_rank_0th_hazen",
),
pytest.param(
0,
"weibull",
pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
0.0,
id="invalid_rank_0th_weibull",
),
pytest.param(
6141,
"original",
pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
0.0,
id="invalid_rank_6141st_original",
),
pytest.param(
6141,
"hazen",
pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
0.0,
id="invalid_rank_6141st_hazen",
),
pytest.param(
6141,
"weibull",
pytest.raises(ValueError, match=r"The rank must be between 1 and 6140 \(the number of users \+ 1\)\."),
0.0,
id="invalid_rank_6141st_weibull",
),
pytest.param(
3070,
"hoge",
pytest.raises(
ValueError,
match=r"Invalid method: hoge\. Supported methods are 'original', 'hazen', and 'weibull'\.",
),
0.0,
id="invalid_method",
),
],
)
def test_convert_rank_to_percentile(
self,
rank: int,
method: str,
context: AbstractContextManager[None],
expected: float,
ranking_calculator_instance: RankingCalculator,
) -> None:
with context:
assert ranking_calculator_instance.convert_rank_to_percentile(rank, method) == pytest.approx(expected)