Add evaluation result aggregation and rendering

FormalLanguageConstrainedPathQuerying · Mar 5, 2024 · a56189e · a56189e
1 parent 916dadb
commit a56189e
Show file tree

Hide file tree

Showing 2 changed files with 137 additions and 2 deletions.
diff --git a/cli/eval_all_pairs_cflr.py b/cli/eval_all_pairs_cflr.py
@@ -3,17 +3,23 @@
 import os
 import subprocess
 import sys
+import warnings
+from math import floor, log10
 from pathlib import Path
 from typing import Optional, List
 
+import pandas as pd
+
 from cli.runners.all_pairs_cflr_tool_runner import IncompatibleCflrToolError
 from cli.runners.all_pairs_cflr_tool_runner_facade import run_appropriate_all_pairs_cflr_tool
 
+DISPLAY_STD_THRESHOLD = 0.1
+
 # see `man timeout`
 TIMEOUT_EXIT_CODE = 124
 
 
-def is_enough_data_collected(result_file_path, rounds):
+def is_enough_data_collected(result_file_path: Path, rounds: int):
     try:
         with open(result_file_path, 'r') as file:
             reader = list(csv.reader(file))
@@ -78,7 +84,7 @@ def run_experiment(
             print(f"   {s_edges} {ram_kb} {time_sec}")
             writer = csv.writer(csvfile)
             writer.writerow([
-                {algo_name},
+                algo_name,
                 os.path.basename(graph_base_name),
                 os.path.basename(grammar_base_name),
                 s_edges,
@@ -87,13 +93,139 @@ def run_experiment(
             ])
 
 
+def round_to_significant_digits(x: float, digits: int = 2) -> float:
+    if x == 0:
+        return x
+    return round(x, max(0, -int(floor(log10(abs(x)))) + digits - 1))
+
+
+def reduce_result_file_to_one_row(result_file_path: Path) -> pd.DataFrame:
+    df = pd.read_csv(result_file_path)
+
+    if len(df) == 0:
+        return df
+
+    df['ram_gb'] = df['ram_kb'].apply(
+        lambda x: x / 10**6 if isinstance(x, int) or isinstance(x, float) else x
+    )
+    assert df['algo'].nunique() <= 1
+    assert df['graph'].nunique() <= 1
+    assert df['grammar'].nunique() <= 1
+    if df['s_edges'].isin(['OOM', 'OOT', '-']).any():
+        # leave only one entry
+        df = df[df['s_edges'].isin(['OOM', 'OOT', '-'])].head(1)
+    else:
+        unique_s_edges = df['s_edges'].unique()
+        if len(unique_s_edges) > 1:
+            warnings.warn(f"Inconsistent 's_edges' values {unique_s_edges} found in {result_file_path}. "
+                          f"Using first 's_edges' value.")
+
+        ram_gb_mean = df['ram_gb'].mean()
+        time_sec_mean = df['time_sec'].mean()
+
+        # sample standard deviation
+        ram_gb_std = df['ram_gb'].std(ddof=1) if len(df) > 1 else -1
+        time_sec_std = df['time_sec'].std(ddof=1) if len(df) > 1 else -1
+
+        df = pd.DataFrame({
+            'algo': [df['algo'].iloc[0]],
+            'graph': [df['graph'].iloc[0]],
+            'grammar': [df['grammar'].iloc[0]],
+            's_edges': [df['s_edges'].iloc[0]],
+            'ram_gb': [
+                round_to_significant_digits(ram_gb_mean)
+                if ram_gb_std < DISPLAY_STD_THRESHOLD * ram_gb_mean
+                else f"{round_to_significant_digits(ram_gb_mean)} ± {round_to_significant_digits(ram_gb_std)}"
+            ],
+            'time_sec': [
+                # Graspan reports analysis time in whole seconds, so it may report 0
+                (round_to_significant_digits(time_sec_mean) if time_sec_mean != 0 else "< 1")
+                if time_sec_std < DISPLAY_STD_THRESHOLD * time_sec_mean
+                else f"{round_to_significant_digits(time_sec_mean)} ± {round_to_significant_digits(time_sec_std)}"
+            ]
+        })
+    return df
+
+
+def pprint_df(df: pd.DataFrame, title: str):
+    df_string = df.to_markdown(maxheadercolwidths=12, maxcolwidths=12)
+    width = max(len(line) for line in df_string.splitlines())
+    print(title.center(width, "="))
+    print(df_string)
+    print("=" * width)
+
+
+def min_numeric(series: pd.Series) -> float:
+    numeric_series = pd.to_numeric(series, errors='coerce').dropna()
+    return float('inf') if numeric_series.empty else numeric_series.min()
+
+
+def display_results_for_grammar(df: pd.DataFrame, grammar: str):
+    df = df[df['grammar'] == grammar].copy()
+    df['algo'] = df['algo'].apply(lambda algo: algo.lower())
+    df.drop(columns=['grammar'], inplace=True)
+
+    df['graph'] = pd.Categorical(df['graph'], sorted(
+        df['graph'].unique(),
+        key=lambda graph: min_numeric(df[df['graph'] == graph]['time_sec'])
+    ))
+
+    s_edges_df = df.pivot(index='graph', columns='algo', values='s_edges').sort_index()
+    s_edges_df.columns = [
+        f'{col} (HAS KNOWN BUGS)'
+        if "pocr" in col.lower()
+        else col
+        for col in s_edges_df.columns
+    ]
+    pprint_df(
+        s_edges_df,
+        title=f" #ANSWER (grammar '{grammar}') ",
+    )
+
+    print()
+    ram_df = df.pivot(index='graph', columns='algo', values='ram_gb').sort_index()
+    pprint_df(
+        ram_df,
+        title=f" RAM, GB (grammar '{grammar}') "
+    )
+    print()
+    time_df = df.pivot(index='graph', columns='algo', values='time_sec').sort_index()
+    pprint_df(
+        time_df,
+        title=f" TIME, SEC (grammar '{grammar}') "
+    )
+    print()
+    print()
+
+
+def display_results(result_files_paths: List[Path]) -> None:
+    print()
+    print("RESULTS:")
+    print(f"Sample std is shown when it's over {DISPLAY_STD_THRESHOLD * 100}% of the mean.")
+    print()
+
+    df = pd.concat(
+        [reduce_result_file_to_one_row(result_file_path) for result_file_path in result_files_paths],
+        ignore_index=True
+    )
+    df['algo'] = pd.Categorical(df['algo'], categories=df['algo'].unique())
+    with pd.option_context(
+            'display.max_rows', None,
+            'display.max_columns', None
+    ):
+        for grammar in df['grammar'].unique():
+            display_results_for_grammar(df, grammar)
+    print(f"Sample std is shown when it's over {DISPLAY_STD_THRESHOLD * 100}% of the mean.")
+
+
 def eval_all_pairs_cflr(
     algo_config: Path,
     data_config: Path,
     result_path: Path,
     rounds: Optional[int],
     timeout_sec: Optional[int],
 ):
+    result_files_paths = []
     with open(algo_config, mode='r') as algo_file:
         algo_reader = csv.DictReader(algo_file)
         for algo_row in algo_reader:
@@ -122,6 +254,8 @@ def eval_all_pairs_cflr(
                         timeout_sec=timeout_sec,
                         result_file_path=result_file_path
                     )
+                    result_files_paths.append(result_file_path)
+    display_results(result_files_paths)
 
 
 def main(raw_args: List[str]):

diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ python-graphblas==2023.7.0
 pandas==2.0.3
 numpy==1.23.5
 psutil==5.9.8
+tabulate==0.9.0