diff --git a/cli/eval_all_pairs_cflr.py b/cli/eval_all_pairs_cflr.py index ceef7e3..bda32a7 100644 --- a/cli/eval_all_pairs_cflr.py +++ b/cli/eval_all_pairs_cflr.py @@ -3,17 +3,23 @@ import os import subprocess import sys +import warnings +from math import floor, log10 from pathlib import Path from typing import Optional, List +import pandas as pd + from cli.runners.all_pairs_cflr_tool_runner import IncompatibleCflrToolError from cli.runners.all_pairs_cflr_tool_runner_facade import run_appropriate_all_pairs_cflr_tool +DISPLAY_STD_THRESHOLD = 0.1 + # see `man timeout` TIMEOUT_EXIT_CODE = 124 -def is_enough_data_collected(result_file_path, rounds): +def is_enough_data_collected(result_file_path: Path, rounds: int): try: with open(result_file_path, 'r') as file: reader = list(csv.reader(file)) @@ -78,7 +84,7 @@ def run_experiment( print(f" {s_edges} {ram_kb} {time_sec}") writer = csv.writer(csvfile) writer.writerow([ - {algo_name}, + algo_name, os.path.basename(graph_base_name), os.path.basename(grammar_base_name), s_edges, @@ -87,6 +93,131 @@ def run_experiment( ]) +def round_to_significant_digits(x: float, digits: int = 2) -> float: + if x == 0: + return x + return round(x, max(0, -int(floor(log10(abs(x)))) + digits - 1)) + + +def reduce_result_file_to_one_row(result_file_path: Path) -> pd.DataFrame: + df = pd.read_csv(result_file_path) + + if len(df) == 0: + return df + + df['ram_gb'] = df['ram_kb'].apply( + lambda x: x / 10**6 if isinstance(x, int) or isinstance(x, float) else x + ) + assert df['algo'].nunique() <= 1 + assert df['graph'].nunique() <= 1 + assert df['grammar'].nunique() <= 1 + if df['s_edges'].isin(['OOM', 'OOT', '-']).any(): + # leave only one entry + df = df[df['s_edges'].isin(['OOM', 'OOT', '-'])].head(1) + else: + unique_s_edges = df['s_edges'].unique() + if len(unique_s_edges) > 1: + warnings.warn(f"Inconsistent 's_edges' values {unique_s_edges} found in {result_file_path}. " + f"Using first 's_edges' value.") + + ram_gb_mean = df['ram_gb'].mean() + time_sec_mean = df['time_sec'].mean() + + # sample standard deviation + ram_gb_std = df['ram_gb'].std(ddof=1) if len(df) > 1 else -1 + time_sec_std = df['time_sec'].std(ddof=1) if len(df) > 1 else -1 + + df = pd.DataFrame({ + 'algo': [df['algo'].iloc[0]], + 'graph': [df['graph'].iloc[0]], + 'grammar': [df['grammar'].iloc[0]], + 's_edges': [df['s_edges'].iloc[0]], + 'ram_gb': [ + round_to_significant_digits(ram_gb_mean) + if ram_gb_std < DISPLAY_STD_THRESHOLD * ram_gb_mean + else f"{round_to_significant_digits(ram_gb_mean)} ± {round_to_significant_digits(ram_gb_std)}" + ], + 'time_sec': [ + # Graspan reports analysis time in whole seconds, so it may report 0 + (round_to_significant_digits(time_sec_mean) if time_sec_mean != 0 else "< 1") + if time_sec_std < DISPLAY_STD_THRESHOLD * time_sec_mean + else f"{round_to_significant_digits(time_sec_mean)} ± {round_to_significant_digits(time_sec_std)}" + ] + }) + return df + + +def pprint_df(df: pd.DataFrame, title: str): + df_string = df.to_markdown(maxheadercolwidths=12, maxcolwidths=12) + width = max(len(line) for line in df_string.splitlines()) + print(title.center(width, "=")) + print(df_string) + print("=" * width) + + +def min_numeric(series: pd.Series) -> float: + numeric_series = pd.to_numeric(series, errors='coerce').dropna() + return float('inf') if numeric_series.empty else numeric_series.min() + + +def display_results_for_grammar(df: pd.DataFrame, grammar: str): + df = df[df['grammar'] == grammar].copy() + df['algo'] = df['algo'].apply(lambda algo: algo.lower()) + df.drop(columns=['grammar'], inplace=True) + + df['graph'] = pd.Categorical(df['graph'], sorted( + df['graph'].unique(), + key=lambda graph: min_numeric(df[df['graph'] == graph]['time_sec']) + )) + + s_edges_df = df.pivot(index='graph', columns='algo', values='s_edges').sort_index() + s_edges_df.columns = [ + f'{col} (HAS KNOWN BUGS)' + if "pocr" in col.lower() + else col + for col in s_edges_df.columns + ] + pprint_df( + s_edges_df, + title=f" #ANSWER (grammar '{grammar}') ", + ) + + print() + ram_df = df.pivot(index='graph', columns='algo', values='ram_gb').sort_index() + pprint_df( + ram_df, + title=f" RAM, GB (grammar '{grammar}') " + ) + print() + time_df = df.pivot(index='graph', columns='algo', values='time_sec').sort_index() + pprint_df( + time_df, + title=f" TIME, SEC (grammar '{grammar}') " + ) + print() + print() + + +def display_results(result_files_paths: List[Path]) -> None: + print() + print("RESULTS:") + print(f"Sample std is shown when it's over {DISPLAY_STD_THRESHOLD * 100}% of the mean.") + print() + + df = pd.concat( + [reduce_result_file_to_one_row(result_file_path) for result_file_path in result_files_paths], + ignore_index=True + ) + df['algo'] = pd.Categorical(df['algo'], categories=df['algo'].unique()) + with pd.option_context( + 'display.max_rows', None, + 'display.max_columns', None + ): + for grammar in df['grammar'].unique(): + display_results_for_grammar(df, grammar) + print(f"Sample std is shown when it's over {DISPLAY_STD_THRESHOLD * 100}% of the mean.") + + def eval_all_pairs_cflr( algo_config: Path, data_config: Path, @@ -94,6 +225,7 @@ def eval_all_pairs_cflr( rounds: Optional[int], timeout_sec: Optional[int], ): + result_files_paths = [] with open(algo_config, mode='r') as algo_file: algo_reader = csv.DictReader(algo_file) for algo_row in algo_reader: @@ -122,6 +254,8 @@ def eval_all_pairs_cflr( timeout_sec=timeout_sec, result_file_path=result_file_path ) + result_files_paths.append(result_file_path) + display_results(result_files_paths) def main(raw_args: List[str]): diff --git a/requirements.txt b/requirements.txt index 69e5ee6..415fdef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ python-graphblas==2023.7.0 pandas==2.0.3 numpy==1.23.5 psutil==5.9.8 +tabulate==0.9.0