forked from saum7800/alpaca_eval
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
615 lines (499 loc) · 27.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
import logging
import sys
from pathlib import Path
from typing import Any, Callable, Literal, Optional, Sequence, Union
import fire
import pandas as pd
from . import analyze, annotators, constants, decoders, metrics, utils
from .types import AnyData, AnyLoadableDF, AnyPath
CUR_DIR = Path(__file__).parent
__all__ = ["evaluate", "evaluate_from_model", "analyze_evaluators", "make_leaderboard"]
def evaluate(
model_outputs: Optional[AnyLoadableDF] = None,
reference_outputs: AnyLoadableDF = constants.ALPACAEVAL_REFERENCE_OUTPUTS,
annotators_config: AnyPath = constants.DEFAULT_ANNOTATOR_CONFIG,
name: Optional[str] = None,
output_path: Optional[Union[AnyPath, str]] = "auto",
precomputed_leaderboard: Optional[Union[str, AnyPath, AnyData]] = "auto",
is_overwrite_leaderboard: bool = False,
leaderboard_mode_to_print: Optional[Union[str, Sequence[str]]] = "minimal",
current_leaderboard_mode: str = "community",
is_return_instead_of_print: bool = False,
fn_metric: Union[str, callable] = "get_length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "get_winrate",
metric_kwargs: Optional[dict[str, Any]] = None,
is_recompute_metrics_only: bool = False,
sort_by: str = "length_controlled_winrate" if constants.IS_ALPACA_EVAL_2 else "win_rate",
is_cache_leaderboard: Optional[bool] = None,
max_instances: Optional[int] = None,
annotation_kwargs: Optional[dict[str, Any]] = None,
Annotator=annotators.PairwiseAnnotator,
**annotator_kwargs,
):
"""Evaluate a model based on its outputs. This is the default entrypoint if no command is specified.
Parameters
----------
model_outputs : path or data or dict
The outputs of the model to add to the leaderboard. Accepts data (list of dictionary, pd.dataframe,
datasets.Dataset) or a path to read those (json, csv, tsv) or a function to generate those. Each dictionary
(or row of dataframe) should contain the keys that are formatted in the prompts. E.g. by default `instruction`
and `output` with optional `input`. If None, we just print the leaderboard.
reference_outputs : path or data, optional
The outputs of the reference model. Same format as `model_outputs`. If None, the reference outputs are a
specific set of Davinci 003 outputs on the AlpacaEval set:
https://huggingface.co/datasets/tatsu-lab/alpaca_eval.
annotators_config : path or list of dict, optional
The path the (or list of dict of) the annotator's config file. For details see the docstring of
`PairwiseAnnotator`.
name : str, optional
The name of the model to add to the leaderboard. If None we check if `generator is in model_outputs` if not
we use "Current model".
output_path : path, optional
Path to the directory where the new leaderboard and the annotations should be stored. If None we don't save.
If `auto` we use `model_outputs` if it is a path, and otherwise use the directory from which we call the script.
precomputed_leaderboard : path or data, optional
The precomputed leaderboard or a path to it (json, csv, or tsv). The leaderboard should contain at least the
column `win_rate`. If `auto` we will try to use the corresponding leaderboard for the reference outputs (only if
in CORRESPONDING_OUTPUTS_LEADERBOARDS). If `None` we won't add other models from the leaderboard.
is_overwrite_leaderboard : bool, optional
Whether to overwrite the leaderboard if the model is already in it.
leaderboard_mode_to_print : {"minimal", "verified", "community", None} or list, optional
The mode of the leaderboard to use. Only used if the precomputed leaderboard has a column `mode`, in which case
it will filter the leaderboard by this mode. If None keeps all. If a list, will print all the models in the
list.
current_leaderboard_mode : {"minimal", "verified", "community"}, optional
The mode of the leaderboard for the current method.
is_return_instead_of_print : bool, optional
Whether to return the metrics instead of printing the results.
fn_metric : str or callable, optional
The function or function name in `metrics` that will be used to convert preference to metrics. The function
should take a sequence of dict annotations. Each dict has a preference key (1.5 for draw, 1 for base win,
2 when the model to compare wins) and return a dictionary of metrics and the key by which to sort the
leaderboard. Common choices: `get_winrate`, `get_length_controlled_winrate`, `get_length_controlled_elo`.
metric_kwargs : dict, optional
Additional arguments to pass to `fn_metric`.
is_recompute_metrics_only : bool, optional
Whether to recompute the metrics. Useful if all you want to recompute the metrics without reannotating.
sort_by : str, optional
The key by which to sort the leaderboard.
is_cache_leaderboard : bool, optional
Whether to save the result leaderboard to `precomputed_leaderboard`. If None we save only if max_instances
not None. A preferred way of adding models to the leaderboard is to set `precomputed_leaderboard` to the
previously saved leaderboard at `<output_path>/leaderboard.csv`.
max_instances : int, optional
The maximum number of instances to annotate. Useful for testing.
annotation_kwargs : dict, optional
Additional arguments to pass to `PairwiseAnnotator.annotate_head2head`.
Annotator : class, optional
The annotator class to use.
annotator_kwargs :
Additional arguments to pass to `PairwiseAnnotator`.
"""
if (
isinstance(current_leaderboard_mode, str)
and current_leaderboard_mode not in constants.ORDERED_LEADERBOARD_MODES
):
raise ValueError(f"current_leaderboard_mode should be one of {constants.ORDERED_LEADERBOARD_MODES}")
annotation_kwargs = annotation_kwargs or dict()
leaderboard, precomputed_leaderboard = utils.get_precomputed_leaderboard(
precomputed_leaderboard, reference_outputs, annotators_config
)
annotations = None
arg_model_outputs = model_outputs
if model_outputs is not None:
model_outputs = utils.load_or_convert_to_dataframe(model_outputs)
reference_outputs = utils.load_or_convert_to_dataframe(reference_outputs)
name = utils.get_generator_name(name, model_outputs)
if (name not in leaderboard) or is_overwrite_leaderboard or is_recompute_metrics_only:
logging.info(f"Evaluating the {name} outputs.")
if not is_recompute_metrics_only:
leaderboard[name] = {}
if max_instances is not None:
# first we shuffle both outputs with a fix seed => more representative
if len(model_outputs) != len(reference_outputs):
logging.warning(
"model_outputs and reference_outputs have different lengths, so we cannot shuffle before taking the first max_instances."
)
else:
seed = 123
model_outputs = model_outputs.sample(frac=1, random_state=seed)
reference_outputs = reference_outputs.sample(frac=1, random_state=seed)
model_outputs = model_outputs[:max_instances]
reference_outputs = reference_outputs[:max_instances]
annotator = Annotator(annotators_config=annotators_config, **annotator_kwargs)
annotations = annotator.annotate_head2head(
outputs_1=reference_outputs, outputs_2=model_outputs, **annotation_kwargs
)
leaderboard[name]["mode"] = current_leaderboard_mode
leaderboard[name]["avg_length"] = int(model_outputs["output"].str.len().mean())
else:
# load previously computed annotations so that we can recompute metrics
assert output_path is not None and name in leaderboard
output_path = utils.get_output_path(
output_path, arg_model_outputs, name, annotators_config=annotators_config
)
annotations = pd.read_json(output_path / "annotations.json")
# Note: I'm using _ to make clear that we may change the annotations in-place. This is bad practice
# but gives much more control for saving annotations with desired metrics. E.g. that's how we save
# "glm_preference" in the annotations
# TODO: change this and use classes
if isinstance(fn_metric, str):
fn_metric_ = getattr(metrics, fn_metric)
else:
fn_metric_ = fn_metric
leaderboard[name].update(fn_metric_(annotations, **(metric_kwargs or {})))
else:
logging.info(f"Skipping evaluation of {name} as it is already in the precomputed leaderboard.")
output_path = utils.get_output_path(output_path, arg_model_outputs, name, annotators_config=annotators_config)
df_leaderboard = pd.DataFrame.from_dict(leaderboard, orient="index").sort_values(by=sort_by, ascending=False)
df_leaderboard = df_leaderboard[
utils.prioritize_elements(list(df_leaderboard.columns), ["win_rate", "standard_error"])
]
if output_path is not None:
logging.info(f"Saving all results to {output_path}")
df_leaderboard.to_csv(output_path / "leaderboard.csv")
if annotations is not None:
utils.convert_to_dataframe(annotations).to_json(
output_path / "annotations.json", orient="records", indent=2
)
if is_cache_leaderboard is None:
is_cache_leaderboard = max_instances is None
if is_cache_leaderboard:
if isinstance(precomputed_leaderboard, AnyPath):
logging.info(f"Saving result to the precomputed leaderboard at {precomputed_leaderboard}")
df_leaderboard.to_csv(precomputed_leaderboard)
else:
logging.info(
f"Not saving the result to the cached leaderboard because precomputed_leaderboard is not a "
f"path but {type(precomputed_leaderboard)}."
)
if is_return_instead_of_print:
return df_leaderboard, annotations
else:
utils.print_leaderboard(
df_leaderboard,
leaderboard_mode_to_print,
current_name=name,
cols_to_print=[sort_by, "win_rate", "standard_error", "n_total", "avg_length"],
)
def evaluate_from_model(
model_configs: Union[AnyPath, dict],
reference_model_configs: Optional[Union[AnyPath, dict]] = None,
evaluation_dataset: AnyLoadableDF = constants.ALPACAEVAL_REFERENCE_OUTPUTS,
annotators_config: AnyPath = constants.DEFAULT_ANNOTATOR_CONFIG,
output_path: AnyPath = "auto",
max_instances: int = None,
is_strip_output: bool = True,
is_load_outputs: bool = True,
chunksize: int = 64,
**kwargs,
):
"""Evaluate a model from HuggingFace or an API provider. This is a wrapper around `evaluate` which includes
generating from
a desired model.
Parameters
----------
model_configs : path or dict
A dictionary or path (relative to `models_configs`) to a yaml file containing the configuration of the model to
decode from. If a directory,we search for 'configs.yaml' in it. The keys in the first dictionary should be the
generator's name, and the value should be a dictionary of the generator's configuration which should have the
following keys:
- prompt_template (str): a prompt template or path to one. Each template should contain placeholders for
keys in the data dictionary, typically {instruction} and {output}.
- fn_completions (str): function in `alpaca_farm.decoders` for completions. Needs to accept as first argument
`prompts` which is a list of string.
- completions_kwargs (dict): kwargs for fn_completions. E.g. model_name, max_tokens, temperature...
reference_model_configs : path or dict, optional
Same as in `model_configs` but for the reference model. If None, we use the default Davinci003 outputs.
evaluation_dataset : path or callable, optional
Path to the evaluation dataset or a function that returns a dataframe. If None, we use the default evaluation
annotators_config : path or dict, optional
Path to the annotators configuration or a dictionary. If None, we use the default annotators configuration.
output_path : path, optional
Path to save the generations, annotations and leaderboard. If auto saves at `results/<model_name>`
max_instances : int, optional
Maximum number of instances to generate and evaluate. If None, we evaluate all instances.
is_strip_output : bool, optional
Whether to strip trailing and leading whitespaces from the outputs.
is_load_outputs : bool, optional
Whether to try to load outputs from the output path. If True and outputs exist we only generate outputs for
instructions that don't have outputs yet.
chunksize : int, optional
Number of instances to generate before saving. If None, we save after all generations.
kwargs:
Other kwargs to `evaluate`
"""
df_dataset = utils.load_or_convert_to_dataframe(evaluation_dataset)
if chunksize is not None and not is_load_outputs:
logging.info("`is_load_outputs` has to be true to use chunksize. Setting it to True.")
is_load_outputs = True
if chunksize is not None and max_instances is not None:
logging.info("cannot use `chunksize` with max_instances. Setting `chunksize` to None.")
chunksize = None
base_dir = Path(kwargs.get("base_dir", constants.MODELS_CONFIG_DIR))
model_configs = utils.load_configs(model_configs, relative_to=base_dir)
if reference_model_configs is not None:
reference_model_configs = utils.load_configs(reference_model_configs, relative_to=base_dir)
if output_path == "auto":
output_path = Path("results") / list(model_configs.keys())[0]
if output_path is not None:
output_path = Path(output_path)
output_path.mkdir(exist_ok=True, parents=True)
def get_completions(configs, df: pd.DataFrame, old_output_path: Optional[Path] = None):
columns_to_keep = ["dataset", "instruction", "output", "generator"]
columns_to_keep = [c for c in columns_to_keep if c in df.columns]
curr_outputs = df[columns_to_keep].copy()
is_loading_old_outputs = old_output_path is not None and old_output_path.exists()
assert len(configs) == 1
generator = list(configs.keys())[0]
configs = list(configs.values())[0]
if is_loading_old_outputs:
logging.info(f"Loading outputs from {old_output_path}")
old_outputs = utils.load_or_convert_to_dataframe(old_output_path)
# select only rows in curr_outputs that have "instruction" that are not in old_outputs
idx_found_old_outputs = curr_outputs["instruction"].isin(old_outputs["instruction"])
curr_outputs = curr_outputs[~idx_found_old_outputs]
assert (old_outputs["generator"] == generator).all()
logging.info(f"Found {len(old_outputs)}. Only generating {len(curr_outputs)} .")
if max_instances is not None:
curr_outputs = curr_outputs.iloc[:max_instances]
if len(curr_outputs) > 0:
prompts, _ = utils.make_prompts(
curr_outputs,
template=utils.read_or_return(base_dir / configs["prompt_template"]),
)
fn_completions = decoders.get_fn_completions(configs["fn_completions"])
completions = fn_completions(prompts=prompts, **configs["completions_kwargs"])["completions"]
if is_strip_output:
completions = [c.strip() for c in completions]
curr_outputs["output"] = completions
curr_outputs["generator"] = generator
if is_loading_old_outputs:
curr_outputs = pd.concat([old_outputs, curr_outputs], axis=0)
return curr_outputs
for df_chunk in utils.dataframe_chunk_generator(
df_dataset, chunksize=chunksize, tqdm_desc="Chunking for generation"
):
if is_load_outputs and output_path is not None:
model_outputs = get_completions(
model_configs, df=df_chunk, old_output_path=output_path / "model_outputs.json"
)
else:
model_outputs = get_completions(model_configs, df=df_chunk)
if reference_model_configs is None:
if "output" not in df_chunk.columns:
raise ValueError("evaluation_dataset should have a column 'output' containing references outputs")
reference_outputs = df_dataset.copy()
else:
reference_outputs = get_completions(
reference_model_configs,
df=df_chunk,
old_output_path=output_path / "reference_outputs.json",
)
if output_path is not None:
model_outputs.to_json(output_path / "model_outputs.json", orient="records", indent=2)
reference_outputs.to_json(output_path / "reference_outputs.json", orient="records", indent=2)
if reference_model_configs is None:
# using a default reference outputs => uses the right leaderboard
if evaluation_dataset in [constants.ALPACAEVAL_REFERENCE_OUTPUTS]:
reference_outputs = evaluation_dataset
return evaluate(
model_outputs=model_outputs,
reference_outputs=reference_outputs,
annotators_config=annotators_config,
output_path=output_path,
max_instances=max_instances,
**kwargs,
)
def make_leaderboard(
leaderboard_path: Optional[AnyPath] = None,
annotators_config: AnyPath = constants.DEFAULT_ANNOTATOR_CONFIG,
all_model_outputs: AnyLoadableDF = constants.ALPACAFARM_ALL_OUTPUTS,
reference_outputs: AnyLoadableDF = constants.ALPACAEVAL_REFERENCE_OUTPUTS,
fn_add_to_leaderboard: Callable = "evaluate",
leaderboard_mode: str = "verified",
is_return_instead_of_print: bool = False,
**kwargs,
):
"""Precompute and save an entire leaderboard for a given dataset / evaluator / set of models generations.
Parameters
----------
leaderboard_path : path
The path to save the leaderboard to. The leaderboard will be saved as a csv file, if it already exists it will
append
annotators_config : path or list of dict, optional
The path the (or list of dict of) the annotator's config file.
all_model_outputs : path or data or callable, optional
The outputs of all models to add to the leaderboard. Accepts data (list of dictionary, pd.dataframe,
datasets.Dataset) or a path to read those (json, csv, tsv potentially with globbing) or a function to generate
those. If the path contains a globbing pattern, we will read all files matching the pattern and concatenate
them. Each dictionary (or row of dataframe) should contain the keys that are formatted in the prompts. E.g. by
default `instruction` and `output` with optional `input`. It should also contain a column `generator` with the
name of the current model. Could also be a list of the above, in which case the output is the concatenation.
reference_outputs : path or data, optional
The outputs of the reference model. Same format as `all_model_outputs` but without needing `generator`. By
default, the reference outputs are the 003 outputs on AlpacaEval set.
fn_add_to_leaderboard : callable or str, optional
The function to use to add a model to the leaderboard. If a string, it should be the name of a function in
`main.py`. The function should take the arguments: `model_outputs`, `annotators_config`, `name`,
`precomputed_leaderboard`, `is_return_instead_of_print`, `reference_outputs`.
leaderboard_mode : {"minimal", "verified", "community"}, optional
The mode of the leaderboard to save all new entries with.
is_return_instead_of_print : bool, optional
Whether to return the metrics instead of printing the results.
kwargs :
Additional arguments to pass to `fn_add_to_leaderboard`.
"""
if isinstance(fn_add_to_leaderboard, str):
fn_add_to_leaderboard = globals()[fn_add_to_leaderboard]
if leaderboard_path is None:
assert isinstance(annotators_config, str) and "/" not in annotators_config, (
"If `leaderboard_path` is None, `annotators_config` should be a string with the name of the annotator "
"configuration."
)
leaderboard_path = Path(constants.ALPACAEVAL_LEADERBOARD_PATHS) / f"{annotators_config}_leaderboard.csv"
Path(leaderboard_path).parent.mkdir(exist_ok=True, parents=True)
all_model_outputs = utils.load_or_convert_to_dataframe(all_model_outputs)
if "generator" not in all_model_outputs.columns:
raise ValueError(f"all_model_outputs should have a column 'generator' with the name of the model.")
all_annotations = []
for model in all_model_outputs["generator"].unique():
model_outputs = all_model_outputs[all_model_outputs["generator"] == model]
df_leaderboard, annotations = fn_add_to_leaderboard(
model_outputs=model_outputs,
reference_outputs=reference_outputs,
annotators_config=annotators_config,
name=model,
precomputed_leaderboard=leaderboard_path,
is_return_instead_of_print=True,
current_leaderboard_mode=leaderboard_mode,
**kwargs,
)
if annotations is not None:
all_annotations += annotations
df_leaderboard.to_csv(leaderboard_path)
leaderboard = utils.load_or_convert_to_dataframe(leaderboard_path)
df_leaderboard = pd.DataFrame(leaderboard)
if is_return_instead_of_print:
return df_leaderboard, all_annotations
else:
utils.print_leaderboard(
df_leaderboard, leaderboard_mode=None, cols_to_print=["win_rate", "standard_error", "n_total"]
)
def analyze_evaluators(
annotators_config: Optional[AnyPath] = constants.DEFAULT_ANNOTATOR_CONFIG,
Annotator=annotators.PairwiseAnnotator,
analyzer_kwargs: Optional[dict] = None,
precomputed_leaderboard: Optional[Union[AnyPath, AnyData]] = CUR_DIR
/ "leaderboards/evaluators/evaluators_leaderboard.csv",
is_save_leaderboard: bool = False,
is_return_instead_of_print: bool = False,
is_overwrite_leaderboard: bool = False,
max_instances: Optional[int] = None,
is_single_annotator: bool = False,
leaderboard_mode_to_print: str = "minimal",
current_leaderboard_mode: str = "minimal",
output_path: Optional[Union[AnyPath, str]] = "auto",
**annotator_kwargs,
):
"""Analyze an evaluator and populates the evaluators leaderboard (agreement with human, speed, price,...).
Parameters
----------
annotators_config : path or list of dict, optional
The path the (or list of dict of) the annotator's config file.
Annotator : class, optional
The annotator class to use.
analyzer_kwargs : dict, optional
Additional arguments to pass to the analyzer.
precomputed_leaderboard : path or data, optional
The precomputed (meta)leaderboard of annotators or a path to it (json, csv, or tsv).
is_save_leaderboard : bool, optional
Whether to save the leaderboard (ie analyzed results).
is_return_instead_of_print : bool, optional
Whether to return the leaderboard (ie analyzed results). If True, it will not print the results.
is_overwrite_leaderboard : bool, optional
Whether to overwrite the leaderboard if it already exists.
max_instances : int, optional
The maximum number of instances to analyze.
is_single_annotator : bool, optional
Whether to analyze a single annotator. If True, will not be able to estimate the annotator's bias.
leaderboard_mode_to_print : {"minimal", "verified", "community"}, optional
The mode of the leaderboard to print.
current_leaderboard_mode : {"minimal", "verified", "community"}, optional
The mode of the leaderboard to save all new entries with.
output_path : path, optional
Path to save the leaderboard and annotataions. If None, we don't save.
annotator_kwargs :
Additional arguments to pass to `Annotator`.
"""
leaderboard = dict()
if precomputed_leaderboard is not None:
try:
leaderboard = utils.load_or_convert_to_dataframe(precomputed_leaderboard).to_dict(orient="index")
except FileNotFoundError:
logging.warning(
f"Could not find precomputed leaderboard at {precomputed_leaderboard}. Starting from " f"scratch."
)
analyzer_kwargs = analyzer_kwargs or {}
all_crossannotations = dict()
key = None
if annotators_config is not None:
key = annotators_config.replace("/", "_").replace("_configs.yaml", "")
if key not in leaderboard or is_overwrite_leaderboard:
analyzer = analyze.Analyzer(**analyzer_kwargs)
if key == "humans":
df_crossannotations = analyzer.df_gold_crossannotations
elif key == "longest":
df_crossannotations = analyze._get_longest_predictor(analyzer.df_gold_crossannotations)
else:
annotator_kwargs = annotator_kwargs or {}
df_crossannotations = analyze.get_crossannotations(
analyzer=analyzer,
Annotator=Annotator,
max_instances=max_instances,
annotators_config=annotators_config,
is_single_annotator=is_single_annotator,
**annotator_kwargs,
)
leaderboard[key] = analyze.get_metrics_evaluator(analyzer, df_crossannotations, evaluator_name=key)
leaderboard[key]["mode"] = current_leaderboard_mode
all_crossannotations[key] = df_crossannotations
df_leaderboard = pd.DataFrame.from_dict(leaderboard, orient="index").sort_values(
by="Human agreement", ascending=False
)
df_leaderboard = df_leaderboard[
utils.prioritize_elements(list(df_leaderboard.columns), constants.EVALUATORS_LEADERBOARD_COLS_TO_PRIORITIZE)
]
if is_save_leaderboard:
df_leaderboard.to_csv(precomputed_leaderboard)
if key is not None and output_path is not None:
output_path = utils.get_output_path(output_path, annotators_config, key, dflt_dir="results_evaluators")
if isinstance(annotators_config, str) and "/" not in annotators_config:
output_path = Path(output_path) / annotators_config
output_path.mkdir(exist_ok=True, parents=True)
logging.info(f"Saving all results to {output_path}")
df_leaderboard.to_csv(output_path / f"leaderboard.csv")
for annotator_name, df_crossannotations in all_crossannotations.items():
annotations_name = f"annotation.json"
df_crossannotations.to_json(output_path / annotations_name, orient="records", indent=2)
if is_return_instead_of_print:
return df_leaderboard, all_crossannotations
else:
utils.print_leaderboard(
df_leaderboard, leaderboard_mode_to_print, cols_to_print=constants.EVALUATORS_LEADERBOARD_COLS_TO_PRINT
)
ALL_FUNCTIONS = {
"evaluate": evaluate,
"evaluate_from_model": evaluate_from_model,
"make_leaderboard": make_leaderboard,
"analyze_evaluators": analyze_evaluators,
}
def main():
is_fn_name = len(sys.argv) > 1 and "--" not in sys.argv[1]
is_help = any(a == "--help" for a in sys.argv)
if is_fn_name or is_help:
fire.Fire(ALL_FUNCTIONS)
else:
# default behavior if no function is specified
fire.Fire(evaluate)
if __name__ == "__main__":
fire.Fire(ALL_FUNCTIONS)