ServiceNow
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎azimuth/modules/base_classes/aggregation_module.py
+3-43 b/‎azimuth/modules/base_classes/aggregation_module.py
+3-43
diff --git a/‎azimuth/modules/model_performance/confidence_binning.py
+46-11 b/‎azimuth/modules/model_performance/confidence_binning.py
+46-11
diff --git a/‎azimuth/modules/model_performance/confusion_matrix.py
+2-1 b/‎azimuth/modules/model_performance/confusion_matrix.py
+2-1
diff --git a/‎azimuth/modules/model_performance/metrics.py
+76-57 b/‎azimuth/modules/model_performance/metrics.py
+76-57
diff --git a/‎azimuth/modules/model_performance/outcome_count.py
+10-3 b/‎azimuth/modules/model_performance/outcome_count.py
+10-3
@@ -9,6 +9,7 @@ cache
 *.swp
 !poetry.lock
 !yarn.lock
+tests/logs.txt
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 
@@ -3,14 +3,13 @@
 # in the root directory of this source tree.
 import time
 from abc import ABC
-from typing import List, Optional, cast
+from typing import List, Optional
 
 from datasets import Dataset
 
 from azimuth.modules.base_classes import ConfigScope, ExpirableMixin, Module
-from azimuth.types import DatasetColumn, DatasetSplitName, ModuleOptions, ModuleResponse
-from azimuth.types.outcomes import OutcomeName
-from azimuth.utils.filtering import filter_dataset_split
+from azimuth.types import DatasetSplitName, ModuleOptions, ModuleResponse
+from azimuth.utils.dataset_operations import filter_dataset_split
 
 
 class AggregationModule(Module[ConfigScope], ABC):
@@ -62,42 +61,3 @@ def get_dataset_split(self, name: DatasetSplitName = None) -> Dataset:
             config=self.config,
             without_postprocessing=self.mod_options.without_postprocessing,
         )
-
-    def _get_predictions_from_ds(self) -> List[int]:
-        """Get predicted classes according to the module options (with or without postprocessing).
-
-        Returns: List of Predictions
-        """
-        ds = self.get_dataset_split()
-        if self.mod_options.without_postprocessing:
-            return cast(List[int], [preds[0] for preds in ds[DatasetColumn.model_predictions]])
-        else:
-            return cast(List[int], ds[DatasetColumn.postprocessed_prediction])
-
-    def _get_confidences_from_ds(self) -> List[List[float]]:
-        """Get confidences according to the module options (with or without postprocessing).
-
-        Notes: Confidences are sorted according to their values (not the class id).
-
-        Returns: List of Confidences
-        """
-        ds = self.get_dataset_split()
-        confidences = (
-            ds[DatasetColumn.model_confidences]
-            if self.mod_options.without_postprocessing
-            else ds[DatasetColumn.postprocessed_confidences]
-        )
-        return cast(List[List[float]], confidences)
-
-    def _get_outcomes_from_ds(self) -> List[OutcomeName]:
-        """Get outcomes according to the module options (with or without postprocessing).
-
-        Returns: List of Outcomes
-        """
-        ds = self.get_dataset_split()
-        outcomes = (
-            ds[DatasetColumn.model_outcome]
-            if self.mod_options.without_postprocessing
-            else ds[DatasetColumn.postprocessed_outcome]
-        )
-        return cast(List[OutcomeName], outcomes)
@@ -16,6 +16,10 @@
     ConfidenceHistogramResponse,
 )
 from azimuth.types.outcomes import ALL_OUTCOMES, OutcomeName
+from azimuth.utils.dataset_operations import (
+    get_confidences_from_ds,
+    get_outcomes_from_ds,
+)
 from azimuth.utils.validation import assert_not_none
 
 CONFIDENCE_BINS_COUNT = 20
@@ -24,23 +28,38 @@
 class ConfidenceHistogramModule(FilterableModule[ModelContractConfig]):
     """Return a confidence histogram of the predictions."""
 
-    def get_outcome_mask(self, outcome: OutcomeName) -> List[bool]:
-        return [utterance_outcome == outcome for utterance_outcome in self._get_outcomes_from_ds()]
-
-    def compute_on_dataset_split(self) -> List[ConfidenceHistogramResponse]:  # type: ignore
-        """Compute the confidence histogram with CONFIDENCE_BINS_COUNT bins on the dataset split.
+    @staticmethod
+    def get_outcome_mask(
+        ds, outcome: OutcomeName, without_postprocessing: bool = False
+    ) -> List[bool]:
+        return [
+            utterance_outcome == outcome
+            for utterance_outcome in get_outcomes_from_ds(ds, without_postprocessing)
+        ]
+
+    @classmethod
+    def get_bins(
+        cls, ds: Dataset, without_postprocessing: bool = False
+    ) -> List[ConfidenceBinDetails]:
+        """Compute the bins on the specified dataset split.
+
+        Note: This lives outside of `compute_on_dataset_split()` so that it can be called without
+        going through calling the module and filtering the dataset.
+
+        Args:
+            ds: Dataset Split on which to compute bins
+            without_postprocessing: Whether to use outcomes and confidences without pipeline
+                postprocessing
 
         Returns:
             List of the confidence bins with their confidence and the outcome count.
-
         """
-        bins = np.linspace(0, 1, CONFIDENCE_BINS_COUNT + 1)
 
-        ds: Dataset = assert_not_none(self.get_dataset_split())
+        bins = np.linspace(0, 1, CONFIDENCE_BINS_COUNT + 1)
 
         if len(ds) > 0:
             # Get the bin index for each prediction.
-            confidences = np.max(self._get_confidences_from_ds(), axis=1)
+            confidences = np.max(get_confidences_from_ds(ds, without_postprocessing), axis=1)
             bin_indices = np.floor(confidences * CONFIDENCE_BINS_COUNT)
 
             # Create the records. We drop the last bin as it's the maximum.
@@ -50,7 +69,7 @@ def compute_on_dataset_split(self) -> List[ConfidenceHistogramResponse]:  # type
                 outcome_count = defaultdict(int)
                 for outcome in ALL_OUTCOMES:
                     outcome_count[outcome] = np.logical_and(
-                        bin_mask, self.get_outcome_mask(outcome)
+                        bin_mask, cls.get_outcome_mask(ds, outcome, without_postprocessing)
                     ).sum()
                 mean_conf = (
                     0 if bin_mask.sum() == 0 else np.nan_to_num(confidences[bin_mask].mean())
@@ -75,7 +94,23 @@ def compute_on_dataset_split(self) -> List[ConfidenceHistogramResponse]:  # type
                 for bin_index, bin_min_value in enumerate(bins[:-1])
             ]
 
-        return [ConfidenceHistogramResponse(bins=result, confidence_threshold=self.get_threshold())]
+        return result
+
+    def compute_on_dataset_split(self) -> List[ConfidenceHistogramResponse]:  # type: ignore
+        """Compute the confidence histogram with CONFIDENCE_BINS_COUNT bins on the dataset split.
+
+        Returns:
+            Confidence bins and threshold.
+
+        """
+        ds: Dataset = assert_not_none(self.get_dataset_split())
+
+        return [
+            ConfidenceHistogramResponse(
+                bins=self.get_bins(ds, self.mod_options.without_postprocessing),
+                confidence_threshold=self.get_threshold(),
+            )
+        ]
 
 
 class ConfidenceBinIndexModule(DatasetResultModule[ModelContractConfig]):
 
@@ -13,6 +13,7 @@
 from azimuth.config import ModelContractConfig
 from azimuth.modules.base_classes import FilterableModule
 from azimuth.types.model_performance import ConfusionMatrixResponse
+from azimuth.utils.dataset_operations import get_predictions_from_ds
 from azimuth.utils.validation import assert_not_none
 
 MIN_CONFUSION_CUTHILL_MCKEE = 0.1
@@ -35,7 +36,7 @@ def compute_on_dataset_split(self) -> List[ConfusionMatrixResponse]:  # type: ig
         """
         ds: Dataset = assert_not_none(self.get_dataset_split())
         predictions, labels = (
-            self._get_predictions_from_ds(),
+            get_predictions_from_ds(ds, self.mod_options.without_postprocessing),
             ds[self.config.columns.label],
         )
         ds_mng = self.get_dataset_split_manager()
 
@@ -33,6 +33,12 @@
     SmartTag,
     SmartTagFamily,
 )
+from azimuth.utils.dataset_operations import (
+    filter_dataset_split,
+    get_confidences_from_ds,
+    get_outcomes_from_ds,
+    get_predictions_from_ds,
+)
 from azimuth.utils.ml.ece import compute_ece_from_bins
 from azimuth.utils.ml.model_performance import sorted_by_utterance_count_with_last
 from azimuth.utils.validation import assert_not_none
@@ -58,65 +64,78 @@ def first_value(di: Optional[Dict]) -> Optional[float]:
 class MetricsModule(FilterableModule[ModelContractConfig]):
     """Computes different metrics on each dataset split."""
 
-    def compute_on_dataset_split(self) -> List[MetricsModuleResponse]:  # type: ignore
-        ds: Dataset = assert_not_none(self.get_dataset_split())
-        indices = self.get_indices()
-        if len(indices) == 0:
+    def compute_metrics(self, ds: Dataset) -> List[MetricsModuleResponse]:
+        """Compute all metrics on the specified dataset split.
+
+        Note: This lives outside of `compute_on_dataset_split()` so that it can be called without
+        going through calling the module and filtering the dataset.
+
+        Args:
+            ds: Dataset Split for which to compute metrics.
+
+        Returns:
+            MetricsModuleResponse with all metrics.
+        """
+        if len(ds) == 0:
             # Nothing to do, we return an empty response.
             return [BASE_RESPONSE]
-
-        utterance_count = len(indices)
-        outcome_count = Counter(self._get_outcomes_from_ds())
-        outcome_count.update({outcome: 0 for outcome in ALL_OUTCOMES})
-
-        # Compute ECE
-        conf_hist_mod = ConfidenceHistogramModule(
-            dataset_split_name=self.dataset_split_name,
-            config=self.config,
-            mod_options=self.mod_options,
-        )
-        bins = conf_hist_mod.compute_on_dataset_split()[0].bins
-        ece, acc, expected = compute_ece_from_bins(bins)
-        count_per_bin = [sum(b.outcome_count.values()) for b in bins]
-
-        metric_values = {}
-        dm = self.get_dataset_split_manager()
-        for metric_name, metric_obj_def in self.config.metrics.items():
-            met: Metric = self.artifact_manager.get_metric(
-                self.config,
-                metric_name,
-                label_list=dm.get_class_names(),
-                rejection_class_idx=dm.rejection_class_idx,
-                force_kwargs=True,  # Set True here as load_metrics has **kwargs.
+        else:
+            utterance_count = len(ds)
+            outcome_count = Counter(
+                get_outcomes_from_ds(ds, self.mod_options.without_postprocessing)
             )
-            accept_probabilities = "probabilities" in inspect.signature(met._compute).parameters
-            extra_kwargs = (
-                dict(probabilities=self.make_probabilities()) if accept_probabilities else {}
-            )
-            extra_kwargs.update(metric_obj_def.additional_kwargs)
-            with warnings.catch_warnings():
-                # Ignore warnings such as
-                #   UndefinedMetricWarning: Precision is ill-defined and being set to 0.0
-                warnings.simplefilter("ignore", category=UndefinedMetricWarning)
-                metric_values[metric_name] = assert_not_none(
-                    first_value(
-                        met.compute(
-                            predictions=self._get_predictions_from_ds(),
-                            references=ds[self.config.columns.label],
-                            **extra_kwargs,
+            outcome_count.update({outcome: 0 for outcome in ALL_OUTCOMES})
+
+            # Compute ECE
+            bins = ConfidenceHistogramModule.get_bins(ds, self.mod_options.without_postprocessing)
+            ece, acc, expected = compute_ece_from_bins(bins)
+            count_per_bin = [sum(b.outcome_count.values()) for b in bins]
+
+            metric_values = {}
+            dm = self.get_dataset_split_manager()
+            for metric_name, metric_obj_def in self.config.metrics.items():
+                met: Metric = self.artifact_manager.get_metric(
+                    self.config,
+                    metric_name,
+                    label_list=dm.get_class_names(),
+                    rejection_class_idx=dm.rejection_class_idx,
+                    force_kwargs=True,  # Set True here as load_metrics has **kwargs.
+                )
+                accept_probabilities = "probabilities" in inspect.signature(met._compute).parameters
+                extra_kwargs = (
+                    dict(probabilities=self.make_probabilities()) if accept_probabilities else {}
+                )
+                extra_kwargs.update(metric_obj_def.additional_kwargs)
+                with warnings.catch_warnings():
+                    # Ignore warnings such as
+                    #   UndefinedMetricWarning: Precision is ill-defined and being set to 0.0
+                    warnings.simplefilter("ignore", category=UndefinedMetricWarning)
+                    metric_values[metric_name] = assert_not_none(
+                        first_value(
+                            met.compute(
+                                predictions=get_predictions_from_ds(
+                                    ds, self.mod_options.without_postprocessing
+                                ),
+                                references=ds[self.config.columns.label],
+                                **extra_kwargs,
+                            )
                         )
                     )
+
+            return [
+                MetricsModuleResponse(
+                    outcome_count=outcome_count,
+                    ece=ece,
+                    ece_plot_args=(acc, expected, ece, count_per_bin),
+                    utterance_count=utterance_count,
+                    custom_metrics=metric_values,
                 )
+            ]
 
-        return [
-            MetricsModuleResponse(
-                outcome_count=outcome_count,
-                ece=ece,
-                ece_plot_args=(acc, expected, ece, count_per_bin),
-                utterance_count=utterance_count,
-                custom_metrics=metric_values,
-            )
-        ]
+    def compute_on_dataset_split(self) -> List[MetricsModuleResponse]:  # type: ignore
+        """Computes different metrics according to the specified module options."""
+        ds: Dataset = assert_not_none(self.get_dataset_split())
+        return self.compute_metrics(ds)
 
     @staticmethod
     def module_to_api_response(res: List[MetricsModuleResponse]) -> List[MetricsAPIResponse]:
@@ -150,7 +169,7 @@ def make_probabilities(self) -> np.ndarray:
         probs = np.zeros([len(ds), num_classes])
         for idx, (confidences, predictions) in enumerate(
             zip(
-                self._get_confidences_from_ds(),
+                get_confidences_from_ds(ds, self.mod_options.without_postprocessing),
                 ds[DatasetColumn.model_predictions],
             )
         ):
@@ -173,14 +192,14 @@ def get_metrics_for_filter(
         Returns:
             Metrics for all provided filters.
         """
+        ds = self.get_dataset_split()
         accumulator = []
         for filter_value, filters in filters_dict.items():
-            met_module = MetricsModule(
+            ds_filtered = filter_dataset_split(ds, filters, config=self.config)
+            metric = MetricsModule(
                 dataset_split_name=self.dataset_split_name,
                 config=self.config,
-                mod_options=self.mod_options.copy(update={"filters": filters}),
-            )
-            metric = met_module.compute_on_dataset_split()[0]
+            ).compute_metrics(ds_filtered)[0]
             accumulator.append(MetricsPerFilterValue(**metric.dict(), filter_value=filter_value))
         return accumulator
 
 
@@ -23,6 +23,7 @@
     SMART_TAGS_FAMILY_MAPPING,
     SmartTag,
 )
+from azimuth.utils.dataset_operations import get_outcomes_from_ds
 from azimuth.utils.ml.model_performance import (
     sorted_by_utterance_count,
     sorted_by_utterance_count_with_last,
@@ -50,7 +51,9 @@ def get_outcome_count_per_class(
         """
         outcome_count_per_class: Dict[Tuple[str, OutcomeName], int] = defaultdict(int)
 
-        for utterance_class, outcome in zip(ds[dataset_column], self._get_outcomes_from_ds()):
+        for utterance_class, outcome in zip(
+            ds[dataset_column], get_outcomes_from_ds(ds, self.mod_options.without_postprocessing)
+        ):
             outcome_count_per_class[(dm.get_class_names()[utterance_class], outcome)] += 1
 
         return sorted_by_utterance_count_with_last(
@@ -77,7 +80,9 @@ def get_outcome_count_per_tag(
         all_tags = dm.get_tags(
             indices=assert_is_list(ds[DatasetColumn.row_idx]), table_key=self._get_table_key()
         )
-        for utterance_tags, outcome in zip(all_tags, self._get_outcomes_from_ds()):
+        for utterance_tags, outcome in zip(
+            all_tags, get_outcomes_from_ds(ds, self.mod_options.without_postprocessing)
+        ):
             no_tag = True
             for filter_, tagged in utterance_tags.items():
                 if tagged and filter_ in filters[:-1]:
@@ -100,7 +105,9 @@ def get_outcome_count_per_outcome(self, ds: Dataset) -> List[OutcomeCountPerFilt
             List of Outcome Count for each outcome.
 
         """
-        outcome_count = defaultdict(int, Counter(self._get_outcomes_from_ds()))
+        outcome_count = defaultdict(
+            int, Counter(get_outcomes_from_ds(ds, self.mod_options.without_postprocessing))
+        )
         empty_outcome_count = {outcome: 0 for outcome in OutcomeName}
 
         metrics = [