prompt.txt

I want to implement Performance Benchmark python module.

The idea is for a task (ex. text-classification), 
given models and given corresponding dataset
compute performance metrics, disk size and latency time.

Here is abstract class:

from abc import ABC, abstractclassmethod
from datasets import load_metric
import transformers
import numpy as np
import torch
from time import perf_counter
from pathlib import Path

""" This is TASK-Agnostic Base class.
"""

class PerformanceBenchmark:
    def __init__(self, pipeline):
        assert isinstance(pipeline, transformers.Pipeline)

        self.pipeline = pipeline
        self.dataset = None # task specific
                
    @abstractclassmethod
    def compute_performance(self, dataset) -> dict:
        """Abstract method.

        Example:
        preds, labels = [], []
        for example in self.dataset:
            preds.append(self.pipeline(example['input']))
            labels.append(example['label'])

        score = self.metric.compute(predictions=preds, references=labels)
        return {self.metric.name : score}
        """
        pass

    def compute_size(self) -> dict:
        state_dict = self.pipeline.model.state_dict()
        tmp = Path("model.pt")
        torch.save(state_dict, tmp)
        size_mb = Path(tmp).stat().st_size / (1024 * 1024)
        tmp.unlink()
        print(f"Model size (MB) - {size_mb:.2f}")
        return {'size_mb' : size_mb}


    def compute_time(self) -> dict:
        latencies = []
        for _ in range(100):
            start = perf_counter()
            _ = self.pipeline(self.dataset[0]['text'])
            latencies.append(perf_counter() - start)
        time_avg_ms = 1_000 * np.mean(latencies)
        time_std_ms = 1_000 * np.std(latencies)
        print(f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
        return {'time_avg_ms' : time_avg_ms, 'time_std_ms' : time_std_ms}

    def run_benchmark(self):
        metrics = {
            **self.compute_size(),
            **self.compute_time(),
            **self.compute_performance(self.dataset),
        }

        return metrics
    

Here is specific class for text-classification:

from nlphub import PerformanceBenchmark
import datasets
from nlphub.utils import rename_dataset_label_key

"""The MODELBenchmark:
1. measure time, memory and the performance on input dataset.

"""

class ClassificationBenchmark(PerformanceBenchmark):

    def __init__(self, pipeline, dataset):
        super().__init__(pipeline)

        # parse the label regex from data set (ex. 'label')
        self._label = list(dataset.features.keys())[1]
        self.features = dataset.features[self._label]
        

    def compuet_performance(self, dataset) -> dict:
        assert isinstance(dataset, datasets.Dataset), 'dataset is not of type datasets.Dataset'
        rename_dataset_label_key(dataset)
        assert 'text' in dataset and 'label' in dataset, "dataset doesn't contain 'text' or 'label' attributes"

        preds, labels = [], []

        for example in self.dataset:
            pred = self.pipeline(example['text'])

            pred_int =  self.features.str2int(pred[self._label])

            preds.append(pred_int)
            labels.append(example['label'])

        score = self.metric.compute(predictions=preds, references=labels)
        return {self.metric : score}

as you have noticed self.metric is no