Skip to content

Commit 82a440f

Browse files
Use Lazy Loaders (#1536)
* try lazy loadHF first Signed-off-by: dafnapension <[email protected]> * reduce benchmark profiling to generating the dataset only. Not inferring (that is dome mocking anyhow) and not evaluating (of the mocked results). add trust_remote also to load_dataset_builder Signed-off-by: dafnapension <[email protected]> * try procrastination for load csv too Signed-off-by: dafnapension <[email protected]> * added split cache for the generators, and log limit once per data and increase loader cache Signed-off-by: dafnapension <[email protected]> * make sklearn loader too - a lazy loader Signed-off-by: dafnapension <[email protected]> * adjust to new readers for csv Signed-off-by: dafnapension <[email protected]> * Enhance LoadHF class to support optional splits and improve dataset loading logic Signed-off-by: elronbandel <[email protected]> * Refactor LoadHF class to improve dataset loading and implement limit on yielded instances Signed-off-by: elronbandel <[email protected]> * Refactor LoadHF class to streamline dataset loading and enhance split handling Signed-off-by: elronbandel <[email protected]> * Remove unused import and update line number in secrets baseline Signed-off-by: elronbandel <[email protected]> * Refactor load_data method to simplify error handling and remove unnecessary cache checks Signed-off-by: elronbandel <[email protected]> * Merge origin/main Signed-off-by: elronbandel <[email protected]> * Refactor loaders to implement LazyLoader class and update load_iterables method for improved streaming support Signed-off-by: elronbandel <[email protected]> * Update exception handling in test_failed_load_csv to catch general exceptions Signed-off-by: elronbandel <[email protected]> * Refactor LoadHF class to streamline data loading and enhance error handling Signed-off-by: elronbandel <[email protected]> --------- Signed-off-by: dafnapension <[email protected]> Signed-off-by: elronbandel <[email protected]> Co-authored-by: Elron Bandel <[email protected]>
1 parent 8157230 commit 82a440f

29 files changed

+272
-283
lines changed

performance/bluebench_profiler.py

+3-25
Original file line numberDiff line numberDiff line change
@@ -93,14 +93,7 @@ def profiler_do_the_profiling(self, dataset_query: str, split: str, **kwargs):
9393
benchmark_recipe=benchmark_recipe, split=split, **kwargs
9494
)
9595

96-
model = self.profiler_instantiate_model()
97-
98-
predictions = self.profiler_infer_predictions(model=model, dataset=dataset)
99-
100-
evaluation_result = self.profiler_evaluate_predictions(
101-
predictions=predictions, dataset=dataset
102-
)
103-
logger.critical(f"length of evaluation_result: {len(evaluation_result)}")
96+
logger.critical(f"length of bluebench generated dataset: {len(dataset)}")
10497

10598

10699
dataset_query = "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]"
@@ -154,44 +147,29 @@ def main():
154147
pst.strip_dirs()
155148
pst.sort_stats("name") # sort by function name
156149
pst.print_stats(
157-
"profile_benchmark_blue_bench|profiler_instantiate_benchmark_recipe|profiler_generate_benchmark_dataset|profiler_instantiate_model|profiler_infer_predictions|profiler_evaluate_predictions|load_data|load_iterables"
150+
"profile_benchmark_blue_bench|profiler_instantiate_benchmark_recipe|profiler_generate_benchmark_dataset|load_data|load_iterables"
158151
)
159152
s = f.getvalue()
160153
assert s.split("\n")[7].split()[3] == "cumtime"
161154
overall_tot_time = find_cummtime_of(
162155
"profile_benchmark_blue_bench", "bluebench_profiler.py", s
163156
)
164157
load_time = find_cummtime_of("load_data", "loaders.py", s)
165-
just_load_no_initial_ms_time = find_cummtime_of(
166-
"load_iterables", "loaders.py", s
167-
)
158+
168159
instantiate_benchmark_time = find_cummtime_of(
169160
"profiler_instantiate_benchmark_recipe", "bluebench_profiler.py", s
170161
)
171162
generate_benchmark_dataset_time = find_cummtime_of(
172163
"profiler_generate_benchmark_dataset", "bluebench_profiler.py", s
173164
)
174-
instantiate_model_time = find_cummtime_of(
175-
"profiler_instantiate_model", "bluebench_profiler.py", s
176-
)
177-
inference_time = find_cummtime_of(
178-
"profiler_infer_predictions", "bluebench_profiler.py", s
179-
)
180-
evaluation_time = find_cummtime_of(
181-
"profiler_evaluate_predictions", "bluebench_profiler.py", s
182-
)
183165

184166
# Data to be written
185167
dictionary = {
186168
"dataset_query": dataset_query,
187169
"total_time": overall_tot_time,
188170
"load_time": load_time,
189-
"load_time_no_initial_ms": just_load_no_initial_ms_time,
190171
"instantiate_benchmark_time": instantiate_benchmark_time,
191172
"generate_benchmark_dataset_time": generate_benchmark_dataset_time,
192-
"instantiate_model_time": instantiate_model_time,
193-
"inference_time": inference_time,
194-
"evaluation_time": evaluation_time,
195173
"used_eager_mode": settings.use_eager_execution,
196174
"performance.prof file": temp_prof_file_path,
197175
}

performance/compare_benchmark_performance_results.py

+10-31
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import argparse
22
import json
3-
import os
43
import sys
54

65
# Argument parser to get file paths from the command line
@@ -23,24 +22,11 @@
2322
print(f'dataset_query = "{main_perf["dataset_query"]}"')
2423
print(f"used_eager_mode in main = {main_perf['used_eager_mode']}")
2524
print(f"used_eager_mode in PR = {pr_perf['used_eager_mode']}")
26-
print(f"use Mocked inference = {os.environ['UNITXT_MOCK_INFERENCE_MODE']}")
2725

2826
ratio1 = (
29-
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time_no_initial_ms"])
30-
/ (
31-
main_perf["generate_benchmark_dataset_time"]
32-
- main_perf["load_time_no_initial_ms"]
33-
)
34-
if (
35-
main_perf["generate_benchmark_dataset_time"]
36-
- main_perf["load_time_no_initial_ms"]
37-
)
38-
> 0
39-
else 1
40-
)
41-
ratio2 = (
42-
pr_perf["evaluation_time"] / main_perf["evaluation_time"]
43-
if main_perf["evaluation_time"] > 0
27+
(pr_perf["generate_benchmark_dataset_time"] - pr_perf["load_time"])
28+
/ (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"])
29+
if (main_perf["generate_benchmark_dataset_time"] - main_perf["load_time"]) > 0
4430
else 1
4531
)
4632
# Markdown table formatting
@@ -49,26 +35,19 @@
4935
line2 = "--------------------|-------------|-------------|---------------\n"
5036
line3 = f" Total time | {main_perf['total_time']:>11} | {pr_perf['total_time']:>11} | {pr_perf['total_time'] / main_perf['total_time']:.2f}\n"
5137
ratio_line4 = (
52-
pr_perf["load_time_no_initial_ms"] / main_perf["load_time_no_initial_ms"]
53-
if main_perf["load_time_no_initial_ms"] > 0
54-
else 1
38+
pr_perf["load_time"] / main_perf["load_time"] if main_perf["load_time"] > 0 else 1
5539
)
56-
line4 = f" Load time | {main_perf['load_time_no_initial_ms']:>11} | {pr_perf['load_time_no_initial_ms']:>11} | {ratio_line4:.2f}\n"
40+
line4 = f" Load time | {main_perf['load_time']:>11} | {pr_perf['load_time']:>11} | {ratio_line4:.2f}\n"
5741
line5 = f" DS Gen. inc. Load | {main_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time']:>11} | {pr_perf['generate_benchmark_dataset_time'] / main_perf['generate_benchmark_dataset_time']:.2f}\n"
58-
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time_no_initial_ms'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time_no_initial_ms'], 3):>11} | {ratio1:.2f}\n"
59-
line7 = f" Inference time | {main_perf['inference_time']:>11} | {pr_perf['inference_time']:>11} | {pr_perf['inference_time'] / main_perf['inference_time']:.2f}\n"
60-
line8 = f" Evaluate time | {main_perf['evaluation_time']:>11} | {pr_perf['evaluation_time']:>11} | {ratio2:.2f}\n"
61-
line9 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"
62-
line10 = f" Model Instantiation| {main_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time']:>11} | {pr_perf['instantiate_model_time'] / main_perf['instantiate_model_time']:.2f}\n"
42+
line6 = f" DS Gen. exc. Load | {round(main_perf['generate_benchmark_dataset_time'] - main_perf['load_time'], 3):>11} | {round(pr_perf['generate_benchmark_dataset_time'] - pr_perf['load_time'], 3):>11} | {ratio1:.2f}\n"
43+
line7 = f" Benchmark Instant. | {main_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time']:>11} | {pr_perf['instantiate_benchmark_time'] / main_perf['instantiate_benchmark_time']:.2f}\n"
6344

6445
print("### Performance Comparison Results, time expressed in seconds:\n")
65-
print(line1 + line2 + line3 + line4 + line5 + line6 + line7 + line8 + line9 + line10)
46+
print(line1 + line2 + line3 + line4 + line5 + line6 + line7)
6647
print("\n\n")
6748
# Performance degradation check (5% threshold)
68-
if ratio1 > 1.05 or ratio2 > 1.05:
69-
print(
70-
"\n**Warning**: Performance degradation in Dataset Generation and/or Evaluation exceeds 5%!"
71-
)
49+
if ratio1 > 1.05:
50+
print("\n**Warning**: Performance degradation in Dataset Generation exceeds 5%!")
7251
print(
7352
"Explore branch performance via 'python performance/bluebench_profiler.py --output_file=<path to json file>',"
7453
"followed by 'snakeviz <the performance.prof file specified in the output json file>'."

prepare/cards/universal_ner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
loader=LoadHF(
4949
path="universalner/universal_ner",
5050
name=sub_task,
51-
requirements_list=["conllu"],
51+
requirements=["conllu"],
5252
),
5353
preprocess_steps=[
5454
# The dataset is sorted by classes

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ keep-runtime-typing = true
190190
"src/unitxt/metric.py" = ["F811", "F401"]
191191
"src/unitxt/dataset.py" = ["F811", "F401"]
192192
"src/unitxt/blocks.py" = ["F811", "F401"]
193-
"tests/library/test_loaders.py" = ["N802", "N803"]
193+
"tests/library/test_loaders.py" = ["N802", "N803", "RUF015"]
194194
"tests/library/test_dataclass.py" = ["F811", "E731"]
195195
"src/unitxt/validate.py" = ["B024"]
196196
"src/unitxt/standard.py" = ["C901"]

src/unitxt/catalog/cards/universal_ner/ceb/gja.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "ceb_gja",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/da/ddt.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "da_ddt",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/de/pud.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "de_pud",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/en/ewt.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "en_ewt",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/en/pud.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "en_pud",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/hr/set.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "hr_set",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/pt/bosque.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "pt_bosque",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/pt/pud.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "pt_pud",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/ru/pud.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "ru_pud",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/sk/snk.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "sk_snk",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/sr/set.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "sr_set",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/sv/pud.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "sv_pud",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/sv/talbanken.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "sv_talbanken",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/tl/trg.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "tl_trg",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/tl/ugnayan.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "tl_ugnayan",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/zh/gsd.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "zh_gsd",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/zh/gsdsimp.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "zh_gsdsimp",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/catalog/cards/universal_ner/zh/pud.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"__type__": "load_hf",
55
"path": "universalner/universal_ner",
66
"name": "zh_pud",
7-
"requirements_list": [
7+
"requirements": [
88
"conllu"
99
]
1010
},

src/unitxt/fusion.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,7 @@ def prepare_subsets(self):
3434
for i in range(len(self.subsets)):
3535
self.named_subsets[i] = self.subsets[i]
3636
else:
37-
for name, origin in self.subsets.items():
38-
try:
39-
self.named_subsets[name] = origin
40-
except Exception as e:
41-
raise RuntimeError(f"Exception in subset: {name}") from e
37+
self.named_subsets = self.subsets
4238

4339
def splits(self) -> List[str]:
4440
self.prepare_subsets()

0 commit comments

Comments
 (0)