⚡️ Speed up method Sst2Processor._create_examples by 15%
#128
+12
−8
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 15% (0.15x) speedup for
Sst2Processor._create_examplesinsrc/transformers/data/processors/glue.py⏱️ Runtime :
2.46 milliseconds→2.14 milliseconds(best of250runs)📝 Explanation and details
The optimized code achieves a 14% speedup by eliminating expensive operations from the hot loop and leveraging Python's optimized list comprehensions.
Key optimizations applied:
Pre-skip header row: Uses
next(enumerate(lines))to skip the header once, eliminating theif i == 0: continuebranch check that executed for every single row in the original loop.Replace loop with list comprehensions: Converts the manual loop +
append()pattern to list comprehensions, which are internally optimized in CPython and have better memory locality.Hoist conditional logic: Moves the
set_type == "test"check outside the loop, creating two separate list comprehensions instead of evaluating the ternarylabel = None if set_type == "test" else line[1]for every row.Reduce variable assignments: Eliminates intermediate variables (
guid,text_a,label) that were created in each loop iteration.Why this is faster:
append()calls and list resizingPerformance characteristics from tests:
This optimization is particularly valuable for data preprocessing pipelines that process large datasets, where the cumulative effect of these micro-optimizations becomes substantial.
✅ Correctness verification report:
🌀 Generated Regression Tests and Runtime
import warnings
imports
import pytest # used for our unit tests
from transformers.data.processors.glue import Sst2Processor
Minimal InputExample for testing
class InputExample:
def init(self, guid, text_a, text_b=None, label=None):
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
Minimal DataProcessor for testing
class DataProcessor:
pass
DEPRECATION_WARNING = (
"This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
"library. You can have a look at this example script for pointers: "
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
)
from transformers.data.processors.glue import Sst2Processor
unit tests
---------------------------
Basic Test Cases
---------------------------
def test_basic_train_set():
"""Test normal train set input with header and two examples."""
processor = Sst2Processor()
lines = [
["sentence", "label"], # header
["I love this movie.", "1"],
["This film is terrible.", "0"]
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 3.01μs -> 3.04μs (1.02% slower)
def test_basic_dev_set():
"""Test normal dev set input with header and one example."""
processor = Sst2Processor()
lines = [
["sentence", "label"], # header
["An average movie.", "1"]
]
codeflash_output = processor._create_examples(lines, "dev"); examples = codeflash_output # 2.31μs -> 2.41μs (3.99% slower)
def test_basic_test_set():
"""Test normal test set input with header and two examples; labels should be None."""
processor = Sst2Processor()
lines = [
["index", "sentence"], # header
["0", "A masterpiece."],
["1", "Not my taste."]
]
codeflash_output = processor._create_examples(lines, "test"); examples = codeflash_output # 3.02μs -> 3.07μs (1.60% slower)
---------------------------
Edge Test Cases
---------------------------
def test_empty_lines():
"""Test with only header, no data rows."""
processor = Sst2Processor()
lines = [
["sentence", "label"] # header only
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 861ns -> 1.35μs (36.1% slower)
def test_single_row_no_header():
"""Test with no header, should treat first row as header and skip it."""
processor = Sst2Processor()
lines = [
["I love it!", "1"]
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 832ns -> 1.34μs (38.1% slower)
def test_malformed_row_missing_label():
"""Test with a row missing label for train/dev set; should raise IndexError."""
processor = Sst2Processor()
lines = [
["sentence", "label"],
["Just text"] # missing label
]
with pytest.raises(IndexError):
processor._create_examples(lines, "train") # 1.67μs -> 2.24μs (25.4% slower)
def test_malformed_row_missing_sentence_test():
"""Test with a row missing sentence for test set; should raise IndexError."""
processor = Sst2Processor()
lines = [
["index", "sentence"],
["0"] # missing sentence
]
with pytest.raises(IndexError):
processor._create_examples(lines, "test") # 1.60μs -> 2.13μs (25.1% slower)
def test_non_string_labels_and_sentences():
"""Test with integer labels and sentences; should accept them as is."""
processor = Sst2Processor()
lines = [
["sentence", "label"],
[12345, 1],
[67890, 0]
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 3.69μs -> 3.43μs (7.64% faster)
def test_duplicate_rows():
"""Test with duplicate rows; should process both."""
processor = Sst2Processor()
lines = [
["sentence", "label"],
["Repeat", "1"],
["Repeat", "1"]
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 3.36μs -> 3.21μs (4.39% faster)
def test_guid_format():
"""Test that the GUID is correctly formatted for each set type."""
processor = Sst2Processor()
lines = [
["sentence", "label"],
["Text", "1"]
]
for set_type in ["train", "dev", "test"]:
codeflash_output = processor._create_examples(lines, set_type); examples = codeflash_output # 4.63μs -> 5.00μs (7.44% slower)
# Should be guid = "{set_type}-1"
if examples:
pass
def test_label_none_for_test():
"""Test that label is None for test set, even if a label column exists."""
processor = Sst2Processor()
lines = [
["index", "sentence"],
["0", "Text"]
]
codeflash_output = processor._create_examples(lines, "test"); examples = codeflash_output # 2.40μs -> 2.48μs (3.06% slower)
def test_label_not_none_for_train():
"""Test that label is not None for train/dev set."""
processor = Sst2Processor()
lines = [
["sentence", "label"],
["Text", "1"]
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 2.23μs -> 2.46μs (9.11% slower)
---------------------------
Large Scale Test Cases
---------------------------
def test_large_scale_train_set():
"""Test with a large train set (999 rows)."""
processor = Sst2Processor()
lines = [["sentence", "label"]]
# Add 999 rows
for i in range(1, 1000):
lines.append([f"Sentence {i}", str(i % 2)])
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 407μs -> 350μs (16.2% faster)
def test_large_scale_test_set():
"""Test with a large test set (999 rows, labels should be None)."""
processor = Sst2Processor()
lines = [["index", "sentence"]]
for i in range(1, 1000):
lines.append([str(i), f"Test sentence {i}"])
codeflash_output = processor._create_examples(lines, "test"); examples = codeflash_output # 398μs -> 344μs (15.8% faster)
def test_large_scale_dev_set_guid_and_label():
"""Test that guids and labels are correct for a large dev set."""
processor = Sst2Processor()
lines = [["sentence", "label"]]
for i in range(1, 1000):
lines.append([f"Dev sentence {i}", str((i+1) % 2)])
codeflash_output = processor._create_examples(lines, "dev"); examples = codeflash_output # 394μs -> 341μs (15.6% faster)
# Check a random sample
for idx in [0, 499, 998]:
expected_guid = f"dev-{idx+1}"
expected_label = str(((idx+1)+1) % 2)
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
import warnings
imports
import pytest
from transformers.data.processors.glue import Sst2Processor
Minimal InputExample and DataProcessor definitions for testing, as we don't import transformers
class InputExample:
def init(self, guid, text_a, text_b=None, label=None):
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
class DataProcessor:
pass
DEPRECATION_WARNING = (
"This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
"library. You can have a look at this example script for pointers: "
"https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
)
from transformers.data.processors.glue import Sst2Processor
--------------------------
Basic Test Cases
--------------------------
def test_basic_train_example():
# Basic: 2 training lines, with header
processor = Sst2Processor()
lines = [
["sentence", "label"], # header
["I love this movie.", "1"],
["This was bad.", "0"],
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 3.33μs -> 3.35μs (0.686% slower)
def test_basic_dev_example():
# Basic: dev set, similar to train
processor = Sst2Processor()
lines = [
["sentence", "label"], # header
["Okay film.", "1"],
["Not so great.", "0"],
]
codeflash_output = processor._create_examples(lines, "dev"); examples = codeflash_output # 2.94μs -> 2.96μs (0.743% slower)
def test_basic_test_example():
# Basic: test set, label should be None, text from 2nd column
processor = Sst2Processor()
lines = [
["index", "sentence"], # header
["0", "What a masterpiece!"],
["1", "Terrible."],
]
codeflash_output = processor._create_examples(lines, "test"); examples = codeflash_output # 3.09μs -> 3.10μs (0.515% slower)
--------------------------
Edge Test Cases
--------------------------
def test_empty_lines():
# Edge: Only header, no data
processor = Sst2Processor()
lines = [["sentence", "label"]]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 861ns -> 1.36μs (36.8% slower)
def test_only_header_test():
# Edge: Only header, test set
processor = Sst2Processor()
lines = [["index", "sentence"]]
codeflash_output = processor._create_examples(lines, "test"); examples = codeflash_output # 870ns -> 1.39μs (37.6% slower)
def test_missing_label_column():
# Edge: missing label column in train/dev
processor = Sst2Processor()
lines = [
["sentence"], # header
["Something is missing"], # no label
]
with pytest.raises(IndexError):
processor._create_examples(lines, "train") # 1.62μs -> 2.19μs (25.8% slower)
def test_missing_sentence_column_test():
# Edge: missing sentence column in test
processor = Sst2Processor()
lines = [
["index"], # header
["0"], # no sentence
]
with pytest.raises(IndexError):
processor._create_examples(lines, "test") # 1.57μs -> 2.14μs (26.8% slower)
def test_extra_columns_train():
# Edge: extra columns in train, should only use first two
processor = Sst2Processor()
lines = [
["sentence", "label", "extra"], # header
["Great!", "1", "foo"],
["Awful.", "0", "bar"],
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 3.59μs -> 3.60μs (0.444% slower)
def test_extra_columns_test():
# Edge: extra columns in test, should use 2nd column for text
processor = Sst2Processor()
lines = [
["index", "sentence", "extra"], # header
["0", "Nice!", "foo"],
["1", "Bad.", "bar"],
]
codeflash_output = processor._create_examples(lines, "test"); examples = codeflash_output # 3.32μs -> 3.29μs (0.974% faster)
def test_incorrect_set_type():
# Edge: unknown set_type, should treat as train/dev (text_index=0)
processor = Sst2Processor()
lines = [
["sentence", "label"], # header
["Surprising.", "1"],
]
codeflash_output = processor._create_examples(lines, "unknown"); examples = codeflash_output # 2.44μs -> 2.54μs (4.13% slower)
def test_non_string_labels():
# Edge: label is not a string
processor = Sst2Processor()
lines = [
["sentence", "label"],
["Good.", 1],
["Bad.", 0],
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 3.29μs -> 3.18μs (3.24% faster)
def test_non_string_text():
# Edge: text_a is not a string
processor = Sst2Processor()
lines = [
["sentence", "label"],
[123, "1"],
[None, "0"],
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 3.17μs -> 3.09μs (2.65% faster)
def test_guid_numbering():
# Edge: guid numbering starts at 1, skips header
processor = Sst2Processor()
lines = [
["sentence", "label"],
["A", "1"],
["B", "0"],
["C", "1"],
]
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 3.64μs -> 3.46μs (5.21% faster)
guids = [ex.guid for ex in examples]
--------------------------
Large Scale Test Cases
--------------------------
def test_large_train_set():
# Large: 500 train examples
processor = Sst2Processor()
lines = [["sentence", "label"]]
for i in range(500):
lines.append([f"Sentence {i}", str(i % 2)])
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 200μs -> 173μs (15.2% faster)
def test_large_test_set():
# Large: 999 test examples
processor = Sst2Processor()
lines = [["index", "sentence"]]
for i in range(999):
lines.append([str(i), f"Test sentence {i}"])
codeflash_output = processor._create_examples(lines, "test"); examples = codeflash_output # 401μs -> 350μs (14.5% faster)
def test_large_mixed_labels():
# Large: 1000 train examples, mixed string/integer labels
processor = Sst2Processor()
lines = [["sentence", "label"]]
for i in range(1000):
label = str(i % 2) if i % 3 else i % 2 # mix str and int
lines.append([f"Sentence {i}", label])
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 393μs -> 340μs (15.6% faster)
# Check some types
for i in (0, 2, 3, 999):
label = examples[i].label
if i % 3 == 0:
pass
else:
pass
def test_large_extra_columns():
# Large: 500 examples, 5 columns per line, only first two used
processor = Sst2Processor()
lines = [["sentence", "label", "c1", "c2", "c3"]]
for i in range(500):
lines.append([f"Sent {i}", str(i % 2), "x", "y", "z"])
codeflash_output = processor._create_examples(lines, "train"); examples = codeflash_output # 197μs -> 171μs (15.7% faster)
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
To edit these changes
git checkout codeflash/optimize-Sst2Processor._create_examples-mhvg4y0dand push.