⚡️ Speed up method TranslationCleaner.clean by 12%
#133
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 12% (0.12x) speedup for
TranslationCleaner.cleanininvokeai/frontend/web/scripts/clean_translations.py⏱️ Runtime :
277 milliseconds→247 milliseconds(best of16runs)📝 Explanation and details
The optimized code achieves a 12% speedup through three key optimizations:
1. Stack-based tree traversal in
_get_keys- Replaced recursive function calls with an iterative stack approach, eliminating Python's function call overhead. This is particularly effective for nested translation dictionaries, reducing execution time from 12.2ms to 9.9ms (~18% faster for this function).2. Precompiled regex patterns in
_search_codebase- Instead of compiling regex patterns for every file search, patterns are now compiled once per key at the start of the function. This eliminates redundant regex compilation overhead, which was a significant bottleneck when searching the same key across multiple files.3. Cached file system traversal - The expensive
os.walk("../src")operation is now performed only once and cached in_src_files. Previously, the directory was walked for every single translation key being checked. With thousands of keys to verify, this change dramatically reduces I/O operations.Performance impact analysis:
The optimizations maintain identical behavior while being particularly beneficial for the typical use case of cleaning large translation files with hundreds or thousands of keys across multiple source files.
✅ Correctness verification report:
🌀 Generated Regression Tests and Runtime
import os
function to test (provided as per instructions)
import re
import shutil
import tempfile
from typing import TypeAlias, Union
imports
import pytest
from invokeai.frontend.web.scripts.clean_translations import TranslationCleaner
from tqdm import tqdm
---- TESTS ----
@pytest.fixture(autouse=True)
def setup_and_teardown_src(tmp_path, monkeypatch):
"""
Creates a temporary ../src directory with .ts/.tsx files for the tests.
Cleans up after each test.
"""
# Create a temporary directory to act as ../src
orig_cwd = os.getcwd()
test_dir = tmp_path / "test_dir"
test_dir.mkdir()
src_dir = test_dir / "src"
src_dir.mkdir()
# Patch os.getcwd to return the test_dir, so ../src resolves to src_dir
monkeypatch.chdir(test_dir)
# Patch os.walk to walk our src_dir instead of a real src
monkeypatch.setattr(os, "walk", lambda path: os.walk(str(src_dir)))
yield src_dir
monkeypatch.chdir(orig_cwd)
def write_file(src_dir, rel_path, content):
"""Helper to write a file in the src_dir."""
file_path = src_dir / rel_path
file_path.parent.mkdir(parents=True, exist_ok=True)
file_path.write_text(content, encoding="utf-8")
return file_path
---- BASIC TEST CASES ----
def test_clean_removes_unused_flat_key(setup_and_teardown_src):
"""
Test that a flat key not present in codebase is removed.
"""
src_dir = setup_and_teardown_src
write_file(src_dir, "file1.ts", "const foo = 'bar';")
cleaner = TranslationCleaner()
obj = {"hello": "world", "unused": "bye"}
# Only 'hello' is referenced in codebase
write_file(src_dir, "file2.ts", 'console.log("hello");')
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_keeps_used_flat_key(setup_and_teardown_src):
"""
Test that a flat key present in codebase is kept.
"""
src_dir = setup_and_teardown_src
write_file(src_dir, "file1.ts", 'console.log("greet");')
cleaner = TranslationCleaner()
obj = {"greet": "hi"}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_removes_unused_nested_key(setup_and_teardown_src):
"""
Test that a nested key not present in codebase is removed.
"""
src_dir = setup_and_teardown_src
write_file(src_dir, "file1.ts", 'console.log("parent.child1");')
cleaner = TranslationCleaner()
obj = {"parent": {"child1": "keep", "child2": "remove"}}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_keeps_used_nested_key_by_stem(setup_and_teardown_src):
"""
Test that a nested key is kept if its stem is referenced in codebase.
"""
src_dir = setup_and_teardown_src
# Only reference 'child1' (the stem), not the full path
write_file(src_dir, "file1.ts", "doSomething('child1');")
cleaner = TranslationCleaner()
obj = {"parent": {"child1": "keep", "child2": "remove"}}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_ignores_pluralized_keys_with_underscore(setup_and_teardown_src):
"""
Test that keys containing underscores are ignored (not removed).
"""
src_dir = setup_and_teardown_src
cleaner = TranslationCleaner()
obj = {"greet_one": "hi", "greet_other": "hello", "greet": "yo"}
# Only 'greet' is referenced, but underscore keys are ignored by _get_keys
write_file(src_dir, "file1.ts", 'console.log("greet");')
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_removes_multiple_unused_keys(setup_and_teardown_src):
"""
Test that multiple unused keys are all removed.
"""
src_dir = setup_and_teardown_src
write_file(src_dir, "file1.ts", 'console.log("used1");')
cleaner = TranslationCleaner()
obj = {"used1": "a", "unused1": "b", "unused2": "c"}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
---- EDGE TEST CASES ----
def test_clean_empty_dict_returns_empty(setup_and_teardown_src):
"""
Test that cleaning an empty dictionary returns an empty dictionary.
"""
cleaner = TranslationCleaner()
obj = {}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output # 143μs -> 144μs (1.01% slower)
def test_clean_all_keys_unused(setup_and_teardown_src):
"""
Test that all keys are removed if none are referenced.
"""
cleaner = TranslationCleaner()
obj = {"foo": "bar", "baz": "qux"}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_all_keys_used(setup_and_teardown_src):
"""
Test that all keys are kept if all are referenced.
"""
src_dir = setup_and_teardown_src
write_file(src_dir, "file1.ts", 'console.log("foo");')
write_file(src_dir, "file2.tsx", 'console.log("baz");')
cleaner = TranslationCleaner()
obj = {"foo": "bar", "baz": "qux"}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_with_deeply_nested_keys(setup_and_teardown_src):
"""
Test that deeply nested keys are handled correctly.
"""
src_dir = setup_and_teardown_src
write_file(src_dir, "file1.ts", 'console.log("a.b.c.d");')
cleaner = TranslationCleaner()
obj = {"a": {"b": {"c": {"d": "keep", "e": "remove"}}}}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_with_non_string_leaf_values(setup_and_teardown_src):
"""
Test that non-string leaf values (e.g., int, list) are ignored by _get_keys and remain untouched.
"""
src_dir = setup_and_teardown_src
cleaner = TranslationCleaner()
obj = {"foo": 123, "bar": ["a", "b"], "baz": {"qux": None}}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_with_keys_containing_dots(setup_and_teardown_src):
"""
Test that keys containing dots are handled as single keys, not as nested paths.
"""
src_dir = setup_and_teardown_src
# Key with dot in name, not nested
write_file(src_dir, "file1.ts", 'console.log("dot.key");')
cleaner = TranslationCleaner()
obj = {"dot.key": "keep", "other": "remove"}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_with_empty_nested_dicts(setup_and_teardown_src):
"""
Test that empty nested dictionaries remain after cleaning (manual removal required).
"""
src_dir = setup_and_teardown_src
write_file(src_dir, "file1.ts", 'console.log("keep");')
cleaner = TranslationCleaner()
obj = {"parent": {"keep": "yes", "remove": "no"}, "empty": {}}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_with_duplicate_key_stems(setup_and_teardown_src):
"""
Test that only the correct key is kept when multiple keys have the same stem.
"""
src_dir = setup_and_teardown_src
# Only 'bar.foo' is referenced, not 'baz.foo'
write_file(src_dir, "file1.ts", 'console.log("bar.foo");')
cleaner = TranslationCleaner()
obj = {"bar": {"foo": "keep"}, "baz": {"foo": "remove"}}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
def test_clean_with_non_ascii_keys(setup_and_teardown_src):
"""
Test that non-ASCII keys are handled correctly.
"""
src_dir = setup_and_teardown_src
write_file(src_dir, "file1.ts", 'console.log("привет");')
cleaner = TranslationCleaner()
obj = {"привет": "hello", "こんにちは": "hi"}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
---- LARGE SCALE TEST CASES ----
def test_clean_large_flat_dict(setup_and_teardown_src):
"""
Test cleaning a large flat dictionary (up to 1000 keys).
"""
src_dir = setup_and_teardown_src
used_keys = [f"key{i}" for i in range(500)]
unused_keys = [f"unused{i}" for i in range(500)]
# Reference all used keys in codebase
content = "\n".join([f'console.log("{k}");' for k in used_keys])
write_file(src_dir, "file1.ts", content)
cleaner = TranslationCleaner()
obj = {k: f"val{k}" for k in used_keys + unused_keys}
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
for k in used_keys:
pass
for k in unused_keys:
pass
def test_clean_large_nested_dict(setup_and_teardown_src):
"""
Test cleaning a large nested dictionary (up to 1000 leaf keys).
"""
src_dir = setup_and_teardown_src
# Create 10 parents, each with 100 children
obj = {f"parent{i}": {f"child{j}": f"val{i}_{j}" for j in range(100)} for i in range(10)}
# Reference only even children in codebase
used_keys = [f"parent{i}.child{j}" for i in range(10) for j in range(0, 100, 2)]
content = "\n".join([f'console.log("{k}");' for k in used_keys])
write_file(src_dir, "file1.ts", content)
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean({k: v.copy() for k, v in obj.items()}); cleaned = codeflash_output
for i in range(10):
for j in range(100):
k = f"child{j}"
if j % 2 == 0:
pass
else:
pass
# Total keys should be 10 * 50 = 500
total = sum(len(v) for v in cleaned.values())
def test_clean_performance_on_large_dict(monkeypatch, setup_and_teardown_src):
"""
Test that cleaning a large dictionary runs in reasonable time (smoke test).
"""
src_dir = setup_and_teardown_src
# 500 used, 500 unused
used_keys = [f"key{i}" for i in range(500)]
unused_keys = [f"unused{i}" for i in range(500)]
content = "\n".join([f'console.log("{k}");' for k in used_keys])
write_file(src_dir, "file1.ts", content)
cleaner = TranslationCleaner()
obj = {k: f"val{k}" for k in used_keys + unused_keys}
# Just ensure it completes and result is correct
codeflash_output = cleaner.clean(obj.copy()); cleaned = codeflash_output
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
import os
function to test
import re
import shutil
import tempfile
from typing import TypeAlias, Union
imports
import pytest # used for our unit tests
from invokeai.frontend.web.scripts.clean_translations import TranslationCleaner
from tqdm import tqdm
Helper function to setup a temporary codebase for _search_codebase
def setup_codebase(files_content):
"""
Creates a temporary ../src directory with .ts/.tsx files for testing.
Returns the tempdir path and a cleanup function.
"""
tempdir = tempfile.mkdtemp()
src_dir = os.path.join(tempdir, "src")
os.makedirs(src_dir)
for fname, content in files_content.items():
with open(os.path.join(src_dir, fname), "w") as f:
f.write(content)
# Patch os.walk to point to our temp src directory
orig_os_walk = os.walk
-------- BASIC TEST CASES --------
def test_clean_removes_unused_keys_basic():
"""
Basic: Removes keys not found in codebase.
"""
files_content = {
"file1.ts": '"used.key": "value",\nconsole.log("used.key");',
"file2.tsx": 'console.log("another.used");'
}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {
"used": {"key": "value"},
"unused": {"key": "not used"},
"another": {"used": "yes"},
"notfound": "nope"
}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 631μs -> 557μs (13.2% faster)
unpatch_walk()
def test_clean_keeps_all_keys_if_all_used():
"""
Basic: Keeps all keys if all are found in codebase.
"""
files_content = {
"file.ts": 'console.log("a.b");\nconsole.log("c");'
}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {
"a": {"b": "val"},
"c": "val2"
}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 340μs -> 382μs (11.0% slower)
unpatch_walk()
def test_clean_removes_nested_unused_keys():
"""
Basic: Removes nested unused keys.
"""
files_content = {
"main.ts": 'console.log("parent.child1");'
}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {
"parent": {
"child1": "used",
"child2": "unused"
}
}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 393μs -> 336μs (16.9% faster)
unpatch_walk()
-------- EDGE TEST CASES --------
def test_clean_handles_empty_dict():
"""
Edge: Handles empty dict gracefully.
"""
files_content = {}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 128μs -> 128μs (0.103% faster)
unpatch_walk()
def test_clean_handles_no_keys_found():
"""
Edge: Removes all keys if none found in codebase.
"""
files_content = {"f.ts": ""}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {
"a": "1",
"b": {"c": "2", "d": "3"}
}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 480μs -> 413μs (16.1% faster)
unpatch_walk()
def test_clean_handles_pluralized_keys():
"""
Edge: Skips keys with underscores (pluralized keys).
"""
files_content = {"f.ts": 'console.log("normal.key");'}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {
"normal": {"key": "val"},
"plural": {"key_one": "one", "key_other": "other"}
}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 288μs -> 295μs (2.26% slower)
unpatch_walk()
def test_clean_removes_keys_with_similar_names():
"""
Edge: Only removes exact keys, not similar keys.
"""
files_content = {"f.ts": 'console.log("exact.key");'}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {
"exact": {"key": "yes"},
"exactkey": "no"
}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 390μs -> 396μs (1.41% slower)
unpatch_walk()
def test_clean_handles_deeply_nested_dict():
"""
Edge: Handles deeply nested dicts.
"""
files_content = {"f.ts": 'console.log("a.b.c.d");'}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {"a": {"b": {"c": {"d": "deep", "e": "notfound"}}}}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 373μs -> 317μs (17.5% faster)
unpatch_walk()
def test_clean_handles_keys_with_dot_in_name():
"""
Edge: Key names with dots are handled as path, not as literal key.
"""
files_content = {"f.ts": 'console.log("weird.key.name");'}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {"weird": {"key": {"name": "found"}, "key.name": "notfound"}}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 301μs -> 335μs (10.1% slower)
unpatch_walk()
-------- LARGE SCALE TEST CASES --------
def test_clean_large_scale_removes_many_unused_keys():
"""
Large: Handles large dict and removes unused keys efficiently.
"""
# Only keep every 10th key
used_keys = [f"key{i}" for i in range(0, 1000, 10)]
files_content = {"large.ts": "\n".join([f'console.log("{k}");' for k in used_keys])}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {f"key{i}": f"val{i}" for i in range(1000)}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 87.1ms -> 80.0ms (8.99% faster)
unpatch_walk()
def test_clean_large_scale_nested_dict():
"""
Large: Handles large nested dict structure.
"""
# 100 parents, 10 children each, only keep child0
files_content = {
"nested.ts": "\n".join([f'console.log("parent{i}.child0");' for i in range(100)])
}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {f"parent{i}": {f"child{j}": f"val{i}_{j}" for j in range(10)} for i in range(100)}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 72.4ms -> 63.0ms (15.1% faster)
unpatch_walk()
# Only child0 should remain for each parent
for i in range(100):
for j in range(1, 10):
pass
def test_clean_large_scale_pluralized_keys_untouched():
"""
Large: Pluralized keys (with underscores) are not removed, even if not in codebase.
"""
files_content = {
"plurals.ts": 'console.log("normal.key");'
}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {}
for i in range(500):
obj[f"normal{i}"] = {"key": "val", "key_one": "one", "key_other": "other"}
cleaner = TranslationCleaner()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 29.2ms -> 24.7ms (18.2% faster)
unpatch_walk()
# "key" is removed except for normal0 (only normal.key is found), plural keys remain
for i in range(500):
if i == 0:
pass
else:
pass
def test_clean_performance_large_scale(monkeypatch):
"""
Large: Performance test - ensure clean runs within reasonable time for large input.
"""
# Used keys: every 20th
used_keys = [f"key{i}" for i in range(0, 1000, 20)]
files_content = {"perf.ts": "\n".join([f'console.log("{k}");' for k in used_keys])}
tempdir, src_dir, patch_walk, unpatch_walk = setup_codebase(files_content)
patch_walk()
obj = {f"key{i}": f"val{i}" for i in range(1000)}
cleaner = TranslationCleaner()
# Monkeypatch tqdm to avoid slowing down tests
monkeypatch.setattr("tqdm.tqdm", lambda x, **kwargs: x)
import time
start = time.time()
codeflash_output = cleaner.clean(obj); cleaned = codeflash_output # 84.7ms -> 76.0ms (11.4% faster)
end = time.time()
unpatch_walk()
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
from invokeai.frontend.web.scripts.clean_translations import TranslationCleaner
def test_TranslationCleaner_clean():
TranslationCleaner.clean(TranslationCleaner(), {})
🔎 Concolic Coverage Tests and Runtime
codeflash_concolic_iouvh9ci/tmp92lseqea/test_concolic_coverage.py::test_TranslationCleaner_cleanTo edit these changes
git checkout codeflash/optimize-TranslationCleaner.clean-mhvg8zt3and push.