Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions codeflash/api/aiservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from codeflash.cli_cmds.console import console, logger
from codeflash.code_utils.code_replacer import is_zero_diff
from codeflash.code_utils.code_utils import unified_diff_strings
from codeflash.code_utils.config_consts import N_CANDIDATES_EFFECTIVE, N_CANDIDATES_LP_EFFECTIVE
from codeflash.code_utils.env_utils import get_codeflash_api_key
from codeflash.code_utils.git_utils import get_last_commit_author_if_pr_exists, get_repo_owner_and_name
from codeflash.code_utils.time_utils import humanize_runtime
Expand Down Expand Up @@ -141,15 +140,14 @@ def optimize_python_code( # noqa: D417
payload = {
"source_code": source_code,
"dependency_code": dependency_code,
"num_variants": num_candidates,
"n_candidates": num_candidates,
"trace_id": trace_id,
"python_version": platform.python_version(),
"experiment_metadata": experiment_metadata,
"codeflash_version": codeflash_version,
"current_username": get_last_commit_author_if_pr_exists(None),
"repo_owner": git_repo_owner,
"repo_name": git_repo_name,
"n_candidates": N_CANDIDATES_EFFECTIVE,
"is_async": is_async,
}

Expand Down Expand Up @@ -183,7 +181,7 @@ def optimize_python_code_line_profiler( # noqa: D417
dependency_code: str,
trace_id: str,
line_profiler_results: str,
num_candidates: int = 10,
num_candidates: int = 8,
experiment_metadata: ExperimentMetadata | None = None,
) -> list[OptimizedCandidate]:
"""Optimize the given python code for performance by making a request to the Django endpoint.
Expand All @@ -204,14 +202,13 @@ def optimize_python_code_line_profiler( # noqa: D417
payload = {
"source_code": source_code,
"dependency_code": dependency_code,
"num_variants": num_candidates,
"n_candidates_lp": num_candidates,
"line_profiler_results": line_profiler_results,
"trace_id": trace_id,
"python_version": platform.python_version(),
"experiment_metadata": experiment_metadata,
"codeflash_version": codeflash_version,
"lsp_mode": is_LSP_enabled(),
"n_candidates_lp": N_CANDIDATES_LP_EFFECTIVE,
}

console.rule()
Expand Down
3 changes: 3 additions & 0 deletions codeflash/cli_cmds/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ def parse_args() -> Namespace:
action="store_true",
help="(Deprecated) Async function optimization is now enabled by default. This flag is ignored.",
)
parser.add_argument(
"--effort", type=str, help="Effort level for optimization", choices=["low", "medium", "high"], default="medium"
)

args, unknown_args = parser.parse_known_args()
sys.argv[:] = [sys.argv[0], *unknown_args]
Expand Down
69 changes: 50 additions & 19 deletions codeflash/code_utils/config_consts.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,22 @@
from enum import StrEnum, auto

MAX_TEST_RUN_ITERATIONS = 5
INDIVIDUAL_TESTCASE_TIMEOUT = 15
MAX_FUNCTION_TEST_SECONDS = 60
N_CANDIDATES = 5
MIN_IMPROVEMENT_THRESHOLD = 0.05
MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD = 0.10 # 10% minimum improvement for async throughput
MAX_TEST_FUNCTION_RUNS = 50
MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS = 100e6 # 100ms
N_TESTS_TO_GENERATE = 2
TOTAL_LOOPING_TIME = 10.0 # 10 second candidate benchmarking budget
COVERAGE_THRESHOLD = 60.0
MIN_TESTCASE_PASSED_THRESHOLD = 6
REPEAT_OPTIMIZATION_PROBABILITY = 0.1
DEFAULT_IMPORTANCE_THRESHOLD = 0.001
N_CANDIDATES_LP = 6

# Refinement
REFINE_ALL_THRESHOLD = 2 # when valid optimizations count is 2 or less, refine all optimizations
REFINED_CANDIDATE_RANKING_WEIGHTS = (2, 1) # (runtime, diff), runtime is more important than diff by a factor of 2
TOP_N_REFINEMENTS = 0.45 # top 45% of valid optimizations (based on the weighted score) are refined

# LSP-specific
N_CANDIDATES_LSP = 3
N_TESTS_TO_GENERATE_LSP = 2
TOTAL_LOOPING_TIME_LSP = 10.0 # Kept same timing for LSP mode to avoid in increase in performance reporting
N_CANDIDATES_LP_LSP = 3

# Code repair
REPAIR_UNMATCHED_PERCENTAGE_LIMIT = 0.4 # if the percentage of unmatched tests is greater than this, we won't fix it (lowering this value makes the repair more stricted)
MAX_REPAIRS_PER_TRACE = 4 # maximum number of repairs we will do for each function

MAX_N_CANDIDATES = 5
MAX_N_CANDIDATES_LP = 6

try:
from codeflash.lsp.helpers import is_LSP_enabled
Expand All @@ -39,9 +25,54 @@
except ImportError:
_IS_LSP_ENABLED = False

N_CANDIDATES_EFFECTIVE = min(N_CANDIDATES_LSP if _IS_LSP_ENABLED else N_CANDIDATES, MAX_N_CANDIDATES)
N_CANDIDATES_LP_EFFECTIVE = min(N_CANDIDATES_LP_LSP if _IS_LSP_ENABLED else N_CANDIDATES_LP, MAX_N_CANDIDATES_LP)
N_TESTS_TO_GENERATE_EFFECTIVE = N_TESTS_TO_GENERATE_LSP if _IS_LSP_ENABLED else N_TESTS_TO_GENERATE
TOTAL_LOOPING_TIME_EFFECTIVE = TOTAL_LOOPING_TIME_LSP if _IS_LSP_ENABLED else TOTAL_LOOPING_TIME

MAX_CONTEXT_LEN_REVIEW = 1000


class EffortLevel(StrEnum):
LOW = auto()
MEDIUM = auto()
HIGH = auto()


class EffortKeys(StrEnum):
N_OPTIMIZER_CANDIDATES = auto()
N_OPTIMIZER_LP_CANDIDATES = auto()
N_GENERATED_TESTS = auto()
MAX_CODE_REPAIRS_PER_TRACE = auto()
REPAIR_UNMATCHED_PERCENTAGE_LIMIT = auto()
REFINE_ALL_THRESHOLD = auto()
TOP_VALID_CANDIDATES_FOR_REFINEMENT = auto()


EFFORT_VALUES: dict[str, dict[EffortLevel, any]] = {
EffortKeys.N_OPTIMIZER_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 4, EffortLevel.HIGH: 5},
EffortKeys.N_OPTIMIZER_LP_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 5, EffortLevel.HIGH: 6},
# we don't use effort with generated tests for now
EffortKeys.N_GENERATED_TESTS.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 2, EffortLevel.HIGH: 2},
# maximum number of repairs we will do for each function
EffortKeys.MAX_CODE_REPAIRS_PER_TRACE.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 4, EffortLevel.HIGH: 5},
# if the percentage of unmatched tests is greater than this, we won't fix it (lowering this value makes the repair more stricted)
# on the low effort we lower the limit to 20% to be more strict (less repairs)
EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT.value: {
EffortLevel.LOW: 0.2,
EffortLevel.MEDIUM: 0.4,
EffortLevel.HIGH: 0.5,
},
# when valid optimizations count is N or less, refine all optimizations
EffortKeys.REFINE_ALL_THRESHOLD.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4},
# Top valid candidates for refinements
EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4},
}


def get_effort_value(key: EffortKeys, effort: EffortLevel) -> any:
key_str = key.value
if key_str in EFFORT_VALUES:
if effort in EFFORT_VALUES[key_str]:
return EFFORT_VALUES[key_str][effort]
msg = f"Invalid effort level: {effort}"
raise ValueError(msg)
msg = f"Invalid key: {key_str}"
raise ValueError(msg)
34 changes: 0 additions & 34 deletions codeflash/code_utils/git_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from __future__ import annotations

import os
import shutil
import subprocess
import sys
import tempfile
import time
from functools import cache
from io import StringIO
Expand All @@ -16,7 +13,6 @@
from unidiff import PatchSet

from codeflash.cli_cmds.console import logger
from codeflash.code_utils.config_consts import N_CANDIDATES_EFFECTIVE

if TYPE_CHECKING:
from git import Repo
Expand Down Expand Up @@ -153,36 +149,6 @@ def check_and_push_branch(repo: git.Repo, git_remote: str | None = "origin", *,
return True


def create_worktree_root_dir(module_root: Path) -> tuple[Path | None, Path | None]:
git_root = git_root_dir() if check_running_in_git_repo(module_root) else None
worktree_root_dir = Path(tempfile.mkdtemp()) if git_root else None
return git_root, worktree_root_dir


def create_git_worktrees(
git_root: Path | None, worktree_root_dir: Path | None, module_root: Path
) -> tuple[Path | None, list[Path]]:
if git_root and worktree_root_dir:
worktree_root = Path(tempfile.mkdtemp(dir=worktree_root_dir))
worktrees = [Path(tempfile.mkdtemp(dir=worktree_root)) for _ in range(N_CANDIDATES_EFFECTIVE + 1)]
for worktree in worktrees:
subprocess.run(["git", "worktree", "add", "-d", worktree], cwd=module_root, check=True)
else:
worktree_root = None
worktrees = []
return worktree_root, worktrees


def remove_git_worktrees(worktree_root: Path | None, worktrees: list[Path]) -> None:
try:
for worktree in worktrees:
subprocess.run(["git", "worktree", "remove", "-f", worktree], check=True)
except subprocess.CalledProcessError as e:
logger.warning(f"Error removing worktrees: {e}")
if worktree_root:
shutil.rmtree(worktree_root)


def get_last_commit_author_if_pr_exists(repo: Repo | None = None) -> str | None:
"""Return the author's name of the last commit in the current branch if PR_NUMBER is set.

Expand Down
40 changes: 23 additions & 17 deletions codeflash/optimization/function_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,11 @@
from codeflash.code_utils.config_consts import (
COVERAGE_THRESHOLD,
INDIVIDUAL_TESTCASE_TIMEOUT,
MAX_REPAIRS_PER_TRACE,
N_CANDIDATES_EFFECTIVE,
N_CANDIDATES_LP_EFFECTIVE,
N_TESTS_TO_GENERATE_EFFECTIVE,
REFINE_ALL_THRESHOLD,
REFINED_CANDIDATE_RANKING_WEIGHTS,
REPAIR_UNMATCHED_PERCENTAGE_LIMIT,
REPEAT_OPTIMIZATION_PROBABILITY,
TOP_N_REFINEMENTS,
TOTAL_LOOPING_TIME_EFFECTIVE,
EffortKeys,
get_effort_value,
)
from codeflash.code_utils.deduplicate_code import normalize_code
from codeflash.code_utils.edit_generated_tests import (
Expand Down Expand Up @@ -139,13 +134,15 @@ def __init__(
ai_service_client: AiServiceClient,
executor: concurrent.futures.ThreadPoolExecutor,
future_all_code_repair: list[concurrent.futures.Future],
effort: str,
) -> None:
self.candidate_queue = queue.Queue()
self.line_profiler_done = False
self.refinement_done = False
self.candidate_len = len(initial_candidates)
self.ai_service_client = ai_service_client
self.executor = executor
self.effort = effort

# Initialize queue with initial candidates
for candidate in initial_candidates:
Expand Down Expand Up @@ -193,8 +190,16 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur
def _process_refinement_results(self) -> OptimizedCandidate | None:
"""Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined."""
future_refinements: list[concurrent.futures.Future] = []
top_n_candidates = int(
min(
get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.effort),
len(self.all_refinements_data),
)
)

if len(self.all_refinements_data) <= REFINE_ALL_THRESHOLD:
if top_n_candidates == len(self.all_refinements_data) or len(self.all_refinements_data) <= get_effort_value(
EffortKeys.REFINE_ALL_THRESHOLD, self.effort
):
for data in self.all_refinements_data:
future_refinements.append(self.refine_optimizations([data])) # noqa: PERF401
else:
Expand All @@ -211,7 +216,6 @@ def _process_refinement_results(self) -> OptimizedCandidate | None:
diffs_norm = normalize_by_max(diff_lens_list)
# the lower the better
score_dict = create_score_dictionary_from_metrics(weights, runtime_norm, diffs_norm)
top_n_candidates = int((TOP_N_REFINEMENTS * len(runtimes_list)) + 0.5)
top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates]

for idx in top_indecies:
Expand Down Expand Up @@ -312,7 +316,7 @@ def __init__(
self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {}
self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {}
self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None
n_tests = N_TESTS_TO_GENERATE_EFFECTIVE
n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, args.effort)
self.executor = concurrent.futures.ThreadPoolExecutor(
max_workers=n_tests + 3 if self.experiment_id is None else n_tests + 4
)
Expand Down Expand Up @@ -362,7 +366,7 @@ def generate_and_instrument_tests(
str,
]:
"""Generate and instrument tests for the function."""
n_tests = N_TESTS_TO_GENERATE_EFFECTIVE
n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.args.effort)
generated_test_paths = [
get_test_file_path(
self.test_cfg.tests_root, self.function_to_optimize.function_name, test_index, test_type="unit"
Expand Down Expand Up @@ -927,7 +931,7 @@ def determine_best_candidate(
dependency_code=code_context.read_only_context_code,
trace_id=self.get_trace_id(exp_type),
line_profiler_results=original_code_baseline.line_profile_results["str_out"],
num_candidates=N_CANDIDATES_LP_EFFECTIVE,
num_candidates=get_effort_value(EffortKeys.N_OPTIMIZER_LP_CANDIDATES, self.args.effort),
experiment_metadata=ExperimentMetadata(
id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment"
)
Expand All @@ -942,6 +946,7 @@ def determine_best_candidate(
self.aiservice_client,
self.executor,
self.future_all_code_repair,
self.args.effort,
)
candidate_index = 0

Expand Down Expand Up @@ -1292,7 +1297,7 @@ def generate_tests(
generated_perf_test_paths: list[Path],
) -> Result[tuple[int, GeneratedTestsList, dict[str, set[FunctionCalledInTest]], str], str]:
"""Generate unit tests and concolic tests for the function."""
n_tests = N_TESTS_TO_GENERATE_EFFECTIVE
n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.args.effort)
assert len(generated_test_paths) == n_tests

# Submit test generation tasks
Expand Down Expand Up @@ -1354,7 +1359,7 @@ def generate_optimizations(
run_experiment: bool = False, # noqa: FBT001, FBT002
) -> Result[tuple[OptimizationSet, str], str]:
"""Generate optimization candidates for the function."""
n_candidates = N_CANDIDATES_EFFECTIVE
n_candidates = get_effort_value(EffortKeys.N_OPTIMIZER_CANDIDATES, self.args.effort)

future_optimization_candidates = self.executor.submit(
self.aiservice_client.optimize_python_code,
Expand Down Expand Up @@ -1921,8 +1926,9 @@ def repair_if_possible(
test_results_count: int,
exp_type: str,
) -> None:
if self.repair_counter >= MAX_REPAIRS_PER_TRACE:
logger.debug(f"Repair counter reached {MAX_REPAIRS_PER_TRACE}, skipping repair")
max_repairs = get_effort_value(EffortKeys.MAX_CODE_REPAIRS_PER_TRACE, self.args.effort)
if self.repair_counter >= max_repairs:
logger.debug(f"Repair counter reached {max_repairs}, skipping repair")
return
if candidate.source not in (OptimizedCandidateSource.OPTIMIZE, OptimizedCandidateSource.OPTIMIZE_LP):
# only repair the first pass of the candidates for now
Expand All @@ -1932,7 +1938,7 @@ def repair_if_possible(
logger.debug("No diffs found, skipping repair")
return
result_unmatched_perc = len(diffs) / test_results_count
if result_unmatched_perc > REPAIR_UNMATCHED_PERCENTAGE_LIMIT:
if result_unmatched_perc > get_effort_value(EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT, self.args.effort):
logger.debug(f"Result unmatched percentage is {result_unmatched_perc * 100}%, skipping repair")
return

Expand Down
Loading