From b9f675a792e1451a3af9b7a25d8e5c24780ffab6 Mon Sep 17 00:00:00 2001
From: Arpad Borsos <arpad.borsos@sentry.io>
Date: Thu, 29 Aug 2024 15:32:19 +0200
Subject: [PATCH] Implement fully parallel upload processing

This adds another feature/rollout flag which prefers the parallel upload processing pipeline in favor of running it as an experiment.

Upload Processing can run in essentially 4 modes:
- Completely serial processing
- Serial processing, but running "experiment" code (`is_experiment_serial`):
  - In this mode, each `UploadProcessor` task saves a copy of the raw upload,
    as well as a copy of the final report (`is_final`) for later verification.
- Parallel processing, but running "experiment" code (`is_experiment_parallel`):
  - In this mode, another parallel set of `UploadProcessor` tasks runs *after*
    the main set up tasks.
  - These tasks are using the copied-over raw uploads that were prepared by
    the `is_experiment_serial` tasks to do their processing.
  - These tasks are not persisting any of their results in the database,
    instead the final `UploadFinisher` task will launch the `ParallelVerification` task.
- Fully parallel processing (`is_fully_parallel`):
  - In this mode, the final `UploadFinisher` task is responsible for merging
    the final report and persisting it.

An example Task chain might look like this, in "experiment" mode:
- Upload
  - UploadProcessor (`is_experiment_serial`)
    - UploadProcessor (`is_experiment_serial`)
      - UploadProcessor (`is_experiment_serial`, `is_final`)
        - UploadFinisher
          - UploadProcessor (`is_experiment_parallel`)
          - UploadProcessor (`is_experiment_parallel`)
          - UploadProcessor (`is_experiment_parallel`)
            - UploadFinisher (`is_experiment_parallel`)
              - ParallelVerification

Once implemented, `is_fully_parallel` will look like this:
- Upload
  - UploadProcessor (`is_fully_parallel`)
  - UploadProcessor (`is_fully_parallel`)
  - UploadProcessor (`is_fully_parallel`)
    - UploadFinisher (`is_fully_parallel`)
---
 helpers/parallel.py                        | 100 +++++++++++++
 helpers/parallel_upload_processing.py      |   6 +-
 rollouts/__init__.py                       |   1 +
 services/report/__init__.py                |  74 +++++-----
 tasks/tests/integration/test_upload_e2e.py |  21 ++-
 tasks/upload.py                            |  63 +++++----
 tasks/upload_finisher.py                   | 155 ++++++++++++---------
 tasks/upload_processor.py                  |  93 ++++++-------
 8 files changed, 329 insertions(+), 184 deletions(-)
 create mode 100644 helpers/parallel.py

diff --git a/helpers/parallel.py b/helpers/parallel.py
new file mode 100644
index 000000000..09220b463
--- /dev/null
+++ b/helpers/parallel.py
@@ -0,0 +1,100 @@
+import dataclasses
+from typing import Self
+
+from rollouts import (
+    FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+    PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+)
+
+
+@dataclasses.dataclass(frozen=True)
+class ParallelProcessing:
+    """
+    This encapsulates Parallel Upload Processing logic
+
+    Upload Processing can run in essentially 4 modes:
+    - Completely serial processing
+    - Serial processing, but running "experiment" code (`is_experiment_serial`):
+      - In this mode, each `UploadProcessor` task saves a copy of the raw upload,
+        as well as a copy of the final report (`is_final`) for later verification.
+    - Parallel processing, but running "experiment" code (`is_experiment_parallel`):
+      - In this mode, another parallel set of `UploadProcessor` tasks runs *after*
+        the main set up tasks.
+      - These tasks are using the copied-over raw uploads that were prepared by
+        the `is_experiment_serial` tasks to do their processing.
+      - These tasks are not persisting any of their results in the database,
+        instead the final `UploadFinisher` task will launch the `ParallelVerification` task.
+    - Fully parallel processing (`is_fully_parallel`):
+      - In this mode, the final `UploadFinisher` task is responsible for merging
+        the final report and persisting it.
+
+    An example Task chain might look like this, in "experiment" mode:
+    - Upload
+      - UploadProcessor (`is_experiment_serial`)
+        - UploadProcessor (`is_experiment_serial`)
+          - UploadProcessor (`is_experiment_serial`, `is_final`)
+            - UploadFinisher
+              - UploadProcessor (`is_experiment_parallel`)
+              - UploadProcessor (`is_experiment_parallel`)
+              - UploadProcessor (`is_experiment_parallel`)
+                - UploadFinisher (`is_experiment_parallel`)
+                  - ParallelVerification
+
+    The `is_fully_parallel` mode looks like this:
+    - Upload
+      - UploadProcessor (`is_fully_parallel`)
+      - UploadProcessor (`is_fully_parallel`)
+      - UploadProcessor (`is_fully_parallel`)
+        - UploadFinisher (`is_fully_parallel`)
+    """
+
+    run_experiment: bool = False
+    run_fully_parallel: bool = False
+
+    is_fully_parallel: bool = False
+    is_experiment_parallel: bool = False
+    is_experiment_serial: bool = False
+    is_final: bool = False
+    parallel_idx: int | None = None
+
+    def initial(repoid: int) -> Self:
+        run_fully_parallel = FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
+            identifier=repoid, default=False
+        )
+        run_experiment = (
+            False
+            if run_fully_parallel
+            else PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
+                identifier=repoid, default=False
+            )
+        )
+
+        return ParallelProcessing(
+            run_fully_parallel=run_fully_parallel,
+            run_experiment=run_experiment,
+            is_fully_parallel=run_fully_parallel,
+        )
+
+    def from_task_args(
+        repoid: int,
+        in_parallel: bool = False,
+        fully_parallel: bool = False,
+        is_final: bool = False,
+        parallel_idx: bool | None = None,
+        **kwargs,
+    ) -> Self:
+        slf = ParallelProcessing.initial(repoid)
+
+        if fully_parallel:
+            return dataclasses.replace(slf, is_fully_parallel=True)
+
+        is_experiment_parallel = slf.run_experiment and in_parallel
+        is_experiment_serial = slf.run_experiment and not in_parallel
+
+        return dataclasses.replace(
+            slf,
+            is_experiment_parallel=is_experiment_parallel,
+            is_experiment_serial=is_experiment_serial,
+            is_final=is_final,
+            parallel_idx=parallel_idx,
+        )
diff --git a/helpers/parallel_upload_processing.py b/helpers/parallel_upload_processing.py
index 75a94ca4e..0edf753bd 100644
--- a/helpers/parallel_upload_processing.py
+++ b/helpers/parallel_upload_processing.py
@@ -43,16 +43,14 @@ def _adjust_sessions(
 def get_parallel_session_ids(
     sessions, argument_list, db_session, report_service, commit_yaml
 ):
-    num_sessions = len(argument_list)
-
     mock_sessions = copy.deepcopy(sessions)  # the sessions already in the report
     get_parallel_session_ids = []
 
     # iterate over all uploads, get the next session id, and adjust sessions (remove CFF logic)
-    for i in range(num_sessions):
+    for arguments in argument_list:
         next_session_id = next_session_number(mock_sessions)
 
-        upload_pk = argument_list[i]["upload_pk"]
+        upload_pk = arguments["upload_pk"]
         upload = db_session.query(Upload).filter_by(id_=upload_pk).first()
         to_merge_session = report_service.build_session(upload)
         flags = upload.flag_names
diff --git a/rollouts/__init__.py b/rollouts/__init__.py
index 67003e32f..2e28e8a6a 100644
--- a/rollouts/__init__.py
+++ b/rollouts/__init__.py
@@ -11,6 +11,7 @@
 )
 
 PARALLEL_UPLOAD_PROCESSING_BY_REPO = Feature("parallel_upload_processing")
+FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO = Feature("fully_parallel_upload_processing")
 
 CARRYFORWARD_BASE_SEARCH_RANGE_BY_OWNER = Feature("carryforward_base_search_range")
 
diff --git a/services/report/__init__.py b/services/report/__init__.py
index 7e8747595..29f6f4bbb 100644
--- a/services/report/__init__.py
+++ b/services/report/__init__.py
@@ -43,11 +43,9 @@
     RepositoryWithoutValidBotError,
 )
 from helpers.labels import get_labels_per_session
+from helpers.parallel import ParallelProcessing
 from helpers.telemetry import MetricContext
-from rollouts import (
-    CARRYFORWARD_BASE_SEARCH_RANGE_BY_OWNER,
-    PARALLEL_UPLOAD_PROCESSING_BY_REPO,
-)
+from rollouts import CARRYFORWARD_BASE_SEARCH_RANGE_BY_OWNER
 from services.archive import ArchiveService
 from services.redis import (
     PARALLEL_UPLOAD_PROCESSING_SESSION_COUNTER_TTL,
@@ -61,9 +59,7 @@
     RAW_UPLOAD_RAW_REPORT_COUNT,
     RAW_UPLOAD_SIZE,
 )
-from services.report.raw_upload_processor import (
-    process_raw_upload,
-)
+from services.report.raw_upload_processor import process_raw_upload
 from services.repository import get_repo_provider_service
 from services.yaml.reader import get_paths_from_flags, read_yaml_field
 
@@ -207,7 +203,9 @@ def has_initialized_report(self, commit: Commit) -> bool:
 
     @sentry_sdk.trace
     def initialize_and_save_report(
-        self, commit: Commit, report_code: str = None
+        self,
+        commit: Commit,
+        report_code: str = None,
     ) -> CommitReport:
         """
             Initializes the commit report
@@ -287,26 +285,28 @@ def initialize_and_save_report(
                 # This means there is a report to carryforward
                 self.save_full_report(commit, report, report_code)
 
+                parallel_processing = (
+                    ParallelProcessing.initial(commit.repository.repoid),
+                )
                 # Behind parallel processing flag, save the CFF report to GCS so the parallel variant of
                 # finisher can build off of it later. Makes the assumption that the CFFs occupy the first
                 # j to i session ids where i is the max id of the CFFs and j is some integer less than i.
-                if PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
-                    identifier=commit.repository.repoid
-                ):
+                if parallel_processing.run_experiment:
                     self.save_parallel_report_to_archive(commit, report, report_code)
-                    highest_session_id = max(
-                        report.sessions.keys()
-                    )  # the largest id among the CFFs
-                    get_redis_connection().incrby(
-                        name=get_parallel_upload_processing_session_counter_redis_key(
+                    # the largest id among the CFFs:
+                    highest_session_id = max(report.sessions.keys())
+                    redis = get_redis_connection()
+                    redis_key = (
+                        get_parallel_upload_processing_session_counter_redis_key(
                             commit.repository.repoid, commit.commitid
-                        ),
+                        )
+                    )
+                    redis.incrby(
+                        name=redis_key,
                         amount=highest_session_id + 1,
                     )
-                    get_redis_connection().expire(
-                        name=get_parallel_upload_processing_session_counter_redis_key(
-                            commit.repository.repoid, commit.commitid
-                        ),
+                    redis.expire(
+                        name=redis_key,
                         time=PARALLEL_UPLOAD_PROCESSING_SESSION_COUNTER_TTL,
                     )
 
@@ -840,7 +840,7 @@ def create_new_report_for_commit(self, commit: Commit) -> Report:
 
     @sentry_sdk.trace
     def parse_raw_report_from_storage(
-        self, repo: Repository, upload: Upload, is_parallel=False
+        self, repo: Repository, upload: Upload, parallel_processing: ParallelProcessing
     ) -> ParsedRawReport:
         """Pulls the raw uploaded report from storage and parses it so it's
         easier to access different parts of the raw upload.
@@ -851,23 +851,19 @@ def parse_raw_report_from_storage(
         archive_service = self.get_archive_service(repo)
         archive_url = upload.storage_path
 
-        # TODO: For the parallel experiment, can remove once finished
         log.info(
             "Parsing the raw report from storage",
             extra=dict(
                 commit=upload.report.commit_id,
                 repoid=repo.repoid,
                 archive_url=archive_url,
-                is_parallel=is_parallel,
             ),
         )
 
         # For the parallel upload verification experiment, we need to make a copy of the raw uploaded reports
         # so that the parallel pipeline can use those to parse. The serial pipeline rewrites the raw uploaded
         # reports to a human readable version that doesn't include file fixes, so that's why copying is necessary.
-        if PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
-            identifier=repo.repoid, default=False
-        ):
+        if parallel_processing.run_experiment:
             parallel_url = archive_url.removesuffix(".txt") + "_PARALLEL.txt"
             log.info(
                 "In the parallel experiment for parsing raw report in storage",
@@ -878,7 +874,7 @@ def parse_raw_report_from_storage(
                     archive_url=archive_url,
                 ),
             )
-            if not is_parallel:
+            if parallel_processing.is_experiment_serial:
                 archive_file = archive_service.read_file(archive_url)
                 archive_service.write_file(parallel_url, archive_file)
                 log.info(
@@ -929,7 +925,7 @@ def build_report_from_raw_content(
         report: Report,
         raw_report_info: RawReportInfo,
         upload: Upload,
-        parallel_idx=None,
+        parallel_processing: ParallelProcessing,
     ) -> ProcessingResult:
         """
         Processes an upload on top of an existing report `master` and returns
@@ -965,7 +961,7 @@ def build_report_from_raw_content(
 
         try:
             raw_report = self.parse_raw_report_from_storage(
-                commit.repository, upload, is_parallel=parallel_idx is not None
+                commit.repository, upload, parallel_processing
             )
             raw_report_info.raw_report = raw_report
         except FileNotInStorageError:
@@ -977,7 +973,7 @@ def build_report_from_raw_content(
                     reportid=reportid,
                     commit_yaml=self.current_yaml.to_dict(),
                     archive_url=archive_url,
-                    in_parallel=parallel_idx is not None,
+                    parallel_processing=parallel_processing,
                 ),
             )
             result.error = ProcessingError(
@@ -997,12 +993,17 @@ def build_report_from_raw_content(
                     flags,
                     session,
                     upload=upload,
-                    parallel_idx=parallel_idx,
+                    parallel_idx=parallel_processing.parallel_idx,
                 )
                 result.report = process_result.report
             log.info(
                 "Successfully processed report"
-                + (" (in parallel)" if parallel_idx is not None else ""),
+                + (
+                    " (in parallel)"
+                    if parallel_processing.is_experiment_parallel
+                    or parallel_processing.is_fully_parallel
+                    else ""
+                ),
                 extra=dict(
                     session=session.id,
                     ci=f"{session.provider}:{session.build}:{session.job}",
@@ -1049,13 +1050,6 @@ def update_upload_with_processing_result(
         db_session = upload_obj.get_db_session()
         session = processing_result.session
         if processing_result.error is None:
-            # this should be enabled for the actual rollout of parallel upload processing.
-            # if PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
-            #     "this should be the repo id"
-            # ):
-            #     upload_obj.state_id = UploadState.PARALLEL_PROCESSED.db_id
-            #     upload_obj.state = "parallel_processed"
-            # else:
             upload_obj.state_id = UploadState.PROCESSED.db_id
             upload_obj.state = "processed"
             upload_obj.order_number = session.id
diff --git a/tasks/tests/integration/test_upload_e2e.py b/tasks/tests/integration/test_upload_e2e.py
index 359437631..6fa7a407d 100644
--- a/tasks/tests/integration/test_upload_e2e.py
+++ b/tasks/tests/integration/test_upload_e2e.py
@@ -13,7 +13,10 @@
 from database.models.core import Commit, CompareCommit, Repository
 from database.tests.factories import CommitFactory, RepositoryFactory
 from database.tests.factories.core import PullFactory
-from rollouts import PARALLEL_UPLOAD_PROCESSING_BY_REPO
+from rollouts import (
+    FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+    PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+)
 from services.archive import ArchiveService
 from services.redis import get_redis_connection
 from services.report import ReportService
@@ -114,9 +117,18 @@ def setup_mock_get_compare(
 
 @pytest.mark.integration
 @pytest.mark.django_db()
-@pytest.mark.parametrize("do_parallel_processing", [False, True])
+@pytest.mark.parametrize(
+    "do_fully_parallel_processing,do_parallel_processing",
+    [
+        (False, False),
+        (False, True),
+        (True, True),
+    ],
+    ids=["fully synchronous", "parallel experiment", "fully parallel"],
+)
 def test_full_upload(
     dbsession: Session,
+    do_fully_parallel_processing: bool,
     do_parallel_processing: bool,
     mocker,
     mock_repo_provider,
@@ -146,6 +158,11 @@ def test_full_upload(
         }
     )
     # use parallel processing:
+    mocker.patch.object(
+        FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+        "check_value",
+        return_value=do_fully_parallel_processing,
+    )
     mocker.patch.object(
         PARALLEL_UPLOAD_PROCESSING_BY_REPO,
         "check_value",
diff --git a/tasks/upload.py b/tasks/upload.py
index 725f973f7..45eca3be2 100644
--- a/tasks/upload.py
+++ b/tasks/upload.py
@@ -17,10 +17,7 @@
 from shared.django_apps.codecov_metrics.service.codecov_metrics import (
     UserOnboardingMetricsService,
 )
-from shared.torngit.exceptions import (
-    TorngitClientError,
-    TorngitRepoNotFoundError,
-)
+from shared.torngit.exceptions import TorngitClientError, TorngitRepoNotFoundError
 from shared.yaml import UserYaml
 from shared.yaml.user_yaml import OwnerContext
 from sqlalchemy.orm import Session
@@ -29,19 +26,14 @@
 from database.enums import CommitErrorTypes, ReportType
 from database.models import Commit, CommitReport
 from database.models.core import GITHUB_APP_INSTALLATION_DEFAULT_NAME
-from helpers.checkpoint_logger import (
-    CheckpointLogger,
-    _kwargs_key,
-)
-from helpers.checkpoint_logger import (
-    from_kwargs as checkpoints_from_kwargs,
-)
+from helpers.checkpoint_logger import CheckpointLogger, _kwargs_key
+from helpers.checkpoint_logger import from_kwargs as checkpoints_from_kwargs
 from helpers.checkpoint_logger.flows import TestResultsFlow, UploadFlow
 from helpers.exceptions import RepositoryWithoutValidBotError
 from helpers.github_installation import get_installation_name_for_owner_for_task
+from helpers.parallel import ParallelProcessing
 from helpers.parallel_upload_processing import get_parallel_session_ids
 from helpers.save_commit_error import save_commit_error
-from rollouts import PARALLEL_UPLOAD_PROCESSING_BY_REPO
 from services.archive import ArchiveService
 from services.bundle_analysis.report import BundleAnalysisReportService
 from services.redis import (
@@ -518,7 +510,7 @@ def run_impl_within_lock(
             scheduled_tasks = self.schedule_task(
                 db_session,
                 commit,
-                commit_yaml,
+                commit_yaml.to_dict(),
                 argument_list,
                 commit_report,
                 upload_context,
@@ -548,14 +540,12 @@ def schedule_task(
         self,
         db_session: Session,
         commit: Commit,
-        commit_yaml: UserYaml,
+        commit_yaml: dict,
         argument_list: list[dict],
         commit_report: CommitReport,
         upload_context: UploadContext,
         checkpoints: CheckpointLogger | None,
     ):
-        commit_yaml = commit_yaml.to_dict()
-
         # Carryforward the parent BA report for the current commit's BA report when handling uploads
         # that's not bundle analysis type.
         self.possibly_carryforward_bundle_report(
@@ -603,6 +593,22 @@ def _schedule_coverage_processing_task(
     ):
         checkpoints.log(UploadFlow.INITIAL_PROCESSING_COMPLETE)
 
+        parallel_processing = ParallelProcessing.initial(upload_context.repoid)
+
+        if parallel_processing.run_fully_parallel or parallel_processing.run_experiment:
+            parallel_tasks = self.create_parallel_tasks(
+                db_session,
+                commit,
+                commit_yaml,
+                argument_list,
+                commit_report,
+                upload_context,
+                checkpoints,
+            )
+
+            if parallel_processing.run_fully_parallel:
+                return parallel_tasks.apply_async()
+
         processing_tasks = [
             upload_processor_task.s(
                 repoid=commit.repoid,
@@ -630,16 +636,25 @@ def _schedule_coverage_processing_task(
                 },
             )
         )
-
         serial_tasks = chain(processing_tasks)
 
-        do_parallel_processing = PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
-            identifier=commit.repository.repoid
-        )
-
-        if not do_parallel_processing:
+        if not parallel_processing.run_experiment:
             return serial_tasks.apply_async()
+        # else:
+        parallel_shadow_experiment = serial_tasks | parallel_tasks
+        return parallel_shadow_experiment.apply_async()
 
+    @sentry_sdk.trace
+    def create_parallel_tasks(
+        self,
+        db_session: Session,
+        commit: Commit,
+        commit_yaml: dict,
+        argument_list: list[dict],
+        commit_report: CommitReport,
+        upload_context: UploadContext,
+        checkpoints: CheckpointLogger,
+    ):
         report_service = ReportService(commit_yaml)
         sessions = report_service.build_sessions(commit=commit)
 
@@ -691,6 +706,7 @@ def _schedule_coverage_processing_task(
 
         parallel_processing_tasks = [
             upload_processor_task.s(
+                {},
                 repoid=commit.repoid,
                 commitid=commit.commitid,
                 commit_yaml=commit_yaml,
@@ -717,8 +733,7 @@ def _schedule_coverage_processing_task(
         )
 
         parallel_tasks = chord(parallel_processing_tasks, finish_parallel_sig)
-        parallel_shadow_experiment = serial_tasks | parallel_tasks
-        return parallel_shadow_experiment.apply_async()
+        return parallel_tasks
 
     def _schedule_bundle_analysis_processing_task(
         self,
diff --git a/tasks/upload_finisher.py b/tasks/upload_finisher.py
index 2c9372334..7b188b0f2 100644
--- a/tasks/upload_finisher.py
+++ b/tasks/upload_finisher.py
@@ -21,11 +21,12 @@
 from app import celery_app
 from celery_config import notify_error_task_name
 from database.models import Commit, Pull
+from database.models.core import Repository
 from helpers.checkpoint_logger import _kwargs_key
 from helpers.checkpoint_logger import from_kwargs as checkpoints_from_kwargs
 from helpers.checkpoint_logger.flows import UploadFlow
-from helpers.metrics import KiB, MiB, metrics
-from rollouts import PARALLEL_UPLOAD_PROCESSING_BY_REPO
+from helpers.metrics import KiB, MiB
+from helpers.parallel import ParallelProcessing
 from services.archive import ArchiveService, MinioEndpoints
 from services.comparison import get_or_create_comparison
 from services.redis import get_redis_connection
@@ -35,6 +36,7 @@
 from tasks.base import BaseCodecovTask
 from tasks.parallel_verification import parallel_verification_task
 from tasks.upload_clean_labels_index import task_name as clean_labels_index_task_name
+from tasks.upload_processor import UploadProcessorTask
 
 log = logging.getLogger(__name__)
 
@@ -99,7 +101,6 @@ def run_impl(
         repoid,
         commitid,
         commit_yaml,
-        in_parallel=False,
         report_code=None,
         **kwargs,
     ):
@@ -127,27 +128,22 @@ def run_impl(
         assert commit, "Commit not found in database."
         repository = commit.repository
 
+        parallel_processing = ParallelProcessing.from_task_args(repoid, **kwargs)
+
         if (
-            PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(identifier=repository.repoid)
-            and in_parallel
+            parallel_processing.is_experiment_parallel
+            or parallel_processing.is_fully_parallel
         ):
-            actual_processing_results = {
-                "processings_so_far": [],
-                "parallel_incremental_result": [],
-            }
-            pr = None
-
             # need to transform processing_results produced by chord to get it into the
             # same format as the processing_results produced from chain
-            for task in processing_results:
-                pr = task["processings_so_far"][0].get("pr") or pr
-                actual_processing_results["processings_so_far"].append(
-                    task["processings_so_far"][0]
-                )
-                actual_processing_results["parallel_incremental_result"].append(
-                    task["parallel_incremental_result"]
-                )
-            processing_results = actual_processing_results
+            processing_results = {
+                "processings_so_far": [
+                    task["processings_so_far"][0] for task in processing_results
+                ],
+                "parallel_incremental_result": [
+                    task["parallel_incremental_result"] for task in processing_results
+                ],
+            }
 
             report_service = ReportService(commit_yaml)
             report = self.merge_incremental_reports(
@@ -156,6 +152,7 @@ def run_impl(
                 commit,
                 report_service,
                 processing_results,
+                parallel_processing,
             )
 
             log.info(
@@ -168,29 +165,42 @@ def run_impl(
                 ),
             )
 
-            with metrics.timer(f"{self.metrics_prefix}.save_parallel_report_results"):
+            if parallel_processing.is_fully_parallel:
+                pr = processing_results["processings_so_far"][0]["arguments"].get("pr")
+                processor_task = UploadProcessorTask()
+                processor_task.save_report_results(
+                    db_session,
+                    report_service,
+                    repository,
+                    commit,
+                    report,
+                    pr,
+                    report_code,
+                )
+
+            else:
                 parallel_paths = report_service.save_parallel_report_to_archive(
                     commit, report, report_code
                 )
-            # now that we've built the report and stored it to GCS, we have what we need to
-            # compare the results with the current upload pipeline. We end execution of the
-            # finisher task here so that we don't cause any additional side-effects
-
-            # The verification task that will compare the results of the serial flow and
-            # the parallel flow, and log the result to determine if parallel flow is
-            # working properly.
-            parallel_verification_task.apply_async(
-                kwargs=dict(
-                    repoid=repoid,
-                    commitid=commitid,
-                    commit_yaml=commit_yaml,
-                    report_code=report_code,
-                    parallel_paths=parallel_paths,
-                    processing_results=processing_results,
-                ),
-            )
+                # now that we've built the report and stored it to GCS, we have what we need to
+                # compare the results with the current upload pipeline. We end execution of the
+                # finisher task here so that we don't cause any additional side-effects
+
+                # The verification task that will compare the results of the serial flow and
+                # the parallel flow, and log the result to determine if parallel flow is
+                # working properly.
+                parallel_verification_task.apply_async(
+                    kwargs=dict(
+                        repoid=repoid,
+                        commitid=commitid,
+                        commit_yaml=commit_yaml,
+                        report_code=report_code,
+                        parallel_paths=parallel_paths,
+                        processing_results=processing_results,
+                    ),
+                )
 
-            return
+                return
 
         lock_name = f"upload_finisher_lock_{repoid}_{commitid}"
         redis_connection = get_redis_connection()
@@ -477,45 +487,56 @@ def invalidate_caches(self, redis_connection, commit: Commit):
     def merge_incremental_reports(
         self,
         commit_yaml: dict,
-        repository,
+        repository: Repository,
         commit: Commit,
         report_service: ReportService,
         processing_results,
+        parallel_processing: ParallelProcessing,
     ):
         archive_service = report_service.get_archive_service(repository)
         repoid = repository.repoid
         commitid = commit.id
 
-        fas_path = MinioEndpoints.parallel_upload_experiment.get_path(
-            version="v4",
-            repo_hash=archive_service.get_archive_hash(repository),
-            commitid=commit.commitid,
-            file_name="files_and_sessions",
-        )
-        chunks_path = MinioEndpoints.parallel_upload_experiment.get_path(
-            version="v4",
-            repo_hash=archive_service.get_archive_hash(repository),
-            commitid=commit.commitid,
-            file_name="chunks",
-        )
+        if parallel_processing.is_fully_parallel:
+            report = report_service.get_existing_report_for_commit(commit)
+            if report is None:
+                log.info(
+                    "No base report found for parallel upload processing, using an empty report",
+                    extra=dict(commit=commitid, repoid=repoid),
+                )
+                report = Report()
 
-        try:
-            files_and_sessions = json.loads(archive_service.read_file(fas_path))
-            chunks = archive_service.read_file(chunks_path).decode(errors="replace")
-            report = report_service.build_report(
-                chunks,
-                files_and_sessions["files"],
-                files_and_sessions["sessions"],
-                None,
+        else:
+            fas_path = MinioEndpoints.parallel_upload_experiment.get_path(
+                version="v4",
+                repo_hash=archive_service.get_archive_hash(repository),
+                commitid=commit.commitid,
+                file_name="files_and_sessions",
             )
-        except (
-            FileNotInStorageError
-        ):  # there were no CFFs, so no report was stored in GCS
-            log.info(
-                "No base report found for parallel upload processing, using an empty report",
-                extra=dict(commit=commitid, repoid=repoid),
+            chunks_path = MinioEndpoints.parallel_upload_experiment.get_path(
+                version="v4",
+                repo_hash=archive_service.get_archive_hash(repository),
+                commitid=commit.commitid,
+                file_name="chunks",
             )
-            report = Report()
+
+            try:
+                files_and_sessions = json.loads(archive_service.read_file(fas_path))
+                chunks = archive_service.read_file(chunks_path).decode(errors="replace")
+                report = report_service.build_report(
+                    chunks,
+                    files_and_sessions["files"],
+                    files_and_sessions["sessions"],
+                    None,
+                )
+            except (
+                FileNotInStorageError
+            ):  # there were no CFFs, so no report was stored in GCS
+                log.info(
+                    "No base report found for parallel upload processing, using an empty report",
+                    extra=dict(commit=commitid, repoid=repoid),
+                )
+                report = Report()
 
         log.info(
             "Downloading %s incremental reports that were processed in parallel",
diff --git a/tasks/upload_processor.py b/tasks/upload_processor.py
index e231e1de1..57c46d390 100644
--- a/tasks/upload_processor.py
+++ b/tasks/upload_processor.py
@@ -20,12 +20,12 @@
 from helpers.exceptions import RepositoryWithoutValidBotError
 from helpers.github_installation import get_installation_name_for_owner_for_task
 from helpers.metrics import metrics
+from helpers.parallel import ParallelProcessing
 from helpers.parallel_upload_processing import (
     save_final_serial_report_results,
     save_incremental_report_results,
 )
 from helpers.save_commit_error import save_commit_error
-from rollouts import PARALLEL_UPLOAD_PROCESSING_BY_REPO
 from services.archive import ArchiveService
 from services.redis import get_redis_connection
 from services.report import ProcessingResult, RawReportInfo, Report, ReportService
@@ -81,22 +81,22 @@ def run_impl(
         commit_yaml,
         arguments_list,
         report_code=None,
-        parallel_idx=None,
-        in_parallel=False,
-        is_final=False,
         **kwargs,
     ):
         repoid = int(repoid)
         log.info(
             "Received upload processor task",
-            extra=dict(repoid=repoid, commit=commitid, in_parallel=in_parallel),
+            extra=dict(
+                repoid=repoid, commit=commitid, in_parallel=kwargs.get("in_parallel")
+            ),
         )
 
-        in_parallel = in_parallel and PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
-            identifier=repoid
-        )
+        parallel_processing = ParallelProcessing.from_task_args(repoid, **kwargs)
 
-        if in_parallel:
+        if (
+            parallel_processing.is_fully_parallel
+            or parallel_processing.is_experiment_parallel
+        ):
             log.info(
                 "Using parallel upload processing, skip acquiring upload processing lock",
                 extra=dict(
@@ -107,8 +107,8 @@ def run_impl(
                 ),
             )
 
-            # This function is named `within_lock` but we gate any concurrency-
-            # unsafe operations with `PARALLEL_UPLOAD_PROCESSING_BY_REPO`.
+            # This function is named `_within_lock`, but locking is only necessary
+            # in the non-parallel variant of this task
             return self.process_impl_within_lock(
                 db_session=db_session,
                 previous_results={},
@@ -116,9 +116,8 @@ def run_impl(
                 commitid=commitid,
                 commit_yaml=commit_yaml,
                 arguments_list=arguments_list,
-                parallel_idx=parallel_idx,
                 report_code=report_code,
-                in_parallel=in_parallel,
+                parallel_processing=parallel_processing,
             )
 
         lock_name = UPLOAD_PROCESSING_LOCK_NAME(repoid, commitid)
@@ -139,6 +138,15 @@ def run_impl(
                 timeout=max(60 * 5, self.hard_time_limit_task),
                 blocking_timeout=5,
             ):
+                log.info(
+                    "Obtained upload processing lock, starting",
+                    extra=dict(
+                        repoid=repoid,
+                        commit=commitid,
+                        parent_task=self.request.parent_id,
+                        report_code=report_code,
+                    ),
+                )
                 actual_arguments_list = deepcopy(arguments_list)
                 return self.process_impl_within_lock(
                     db_session=db_session,
@@ -148,9 +156,7 @@ def run_impl(
                     commit_yaml=commit_yaml,
                     arguments_list=actual_arguments_list,
                     report_code=report_code,
-                    parallel_idx=parallel_idx,
-                    in_parallel=in_parallel,
-                    is_final=is_final,
+                    parallel_processing=parallel_processing,
                 )
         except LockError:
             max_retry = 200 * 3**self.request.retries
@@ -177,21 +183,8 @@ def process_impl_within_lock(
         commit_yaml: dict,
         arguments_list,
         report_code,
-        parallel_idx=None,
-        in_parallel=False,
-        is_final=False,
+        parallel_processing: ParallelProcessing,
     ):
-        if in_parallel:
-            log.info(
-                "Obtained upload processing lock, starting",
-                extra=dict(
-                    repoid=repoid,
-                    commit=commitid,
-                    parent_task=self.request.parent_id,
-                    report_code=report_code,
-                ),
-            )
-
         processings_so_far = previous_results.get("processings_so_far", [])
         n_processed = 0
         n_failed = 0
@@ -206,6 +199,11 @@ def process_impl_within_lock(
         pr = None
         report_service = ReportService(UserYaml(commit_yaml))
 
+        in_parallel = (
+            parallel_processing.is_experiment_parallel
+            or parallel_processing.is_fully_parallel
+        )
+
         if in_parallel:
             log.info(
                 "Creating empty report to store incremental result",
@@ -213,16 +211,15 @@ def process_impl_within_lock(
             )
             report = Report()
         else:
-            with metrics.timer(f"{self.metrics_prefix}.build_original_report"):
-                report = report_service.get_existing_report_for_commit(
-                    commit, report_code=report_code
+            report = report_service.get_existing_report_for_commit(
+                commit, report_code=report_code
+            )
+            if report is None:
+                log.info(
+                    "No existing report for commit",
+                    extra=dict(commit=commit.commitid),
                 )
-                if report is None:
-                    log.info(
-                        "No existing report for commit",
-                        extra=dict(commit=commit.commitid),
-                    )
-                    report = Report()
+                report = Report()
 
         raw_reports: list[RawReportInfo] = []
         try:
@@ -261,8 +258,7 @@ def process_impl_within_lock(
                             report,
                             upload_obj,
                             raw_report_info,
-                            parallel_idx=parallel_idx,
-                            in_parallel=in_parallel,
+                            parallel_processing,
                         )
                         # NOTE: this is only used because test mocking messes with the return value here.
                         # in normal flow, the function mutates the argument instead.
@@ -320,7 +316,11 @@ def process_impl_within_lock(
             results_dict = {}
             if in_parallel:
                 parallel_incremental_result = save_incremental_report_results(
-                    report_service, commit, report, parallel_idx, report_code
+                    report_service,
+                    commit,
+                    report,
+                    parallel_processing.parallel_idx,
+                    report_code,
                 )
                 parallel_incremental_result["upload_pk"] = arguments_list[0].get(
                     "upload_pk"
@@ -349,7 +349,7 @@ def process_impl_within_lock(
                 # ParallelVerification task to compare with later, for the parallel
                 # experiment. The report being saved is not necessarily the final
                 # report for the commit, as more uploads can still be made.
-                if is_final:
+                if parallel_processing.is_final:
                     final_serial_report_url = save_final_serial_report_results(
                         report_service, commit, report, report_code, arguments_list
                     )
@@ -405,11 +405,10 @@ def process_individual_report(
         report: Report,
         upload: Upload,
         raw_report_info: RawReportInfo,
-        parallel_idx=None,
-        in_parallel=False,
+        parallel_processing: ParallelProcessing,
     ) -> ProcessingResult:
         processing_result = report_service.build_report_from_raw_content(
-            report, raw_report_info, upload=upload, parallel_idx=parallel_idx
+            report, raw_report_info, upload, parallel_processing
         )
         if (
             processing_result.error is not None
@@ -431,7 +430,7 @@ def process_individual_report(
 
         # for the parallel experiment, we don't want to modify anything in the
         # database, so we disable it here
-        if not in_parallel:
+        if not parallel_processing.is_experiment_parallel:
             report_service.update_upload_with_processing_result(
                 upload, processing_result
             )