Implement fully parallel upload processing

This adds another feature/rollout flag which prefers the parallel upload processing pipeline in favor of running it as an experiment. Upload Processing can run in essentially 4 modes: - Completely serial processing - Serial processing, but running "experiment" code (`is_experiment_serial`): - In this mode, each `UploadProcessor` task saves a copy of the raw upload, as well as a copy of the final report (`is_final`) for later verification. - Parallel processing, but running "experiment" code (`is_experiment_parallel`): - In this mode, another parallel set of `UploadProcessor` tasks runs *after* the main set up tasks. - These tasks are using the copied-over raw uploads that were prepared by the `is_experiment_serial` tasks to do their processing. - These tasks are not persisting any of their results in the database, instead the final `UploadFinisher` task will launch the `ParallelVerification` task. - Fully parallel processing (`is_fully_parallel`): - In this mode, the final `UploadFinisher` task is responsible for merging the final report and persisting it. An example Task chain might look like this, in "experiment" mode: - Upload - UploadProcessor (`is_experiment_serial`) - UploadProcessor (`is_experiment_serial`) - UploadProcessor (`is_experiment_serial`, `is_final`) - UploadFinisher - UploadProcessor (`is_experiment_parallel`) - UploadProcessor (`is_experiment_parallel`) - UploadProcessor (`is_experiment_parallel`) - UploadFinisher (`is_experiment_parallel`) - ParallelVerification Once implemented, `is_fully_parallel` will look like this: - Upload - UploadProcessor (`is_fully_parallel`) - UploadProcessor (`is_fully_parallel`) - UploadProcessor (`is_fully_parallel`) - UploadFinisher (`is_fully_parallel`)
codecov · Sep 10, 2024 · b9f675a · b9f675a
1 parent ab427f8
commit b9f675a
Show file tree

Hide file tree

Showing 8 changed files with 329 additions and 184 deletions.
diff --git a/helpers/parallel.py b/helpers/parallel.py
@@ -0,0 +1,100 @@
+import dataclasses
+from typing import Self
+
+from rollouts import (
+    FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+    PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+)
+
+
+@dataclasses.dataclass(frozen=True)
+class ParallelProcessing:
+    """
+    This encapsulates Parallel Upload Processing logic
+
+    Upload Processing can run in essentially 4 modes:
+    - Completely serial processing
+    - Serial processing, but running "experiment" code (`is_experiment_serial`):
+      - In this mode, each `UploadProcessor` task saves a copy of the raw upload,
+        as well as a copy of the final report (`is_final`) for later verification.
+    - Parallel processing, but running "experiment" code (`is_experiment_parallel`):
+      - In this mode, another parallel set of `UploadProcessor` tasks runs *after*
+        the main set up tasks.
+      - These tasks are using the copied-over raw uploads that were prepared by
+        the `is_experiment_serial` tasks to do their processing.
+      - These tasks are not persisting any of their results in the database,
+        instead the final `UploadFinisher` task will launch the `ParallelVerification` task.
+    - Fully parallel processing (`is_fully_parallel`):
+      - In this mode, the final `UploadFinisher` task is responsible for merging
+        the final report and persisting it.
+
+    An example Task chain might look like this, in "experiment" mode:
+    - Upload
+      - UploadProcessor (`is_experiment_serial`)
+        - UploadProcessor (`is_experiment_serial`)
+          - UploadProcessor (`is_experiment_serial`, `is_final`)
+            - UploadFinisher
+              - UploadProcessor (`is_experiment_parallel`)
+              - UploadProcessor (`is_experiment_parallel`)
+              - UploadProcessor (`is_experiment_parallel`)
+                - UploadFinisher (`is_experiment_parallel`)
+                  - ParallelVerification
+
+    The `is_fully_parallel` mode looks like this:
+    - Upload
+      - UploadProcessor (`is_fully_parallel`)
+      - UploadProcessor (`is_fully_parallel`)
+      - UploadProcessor (`is_fully_parallel`)
+        - UploadFinisher (`is_fully_parallel`)
+    """
+
+    run_experiment: bool = False
+    run_fully_parallel: bool = False
+
+    is_fully_parallel: bool = False
+    is_experiment_parallel: bool = False
+    is_experiment_serial: bool = False
+    is_final: bool = False
+    parallel_idx: int | None = None
+
+    def initial(repoid: int) -> Self:
+        run_fully_parallel = FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
+            identifier=repoid, default=False
+        )
+        run_experiment = (
+            False
+            if run_fully_parallel
+            else PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
+                identifier=repoid, default=False
+            )
+        )
+
+        return ParallelProcessing(
+            run_fully_parallel=run_fully_parallel,
+            run_experiment=run_experiment,
+            is_fully_parallel=run_fully_parallel,
+        )
+
+    def from_task_args(
+        repoid: int,
+        in_parallel: bool = False,
+        fully_parallel: bool = False,
+        is_final: bool = False,
+        parallel_idx: bool | None = None,
+        **kwargs,
+    ) -> Self:
+        slf = ParallelProcessing.initial(repoid)
+
+        if fully_parallel:
+            return dataclasses.replace(slf, is_fully_parallel=True)
+
+        is_experiment_parallel = slf.run_experiment and in_parallel
+        is_experiment_serial = slf.run_experiment and not in_parallel
+
+        return dataclasses.replace(
+            slf,
+            is_experiment_parallel=is_experiment_parallel,
+            is_experiment_serial=is_experiment_serial,
+            is_final=is_final,
+            parallel_idx=parallel_idx,
+        )
diff --git a/helpers/parallel_upload_processing.py b/helpers/parallel_upload_processing.py
@@ -43,16 +43,14 @@ def _adjust_sessions(
 def get_parallel_session_ids(
     sessions, argument_list, db_session, report_service, commit_yaml
 ):
-    num_sessions = len(argument_list)
-
     mock_sessions = copy.deepcopy(sessions)  # the sessions already in the report
     get_parallel_session_ids = []
 
     # iterate over all uploads, get the next session id, and adjust sessions (remove CFF logic)
-    for i in range(num_sessions):
+    for arguments in argument_list:
         next_session_id = next_session_number(mock_sessions)
 
-        upload_pk = argument_list[i]["upload_pk"]
+        upload_pk = arguments["upload_pk"]
         upload = db_session.query(Upload).filter_by(id_=upload_pk).first()
         to_merge_session = report_service.build_session(upload)
         flags = upload.flag_names

diff --git a/rollouts/__init__.py b/rollouts/__init__.py
@@ -11,6 +11,7 @@
 )
 
 PARALLEL_UPLOAD_PROCESSING_BY_REPO = Feature("parallel_upload_processing")
+FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO = Feature("fully_parallel_upload_processing")
 
 CARRYFORWARD_BASE_SEARCH_RANGE_BY_OWNER = Feature("carryforward_base_search_range")
 

diff --git a/services/report/__init__.py b/services/report/__init__.py
@@ -43,11 +43,9 @@
     RepositoryWithoutValidBotError,
 )
 from helpers.labels import get_labels_per_session
+from helpers.parallel import ParallelProcessing
 from helpers.telemetry import MetricContext
-from rollouts import (
-    CARRYFORWARD_BASE_SEARCH_RANGE_BY_OWNER,
-    PARALLEL_UPLOAD_PROCESSING_BY_REPO,
-)
+from rollouts import CARRYFORWARD_BASE_SEARCH_RANGE_BY_OWNER
 from services.archive import ArchiveService
 from services.redis import (
     PARALLEL_UPLOAD_PROCESSING_SESSION_COUNTER_TTL,
@@ -61,9 +59,7 @@
     RAW_UPLOAD_RAW_REPORT_COUNT,
     RAW_UPLOAD_SIZE,
 )
-from services.report.raw_upload_processor import (
-    process_raw_upload,
-)
+from services.report.raw_upload_processor import process_raw_upload
 from services.repository import get_repo_provider_service
 from services.yaml.reader import get_paths_from_flags, read_yaml_field
 
@@ -207,7 +203,9 @@ def has_initialized_report(self, commit: Commit) -> bool:
 
     @sentry_sdk.trace
     def initialize_and_save_report(
-        self, commit: Commit, report_code: str = None
+        self,
+        commit: Commit,
+        report_code: str = None,
     ) -> CommitReport:
         """
             Initializes the commit report
@@ -287,26 +285,28 @@ def initialize_and_save_report(
                 # This means there is a report to carryforward
                 self.save_full_report(commit, report, report_code)
 
+                parallel_processing = (
+                    ParallelProcessing.initial(commit.repository.repoid),
+                )
                 # Behind parallel processing flag, save the CFF report to GCS so the parallel variant of
                 # finisher can build off of it later. Makes the assumption that the CFFs occupy the first
                 # j to i session ids where i is the max id of the CFFs and j is some integer less than i.
-                if PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
-                    identifier=commit.repository.repoid
-                ):
+                if parallel_processing.run_experiment:
                     self.save_parallel_report_to_archive(commit, report, report_code)
-                    highest_session_id = max(
-                        report.sessions.keys()
-                    )  # the largest id among the CFFs
-                    get_redis_connection().incrby(
-                        name=get_parallel_upload_processing_session_counter_redis_key(
+                    # the largest id among the CFFs:
+                    highest_session_id = max(report.sessions.keys())
+                    redis = get_redis_connection()
+                    redis_key = (
+                        get_parallel_upload_processing_session_counter_redis_key(
                             commit.repository.repoid, commit.commitid
-                        ),
+                        )
+                    )
+                    redis.incrby(
+                        name=redis_key,
                         amount=highest_session_id + 1,
                     )
-                    get_redis_connection().expire(
-                        name=get_parallel_upload_processing_session_counter_redis_key(
-                            commit.repository.repoid, commit.commitid
-                        ),
+                    redis.expire(
+                        name=redis_key,
                         time=PARALLEL_UPLOAD_PROCESSING_SESSION_COUNTER_TTL,
                     )
 
@@ -840,7 +840,7 @@ def create_new_report_for_commit(self, commit: Commit) -> Report:
 
     @sentry_sdk.trace
     def parse_raw_report_from_storage(
-        self, repo: Repository, upload: Upload, is_parallel=False
+        self, repo: Repository, upload: Upload, parallel_processing: ParallelProcessing
     ) -> ParsedRawReport:
         """Pulls the raw uploaded report from storage and parses it so it's
         easier to access different parts of the raw upload.
@@ -851,23 +851,19 @@ def parse_raw_report_from_storage(
         archive_service = self.get_archive_service(repo)
         archive_url = upload.storage_path
 
-        # TODO: For the parallel experiment, can remove once finished
         log.info(
             "Parsing the raw report from storage",
             extra=dict(
                 commit=upload.report.commit_id,
                 repoid=repo.repoid,
                 archive_url=archive_url,
-                is_parallel=is_parallel,
             ),
         )
 
         # For the parallel upload verification experiment, we need to make a copy of the raw uploaded reports
         # so that the parallel pipeline can use those to parse. The serial pipeline rewrites the raw uploaded
         # reports to a human readable version that doesn't include file fixes, so that's why copying is necessary.
-        if PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
-            identifier=repo.repoid, default=False
-        ):
+        if parallel_processing.run_experiment:
             parallel_url = archive_url.removesuffix(".txt") + "_PARALLEL.txt"
             log.info(
                 "In the parallel experiment for parsing raw report in storage",
@@ -878,7 +874,7 @@ def parse_raw_report_from_storage(
                     archive_url=archive_url,
                 ),
             )
-            if not is_parallel:
+            if parallel_processing.is_experiment_serial:
                 archive_file = archive_service.read_file(archive_url)
                 archive_service.write_file(parallel_url, archive_file)
                 log.info(
@@ -929,7 +925,7 @@ def build_report_from_raw_content(
         report: Report,
         raw_report_info: RawReportInfo,
         upload: Upload,
-        parallel_idx=None,
+        parallel_processing: ParallelProcessing,
     ) -> ProcessingResult:
         """
         Processes an upload on top of an existing report `master` and returns
@@ -965,7 +961,7 @@ def build_report_from_raw_content(
 
         try:
             raw_report = self.parse_raw_report_from_storage(
-                commit.repository, upload, is_parallel=parallel_idx is not None
+                commit.repository, upload, parallel_processing
             )
             raw_report_info.raw_report = raw_report
         except FileNotInStorageError:
@@ -977,7 +973,7 @@ def build_report_from_raw_content(
                     reportid=reportid,
                     commit_yaml=self.current_yaml.to_dict(),
                     archive_url=archive_url,
-                    in_parallel=parallel_idx is not None,
+                    parallel_processing=parallel_processing,
                 ),
             )
             result.error = ProcessingError(
@@ -997,12 +993,17 @@ def build_report_from_raw_content(
                     flags,
                     session,
                     upload=upload,
-                    parallel_idx=parallel_idx,
+                    parallel_idx=parallel_processing.parallel_idx,
                 )
                 result.report = process_result.report
             log.info(
                 "Successfully processed report"
-                + (" (in parallel)" if parallel_idx is not None else ""),
+                + (
+                    " (in parallel)"
+                    if parallel_processing.is_experiment_parallel
+                    or parallel_processing.is_fully_parallel
+                    else ""
+                ),
                 extra=dict(
                     session=session.id,
                     ci=f"{session.provider}:{session.build}:{session.job}",
@@ -1049,13 +1050,6 @@ def update_upload_with_processing_result(
         db_session = upload_obj.get_db_session()
         session = processing_result.session
         if processing_result.error is None:
-            # this should be enabled for the actual rollout of parallel upload processing.
-            # if PARALLEL_UPLOAD_PROCESSING_BY_REPO.check_value(
-            #     "this should be the repo id"
-            # ):
-            #     upload_obj.state_id = UploadState.PARALLEL_PROCESSED.db_id
-            #     upload_obj.state = "parallel_processed"
-            # else:
             upload_obj.state_id = UploadState.PROCESSED.db_id
             upload_obj.state = "processed"
             upload_obj.order_number = session.id

diff --git a/tasks/tests/integration/test_upload_e2e.py b/tasks/tests/integration/test_upload_e2e.py
@@ -13,7 +13,10 @@
 from database.models.core import Commit, CompareCommit, Repository
 from database.tests.factories import CommitFactory, RepositoryFactory
 from database.tests.factories.core import PullFactory
-from rollouts import PARALLEL_UPLOAD_PROCESSING_BY_REPO
+from rollouts import (
+    FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+    PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+)
 from services.archive import ArchiveService
 from services.redis import get_redis_connection
 from services.report import ReportService
@@ -114,9 +117,18 @@ def setup_mock_get_compare(
 
 @pytest.mark.integration
 @pytest.mark.django_db()
-@pytest.mark.parametrize("do_parallel_processing", [False, True])
+@pytest.mark.parametrize(
+    "do_fully_parallel_processing,do_parallel_processing",
+    [
+        (False, False),
+        (False, True),
+        (True, True),
+    ],
+    ids=["fully synchronous", "parallel experiment", "fully parallel"],
+)
 def test_full_upload(
     dbsession: Session,
+    do_fully_parallel_processing: bool,
     do_parallel_processing: bool,
     mocker,
     mock_repo_provider,
@@ -146,6 +158,11 @@ def test_full_upload(
         }
     )
     # use parallel processing:
+    mocker.patch.object(
+        FULLY_PARALLEL_UPLOAD_PROCESSING_BY_REPO,
+        "check_value",
+        return_value=do_fully_parallel_processing,
+    )
     mocker.patch.object(
         PARALLEL_UPLOAD_PROCESSING_BY_REPO,
         "check_value",