epoch8 · elephantum · Jul 4, 2024 · Dec 15, 2024 · Dec 15, 2024 · Dec 15, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+# WIP
+
+## Significant changes
+
+* `DatapipeApp` becomes main entry point to work with pipeline
+* BatchTransform metadata has status "pending"/"clean"/"failed"
+* `DatapipeApp.ingest_data` updates BatchTransform metadata on write
+
 # 0.14.1
 
 * Refactor metadata handling into `datapipe.meta` submodule

diff --git a/datapipe/cli.py b/datapipe/cli.py
@@ -182,21 +182,24 @@ def setup_logging():
         )
         trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(cloud_trace_exporter))  # type: ignore
 
+    executor_instance: Executor
     if executor == "SingleThreadExecutor":
-        ctx.obj["executor"] = SingleThreadExecutor()
+        executor_instance = SingleThreadExecutor()
     elif executor == "RayExecutor":
         import ray
 
         from datapipe.executor.ray import RayExecutor
 
         ray_ctx = ray.init()
 
-        ctx.obj["executor"] = RayExecutor()
+        executor_instance = RayExecutor()
     else:
         raise ValueError(f"Unknown executor: {executor}")
 
+    ctx.obj["executor"] = executor_instance
+
     with tracer.start_as_current_span("init"):
-        ctx.obj["pipeline"] = load_pipeline(pipeline)
+        ctx.obj["pipeline"] = load_pipeline(pipeline).with_executor(executor_instance)
 
 
 @cli.group()

diff --git a/datapipe/compute.py b/datapipe/compute.py
@@ -1,9 +1,11 @@
 import hashlib
 import logging
+import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Dict, Iterable, List, Literal, Optional, Sequence, Tuple
 
+import pandas as pd
 from opentelemetry import trace
 
 from datapipe.datatable import DataStore, DataTable
@@ -192,6 +194,15 @@ def get_change_list_process_ids(
     ) -> Tuple[int, Iterable[IndexDF]]:
         raise NotImplementedError()
 
+    def notify_change_list(
+        self,
+        ds: DataStore,
+        change_list: ChangeList,
+        now: Optional[float] = None,
+        run_config: Optional[RunConfig] = None,
+    ) -> None:
+        pass
+
     def run_full(
         self,
         ds: DataStore,
@@ -237,13 +248,66 @@ class Pipeline:
 
 
 class DatapipeApp:
-    def __init__(self, ds: DataStore, catalog: Catalog, pipeline: Pipeline):
+    def __init__(
+        self,
+        ds: DataStore,
+        catalog: Catalog,
+        pipeline: Pipeline,
+        executor: Optional[Executor] = None,
+    ):
         self.ds = ds
         self.catalog = catalog
         self.pipeline = pipeline
+        self.executor = executor
 
         self.steps = build_compute(ds, catalog, pipeline)
 
+    def with_executor(self, executor: Executor) -> "DatapipeApp":
+        self.executor = executor
+
+        return self
+
+    def consumers(self, table_name: str) -> List[ComputeStep]:
+        return [
+            step
+            for step in self.steps
+            if table_name in [i.dt.name for i in step.input_dts]
+        ]
+
+    def producers(self, table_name: str) -> List[ComputeStep]:
+        return [
+            step
+            for step in self.steps
+            if table_name in [o.name for o in step.output_dts]
+        ]
+
+    def ingest_data(
+        self,
+        table_name: str,
+        data_df: pd.DataFrame,
+        now: Optional[float] = None,
+    ) -> ChangeList:
+        table = self.ds.get_table(table_name)
+        now = now or time.time()
+        changes = table.store_chunk(data_df, now=now)
+
+        change_list = ChangeList({table_name: changes})
+
+        for step in self.consumers(table_name):
+            step.notify_change_list(self.ds, change_list, now=now)
+
+        return change_list
+
+    def ingest_and_process_data(self, table_name: str, data_df: pd.DataFrame) -> None:
+        cl = self.ingest_data(table_name, data_df)
+
+        run_steps_changelist(
+            self.ds,
+            self.steps,
+            cl,
+            executor=self.executor,
+        )
+
 
 def build_compute(
     ds: DataStore, catalog: Catalog, pipeline: Pipeline