Add import_csv_pandas and import_csv_dask utility primitives

amotl · amotl · commit 7a2cb596a026 · 2023-11-07T20:55:16.000+01:00
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -53,7 +53,7 @@ jobs:
         pip install "setuptools>=64" --upgrade
 
         # Install package in editable mode.
-        pip install --use-pep517 --prefer-binary --editable=.[test,develop]
+        pip install --use-pep517 --prefer-binary --editable=.[io,test,develop]
 
     - name: Run linter and software tests
       run: |
diff --git a/CHANGES.md b/CHANGES.md
@@ -4,6 +4,7 @@
 ## Unreleased
 
 - Add SQL runner utility primitives to `io.sql` namespace
+- Add `import_csv_pandas` and `import_csv_dask` utility primitives
 
 
 ## 2023/11/06 v0.0.2
diff --git a/cratedb_toolkit/util/database.py b/cratedb_toolkit/util/database.py
@@ -194,6 +194,61 @@ def ensure_repository_az(
         """
         self.run_sql(sql)
 
+    def import_csv_pandas(
+        self, filepath: t.Union[str, Path], tablename: str, index=False, chunksize=1000, if_exists="replace"
+    ):
+        """
+        Import CSV data using pandas.
+        """
+        import pandas as pd
+        from crate.client.sqlalchemy.support import insert_bulk
+
+        df = pd.read_csv(filepath)
+        with self.engine.connect() as connection:
+            return df.to_sql(
+                tablename, connection, index=index, chunksize=chunksize, if_exists=if_exists, method=insert_bulk
+            )
+
+    def import_csv_dask(
+        self,
+        filepath: t.Union[str, Path],
+        tablename: str,
+        index=False,
+        chunksize=1000,
+        if_exists="replace",
+        npartitions: int = None,
+        progress: bool = False,
+    ):
+        """
+        Import CSV data using Dask.
+        """
+        import dask.dataframe as dd
+        import pandas as pd
+        from crate.client.sqlalchemy.support import insert_bulk
+
+        # Set a few defaults.
+        # TODO: Use amount of CPU cores instead?
+        npartitions = npartitions or 4
+
+        if progress:
+            from dask.diagnostics import ProgressBar
+
+            pbar = ProgressBar()
+            pbar.register()
+
+        # Mangle data.
+        df = pd.read_csv(filepath)
+        ddf = dd.from_pandas(df, npartitions=npartitions)
+        return ddf.to_sql(
+            tablename,
+            uri=self.dburi,
+            index=index,
+            chunksize=chunksize,
+            if_exists=if_exists,
+            method=insert_bulk,
+            parallel=True,
+        )
+
 
 def sa_is_empty(thing):
     """
diff --git a/pyproject.toml b/pyproject.toml
@@ -102,6 +102,10 @@ develop = [
   "ruff==0.1.3",
   "validate-pyproject<0.16",
 ]
+io = [
+  "dask<=2023.10.1,>=2020",
+  "pandas<3,>=2",
+]
 release = [
   "build<2",
   "twine<5",
diff --git a/release/oci/Dockerfile b/release/oci/Dockerfile
@@ -21,7 +21,7 @@ COPY . /src
 
 # Install package.
 RUN --mount=type=cache,id=pip,target=/root/.cache/pip \
-    pip install --use-pep517 --prefer-binary '/src'
+    pip install --use-pep517 --prefer-binary '/src[io]'
 
 # Uninstall Git again.
 RUN apt-get --yes remove --purge git && apt-get --yes autoremove