From 4171dfa00cf7bfe166fd30072ad8ec4c8cde6967 Mon Sep 17 00:00:00 2001 From: David Wood Date: Mon, 2 Dec 2024 12:12:41 -0500 Subject: [PATCH] Fix for inability to read some parquet files (issue #816) (#817) * add polars to try and read some troublesome parquet files to arrow tables Signed-off-by: David Wood * fix bug in convert_binary_to_arrow() by returnning table from polars Signed-off-by: David Wood * update convert_binary_to_arrow() by catching exceptoins from polars Signed-off-by: David Wood * change filter's duckdb setting to allow large buffers on arrow tables Signed-off-by: David Wood * turn off changes to filter for now Signed-off-by: David Wood * add polars to core library Signed-off-by: David Wood * add comment to say way we're adding polars for reading some parquet files Signed-off-by: David Wood * pin core lib polars>=1.16.0 Signed-off-by: David Wood * change failure on polars read from warning to error Signed-off-by: David Wood * remove comments on duckdb settings for multimodal in FilterTransform.init(). Signed-off-by: David Wood * downgrade polars to >=1.9.0 Signed-off-by: David Wood --------- Signed-off-by: David Wood --- data-processing-lib/python/requirements.txt | 1 + .../data_processing/utils/transform_utils.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/data-processing-lib/python/requirements.txt b/data-processing-lib/python/requirements.txt index 7b363f2b56..318d715d53 100644 --- a/data-processing-lib/python/requirements.txt +++ b/data-processing-lib/python/requirements.txt @@ -4,3 +4,4 @@ argparse mmh3 psutil + polars>=1.9.0 diff --git a/data-processing-lib/python/src/data_processing/utils/transform_utils.py b/data-processing-lib/python/src/data_processing/utils/transform_utils.py index e2d37581c0..ccb7f3fe83 100644 --- a/data-processing-lib/python/src/data_processing/utils/transform_utils.py +++ b/data-processing-lib/python/src/data_processing/utils/transform_utils.py @@ -11,6 +11,7 @@ ################################################################################ import hashlib +import io import os import string import sys @@ -144,8 +145,21 @@ def convert_binary_to_arrow(data: bytes, schema: pa.schema = None) -> pa.Table: table = pq.read_table(reader, schema=schema) return table except Exception as e: - logger.error(f"Failed to convert byte array to arrow table, exception {e}. Skipping it") - return None + logger.warning(f"Could not convert bytes to pyarrow: {e}") + + # We have seen this exception before when using pyarrow, but polars does not throw it. + # "Nested data conversions not implemented for chunked array outputs" + # See issue 816 https://github.com/IBM/data-prep-kit/issues/816. + logger.info(f"Attempting read of pyarrow Table using polars") + try: + import polars + + df = polars.read_parquet(io.BytesIO(data)) + table = df.to_arrow() + except Exception as e: + logger.error(f"Could not convert bytes to pyarrow using polars: {e}. Skipping.") + table = None + return table @staticmethod def convert_arrow_to_binary(table: pa.Table) -> bytes: