From 86537f261f9651e321ea11227c4cc6e98455a40d Mon Sep 17 00:00:00 2001 From: Jonathan Calderon Chavez Date: Tue, 2 Apr 2024 16:40:25 +0000 Subject: [PATCH 1/3] fix datasets --- Dockerfile.tmpl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl index a7831a81..4f8a8c76 100644 --- a/Dockerfile.tmpl +++ b/Dockerfile.tmpl @@ -525,7 +525,6 @@ RUN pip install flashtext \ pyemd \ pyupset \ pympler \ - s3fs \ featuretools \ #-e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper \ git+https://github.com/Kaggle/learntools \ @@ -561,8 +560,9 @@ RUN pip install pytorch-ignite \ bqplot \ earthengine-api \ transformers \ - # b/232247930 >= 2.2.0 requires pyarrow >= 6.0.0 which conflicts with dependencies for rapidsai 0.21.* - datasets==2.1.0 \ + datasets \ + s3fs \ + gcsfs \ kaggle-environments \ geopandas \ "shapely<2" \ From 6fb8fa60396166a4740bb4322aa9023ff716354c Mon Sep 17 00:00:00 2001 From: Jonathan Calderon Chavez Date: Tue, 2 Apr 2024 17:37:51 +0000 Subject: [PATCH 2/3] add test --- tests/test_hf_datasets.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/test_hf_datasets.py b/tests/test_hf_datasets.py index 15a15ec9..46d66879 100644 --- a/tests/test_hf_datasets.py +++ b/tests/test_hf_datasets.py @@ -1,7 +1,8 @@ import unittest -from datasets import Dataset - +import datasets +import pandas as pd +import warnings class TestHuggingFaceDatasets(unittest.TestCase): @@ -10,7 +11,13 @@ def some_func(batch): batch['label'] = 'foo' return batch - df = Dataset.from_dict({'text': ['Kaggle rocks!']}) + df = datasets.Dataset.from_dict({'text': ['Kaggle rocks!']}) mapped_df = df.map(some_func) - self.assertEqual('foo', mapped_df[0]['label']) \ No newline at end of file + self.assertEqual('foo', mapped_df[0]['label']) + + def test_load_dataset(self): + warnings.simplefilter(action='ignore', category=FutureWarning) + dataset = datasets.load_dataset("csv", data_files="/input/tests/data/train.csv") + full_data = pd.DataFrame(dataset['train']) + self.assertFalse(full_data.empty) \ No newline at end of file From 79c3fd4ca44f233f84044fed2c17545227a115a5 Mon Sep 17 00:00:00 2001 From: Jonathan Calderon Chavez Date: Tue, 2 Apr 2024 17:39:51 +0000 Subject: [PATCH 3/3] add test pt2 --- tests/test_hf_datasets.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_hf_datasets.py b/tests/test_hf_datasets.py index 46d66879..048462a6 100644 --- a/tests/test_hf_datasets.py +++ b/tests/test_hf_datasets.py @@ -2,7 +2,6 @@ import datasets import pandas as pd -import warnings class TestHuggingFaceDatasets(unittest.TestCase): @@ -17,7 +16,6 @@ def some_func(batch): self.assertEqual('foo', mapped_df[0]['label']) def test_load_dataset(self): - warnings.simplefilter(action='ignore', category=FutureWarning) dataset = datasets.load_dataset("csv", data_files="/input/tests/data/train.csv") full_data = pd.DataFrame(dataset['train']) self.assertFalse(full_data.empty) \ No newline at end of file