Kaggle · calderjo · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -525,7 +525,6 @@ RUN pip install flashtext \
         pyemd \
         pyupset \
         pympler \
-        s3fs \
         featuretools \
         #-e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper \
         git+https://github.com/Kaggle/learntools \
@@ -561,8 +560,9 @@ RUN pip install pytorch-ignite \
         bqplot \
         earthengine-api \
         transformers \
-        # b/232247930 >= 2.2.0 requires pyarrow >= 6.0.0 which conflicts with dependencies for rapidsai 0.21.*
-        datasets==2.1.0 \
+        datasets \
+        s3fs \
+        gcsfs \
         kaggle-environments \
         geopandas \
         "shapely<2" \

diff --git a/tests/test_hf_datasets.py b/tests/test_hf_datasets.py
@@ -1,7 +1,7 @@
 import unittest
 
-from datasets import Dataset
-
+import datasets
+import pandas as pd
 
 class TestHuggingFaceDatasets(unittest.TestCase):
 
@@ -10,7 +10,12 @@ def some_func(batch):
             batch['label'] = 'foo'
             return batch
 
-        df = Dataset.from_dict({'text': ['Kaggle rocks!']})
+        df = datasets.Dataset.from_dict({'text': ['Kaggle rocks!']})
         mapped_df = df.map(some_func)
 
-        self.assertEqual('foo', mapped_df[0]['label'])
+        self.assertEqual('foo', mapped_df[0]['label'])
+
+    def test_load_dataset(self):
+        dataset = datasets.load_dataset("csv", data_files="/input/tests/data/train.csv")
+        full_data = pd.DataFrame(dataset['train'])
+        self.assertFalse(full_data.empty)