Merge branch 'master' of https://github.com/modin-project/modin into …

…fix-testsw
modin-project · Apr 5, 2024 · 8f382b0 · 8f382b0
2 parents b05cc60 + 0dfd88d
commit 8f382b0
Show file tree

Hide file tree

Showing 103 changed files with 4,452 additions and 1,430 deletions.
diff --git a/.github/actions/mamba-env/action.yml b/.github/actions/mamba-env/action.yml
@@ -42,3 +42,7 @@ runs:
         # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
         # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
         use-only-tar-bz2: false
+    - shell: bash -l {0}
+      run: |
+        conda run -n ${{ inputs.activate-environment }} pip install .
+        conda list -n ${{ inputs.activate-environment }}
diff --git a/.github/actions/run-core-tests/group_2/action.yml b/.github/actions/run-core-tests/group_2/action.yml
@@ -20,5 +20,3 @@ runs:
                                                       modin/pandas/test/dataframe/test_pickle.py
           echo "::endgroup::"
         shell: bash -l {0}
-      - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
-        shell: bash -l {0}
diff --git a/.github/actions/run-core-tests/group_3/action.yml b/.github/actions/run-core-tests/group_3/action.yml
@@ -18,7 +18,12 @@ runs:
           echo "::endgroup::"
         shell: bash -l {0}
       - run: |
-          echo "::group::Running experimental groupby tests (group 3)..."
+          echo "::group::Running range-partitioning tests (group 3)..."
           MODIN_RANGE_PARTITIONING_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_series.py -k "test_unique or test_nunique or drop_duplicates or test_resample"
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_general.py -k "test_unique"
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_map_metadata.py -k "drop_duplicates"
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_join_sort.py -k "merge"
+          MODIN_RANGE_PARTITIONING=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_default.py -k "resample"
           echo "::endgroup::"
         shell: bash -l {0}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -188,7 +188,6 @@ jobs:
       - run: python -m pytest modin/pandas/test/dataframe/test_binary.py
       - run: python -m pytest modin/pandas/test/dataframe/test_reduce.py
       - run: python -m pytest modin/pandas/test/dataframe/test_join_sort.py
-      - run: MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
       - run: python -m pytest modin/pandas/test/test_general.py
       - run: python -m pytest modin/pandas/test/dataframe/test_indexing.py
       - run: python -m pytest modin/pandas/test/test_series.py

diff --git a/.github/workflows/push-to-master.yml b/.github/workflows/push-to-master.yml
@@ -46,7 +46,6 @@ jobs:
           python -m pytest modin/pandas/test/dataframe/test_indexing.py
           python -m pytest modin/pandas/test/dataframe/test_iter.py
           python -m pytest modin/pandas/test/dataframe/test_join_sort.py
-          MODIN_RANGE_PARTITIONING=1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py -k "merge"
           python -m pytest modin/pandas/test/dataframe/test_map_metadata.py
           python -m pytest modin/pandas/test/dataframe/test_reduce.py
           python -m pytest modin/pandas/test/dataframe/test_udf.py

diff --git a/README.md b/README.md
@@ -24,6 +24,8 @@ Modin is a drop-in replacement for [pandas](https://github.com/pandas-dev/pandas
 single-threaded, Modin lets you instantly speed up your workflows by scaling pandas so it uses all of your
 cores. Modin works especially well on larger datasets, where pandas becomes painfully slow or runs
 [out of memory](https://modin.readthedocs.io/en/latest/getting_started/why_modin/out_of_core.html).
+Also, Modin comes with the [additional APIs](https://modin.readthedocs.io/en/latest/usage_guide/advanced_usage/index.html#additional-apis)
+to improve user experience.
 
 By simply replacing the import statement, Modin offers users effortless speed and scale for their pandas workflows:
 

diff --git a/asv_bench/benchmarks/utils/common.py b/asv_bench/benchmarks/utils/common.py
@@ -114,11 +114,7 @@ def gen_nan_data(nrows: int, ncols: int) -> dict:
 
 def gen_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict:
     """
-    Generate int data with caching.
-
-    The generated data are saved in the dictionary and on a subsequent call,
-    if the keys match, saved data will be returned. Therefore, we need
-    to carefully monitor the changing of saved data and make its copy if needed.
+    Generate int data.
 
     Parameters
     ----------
@@ -136,30 +132,16 @@ def gen_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict:
     dict
         Number of keys - `ncols`, each of them store np.ndarray of `nrows` length.
     """
-    cache_key = ("int", nrows, ncols, rand_low, rand_high)
-    if cache_key in data_cache:
-        return data_cache[cache_key]
-
-    logging.info(
-        "Generating int data {} rows and {} columns [{}-{}]".format(
-            nrows, ncols, rand_low, rand_high
-        )
-    )
     data = {
         "col{}".format(i): np.random.randint(rand_low, rand_high, size=(nrows))
         for i in range(ncols)
     }
-    data_cache[cache_key] = weakdict(data)
     return data
 
 
 def gen_str_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> dict:
     """
-    Generate int data and string data with caching.
-
-    The generated data are saved in the dictionary and on a subsequent call,
-    if the keys match, saved data will be returned. Therefore, we need
-    to carefully monitor the changing of saved data and make its copy if needed.
+    Generate int data and string data.
 
     Parameters
     ----------
@@ -178,30 +160,16 @@ def gen_str_int_data(nrows: int, ncols: int, rand_low: int, rand_high: int) -> d
         Number of keys - `ncols`, each of them store np.ndarray of `nrows` length.
         One of the columns with string values.
     """
-    cache_key = ("str_int", nrows, ncols, rand_low, rand_high)
-    if cache_key in data_cache:
-        return data_cache[cache_key]
-
-    logging.info(
-        "Generating str_int data {} rows and {} columns [{}-{}]".format(
-            nrows, ncols, rand_low, rand_high
-        )
-    )
     data = gen_int_data(nrows, ncols, rand_low, rand_high).copy()
     # convert values in arbitary column to string type
     key = list(data.keys())[0]
     data[key] = [f"str_{x}" for x in data[key]]
-    data_cache[cache_key] = weakdict(data)
     return data
 
 
 def gen_true_false_int_data(nrows, ncols, rand_low, rand_high):
     """
-    Generate int data and string data "true" and "false" values with caching.
-
-    The generated data are saved in the dictionary and on a subsequent call,
-    if the keys match, saved data will be returned. Therefore, we need
-    to carefully monitor the changing of saved data and make its copy if needed.
+    Generate int data and string data "true" and "false" values.
 
     Parameters
     ----------
@@ -221,15 +189,6 @@ def gen_true_false_int_data(nrows, ncols, rand_low, rand_high):
         One half of the columns with integer values, another half - with "true" and
         "false" string values.
     """
-    cache_key = ("true_false_int", nrows, ncols, rand_low, rand_high)
-    if cache_key in data_cache:
-        return data_cache[cache_key]
-
-    logging.info(
-        "Generating true_false_int data {} rows and {} columns [{}-{}]".format(
-            nrows, ncols, rand_low, rand_high
-        )
-    )
     data = gen_int_data(nrows // 2, ncols // 2, rand_low, rand_high)
 
     data_true_false = {
@@ -239,7 +198,6 @@ def gen_true_false_int_data(nrows, ncols, rand_low, rand_high):
         for i in range(ncols - ncols // 2)
     }
     data.update(data_true_false)
-    data_cache[cache_key] = weakdict(data)
     return data
 
 
@@ -289,10 +247,20 @@ def gen_data(
         "str_int": gen_str_int_data,
         "true_false_int": gen_true_false_int_data,
     }
+    cache_key = (data_type, nrows, ncols, rand_low, rand_high)
+    if cache_key in data_cache:
+        return data_cache[cache_key]
+
+    logging.info(
+        "Generating {} data {} rows and {} columns [{}-{}]".format(
+            data_type, nrows, ncols, rand_low, rand_high
+        )
+    )
     assert data_type in type_to_generator
     data_generator = type_to_generator[data_type]
 
     data = data_generator(nrows, ncols, rand_low, rand_high)
+    data_cache[cache_key] = weakdict(data)
 
     return data
 

diff --git a/docs/development/contributing.rst b/docs/development/contributing.rst
@@ -63,8 +63,8 @@ or ``--signoff`` to your usual ``git commit`` commands:
 
 .. code-block:: bash
 
-   git commit --signoff
-   git commit -s
+   git commit --signoff -m "This is my commit message"
+   git commit -s -m "This is my commit message"
 
 This will use your default git configuration which is found in .git/config. To change
 this, you can use the following commands:

diff --git a/docs/ecosystem.rst b/docs/ecosystem.rst
@@ -45,5 +45,32 @@ where NumPy can be used and what libraries it powers.
 
     numpy_arr = to_numpy(modin_df)
 
+to_ray
+------
+
+You can refer to `Ray Data`_ page to get more details on
+where Ray Dataset can be used and what libraries it powers.
+
+.. code-block:: python
+
+    from modin.pandas.io import to_ray
+
+    ray_dataset = to_ray(modin_df)
+
+to_dask
+-------
+
+You can refer to `Dask DataFrame`_ page to get more details on
+where Dask DataFrame can be used and what libraries it powers.
+
+.. code-block:: python
+
+    from modin.pandas.io import to_dask
+
+    dask_df = to_dask(modin_df)
+
 .. _pandas ecosystem: https://pandas.pydata.org/community/ecosystem.html
 .. _NumPy ecosystem: https://numpy.org
+.. _Ray Data: https://docs.ray.io/en/latest/data/data.html
+.. _Dask DataFrame: https://docs.dask.org/en/stable/dataframe.html
+
diff --git a/docs/flow/modin/config.rst b/docs/flow/modin/config.rst
@@ -56,3 +56,30 @@ API.
     # Changing value of `NPartitions`
     modin.config.NPartitions.put(16)
     print(modin.config.NPartitions.get()) # prints '16'
+
+One can also use config variables with a context manager in order to use
+some config only for a certain part of the code:
+
+.. code-block:: python
+
+    import modin.config as cfg
+
+    # Default value for this config is 'False'
+    print(cfg.RangePartitioning.get()) # False
+
+    # Set the config to 'True' inside of the context-manager
+    with cfg.context(RangePartitioning=True):
+        print(cfg.RangePartitioning.get()) # True
+        df.merge(...) # will use range-partitioning impl
+
+    # Once the context is over, the config gets back to its previous value
+    print(cfg.RangePartitioning.get()) # False
+
+    # You can also set multiple config at once when you pass a dictionary to 'cfg.context'
+    print(cfg.AsyncReadMode.get()) # False
+
+    with cfg.context(RangePartitioning=True, AsyncReadMode=True):
+        print(cfg.RangePartitioning.get()) # True
+        print(cfg.AsyncReadMode.get()) # True
+    print(cfg.RangePartitioning.get()) # False
+    print(cfg.AsyncReadMode.get()) # False
diff --git a/docs/flow/modin/experimental/pandas.rst b/docs/flow/modin/experimental/pandas.rst
@@ -16,6 +16,8 @@ Experimental API Reference
 .. autofunction:: read_parquet_glob
 .. autofunction:: read_json_glob
 .. autofunction:: read_xml_glob
+.. automethod:: modin.pandas.DataFrame.modin::to_pandas
+.. automethod:: modin.pandas.DataFrame.modin::to_ray
 .. automethod:: modin.pandas.DataFrame.modin::to_pickle_glob
 .. automethod:: modin.pandas.DataFrame.modin::to_parquet_glob
 .. automethod:: modin.pandas.DataFrame.modin::to_json_glob

diff --git a/docs/flow/modin/experimental/range_partitioning_groupby.rst b/docs/flow/modin/experimental/range_partitioning_groupby.rst
@@ -78,3 +78,30 @@ Range-partitioning Merge
 
 It is recommended to use this implementation if the right dataframe in merge is as big as
 the left dataframe. In this case, range-partitioning implementation works faster and consumes less RAM.
+
+'.unique()' and '.drop_duplicates()'
+""""""""""""""""""""""""""""""""""""
+
+Range-partitioning implementation of '.unique()'/'.drop_duplicates()' works best when the input data size is big (more than
+5_000_000 rows) and when the output size is also expected to be big (no more than 80% values are duplicates).
+
+'.nunique()'
+""""""""""""""""""""""""""""""""""""
+
+.. note::
+
+    Range-partitioning approach is implemented only for 'pd.Series.nunique()' and 1-column dataframes.
+    For multi-column dataframes '.nunique()' can only use full-axis reduce implementation.
+
+Range-partitioning implementation of '.nunique()'' works best when the input data size is big (more than
+5_000_000 rows) and when the output size is also expected to be big (no more than 80% values are duplicates).
+
+Resample
+""""""""
+
+.. note::
+
+    Range-partitioning approach doesn't support transform-like functions (like `.interpolate()`, `.ffill()`, `.bfill()`, ...)
+
+It is recommended to use range-partitioning for resampling if you're dealing with a dataframe that has more than
+5_000_000 rows and the expected output is also expected to be big (more than 500_000 rows).
diff --git a/docs/requirements-doc.txt b/docs/requirements-doc.txt
@@ -13,7 +13,7 @@ recommonmark
 sphinx<6.0.0
 sphinx-click
 # ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100
-ray[default]>=1.13.0,!=2.5.0
+ray[default]>=2.1.0,!=2.5.0
 # Override to latest version of modin-spreadsheet
 git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
 sphinxcontrib_plantuml

diff --git a/docs/usage_guide/advanced_usage/index.rst b/docs/usage_guide/advanced_usage/index.rst
@@ -33,11 +33,13 @@ If you are familiar with a concrete execution engine, it is possible to initiali
 Modin will automatically attach to it. Refer to :doc:`Modin engines </usage_guide/advanced_usage/modin_engines>` page
 for more details.
 
-Experimental APIs
------------------
+Additional APIs
+---------------
 
-Modin also supports these experimental APIs on top of pandas that are under active development.
+Modin also supports these additional APIs on top of pandas to improve user experience.
 
+- :py:meth:`~modin.pandas.DataFrame.modin.to_pandas` -- convert Modin DataFrame/Series to Pandas DataFrame/Series.
+- :py:meth:`~modin.pandas.DataFrame.modin.to_ray` -- convert Modin DataFrame/Series to Ray Dataset.
 - :py:func:`~modin.experimental.pandas.read_csv_glob` -- read multiple files in a directory
 - :py:func:`~modin.experimental.pandas.read_sql` -- add optional parameters for the database connection
 - :py:func:`~modin.experimental.pandas.read_custom_text` -- read custom text data from file

diff --git a/environment-dev.yml b/environment-dev.yml
@@ -13,7 +13,7 @@ dependencies:
 
   # optional dependencies
   # ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100
-  - ray-default>=1.13.0,!=2.5.0
+  - ray-default>=2.1.0,!=2.5.0
   - pyarrow>=7.0.0
   # workaround for https://github.com/conda/conda/issues/11744
   - grpcio!=1.45.*