-
Notifications
You must be signed in to change notification settings - Fork 655
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PERF-#4494: Get partition widths/lengths in parallel instead of serially #4683
base: master
Are you sure you want to change the base?
Changes from all commits
cb4f35c
7183383
5e16ccf
80fa12c
3320f27
be0a146
09df9a5
af21d50
b7c3471
5729f18
3da8073
2d4a8d3
b10f6f5
490778c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -382,6 +382,20 @@ | |
""" | ||
if self._length_cache is None: | ||
if self.axis == 0: | ||
caches = [ | ||
Check warning on line 385 in modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py Codecov / codecov/patchmodin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py#L385
|
||
obj.build_length_cache() | ||
for obj in self.list_of_partitions_to_combine | ||
] | ||
new_lengths = DaskWrapper.materialize( | ||
Check warning on line 389 in modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py Codecov / codecov/patchmodin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py#L389
|
||
[cache for cache in caches if isinstance(cache, Future)] | ||
) | ||
dask_idx = 0 | ||
for i, cache in enumerate(caches): | ||
if isinstance(cache, Future): | ||
self.list_of_partitions_to_combine[i].set_length_cache( | ||
Check warning on line 395 in modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py Codecov / codecov/patchmodin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py#L392-L395
|
||
new_lengths[dask_idx] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this just be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, since |
||
) | ||
dask_idx += 1 | ||
Check warning on line 398 in modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py Codecov / codecov/patchmodin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py#L398
|
||
self._length_cache = sum( | ||
obj.length() for obj in self.list_of_block_partitions | ||
) | ||
|
@@ -402,6 +416,20 @@ | |
""" | ||
if self._width_cache is None: | ||
if self.axis == 1: | ||
caches = [ | ||
Check warning on line 419 in modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py Codecov / codecov/patchmodin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py#L419
|
||
obj.build_width_cache() | ||
for obj in self.list_of_partitions_to_combine | ||
] | ||
new_widths = DaskWrapper.materialize( | ||
Check warning on line 423 in modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py Codecov / codecov/patchmodin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py#L423
|
||
[cache for cache in caches if isinstance(cache, Future)] | ||
) | ||
dask_idx = 0 | ||
for i, cache in enumerate(caches): | ||
if isinstance(cache, Future): | ||
self.list_of_partitions_to_combine[i].set_width_cache( | ||
Check warning on line 429 in modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py Codecov / codecov/patchmodin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py#L426-L429
|
||
new_widths[dask_idx] | ||
) | ||
dask_idx += 1 | ||
Check warning on line 432 in modin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py Codecov / codecov/patchmodin/core/execution/dask/implementations/pandas_on_dask/partitioning/virtual_partition.py#L432
|
||
self._width_cache = sum( | ||
obj.width() for obj in self.list_of_block_partitions | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need to compute dimensions here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
length
andwidth
values of each partition are accessed in the localcompute_part_size
, defined immediately below. The doublefor
loop structure wherecompute_part_size
is called makes it hard to parallelize the computation of these dimensions, so I thought it would be simplest to precompute the relevant dimensions before the loop.