Skip to content

Commit a2acea7

Browse files
authored
Merge pull request #47 from mindsdb/columns_pool
Run as many processes as columns count
2 parents 9d7a09d + 36f866e commit a2acea7

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

type_infer/infer.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -392,9 +392,10 @@ def infer_types(
392392
f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
393393

394394
nr_procs = get_nr_procs(df=sample_df)
395-
if data.size > mp_cutoff and nr_procs > 1:
396-
log.info(f'Using {nr_procs} processes to deduct types.')
397-
pool = mp.Pool(processes=nr_procs)
395+
pool_size = min(nr_procs, len(sample_df.columns.values))
396+
if data.size > mp_cutoff and pool_size > 1:
397+
log.info(f'Using {pool_size} processes to deduct types.')
398+
pool = mp.Pool(processes=pool_size)
398399
# column-wise parallelization # TODO: evaluate switching to row-wise split instead
399400
answer_arr = pool.starmap(get_column_data_type, [
400401
(sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values
@@ -422,8 +423,8 @@ def infer_types(
422423
'dtype_dist': data_dtype_dist
423424
}
424425

425-
if data.size > mp_cutoff and nr_procs > 1:
426-
pool = mp.Pool(processes=nr_procs)
426+
if data.size > mp_cutoff and pool_size > 1:
427+
pool = mp.Pool(processes=pool_size)
427428
answer_arr = pool.map(get_identifier_description_mp, [
428429
(data[x], x, type_information.dtypes[x])
429430
for x in sample_df.columns

0 commit comments

Comments
 (0)