@@ -392,9 +392,10 @@ def infer_types(
392
392
f'from a total population of { population_size } , this is equivalent to { round (sample_size * 100 / population_size , 1 )} % of your data.' ) # noqa
393
393
394
394
nr_procs = get_nr_procs (df = sample_df )
395
- if data .size > mp_cutoff and nr_procs > 1 :
396
- log .info (f'Using { nr_procs } processes to deduct types.' )
397
- pool = mp .Pool (processes = nr_procs )
395
+ pool_size = min (nr_procs , len (sample_df .columns .values ))
396
+ if data .size > mp_cutoff and pool_size > 1 :
397
+ log .info (f'Using { pool_size } processes to deduct types.' )
398
+ pool = mp .Pool (processes = pool_size )
398
399
# column-wise parallelization # TODO: evaluate switching to row-wise split instead
399
400
answer_arr = pool .starmap (get_column_data_type , [
400
401
(sample_df [x ].dropna (), data [x ], x , pct_invalid ) for x in sample_df .columns .values
@@ -422,8 +423,8 @@ def infer_types(
422
423
'dtype_dist' : data_dtype_dist
423
424
}
424
425
425
- if data .size > mp_cutoff and nr_procs > 1 :
426
- pool = mp .Pool (processes = nr_procs )
426
+ if data .size > mp_cutoff and pool_size > 1 :
427
+ pool = mp .Pool (processes = pool_size )
427
428
answer_arr = pool .map (get_identifier_description_mp , [
428
429
(data [x ], x , type_information .dtypes [x ])
429
430
for x in sample_df .columns
0 commit comments