Skip to content

340 implement function arguments to datasetformat that triggers removal of na values in the returned dfs #404

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 28 additions & 50 deletions coderdata/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,41 +39,22 @@ class Split:

class Dataset:

data_format_params = {
"samples": (
"improve_sample_id", "cancer_type", "model_type", "common_name",
"other_id", "other_names", "id_source", "species"
),
"transcriptomics": (
"improve_sample_id", "entrez_id", "transcriptomics"
),
"proteomics": ("improve_sample_id", "entrez_id", "proteomics"),
"mutations": ("improve_sample_id", "entrez_id", "mutation"),
"copy_number": ("improve_sample_id", "entrez_id", "copy_number"),
"methylation": ("improve_sample_id", "entrez_id", "methylation"),
"experiments": (
"improve_sample_id", "improve_drug_id", "dose_response_value"
),
"drugs": ("improve_drug_id", "chem_name", "isoSMILES"),
"genes": ("entrez_id", "gene_symbol", "other_id")
}

def __init__(
self,
name: str=None,
transcriptomics: pd.DataFrame=None,
proteomics: pd.DataFrame=None,
mutations: pd.DataFrame=None,
copy_number: pd.DataFrame=None,
samples: pd.DataFrame=None,
drugs: pd.DataFrame=None,
drug_descriptors: pd.DataFrame=None,
mirna: pd.DataFrame=None,
experiments: pd.DataFrame=None,
methylation: pd.DataFrame=None,
metabolomics: pd.DataFrame=None,
genes: pd.DataFrame=None,
combinations: pd.DataFrame=None,
name: Optional[str]=None,
transcriptomics: Optional[pd.DataFrame]=None,
proteomics: Optional[pd.DataFrame]=None,
mutations: Optional[pd.DataFrame]=None,
copy_number: Optional[pd.DataFrame]=None,
samples: Optional[pd.DataFrame]=None,
drugs: Optional[pd.DataFrame]=None,
drug_descriptors: Optional[pd.DataFrame]=None,
mirna: Optional[pd.DataFrame]=None,
experiments: Optional[pd.DataFrame]=None,
methylation: Optional[pd.DataFrame]=None,
metabolomics: Optional[pd.DataFrame]=None,
genes: Optional[pd.DataFrame]=None,
combinations: Optional[pd.DataFrame]=None,
):
"""
Load datasets of a specific type into predefined attributes of this class instance.
Expand Down Expand Up @@ -131,12 +112,6 @@ def __init__(
# getters / setters & deleters
# ----------------------------


@property
def data_format_params(self):
return self._data_format_params


@property
def name(self):
return self._name
Expand Down Expand Up @@ -330,10 +305,10 @@ def format(
'experiments', 'combinations', 'drug_descriptor', 'drugs',
'genes', 'samples',
],
use_polars: bool=False,
remove_na: bool=False,
**kwargs: dict,
):
return format(self, data_type=data_type, use_polars=use_polars, **kwargs)
return format(self, data_type=data_type, remove_na=False, **kwargs)


def split_train_other(
Expand Down Expand Up @@ -526,6 +501,7 @@ def load(
dataset = pickle.load(file=file)
print("DONE", file=sys.stderr)
return dataset
raise FileNotFoundError("No suitable pickle file found.")



Expand All @@ -536,7 +512,7 @@ def format(
'experiments', 'combinations', 'drug_descriptor', 'drugs',
'genes', 'samples',
],
use_polars: bool=False,
remove_na: bool=False,
**kwargs: dict,
):

Expand Down Expand Up @@ -642,6 +618,8 @@ def format(
columns = 'dose_response_metric',
values = 'dose_response_value'
).reset_index().rename_axis(None, axis=1)
if remove_na:
ret.dropna(axis='index', inplace=True)
elif shape == 'matrix':
if len(metrics) > 1:
raise ValueError(
Expand All @@ -654,7 +632,6 @@ def format(
index='improve_drug_id',
columns='improve_sample_id'
)
return ret

elif data_type == "combinations":
raise NotImplementedError(
Expand Down Expand Up @@ -771,7 +748,7 @@ def split_train_test_validate(
train, other = _split_two_way(
data=data,
split_type=split_type,
ratio=[ratio[0], ratio[1] + ratio[2]],
ratio=(ratio[0], ratio[1] + ratio[2]),
stratify_by=stratify_by,
balance=balance,
random_state=random_state,
Expand All @@ -781,7 +758,7 @@ def split_train_test_validate(
test, val = _split_two_way(
data=other,
split_type=split_type,
ratio=[ratio[1], ratio[2]],
ratio=(ratio[1], ratio[2]),
stratify_by=stratify_by,
balance=balance,
random_state=random_state,
Expand Down Expand Up @@ -993,10 +970,10 @@ def _filter(data: Dataset, split: pd.DataFrame) -> Dataset:
return data_ret

def _balance_data(
data: pd.Dataframe,
data: pd.DataFrame,
random_state: Optional[Union[int,RandomState]]=None,
# oversample: bool=False,
) -> pd.Dataframe:
) -> pd.DataFrame:
tmp = deepcopy(data)
counts = tmp.value_counts('split_class')
ret_df = (
Expand All @@ -1012,7 +989,7 @@ def _create_classes(
metric: str,
num_classes: int=2,
quantiles: bool=True,
thresh: float=None,
thresh: Optional[float]=None,
) -> pd.DataFrame:
"""
Helper function that bins experiment data into a number of defined
Expand Down Expand Up @@ -1101,7 +1078,7 @@ def _split_two_way(
split_type: Literal[
'mixed-set', 'drug-blind', 'cancer-blind'
]='mixed-set',
ratio: tuple[int, int, int]=(8,2),
ratio: tuple[int, int]=(8,2),
balance: bool=False,
stratify_by: Optional[str]=None,
random_state: Optional[Union[int,RandomState]]=None,
Expand Down Expand Up @@ -1207,7 +1184,8 @@ def _split_two_way(
columns = 'dose_response_metric',
values = 'dose_response_value'
).reset_index()

if stratify_by is not None:
df_full.dropna(axis='index', subset=[stratify_by], inplace=True)
# Defining the split sizes.
train_size = float(ratio[0]) / sum(ratio)
test_val_size = float(ratio[1]) / sum(ratio)
Expand Down