-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bf30f50
commit 3e93e96
Showing
33 changed files
with
30,937 additions
and
0 deletions.
There are no files selected for viewing
5,378 changes: 5,378 additions & 0 deletions
5,378
...Python Tutorial Preliminary Skills and Concepts/01 - Machine Learning for Beginners.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
4,075 changes: 4,075 additions & 0 deletions
4,075
labs/01- Python Tutorial Preliminary Skills and Concepts/01 - Python for Beginners.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
739 changes: 739 additions & 0 deletions
739
labs/02 - Linear Models - Regression and Classification Models/02 - Linear Models.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
211 changes: 211 additions & 0 deletions
211
labs/03 - Probability and Information Theory/03 - PDF CDF.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
1,532 changes: 1,532 additions & 0 deletions
1,532
labs/04 - Bayesian Models/04 - Probability and Bayesian Theory.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
4,836 changes: 4,836 additions & 0 deletions
4,836
...- Ensemble Models and Kernel Based Models/05 - SVM Decision Trees and Random Forest.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
3,531 changes: 3,531 additions & 0 deletions
3,531
labs/06 - Neural Networks/06 - Neural Networks.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,320 @@ | ||
import torch | ||
import numpy as np | ||
import os | ||
import typing | ||
import typing | ||
import ast | ||
import pandas as pd | ||
import numpy as np | ||
import torch | ||
from sklearn.preprocessing import MultiLabelBinarizer | ||
from torchvision.datasets.utils import download_and_extract_archive | ||
import joblib | ||
import tqdm | ||
|
||
try: | ||
import wfdb | ||
|
||
wfdb_import_error = False | ||
except ImportError: | ||
wfdb_import_error = True | ||
|
||
|
||
class PTB_XL(torch.utils.data.Dataset): | ||
def __init__( | ||
self, | ||
data_path: str = "./", | ||
train: bool = True, | ||
sampling_rate: typing.Literal[100, 500] = 100, | ||
binary: bool = False, | ||
subset=False, | ||
): | ||
""" | ||
ECG Data, as described here: https://physionet.org/content/ptb-xl/1.0.2/. | ||
A positive class when :code:`binary=True`, indicates that | ||
the ECG Data is abnormal. | ||
Examples | ||
--------- | ||
.. code-block:: | ||
>>> dataset = PTB_XL( | ||
... data_path='../../data/', | ||
... train=True, | ||
... sampling_rate=500, | ||
... ) | ||
Arguments | ||
--------- | ||
- data_path: str, optional: | ||
The path that the data is saved | ||
or will be saved. | ||
Defaults to :code:`'./'`. | ||
- train: bool, optional: | ||
Whether to load the training or testing set. | ||
Defaults to :code:`True`. | ||
- sampling_rate: typing.Literal[100, 500], optional: | ||
The sampling rate. This should be | ||
in :code:`[100, 500]`. | ||
Defaults to :code:`100`. | ||
- binary: bool, optional: | ||
Whether to return classes based on whether the | ||
ecg is normal or not, and so a binary classification | ||
problem. | ||
Defaults to :code:`False`. | ||
- subset: bool, optional: | ||
If :code:`True`, only the first 1000 items | ||
of the training and test set will be returned. | ||
Defaults to :code:`False`. | ||
""" | ||
|
||
if wfdb_import_error: | ||
raise ImportError( | ||
"Please install wfdb before using this dataset. Use pip install wfdb." | ||
) | ||
|
||
assert sampling_rate in [ | ||
100, | ||
500, | ||
], "Please choose sampling_rate from [100, 500]" | ||
assert type(train) == bool, "Please use train = True or False" | ||
|
||
self.data_path = data_path | ||
self.download() | ||
self.data_path = os.path.join( | ||
self.data_path, | ||
"ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.2/", | ||
) | ||
|
||
self.train = train | ||
self.sampling_rate = sampling_rate | ||
self.binary = binary | ||
self.meta_data = pd.read_csv(self.data_path + "ptbxl_database.csv") | ||
self.meta_data["scp_codes"] = self.meta_data["scp_codes"].apply( | ||
lambda x: ast.literal_eval(x) | ||
) | ||
self.aggregate_diagnostic() # create diagnostic columns | ||
self.feature_names = [ | ||
"I", | ||
"II", | ||
"III", | ||
"aVL", | ||
"aVR", | ||
"aVF", | ||
"V1", | ||
"V2", | ||
"V3", | ||
"V4", | ||
"V5", | ||
"V6", | ||
] | ||
|
||
if self.train: | ||
self.meta_data = self.meta_data.query("strat_fold != 10") | ||
if subset: | ||
self.meta_data = self.meta_data.iloc[:1000] | ||
else: | ||
self.meta_data = self.meta_data.query("strat_fold == 10") | ||
if subset: | ||
self.meta_data = self.meta_data.iloc[:1000] | ||
|
||
if binary: | ||
self.targets = self.meta_data[["NORM", "CD", "HYP", "MI", "STTC"]].values | ||
self.targets = 1 - self.targets[:, 0] | ||
else: | ||
self.targets = self.meta_data[["NORM", "CD", "HYP", "MI", "STTC"]].values | ||
|
||
return | ||
|
||
def _check_exists(self): | ||
folder = os.path.join( | ||
self.data_path, | ||
"ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.2", | ||
) | ||
return os.path.exists(folder) | ||
|
||
def download(self): | ||
if self._check_exists(): | ||
print("Files already downloaded.") | ||
return | ||
|
||
download_and_extract_archive( | ||
url="https://physionet.org/static" | ||
"/published-projects/ptb-xl/" | ||
"ptb-xl-a-large-publicly-available" | ||
"-electrocardiography-dataset-1.0.2.zip", | ||
download_root=self.data_path, | ||
extract_root=self.data_path, | ||
filename="ptbxl.zip", | ||
remove_finished=True, | ||
) | ||
|
||
return | ||
|
||
@staticmethod | ||
def single_diagnostic(y_dict, agg_df): | ||
tmp = [] | ||
for key in y_dict.keys(): | ||
if key in agg_df.index: | ||
tmp.append(agg_df.loc[key].diagnostic_class) | ||
return list(set(tmp)) | ||
|
||
def aggregate_diagnostic(self): | ||
agg_df = pd.read_csv(self.data_path + "scp_statements.csv", index_col=0) | ||
agg_df = agg_df[agg_df.diagnostic == 1] | ||
self.meta_data["diagnostic_superclass"] = self.meta_data["scp_codes"].apply( | ||
self.single_diagnostic, | ||
agg_df=agg_df, | ||
) | ||
mlb = MultiLabelBinarizer() | ||
self.meta_data = self.meta_data.join( | ||
pd.DataFrame( | ||
mlb.fit_transform(self.meta_data.pop("diagnostic_superclass")), | ||
columns=mlb.classes_, | ||
index=self.meta_data.index, | ||
) | ||
) | ||
return | ||
|
||
def __getitem__(self, index): | ||
data = self.meta_data.iloc[index] | ||
|
||
if self.sampling_rate == 100: | ||
f = data["filename_lr"] | ||
x = wfdb.rdsamp(self.data_path + f) | ||
elif self.sampling_rate == 500: | ||
f = data["filename_hr"] | ||
x = wfdb.rdsamp(self.data_path + f) | ||
x = torch.tensor(x[0]).transpose(0, 1).float() | ||
y = torch.tensor( | ||
data[["NORM", "CD", "HYP", "MI", "STTC"]].values.astype(np.int64) | ||
) | ||
if self.binary: | ||
y = 1 - y[0] | ||
|
||
return x, y | ||
|
||
def __len__(self): | ||
return len(self.meta_data) | ||
|
||
|
||
class MemoryDataset(torch.utils.data.Dataset): | ||
def __init__( | ||
self, | ||
dataset: torch.utils.data.Dataset, | ||
now: bool = True, | ||
verbose: bool = True, | ||
n_jobs: int = 1, | ||
): | ||
""" | ||
This dataset allows the user | ||
to wrap another dataset and | ||
load all of the outputs into memory, | ||
so that they are accessed from RAM | ||
instead of storage. All attributes of | ||
the original dataset will still be available, except | ||
for :code:`._dataset` and :code:`._data_dict` if they | ||
were defined. | ||
It also allows the data to be saved in memory right | ||
away or after the data is accessed for the first time. | ||
Examples | ||
--------- | ||
.. code-block:: | ||
>>> dataset = MemoryDataset(dataset, now=True) | ||
Arguments | ||
--------- | ||
- dataset: torch.utils.data.Dataset: | ||
The dataset to wrap and add to memory. | ||
- now: bool, optional: | ||
Whether to save the data to memory | ||
right away, or the first time the | ||
data is accessed. If :code:`True`, then | ||
this initialisation might take some time | ||
as it will need to load all of the data. | ||
Defaults to :code:`True`. | ||
- verbose: bool, optional: | ||
Whether to print progress | ||
as the data is being loaded into | ||
memory. This is ignored if :code:`now=False`. | ||
Defaults to :code:`True`. | ||
- n_jobs: int, optional: | ||
The number of parallel operations when loading | ||
the data to memory. | ||
Defaults to :code:`1`. | ||
""" | ||
|
||
self._dataset = dataset | ||
self._data_dict = {} | ||
if now: | ||
pbar = tqdm.tqdm( | ||
total=len(dataset), | ||
desc="Loading into memory", | ||
disable=not verbose, | ||
smoothing=0, | ||
) | ||
|
||
def add_to_dict(index): | ||
for ni, i in enumerate(index): | ||
self._data_dict[i] = dataset[i] | ||
pbar.update(1) | ||
pbar.refresh() | ||
return None | ||
|
||
all_index = np.arange(len(dataset)) | ||
index_list = [all_index[i::n_jobs] for i in range(n_jobs)] | ||
|
||
joblib.Parallel( | ||
n_jobs=n_jobs, | ||
backend="threading", | ||
)(joblib.delayed(add_to_dict)(index) for index in index_list) | ||
|
||
pbar.close() | ||
|
||
return | ||
|
||
def __getitem__(self, index): | ||
if index in self._data_dict: | ||
return self._data_dict[index] | ||
else: | ||
output = self._dataset[index] | ||
self._data_dict[index] = output | ||
return output | ||
|
||
def __len__(self): | ||
return len(self._dataset) | ||
|
||
# defined since __getattr__ causes pickling problems | ||
def __getstate__(self): | ||
return vars(self) | ||
|
||
# defined since __getattr__ causes pickling problems | ||
def __setstate__(self, state): | ||
vars(self).update(state) | ||
|
||
def __getattr__(self, name): | ||
if hasattr(self._dataset, name): | ||
return getattr(self._dataset, name) | ||
else: | ||
raise AttributeError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# probability decision surface for logistic regression on a binary classification dataset | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import torch | ||
|
||
def pytorch_decision_boundary( | ||
wrapped_model, | ||
X, | ||
grid_resolution=200, | ||
ax=None, | ||
cmap='RdBu', | ||
alpha=0.5, | ||
levels=100, | ||
): | ||
|
||
if ax is None: | ||
ax = plt.gca() | ||
|
||
# define bounds of the domain | ||
x_min, x_max = X[:, 0].min().item()-1, X[:, 0].max().item()+1 | ||
y_min, y_max = X[:, 1].min().item()-1, X[:, 1].max().item()+1 | ||
# define the x and y scale | ||
xx, yy = np.meshgrid(np.linspace(x_min, x_max, grid_resolution, endpoint=1), | ||
np.linspace(y_min, y_max, grid_resolution, endpoint=1)) | ||
grid = np.c_[xx.ravel(), yy.ravel()] | ||
|
||
yhat = wrapped_model.predict(grid) | ||
# reshape the predictions back into a grid | ||
zz = yhat.reshape(xx.shape) | ||
|
||
# plot the grid of x, y and z values as a surface | ||
ax.contourf(xx, yy, zz, cmap=cmap, alpha=alpha, levels=levels) | ||
# add a legend, called a color bar | ||
|
||
return ax |
Oops, something went wrong.