diff --git a/DemoDNA.py b/DemoDNA.py index 8fc7202..813b14e 100644 --- a/DemoDNA.py +++ b/DemoDNA.py @@ -1,17 +1,76 @@ -from NFS import * +from naive_feature_selection import * from numpy import genfromtxt from sklearn.svm import LinearSVC from sklearn import metrics from pandas import read_csv +import numpy as np from sklearn.model_selection import train_test_split +import os +#%% data downloader + + +import gzip +import wget +import os +import tarfile +import zipfile +import shutil + + +def extract_file(path, to_directory='./data/'): + try: + os.stat(extract_path) + except: + os.mkdir(extract_path) + if path.endswith('.zip'): + opener, mode = zipfile.ZipFile, 'r' + elif path.endswith('.tar.gz') or path.endswith('.tgz'): + opener, mode = tarfile.open, 'r:gz' + elif path.endswith('.tar.bz2') or path.endswith('.tbz'): + opener, mode = tarfile.open, 'r:bz2' + else: + raise (ValueError, "Could not extract `%s` as no appropriate extractor is found" % path) + # + os.chdir(to_directory) + # + try: + file = opener(path, mode) + try: file.extractall() + finally: file.close() + finally: + os.chdir(cwd) + +def download_file(url, out_directory = './data/'): + # Download archive + try: + file = wget(url) + cwd = os.getcwd() + # Read the file inside the .gz archive located at url + extract_all(cwd+'/'+file, out_directory) + return 0 + except Exception as e: + print(e) + return 1 + + + #%% Test on UCI gene expression cancer RNA-Seq Data Set print("Importing RNA-Seq data...") -X_data = genfromtxt('./data/data.csv', delimiter=',',skip_header=1) + +cwd = os.getcwd() +data_path = cwd+'/data/TCGA-PANCAN-HiSeq-801x20531/data.csv' +label_path = cwd+'/data/TCGA-PANCAN-HiSeq-801x20531/labels.csv' +if not os.path.exists(data_path): + url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00401/TCGA-PANCAN-HiSeq-801x20531.tar.gz' + download_file(url) + + +X_data = genfromtxt(data_path, delimiter=',',skip_header=1) X_data=X_data[:,1:] -X_labels=read_csv('./data/data.csv',nrows=1) +X_labels=read_csv(data_path,nrows=1) X_labels=X_labels.columns[1:] -labels=read_csv('./data/labels.csv',header=0) +labels=read_csv(label_path,header=0) y_data=labels['Class']=='BRCA' # Check for BRCA labels y_data=1.0*y_data.to_numpy() @@ -20,15 +79,18 @@ # Test Naive Feature Selection, followed by l2 SVM k=100 # Target number of features -nfs_res=nfs(X_train,y_train,k) +nfs = NaiveFeatureSelection(k=k, alpha=1e-4) + +X_train_nfs=nfs.fit_transform(X_train,y_train) clfsv = LinearSVC(random_state=0, tol=1e-5) -clfsv.fit(X_train[:,nfs_res['idx']], y_train) -y_pred_NFS = clfsv.predict(X_test[:,nfs_res['idx']]) +clfsv.fit(X_train_nfs, y_train) +X_test_nfs = nfs.transform(X_test) +y_pred_NFS = clfsv.predict(X_test_nfs) score_nfs = metrics.accuracy_score(y_test==1, y_pred_NFS) print("NFS accuracy:\t%0.3f" % score_nfs) print('Positive genes:') -print([X_labels[nfs_res['idx'][i]] for i in range(100) if clfsv.coef_[0][i]>=0]) +print([X_labels[i] for i in range(100) if clfsv.coef_[0][i]>=0]) #%% Plot sparsity / likelihood tradeoff @@ -38,8 +100,9 @@ resv=np.zeros(len(xvals)) for i in range(0,len(xvals)): k=xvals[i] - nfs_res=nfs(X_train,y_train,k) - resv[i]=nfs_res['objv'] + nfs = NaiveFeatureSelection(k=k, alpha=1e-4) + nfs.fit(X_train,y_train) + resv[i]=nfs.res_nfs['objv'] plt.style.use('seaborn-white') plt.plot(xvals, resv,'-b',linewidth=4) @@ -50,7 +113,7 @@ #%% Plot curve -import plotly.plotly as py +import chart_studio.plotly as py import plotly.graph_objs as go from plotly.offline import iplot, init_notebook_mode import plotly.io as pio diff --git a/naive_feature_selection/naive_feature_selection.py b/naive_feature_selection/naive_feature_selection.py index 4830faf..1b6b5c8 100644 --- a/naive_feature_selection/naive_feature_selection.py +++ b/naive_feature_selection/naive_feature_selection.py @@ -1,6 +1,6 @@ import numpy as np from sklearn.base import BaseEstimator -from sklearn.feature_selection.base import SelectorMixin +from sklearn.feature_selection import SelectorMixin from sklearn.utils import check_X_y from sklearn.utils.validation import check_is_fitted from scipy.sparse import issparse @@ -290,10 +290,14 @@ def fit(self, X, y): elif self._is_binary(X): res_nfs = self._binary_naive_feature_selection(X, y, self.k) mask[res_nfs["idx"]] = 1 + self.res_nfs = res_nfs scores = np.square(res_nfs["w"]) + else: res_nfs = self._naive_feature_selection(X, y, self.k) mask[res_nfs["idx"]] = 1 + + self.res_nfs = res_nfs scores = np.square(res_nfs["w"]) self.mask_ = mask