aspremon · ThomasJanssoone · Dec 23, 2020 · Jan 17, 2021 · Jan 18, 2021
diff --git a/DemoDNA.py b/DemoDNA.py
@@ -1,17 +1,76 @@
-from NFS import *
+from naive_feature_selection import *
 from numpy import genfromtxt
 from sklearn.svm import LinearSVC
 from sklearn import metrics
 from pandas import read_csv
+import numpy as np
 from sklearn.model_selection import train_test_split
+import os
 
+#%% data downloader
+
+
+import gzip
+import wget
+import os
+import tarfile
+import zipfile
+import shutil
+
+
+def extract_file(path, to_directory='./data/'):
+    try:
+        os.stat(extract_path)
+    except:
+        os.mkdir(extract_path)
+    if path.endswith('.zip'):
+        opener, mode = zipfile.ZipFile, 'r'
+    elif path.endswith('.tar.gz') or path.endswith('.tgz'):
+        opener, mode = tarfile.open, 'r:gz'
+    elif path.endswith('.tar.bz2') or path.endswith('.tbz'):
+        opener, mode = tarfile.open, 'r:bz2'
+    else: 
+        raise (ValueError, "Could not extract `%s` as no appropriate extractor is found" % path)
+    #
+    os.chdir(to_directory)
+    #
+    try:
+        file = opener(path, mode)
+        try: file.extractall()
+        finally: file.close()
+    finally:
+        os.chdir(cwd)
+
+def download_file(url, out_directory = './data/'):
+    # Download archive
+    try:
+        file = wget(url)
+        cwd = os.getcwd()
+        # Read the file inside the .gz archive located at url
+        extract_all(cwd+'/'+file, out_directory)
+        return 0
+    except Exception as e:
+        print(e)
+        return 1
+
+
+
 #%% Test on UCI gene expression cancer RNA-Seq Data Set 
 print("Importing RNA-Seq data...")
-X_data = genfromtxt('./data/data.csv', delimiter=',',skip_header=1)
+
+cwd = os.getcwd()
+data_path = cwd+'/data/TCGA-PANCAN-HiSeq-801x20531/data.csv'
+label_path = cwd+'/data/TCGA-PANCAN-HiSeq-801x20531/labels.csv'
+if not os.path.exists(data_path):
+    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00401/TCGA-PANCAN-HiSeq-801x20531.tar.gz'
+    download_file(url)
+
+
+X_data = genfromtxt(data_path, delimiter=',',skip_header=1)
 X_data=X_data[:,1:]
-X_labels=read_csv('./data/data.csv',nrows=1)
+X_labels=read_csv(data_path,nrows=1)
 X_labels=X_labels.columns[1:]
-labels=read_csv('./data/labels.csv',header=0)
+labels=read_csv(label_path,header=0)
 y_data=labels['Class']=='BRCA' # Check for BRCA labels
 y_data=1.0*y_data.to_numpy()
 
@@ -20,15 +79,18 @@
 
 # Test Naive Feature Selection, followed by l2 SVM
 k=100 # Target number of features
-nfs_res=nfs(X_train,y_train,k)
+nfs = NaiveFeatureSelection(k=k, alpha=1e-4)
+
+X_train_nfs=nfs.fit_transform(X_train,y_train)
 clfsv = LinearSVC(random_state=0, tol=1e-5)
-clfsv.fit(X_train[:,nfs_res['idx']], y_train)
-y_pred_NFS = clfsv.predict(X_test[:,nfs_res['idx']])
+clfsv.fit(X_train_nfs, y_train)
+X_test_nfs = nfs.transform(X_test)
+y_pred_NFS = clfsv.predict(X_test_nfs)
 score_nfs = metrics.accuracy_score(y_test==1, y_pred_NFS)
 print("NFS accuracy:\t%0.3f" % score_nfs)
 
 print('Positive genes:')
-print([X_labels[nfs_res['idx'][i]] for i in range(100) if clfsv.coef_[0][i]>=0])
+print([X_labels[i] for i in range(100) if clfsv.coef_[0][i]>=0])
 
 
 #%% Plot sparsity / likelihood tradeoff
@@ -38,8 +100,9 @@
 resv=np.zeros(len(xvals))
 for i in range(0,len(xvals)):
     k=xvals[i]
-    nfs_res=nfs(X_train,y_train,k)
-    resv[i]=nfs_res['objv']
+    nfs = NaiveFeatureSelection(k=k, alpha=1e-4)
+    nfs.fit(X_train,y_train)
+    resv[i]=nfs.res_nfs['objv']
 
 plt.style.use('seaborn-white')
 plt.plot(xvals, resv,'-b',linewidth=4)
@@ -50,7 +113,7 @@
 
 
 #%% Plot curve
-import plotly.plotly as py
+import chart_studio.plotly as py
 import plotly.graph_objs as go
 from plotly.offline import iplot, init_notebook_mode
 import plotly.io as pio

diff --git a/naive_feature_selection/naive_feature_selection.py b/naive_feature_selection/naive_feature_selection.py
@@ -1,6 +1,6 @@
 import numpy as np
 from sklearn.base import BaseEstimator
-from sklearn.feature_selection.base import SelectorMixin
+from sklearn.feature_selection import SelectorMixin
 from sklearn.utils import check_X_y
 from sklearn.utils.validation import check_is_fitted
 from scipy.sparse import issparse
@@ -290,10 +290,14 @@ def fit(self, X, y):
         elif self._is_binary(X):
             res_nfs = self._binary_naive_feature_selection(X, y, self.k)
             mask[res_nfs["idx"]] = 1
+            self.res_nfs = res_nfs
             scores = np.square(res_nfs["w"])
+
         else:
             res_nfs = self._naive_feature_selection(X, y, self.k)
             mask[res_nfs["idx"]] = 1
+
+            self.res_nfs = res_nfs
             scores = np.square(res_nfs["w"])
 
         self.mask_ = mask