Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 74 additions & 11 deletions DemoDNA.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,76 @@
from NFS import *
from naive_feature_selection import *
from numpy import genfromtxt
from sklearn.svm import LinearSVC
from sklearn import metrics
from pandas import read_csv
import numpy as np
from sklearn.model_selection import train_test_split
import os

#%% data downloader


import gzip
import wget
import os
import tarfile
import zipfile
import shutil


def extract_file(path, to_directory='./data/'):
try:
os.stat(extract_path)
except:
os.mkdir(extract_path)
if path.endswith('.zip'):
opener, mode = zipfile.ZipFile, 'r'
elif path.endswith('.tar.gz') or path.endswith('.tgz'):
opener, mode = tarfile.open, 'r:gz'
elif path.endswith('.tar.bz2') or path.endswith('.tbz'):
opener, mode = tarfile.open, 'r:bz2'
else:
raise (ValueError, "Could not extract `%s` as no appropriate extractor is found" % path)
#
os.chdir(to_directory)
#
try:
file = opener(path, mode)
try: file.extractall()
finally: file.close()
finally:
os.chdir(cwd)

def download_file(url, out_directory = './data/'):
# Download archive
try:
file = wget(url)
cwd = os.getcwd()
# Read the file inside the .gz archive located at url
extract_all(cwd+'/'+file, out_directory)
return 0
except Exception as e:
print(e)
return 1



#%% Test on UCI gene expression cancer RNA-Seq Data Set
print("Importing RNA-Seq data...")
X_data = genfromtxt('./data/data.csv', delimiter=',',skip_header=1)

cwd = os.getcwd()
data_path = cwd+'/data/TCGA-PANCAN-HiSeq-801x20531/data.csv'
label_path = cwd+'/data/TCGA-PANCAN-HiSeq-801x20531/labels.csv'
if not os.path.exists(data_path):
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00401/TCGA-PANCAN-HiSeq-801x20531.tar.gz'
download_file(url)


X_data = genfromtxt(data_path, delimiter=',',skip_header=1)
X_data=X_data[:,1:]
X_labels=read_csv('./data/data.csv',nrows=1)
X_labels=read_csv(data_path,nrows=1)
X_labels=X_labels.columns[1:]
labels=read_csv('./data/labels.csv',header=0)
labels=read_csv(label_path,header=0)
y_data=labels['Class']=='BRCA' # Check for BRCA labels
y_data=1.0*y_data.to_numpy()

Expand All @@ -20,15 +79,18 @@

# Test Naive Feature Selection, followed by l2 SVM
k=100 # Target number of features
nfs_res=nfs(X_train,y_train,k)
nfs = NaiveFeatureSelection(k=k, alpha=1e-4)

X_train_nfs=nfs.fit_transform(X_train,y_train)
clfsv = LinearSVC(random_state=0, tol=1e-5)
clfsv.fit(X_train[:,nfs_res['idx']], y_train)
y_pred_NFS = clfsv.predict(X_test[:,nfs_res['idx']])
clfsv.fit(X_train_nfs, y_train)
X_test_nfs = nfs.transform(X_test)
y_pred_NFS = clfsv.predict(X_test_nfs)
score_nfs = metrics.accuracy_score(y_test==1, y_pred_NFS)
print("NFS accuracy:\t%0.3f" % score_nfs)

print('Positive genes:')
print([X_labels[nfs_res['idx'][i]] for i in range(100) if clfsv.coef_[0][i]>=0])
print([X_labels[i] for i in range(100) if clfsv.coef_[0][i]>=0])


#%% Plot sparsity / likelihood tradeoff
Expand All @@ -38,8 +100,9 @@
resv=np.zeros(len(xvals))
for i in range(0,len(xvals)):
k=xvals[i]
nfs_res=nfs(X_train,y_train,k)
resv[i]=nfs_res['objv']
nfs = NaiveFeatureSelection(k=k, alpha=1e-4)
nfs.fit(X_train,y_train)
resv[i]=nfs.res_nfs['objv']

plt.style.use('seaborn-white')
plt.plot(xvals, resv,'-b',linewidth=4)
Expand All @@ -50,7 +113,7 @@


#%% Plot curve
import plotly.plotly as py
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import plotly.io as pio
Expand Down
6 changes: 5 additions & 1 deletion naive_feature_selection/naive_feature_selection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_selection.base import SelectorMixin
from sklearn.feature_selection import SelectorMixin
from sklearn.utils import check_X_y
from sklearn.utils.validation import check_is_fitted
from scipy.sparse import issparse
Expand Down Expand Up @@ -290,10 +290,14 @@ def fit(self, X, y):
elif self._is_binary(X):
res_nfs = self._binary_naive_feature_selection(X, y, self.k)
mask[res_nfs["idx"]] = 1
self.res_nfs = res_nfs
scores = np.square(res_nfs["w"])

else:
res_nfs = self._naive_feature_selection(X, y, self.k)
mask[res_nfs["idx"]] = 1

self.res_nfs = res_nfs
scores = np.square(res_nfs["w"])

self.mask_ = mask
Expand Down