|
| 1 | +diff --git a/scSplit b/scSplit |
| 2 | +index 4847737..068fa04 100755 |
| 3 | +--- a/scSplit |
| 4 | ++++ b/scSplit |
| 5 | +@@ -16,7 +16,9 @@ from scipy.sparse import csr_matrix |
| 6 | + from sklearn.cluster import KMeans |
| 7 | + from sklearn.decomposition import PCA |
| 8 | + from sklearn.preprocessing import StandardScaler |
| 9 | +-import os, sys, io, vcf, csv, math, datetime, pickle, argparse, gzip |
| 10 | ++import os, sys, io, csv, math, datetime, pickle, argparse, gzip |
| 11 | ++import vcfpy as vcf |
| 12 | ++ |
| 13 | + |
| 14 | + |
| 15 | + class mixed_VCF: |
| 16 | +@@ -86,15 +88,15 @@ class models: |
| 17 | + self.ref_bc_mtx, self.alt_bc_mtx = base_calls_mtx[0], base_calls_mtx[1] |
| 18 | + self.all_POS, self.barcodes = base_calls_mtx[2].tolist(), base_calls_mtx[3].tolist() |
| 19 | + self.num = num |
| 20 | +- self.P_s_c = pd.DataFrame(0, index = self.barcodes, columns = range(self.num)) |
| 21 | +- self.lP_c_s = pd.DataFrame(0, index = self.barcodes, columns = range(self.num)) |
| 22 | ++ self.P_s_c = pd.DataFrame(0, index = self.barcodes, columns = range(self.num), dtype="float") |
| 23 | ++ self.lP_c_s = pd.DataFrame(0, index = self.barcodes, columns = range(self.num), dtype="float") |
| 24 | + self.lP_s = [np.log2(1/self.num)] * self.num |
| 25 | + self.assigned, self.reassigned = [], [] |
| 26 | + self.convergence = 0 |
| 27 | + for _ in range(self.num): |
| 28 | + self.assigned.append([]) |
| 29 | + self.reassigned.append([]) |
| 30 | +- self.model_af = pd.DataFrame(0, index=self.all_POS, columns=range(self.num)) |
| 31 | ++ self.model_af = pd.DataFrame(0, index=self.all_POS, columns=range(self.num), dtype="float") |
| 32 | + self.pseudo = 1 |
| 33 | + |
| 34 | + # background alt count proportion, with pseudo count added for 0 counts on multi-base SNPs |
| 35 | +@@ -142,7 +144,7 @@ class models: |
| 36 | + self.initial[n].append(self.barcodes[col]) |
| 37 | + barcode_alt = np.array(self.alt_bc_mtx[:, icols[kmeans.labels_==n]].sum(axis=1)) |
| 38 | + barcode_ref = np.array(self.ref_bc_mtx[:, icols[kmeans.labels_==n]].sum(axis=1)) |
| 39 | +- self.model_af.loc[:, n] = (barcode_alt + self.k_alt) / (barcode_alt + barcode_ref + self.pseudo) |
| 40 | ++ self.model_af.loc[:, n] = ((barcode_alt + self.k_alt) / (barcode_alt + barcode_ref + self.pseudo)).ravel() |
| 41 | + |
| 42 | + |
| 43 | + def run_EM(self, output): |
| 44 | +@@ -194,7 +196,7 @@ class models: |
| 45 | + """ |
| 46 | + |
| 47 | + self.model_af = pd.DataFrame((self.alt_bc_mtx.dot(self.P_s_c) + self.k_alt) / ((self.alt_bc_mtx + self.ref_bc_mtx).dot(self.P_s_c) + self.pseudo), |
| 48 | +- index = self.all_POS, columns = range(self.num)) |
| 49 | ++ index = self.all_POS, columns = range(self.num), dtype="float") |
| 50 | + self.lP_s = np.log2(self.P_s_c.sum(axis=0)) - np.log2(self.P_s_c.sum(axis=0).sum()) |
| 51 | + |
| 52 | + |
| 53 | +@@ -211,7 +213,7 @@ class models: |
| 54 | + """ |
| 55 | + Locate the doublet state |
| 56 | + """ |
| 57 | +- cross_state = pd.DataFrame(0, index = range(self.num), columns = range(self.num)) |
| 58 | ++ cross_state = pd.DataFrame(0, index = range(self.num), columns = range(self.num), dtype="float") |
| 59 | + for i in range(self.num): |
| 60 | + for j in range(self.num): |
| 61 | + index = [] |
| 62 | +@@ -262,17 +264,17 @@ class models: |
| 63 | + self.dist_variants, ncols = [], self.num - 1 + (self.doublet < 0) * 1 |
| 64 | + if len(pos) != 0: |
| 65 | + snv = [self.all_POS[i] for i in pos] |
| 66 | +- N_ref_mtx, N_alt_mtx = pd.DataFrame(0, index=snv, columns=range(self.num)), pd.DataFrame(0, index=snv, columns=range(self.num)) |
| 67 | ++ N_ref_mtx, N_alt_mtx = pd.DataFrame(0, index=snv, columns=range(self.num), dtype="int64"), pd.DataFrame(0, index=snv, columns=range(self.num), dtype="int64") |
| 68 | + else: |
| 69 | +- N_ref_mtx, N_alt_mtx = pd.DataFrame(0, index=self.all_POS, columns=range(self.num)), pd.DataFrame(0, index=self.all_POS, columns=range(self.num)) |
| 70 | ++ N_ref_mtx, N_alt_mtx = pd.DataFrame(0, index=self.all_POS, columns=range(self.num), dtype="int64"), pd.DataFrame(0, index=self.all_POS, columns=range(self.num), dtype="int64") |
| 71 | + |
| 72 | + for n in range(self.num): |
| 73 | + bc_idx = [i for i, e in enumerate(self.barcodes) if e in self.reassigned[n]] |
| 74 | + # REF/ALT alleles counts from cells assigned to state n |
| 75 | + if len(pos) == 0: |
| 76 | +- N_ref_mtx.loc[:, n], N_alt_mtx.loc[:, n] = self.ref_bc_mtx[:, bc_idx].sum(axis=1), self.alt_bc_mtx[:, bc_idx].sum(axis=1) |
| 77 | ++ N_ref_mtx.loc[:, n], N_alt_mtx.loc[:, n] = self.ref_bc_mtx[:, bc_idx].sum(axis=1).ravel(), self.alt_bc_mtx[:, bc_idx].sum(axis=1).ravel() |
| 78 | + else: |
| 79 | +- N_ref_mtx.loc[:, n], N_alt_mtx.loc[:, n] = self.ref_bc_mtx[pos][:, bc_idx].sum(axis=1), self.alt_bc_mtx[pos][:, bc_idx].sum(axis=1) |
| 80 | ++ N_ref_mtx.loc[:, n], N_alt_mtx.loc[:, n] = self.ref_bc_mtx[pos][:, bc_idx].sum(axis=1).ravel(), self.alt_bc_mtx[pos][:, bc_idx].sum(axis=1).ravel() |
| 81 | + |
| 82 | + # judge N(A) or N(R) for each cluster |
| 83 | + if self.doublet == -1: |
| 84 | +@@ -585,12 +587,12 @@ Options: |
| 85 | + assignment['Barcode'] = model.reassigned[n] |
| 86 | + if n != model.doublet: |
| 87 | + assignment['Cluster'] = 'SNG-' + str(n) |
| 88 | +- assignments = assignments.append(assignment) |
| 89 | ++ assignments = pd.concat([assignments, assignment]) |
| 90 | + if doublets != 0: # if doublet cluster is expected |
| 91 | + assignment = pd.DataFrame() |
| 92 | + assignment['Barcode'] = model.reassigned[model.doublet] |
| 93 | + assignment['Cluster'] = 'DBL-' + str(model.doublet) |
| 94 | +- assignments = assignments.append(assignment) |
| 95 | ++ assignments = pd.concat([assignments, assignment]) |
| 96 | + assignments.to_csv(os.path.join(args.out, r'scSplit_result.csv'), sep='\t', index=False) |
| 97 | + model.P_s_c.to_csv(os.path.join(args.out, r'scSplit_P_s_c.csv')) |
| 98 | + with open(os.path.join(args.out, r'scSplit_dist_variants.txt'), 'w') as logfile: |
0 commit comments