Skip to content

Commit 05a13ba

Browse files
Fix genetix_demux/scsplit build (#1096)
1 parent 70a2dba commit 05a13ba

File tree

3 files changed

+109
-7
lines changed

3 files changed

+109
-7
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636

3737
* `integrate/scarches` and `workflows/annotate/scanvi_scarches`: Enable correction for technical variability by multiple continuous and categorical covariates.
3838

39+
* `genetic_demux/scsplit`: bump python to `3.13` and unpin pandas and numpy (were pinned to `<2.0` and `<2` respectively) (PR #1096).
40+
3941
## BUG FIXES
4042

4143
* `differential_expression/create_pseudobulks`: Fixed the check to verify that the raw counts layer was passed (PR #1072).

src/genetic_demux/scsplit/config.vsh.yaml

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,28 +65,30 @@ argument_groups:
6565
resources:
6666
- type: bash_script
6767
path: script.sh
68+
- path: scSplit.patch
6869
test_resources:
6970
- type: bash_script
7071
path: test.sh
7172
- path: ../../../resources_test/demuxafy_test_data
7273

7374
engines:
7475
- type: docker
75-
image: python:3.11
76+
image: python:3.13
7677
setup:
78+
- type: docker
79+
copy: ["scSplit.patch /opt/scSplit.patch"]
7780
- type: python
7881
pip:
79-
- numpy<2
80-
- pandas<2.0
82+
- numpy
83+
- pandas
8184
- pysam
82-
- setuptools<58
83-
- scikit-learn==1.1.3
85+
- scikit-learn
8486
- scipy
8587
- statistics
8688
- type: python
87-
pip: [ PyVCF ]
89+
pip: [ vcfpy ]
8890
- type: docker
89-
run: git clone https://github.com/jon-xu/scSplit && cp scSplit/scSplit /usr/local/bin && rm -rf scSplit
91+
run: git clone https://github.com/jon-xu/scSplit && cd scSplit && git apply /opt/scSplit.patch && cp scSplit /usr/local/bin && cd .. && rm -rf scSplit
9092
runners:
9193
- type: executable
9294
- type: nextflow
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
diff --git a/scSplit b/scSplit
2+
index 4847737..068fa04 100755
3+
--- a/scSplit
4+
+++ b/scSplit
5+
@@ -16,7 +16,9 @@ from scipy.sparse import csr_matrix
6+
from sklearn.cluster import KMeans
7+
from sklearn.decomposition import PCA
8+
from sklearn.preprocessing import StandardScaler
9+
-import os, sys, io, vcf, csv, math, datetime, pickle, argparse, gzip
10+
+import os, sys, io, csv, math, datetime, pickle, argparse, gzip
11+
+import vcfpy as vcf
12+
+
13+
14+
15+
class mixed_VCF:
16+
@@ -86,15 +88,15 @@ class models:
17+
self.ref_bc_mtx, self.alt_bc_mtx = base_calls_mtx[0], base_calls_mtx[1]
18+
self.all_POS, self.barcodes = base_calls_mtx[2].tolist(), base_calls_mtx[3].tolist()
19+
self.num = num
20+
- self.P_s_c = pd.DataFrame(0, index = self.barcodes, columns = range(self.num))
21+
- self.lP_c_s = pd.DataFrame(0, index = self.barcodes, columns = range(self.num))
22+
+ self.P_s_c = pd.DataFrame(0, index = self.barcodes, columns = range(self.num), dtype="float")
23+
+ self.lP_c_s = pd.DataFrame(0, index = self.barcodes, columns = range(self.num), dtype="float")
24+
self.lP_s = [np.log2(1/self.num)] * self.num
25+
self.assigned, self.reassigned = [], []
26+
self.convergence = 0
27+
for _ in range(self.num):
28+
self.assigned.append([])
29+
self.reassigned.append([])
30+
- self.model_af = pd.DataFrame(0, index=self.all_POS, columns=range(self.num))
31+
+ self.model_af = pd.DataFrame(0, index=self.all_POS, columns=range(self.num), dtype="float")
32+
self.pseudo = 1
33+
34+
# background alt count proportion, with pseudo count added for 0 counts on multi-base SNPs
35+
@@ -142,7 +144,7 @@ class models:
36+
self.initial[n].append(self.barcodes[col])
37+
barcode_alt = np.array(self.alt_bc_mtx[:, icols[kmeans.labels_==n]].sum(axis=1))
38+
barcode_ref = np.array(self.ref_bc_mtx[:, icols[kmeans.labels_==n]].sum(axis=1))
39+
- self.model_af.loc[:, n] = (barcode_alt + self.k_alt) / (barcode_alt + barcode_ref + self.pseudo)
40+
+ self.model_af.loc[:, n] = ((barcode_alt + self.k_alt) / (barcode_alt + barcode_ref + self.pseudo)).ravel()
41+
42+
43+
def run_EM(self, output):
44+
@@ -194,7 +196,7 @@ class models:
45+
"""
46+
47+
self.model_af = pd.DataFrame((self.alt_bc_mtx.dot(self.P_s_c) + self.k_alt) / ((self.alt_bc_mtx + self.ref_bc_mtx).dot(self.P_s_c) + self.pseudo),
48+
- index = self.all_POS, columns = range(self.num))
49+
+ index = self.all_POS, columns = range(self.num), dtype="float")
50+
self.lP_s = np.log2(self.P_s_c.sum(axis=0)) - np.log2(self.P_s_c.sum(axis=0).sum())
51+
52+
53+
@@ -211,7 +213,7 @@ class models:
54+
"""
55+
Locate the doublet state
56+
"""
57+
- cross_state = pd.DataFrame(0, index = range(self.num), columns = range(self.num))
58+
+ cross_state = pd.DataFrame(0, index = range(self.num), columns = range(self.num), dtype="float")
59+
for i in range(self.num):
60+
for j in range(self.num):
61+
index = []
62+
@@ -262,17 +264,17 @@ class models:
63+
self.dist_variants, ncols = [], self.num - 1 + (self.doublet < 0) * 1
64+
if len(pos) != 0:
65+
snv = [self.all_POS[i] for i in pos]
66+
- N_ref_mtx, N_alt_mtx = pd.DataFrame(0, index=snv, columns=range(self.num)), pd.DataFrame(0, index=snv, columns=range(self.num))
67+
+ N_ref_mtx, N_alt_mtx = pd.DataFrame(0, index=snv, columns=range(self.num), dtype="int64"), pd.DataFrame(0, index=snv, columns=range(self.num), dtype="int64")
68+
else:
69+
- N_ref_mtx, N_alt_mtx = pd.DataFrame(0, index=self.all_POS, columns=range(self.num)), pd.DataFrame(0, index=self.all_POS, columns=range(self.num))
70+
+ N_ref_mtx, N_alt_mtx = pd.DataFrame(0, index=self.all_POS, columns=range(self.num), dtype="int64"), pd.DataFrame(0, index=self.all_POS, columns=range(self.num), dtype="int64")
71+
72+
for n in range(self.num):
73+
bc_idx = [i for i, e in enumerate(self.barcodes) if e in self.reassigned[n]]
74+
# REF/ALT alleles counts from cells assigned to state n
75+
if len(pos) == 0:
76+
- N_ref_mtx.loc[:, n], N_alt_mtx.loc[:, n] = self.ref_bc_mtx[:, bc_idx].sum(axis=1), self.alt_bc_mtx[:, bc_idx].sum(axis=1)
77+
+ N_ref_mtx.loc[:, n], N_alt_mtx.loc[:, n] = self.ref_bc_mtx[:, bc_idx].sum(axis=1).ravel(), self.alt_bc_mtx[:, bc_idx].sum(axis=1).ravel()
78+
else:
79+
- N_ref_mtx.loc[:, n], N_alt_mtx.loc[:, n] = self.ref_bc_mtx[pos][:, bc_idx].sum(axis=1), self.alt_bc_mtx[pos][:, bc_idx].sum(axis=1)
80+
+ N_ref_mtx.loc[:, n], N_alt_mtx.loc[:, n] = self.ref_bc_mtx[pos][:, bc_idx].sum(axis=1).ravel(), self.alt_bc_mtx[pos][:, bc_idx].sum(axis=1).ravel()
81+
82+
# judge N(A) or N(R) for each cluster
83+
if self.doublet == -1:
84+
@@ -585,12 +587,12 @@ Options:
85+
assignment['Barcode'] = model.reassigned[n]
86+
if n != model.doublet:
87+
assignment['Cluster'] = 'SNG-' + str(n)
88+
- assignments = assignments.append(assignment)
89+
+ assignments = pd.concat([assignments, assignment])
90+
if doublets != 0: # if doublet cluster is expected
91+
assignment = pd.DataFrame()
92+
assignment['Barcode'] = model.reassigned[model.doublet]
93+
assignment['Cluster'] = 'DBL-' + str(model.doublet)
94+
- assignments = assignments.append(assignment)
95+
+ assignments = pd.concat([assignments, assignment])
96+
assignments.to_csv(os.path.join(args.out, r'scSplit_result.csv'), sep='\t', index=False)
97+
model.P_s_c.to_csv(os.path.join(args.out, r'scSplit_P_s_c.csv'))
98+
with open(os.path.join(args.out, r'scSplit_dist_variants.txt'), 'w') as logfile:

0 commit comments

Comments
 (0)