Allow column names in all backends

open2c · Nov 16, 2022 · e0d0223 · e0d0223
1 parent caf38db
commit e0d0223
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 44 deletions.
diff --git a/pairtools/cli/dedup.py b/pairtools/cli/dedup.py
@@ -9,13 +9,12 @@
 
 logger = get_logger()
 
-from ..lib import fileio, pairsam_format, headerops
-from . import cli, common_io_options
-
 import click
 
+from ..lib import fileio, headerops, pairsam_format
 from ..lib.dedup import streaming_dedup, streaming_dedup_cython
 from ..lib.stats import PairCounter
+from . import cli, common_io_options
 
 UTIL_NAME = "pairtools_dedup"
 
@@ -179,45 +178,45 @@
 )
 @click.option(
     "--c1",
-    type=int,
-    default=pairsam_format.COL_C1,
-    help=f"Chrom 1 column; default {pairsam_format.COL_C1}"
-    " Only works with '--backend cython'. [input format option]",
+    type=str,
+    default=pairsam_format.COLUMNS_PAIRS[1],
+    help=f"Chrom 1 column; default {pairsam_format.COLUMNS_PAIRS[1]}"
+    "[input format option]",
 )
 @click.option(
     "--c2",
-    type=int,
-    default=pairsam_format.COL_C2,
-    help=f"Chrom 2 column; default {pairsam_format.COL_C2}"
-    " Only works with '--backend cython'. [input format option]",
+    type=str,
+    default=pairsam_format.COLUMNS_PAIRS[3],
+    help=f"Chrom 2 column; default {pairsam_format.COLUMNS_PAIRS[3]}"
+    "[input format option]",
 )
 @click.option(
     "--p1",
-    type=int,
-    default=pairsam_format.COL_P1,
-    help=f"Position 1 column; default {pairsam_format.COL_P1}"
-    " Only works with '--backend cython'. [input format option]",
+    type=str,
+    default=pairsam_format.COLUMNS_PAIRS[2],
+    help=f"Position 1 column; default {pairsam_format.COLUMNS_PAIRS[2]}"
+    "[input format option]",
 )
 @click.option(
     "--p2",
-    type=int,
-    default=pairsam_format.COL_P2,
-    help=f"Position 2 column; default {pairsam_format.COL_P2}"
-    " Only works with '--backend cython'. [input format option]",
+    type=str,
+    default=pairsam_format.COLUMNS_PAIRS[4],
+    help=f"Position 2 column; default {pairsam_format.COLUMNS_PAIRS[4]}"
+    "[input format option]",
 )
 @click.option(
     "--s1",
-    type=int,
-    default=pairsam_format.COL_S1,
-    help=f"Strand 1 column; default {pairsam_format.COL_S1}"
-    " Only works with '--backend cython'. [input format option]",
+    type=str,
+    default=pairsam_format.COLUMNS_PAIRS[5],
+    help=f"Strand 1 column; default {pairsam_format.COLUMNS_PAIRS[5]}"
+    "[input format option]",
 )
 @click.option(
     "--s2",
-    type=int,
-    default=pairsam_format.COL_S2,
-    help=f"Strand 2 column; default {pairsam_format.COL_S2}"
-    " Only works with '--backend cython'. [input format option]",
+    type=str,
+    default=pairsam_format.COLUMNS_PAIRS[6],
+    help=f"Strand 2 column; default {pairsam_format.COLUMNS_PAIRS[6]}"
+    "[input format option]",
 )
 @click.option(
     "--unmapped-chrom",
@@ -516,6 +515,12 @@ def dedup_py(
         # )
         extra_cols1 = [column_names.index(col) for col in extra_cols1]
         extra_cols2 = [column_names.index(col) for col in extra_cols2]
+        c1 = column_names.index(c1)
+        c2 = column_names.index(c2)
+        p1 = column_names.index(p1)
+        p2 = column_names.index(p2)
+        s1 = column_names.index(s1)
+        s2 = column_names.index(s2)
         streaming_dedup_cython(
             method,
             max_mismatch,
@@ -555,6 +560,12 @@ def dedup_py(
             out_stat=out_stat,
             backend=backend,
             n_proc=n_proc,
+            c1=c1,
+            c2=c2,
+            p1=p1,
+            p2=p2,
+            s1=s1,
+            s2=s2,
         )
     else:
         raise ValueError("Unknown backend")

diff --git a/pairtools/lib/dedup.py b/pairtools/lib/dedup.py
@@ -6,7 +6,6 @@
 from scipy.sparse.csgraph import connected_components
 
 from . import dedup_cython, pairsam_format
-from .stats import PairCounter
 
 from .._logging import get_logger
 
@@ -15,7 +14,8 @@
 
 # Ignore pandas future warnings:
 import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
+
+warnings.simplefilter(action="ignore", category=FutureWarning)
 
 # Setting for cython deduplication:
 # you don't need to load more than 10k lines at a time b/c you get out of the
@@ -40,6 +40,12 @@ def streaming_dedup(
     out_stat,
     backend,
     n_proc,
+    c1="chrom1",
+    c2="chrom2",
+    p1="pos1",
+    p2="pos2",
+    s1="strand1",
+    s2="strand2",
 ):
 
     deduped_chunks = _dedup_stream(
@@ -54,6 +60,13 @@ def streaming_dedup(
         keep_parent_id=keep_parent_id,
         backend=backend,
         n_proc=n_proc,
+        c1=c1,
+        c2=c2,
+        p1=p1,
+        p2=p2,
+        s1=s1,
+        s2=s2,
+        unmapped_chrom=unmapped_chrom,
     )
 
     t0 = time.time()
@@ -68,8 +81,8 @@ def streaming_dedup(
 
         # Define masks of unmapped and duplicated reads:
         mask_mapped = np.logical_and(
-            (df_chunk["chrom1"] != unmapped_chrom),
-            (df_chunk["chrom2"] != unmapped_chrom),
+            (df_chunk[c1] != unmapped_chrom),
+            (df_chunk[c2] != unmapped_chrom),
         )
         mask_duplicates = df_chunk["duplicate"]
 
@@ -118,11 +131,16 @@ def _dedup_stream(
     keep_parent_id,
     backend,
     n_proc,
+    c1,
+    c2,
+    p1,
+    p2,
+    s1,
+    s2,
+    unmapped_chrom,
 ):
     # Stream the input dataframe:
-    dfs = pd.read_table(
-        in_stream, comment=None, names=colnames, chunksize=chunksize
-    )
+    dfs = pd.read_table(in_stream, comment=None, names=colnames, chunksize=chunksize)
 
     # Set up the carryover dataframe:
     df_prev_nodups = pd.DataFrame([])
@@ -140,6 +158,13 @@ def _dedup_stream(
             extra_col_pairs=extra_col_pairs,
             backend=backend,
             n_proc=n_proc,
+            c1=c1,
+            c2=c2,
+            p1=p1,
+            p2=p2,
+            s1=s1,
+            s2=s2,
+            unmapped_chrom=unmapped_chrom,
         )
         df_marked = df_marked.loc[prev_i:, :].reset_index(drop=True)
         mask_duplicated = df_marked["duplicate"]
@@ -163,8 +188,14 @@ def _dedup_chunk(
     keep_parent_id,
     extra_col_pairs,
     backend,
-    unmapped_chrom="!",
-    n_proc=1,
+    n_proc,
+    c1,
+    c2,
+    p1,
+    p2,
+    s1,
+    s2,
+    unmapped_chrom,
 ):
     """Mark duplicates in a dataframe of pairs
 
@@ -222,7 +253,7 @@ def _dedup_chunk(
     df.loc[:, "duplicate"] = False
 
     # Split mapped and unmapped reads:
-    mask_unmapped = (df["chrom1"] == unmapped_chrom) | (df["chrom2"] == unmapped_chrom)
+    mask_unmapped = (df[c1] == unmapped_chrom) | (df[c2] == unmapped_chrom)
     df_unmapped = df.loc[mask_unmapped, :].copy()
     df_mapped = df.loc[~mask_unmapped, :].copy()
     N_mapped = df_mapped.shape[0]
@@ -231,23 +262,23 @@ def _dedup_chunk(
     if N_mapped > 0:
         if backend == "sklearn":
             a = neighbors.radius_neighbors_graph(
-                df_mapped[["pos1", "pos2"]],
+                df_mapped[[p1, p2]],
                 radius=r,
                 p=p,
                 n_jobs=n_proc,
             )
             a0, a1 = a.nonzero()
         elif backend == "scipy":
-            z = scipy.spatial.cKDTree(df_mapped[["pos1", "pos2"]])
+            z = scipy.spatial.cKDTree(df_mapped[[p1, p2]])
             a = z.query_pairs(r=r, p=p, output_type="ndarray")
             a0 = a[:, 0]
             a1 = a[:, 1]
         need_to_match = np.array(
             [
-                ("chrom1", "chrom1"),
-                ("chrom2", "chrom2"),
-                ("strand1", "strand1"),
-                ("strand2", "strand2"),
+                (c1, c1),
+                (c2, c2),
+                (s1, s1),
+                (s2, s2),
             ]
             + extra_col_pairs
         )
@@ -376,7 +407,7 @@ def streaming_dedup_cython(
     read_idx = 0  # read index to mark the parent readID
     while True:
         rawline = next(instream, None)
-        stripline = rawline.strip('\n') if rawline else None
+        stripline = rawline.strip("\n") if rawline else None
 
         # take care of empty lines not at the end of the file separately
         if rawline and (not stripline):

diff --git a/tests/test_dedup.py b/tests/test_dedup.py
@@ -11,6 +11,9 @@
 tmpdir_name = tmpdir.name
 
 mock_pairsam_path_dedup = os.path.join(testdir, "data", "mock.4dedup.pairsam")
+mock_pairsam_path_dedup_diff_colnames = os.path.join(
+    testdir, "data", "mock.4dedup_diffcolnames.pairsam"
+)
 
 dedup_path = os.path.join(tmpdir_name, "dedup.pairsam")
 unmapped_path = os.path.join(tmpdir_name, "unmapped.pairsam")
@@ -66,6 +69,36 @@ def setup_dedup():
                 "max",
             ],
         )
+        subprocess.check_output(
+            [
+                "python",
+                "-m",
+                "pairtools",
+                "dedup",
+                mock_pairsam_path_dedup_diff_colnames,
+                "--mark-dups",
+                "--output",
+                dedup_markdups_path,
+                "--output-dups",
+                dups_markdups_path,
+                "--output-unmapped",
+                unmapped_markdups_path,
+                "--max-mismatch",
+                str(max_mismatch),
+                "--c1",
+                "chr1",
+                "--c2",
+                "chr2",
+                "--p1",
+                "p1",
+                "--p2",
+                "p2",
+                "--s1",
+                "str1",
+                "--s2",
+                "str2",
+            ],
+        )
         subprocess.check_output(
             [
                 "python",