Skip to content

Commit

Permalink
Allow column names in all backends
Browse files Browse the repository at this point in the history
  • Loading branch information
Phlya committed Nov 16, 2022
1 parent caf38db commit e0d0223
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 44 deletions.
65 changes: 38 additions & 27 deletions pairtools/cli/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@

logger = get_logger()

from ..lib import fileio, pairsam_format, headerops
from . import cli, common_io_options

import click

from ..lib import fileio, headerops, pairsam_format
from ..lib.dedup import streaming_dedup, streaming_dedup_cython
from ..lib.stats import PairCounter
from . import cli, common_io_options

UTIL_NAME = "pairtools_dedup"

Expand Down Expand Up @@ -179,45 +178,45 @@
)
@click.option(
"--c1",
type=int,
default=pairsam_format.COL_C1,
help=f"Chrom 1 column; default {pairsam_format.COL_C1}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[1],
help=f"Chrom 1 column; default {pairsam_format.COLUMNS_PAIRS[1]}"
"[input format option]",
)
@click.option(
"--c2",
type=int,
default=pairsam_format.COL_C2,
help=f"Chrom 2 column; default {pairsam_format.COL_C2}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[3],
help=f"Chrom 2 column; default {pairsam_format.COLUMNS_PAIRS[3]}"
"[input format option]",
)
@click.option(
"--p1",
type=int,
default=pairsam_format.COL_P1,
help=f"Position 1 column; default {pairsam_format.COL_P1}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[2],
help=f"Position 1 column; default {pairsam_format.COLUMNS_PAIRS[2]}"
"[input format option]",
)
@click.option(
"--p2",
type=int,
default=pairsam_format.COL_P2,
help=f"Position 2 column; default {pairsam_format.COL_P2}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[4],
help=f"Position 2 column; default {pairsam_format.COLUMNS_PAIRS[4]}"
"[input format option]",
)
@click.option(
"--s1",
type=int,
default=pairsam_format.COL_S1,
help=f"Strand 1 column; default {pairsam_format.COL_S1}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[5],
help=f"Strand 1 column; default {pairsam_format.COLUMNS_PAIRS[5]}"
"[input format option]",
)
@click.option(
"--s2",
type=int,
default=pairsam_format.COL_S2,
help=f"Strand 2 column; default {pairsam_format.COL_S2}"
" Only works with '--backend cython'. [input format option]",
type=str,
default=pairsam_format.COLUMNS_PAIRS[6],
help=f"Strand 2 column; default {pairsam_format.COLUMNS_PAIRS[6]}"
"[input format option]",
)
@click.option(
"--unmapped-chrom",
Expand Down Expand Up @@ -516,6 +515,12 @@ def dedup_py(
# )
extra_cols1 = [column_names.index(col) for col in extra_cols1]
extra_cols2 = [column_names.index(col) for col in extra_cols2]
c1 = column_names.index(c1)
c2 = column_names.index(c2)
p1 = column_names.index(p1)
p2 = column_names.index(p2)
s1 = column_names.index(s1)
s2 = column_names.index(s2)
streaming_dedup_cython(
method,
max_mismatch,
Expand Down Expand Up @@ -555,6 +560,12 @@ def dedup_py(
out_stat=out_stat,
backend=backend,
n_proc=n_proc,
c1=c1,
c2=c2,
p1=p1,
p2=p2,
s1=s1,
s2=s2,
)
else:
raise ValueError("Unknown backend")
Expand Down
65 changes: 48 additions & 17 deletions pairtools/lib/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from scipy.sparse.csgraph import connected_components

from . import dedup_cython, pairsam_format
from .stats import PairCounter

from .._logging import get_logger

Expand All @@ -15,7 +14,8 @@

# Ignore pandas future warnings:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

warnings.simplefilter(action="ignore", category=FutureWarning)

# Setting for cython deduplication:
# you don't need to load more than 10k lines at a time b/c you get out of the
Expand All @@ -40,6 +40,12 @@ def streaming_dedup(
out_stat,
backend,
n_proc,
c1="chrom1",
c2="chrom2",
p1="pos1",
p2="pos2",
s1="strand1",
s2="strand2",
):

deduped_chunks = _dedup_stream(
Expand All @@ -54,6 +60,13 @@ def streaming_dedup(
keep_parent_id=keep_parent_id,
backend=backend,
n_proc=n_proc,
c1=c1,
c2=c2,
p1=p1,
p2=p2,
s1=s1,
s2=s2,
unmapped_chrom=unmapped_chrom,
)

t0 = time.time()
Expand All @@ -68,8 +81,8 @@ def streaming_dedup(

# Define masks of unmapped and duplicated reads:
mask_mapped = np.logical_and(
(df_chunk["chrom1"] != unmapped_chrom),
(df_chunk["chrom2"] != unmapped_chrom),
(df_chunk[c1] != unmapped_chrom),
(df_chunk[c2] != unmapped_chrom),
)
mask_duplicates = df_chunk["duplicate"]

Expand Down Expand Up @@ -118,11 +131,16 @@ def _dedup_stream(
keep_parent_id,
backend,
n_proc,
c1,
c2,
p1,
p2,
s1,
s2,
unmapped_chrom,
):
# Stream the input dataframe:
dfs = pd.read_table(
in_stream, comment=None, names=colnames, chunksize=chunksize
)
dfs = pd.read_table(in_stream, comment=None, names=colnames, chunksize=chunksize)

# Set up the carryover dataframe:
df_prev_nodups = pd.DataFrame([])
Expand All @@ -140,6 +158,13 @@ def _dedup_stream(
extra_col_pairs=extra_col_pairs,
backend=backend,
n_proc=n_proc,
c1=c1,
c2=c2,
p1=p1,
p2=p2,
s1=s1,
s2=s2,
unmapped_chrom=unmapped_chrom,
)
df_marked = df_marked.loc[prev_i:, :].reset_index(drop=True)
mask_duplicated = df_marked["duplicate"]
Expand All @@ -163,8 +188,14 @@ def _dedup_chunk(
keep_parent_id,
extra_col_pairs,
backend,
unmapped_chrom="!",
n_proc=1,
n_proc,
c1,
c2,
p1,
p2,
s1,
s2,
unmapped_chrom,
):
"""Mark duplicates in a dataframe of pairs
Expand Down Expand Up @@ -222,7 +253,7 @@ def _dedup_chunk(
df.loc[:, "duplicate"] = False

# Split mapped and unmapped reads:
mask_unmapped = (df["chrom1"] == unmapped_chrom) | (df["chrom2"] == unmapped_chrom)
mask_unmapped = (df[c1] == unmapped_chrom) | (df[c2] == unmapped_chrom)
df_unmapped = df.loc[mask_unmapped, :].copy()
df_mapped = df.loc[~mask_unmapped, :].copy()
N_mapped = df_mapped.shape[0]
Expand All @@ -231,23 +262,23 @@ def _dedup_chunk(
if N_mapped > 0:
if backend == "sklearn":
a = neighbors.radius_neighbors_graph(
df_mapped[["pos1", "pos2"]],
df_mapped[[p1, p2]],
radius=r,
p=p,
n_jobs=n_proc,
)
a0, a1 = a.nonzero()
elif backend == "scipy":
z = scipy.spatial.cKDTree(df_mapped[["pos1", "pos2"]])
z = scipy.spatial.cKDTree(df_mapped[[p1, p2]])
a = z.query_pairs(r=r, p=p, output_type="ndarray")
a0 = a[:, 0]
a1 = a[:, 1]
need_to_match = np.array(
[
("chrom1", "chrom1"),
("chrom2", "chrom2"),
("strand1", "strand1"),
("strand2", "strand2"),
(c1, c1),
(c2, c2),
(s1, s1),
(s2, s2),
]
+ extra_col_pairs
)
Expand Down Expand Up @@ -376,7 +407,7 @@ def streaming_dedup_cython(
read_idx = 0 # read index to mark the parent readID
while True:
rawline = next(instream, None)
stripline = rawline.strip('\n') if rawline else None
stripline = rawline.strip("\n") if rawline else None

# take care of empty lines not at the end of the file separately
if rawline and (not stripline):
Expand Down
33 changes: 33 additions & 0 deletions tests/test_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
tmpdir_name = tmpdir.name

mock_pairsam_path_dedup = os.path.join(testdir, "data", "mock.4dedup.pairsam")
mock_pairsam_path_dedup_diff_colnames = os.path.join(
testdir, "data", "mock.4dedup_diffcolnames.pairsam"
)

dedup_path = os.path.join(tmpdir_name, "dedup.pairsam")
unmapped_path = os.path.join(tmpdir_name, "unmapped.pairsam")
Expand Down Expand Up @@ -66,6 +69,36 @@ def setup_dedup():
"max",
],
)
subprocess.check_output(
[
"python",
"-m",
"pairtools",
"dedup",
mock_pairsam_path_dedup_diff_colnames,
"--mark-dups",
"--output",
dedup_markdups_path,
"--output-dups",
dups_markdups_path,
"--output-unmapped",
unmapped_markdups_path,
"--max-mismatch",
str(max_mismatch),
"--c1",
"chr1",
"--c2",
"chr2",
"--p1",
"p1",
"--p2",
"p2",
"--s1",
"str1",
"--s2",
"str2",
],
)
subprocess.check_output(
[
"python",
Expand Down

0 comments on commit e0d0223

Please sign in to comment.