Skip to content

Commit

Permalink
Merge branch 'master' into dedup-cols
Browse files Browse the repository at this point in the history
  • Loading branch information
Phlya authored Dec 19, 2022
2 parents ce888c1 + 806c9ba commit c37a6df
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 7 deletions.
23 changes: 18 additions & 5 deletions pairtools/lib/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ def make_empty_cross_region_table(


def bins_pairs_by_distance(
pairs_df, dist_bins, regions=None, chromsizes=None, ignore_trans=False
pairs_df, dist_bins, regions=None, chromsizes=None, ignore_trans=False,
keep_unassigned=False,
):

dist_bins = np.r_[dist_bins, np.iinfo(np.int64).max]
Expand All @@ -149,9 +150,10 @@ def bins_pairs_by_distance(
region_ends1, region_ends2 = -1, -1

else:
region_starts1, region_starts2 = 0, 0
region_ends1 = pairs_df.chrom1.map(chromsizes).fillna(1).astype(np.int64)
region_ends2 = pairs_df.chrom2.map(chromsizes).fillna(1).astype(np.int64)
region_ends1 = pairs_df.chrom1.map(chromsizes).fillna(-1).astype(np.int64)
region_ends2 = pairs_df.chrom2.map(chromsizes).fillna(-1).astype(np.int64)
region_starts1 = np.where(region_ends1 > 0, 0, -1)
region_starts2 = np.where(region_ends2 > 0, 0, -1)
regions = pd.DataFrame(
[
{"chrom": chrom, "start": 0, "end": length}
Expand Down Expand Up @@ -184,6 +186,7 @@ def bins_pairs_by_distance(
pairs_df.chrom2.values, pairs_df.pos2.values, regions
).T


pairs_reduced_df = pd.DataFrame(
{
"chrom1": pairs_df.chrom1.values,
Expand All @@ -202,6 +205,11 @@ def bins_pairs_by_distance(
copy=False,
)

if not keep_unassigned:
pairs_reduced_df = (pairs_reduced_df
.query('(start1 >= 0) and (end1 > 0) and (start2 >= 0) and (end2 > 0)')
.reset_index(drop=True))

pairs_reduced_df["min_dist"] = np.where(
pairs_reduced_df["dist_bin_idx"] > 0,
dist_bins[pairs_reduced_df["dist_bin_idx"] - 1],
Expand Down Expand Up @@ -324,6 +332,7 @@ def compute_scaling(
n_dist_bins=8 * 8,
chunksize=int(1e7),
ignore_trans=False,
keep_unassigned=False,
filter_f=None,
nproc_in=1,
cmd_in=None,
Expand All @@ -340,6 +349,7 @@ def compute_scaling(
n_dist_bins: number of logarithmic bins
chunksize: size of chunks for calculations
ignore_trans: bool, ignore trans or not
keep_unassigned: bool, keep pairs that are not assigned to any region
filter_f: filter function that can be applied to each chunk
nproc_in
cmd_in
Expand Down Expand Up @@ -371,8 +381,10 @@ def compute_scaling(
header, pairs_body = headerops.get_header(pairs_stream)

cols = headerops.extract_column_names(header)

if chromsizes is None:
chromsizes = headerops.extract_chromsizes(header)

pairs_df = pd.read_csv(
pairs_body,
header=None,
Expand All @@ -396,6 +408,7 @@ def compute_scaling(
regions=regions,
chromsizes=chromsizes,
ignore_trans=ignore_trans,
keep_unassigned=keep_unassigned
)

sc = sc_chunk if sc is None else sc.add(sc_chunk, fill_value=0)
Expand All @@ -416,7 +429,7 @@ def compute_scaling(

if not ignore_trans:
trans_counts.reset_index(inplace=True)
trans_counts["np_bp2"] = (trans_counts["end1"] - trans_counts["start1"]) * (
trans_counts["n_bp2"] = (trans_counts["end1"] - trans_counts["start1"]) * (
trans_counts["end2"] - trans_counts["start2"]
)

Expand Down
3 changes: 1 addition & 2 deletions tests/test_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,4 @@ def test_scaling():

output = pd.read_csv(io.StringIO(result), sep="\t", header=0)

# All the pairs, even "!" are counted as present because we don't provide regions
assert output["n_pairs"].sum() == 8
assert output["n_pairs"].sum() == 5

0 comments on commit c37a6df

Please sign in to comment.