Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix scaling: take chromsizes from header #239

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
black
Phlya committed May 3, 2024
commit 9c5c77ee14da42dc5e76c875006e9b31786dc7d8
38 changes: 20 additions & 18 deletions pairtools/lib/scaling.py
Original file line number Diff line number Diff line change
@@ -134,7 +134,11 @@ def make_empty_cross_region_table(


def bins_pairs_by_distance(
pairs_df, dist_bins, regions=None, chromsizes=None, ignore_trans=False,
pairs_df,
dist_bins,
regions=None,
chromsizes=None,
ignore_trans=False,
keep_unassigned=False,
):

@@ -187,7 +191,6 @@ def bins_pairs_by_distance(
pairs_df.chrom2.values, pairs_df.pos2.values, regions
).T


pairs_reduced_df = pd.DataFrame(
{
"chrom1": pairs_df.chrom1.values,
@@ -207,10 +210,11 @@ def bins_pairs_by_distance(
)

if not keep_unassigned:
pairs_reduced_df = (pairs_reduced_df
.query('(start1 >= 0) and (start2 >= 0)')
pairs_reduced_df = (
pairs_reduced_df.query("(start1 >= 0) and (start2 >= 0)")
# do not test for end1 and end2, as they can be -1 if regions and not specified
.reset_index(drop=True))
.reset_index(drop=True)
)

pairs_reduced_df["min_dist"] = np.where(
pairs_reduced_df["dist_bin_idx"] > 0,
@@ -219,7 +223,7 @@ def bins_pairs_by_distance(
)

pairs_reduced_df["max_dist"] = np.where(
pairs_reduced_df["dist_bin_idx"] < len(dist_bins)-1,
pairs_reduced_df["dist_bin_idx"] < len(dist_bins) - 1,
dist_bins[pairs_reduced_df["dist_bin_idx"]],
np.iinfo(np.int64).max,
)
@@ -348,7 +352,7 @@ def compute_scaling(
Parameters
----------
pairs : pd.DataFrame or str or file-like object
A table with pairs of genomic coordinates representing contacts.
A table with pairs of genomic coordinates representing contacts.
It can be a pandas DataFrame, a path to a pairs file, or a file-like object.
regions : bioframe viewframe or None, optional
Genomic regions of interest. It can be anything that can serve as input to bioframe.from_any,
@@ -379,9 +383,9 @@ def compute_scaling(
"""

dist_bins = geomspace(
dist_range[0],
dist_range[0],
dist_range[1],
int(np.round(np.log10(dist_range[1]/dist_range[0])*n_dist_bins_decade))
int(np.round(np.log10(dist_range[1] / dist_range[0]) * n_dist_bins_decade)),
)

if isinstance(pairs, pd.DataFrame):
@@ -406,7 +410,7 @@ def compute_scaling(
regions=regions,
chromsizes=chromsizes,
ignore_trans=ignore_trans,
keep_unassigned=keep_unassigned
keep_unassigned=keep_unassigned,
)

sc = sc_chunk if sc is None else sc.add(sc_chunk, fill_value=0)
@@ -417,7 +421,6 @@ def compute_scaling(
else trans_counts.add(trans_counts_chunk, fill_value=0)
)


# if not (isinstance(regions, pd.DataFrame) and
# (set(regions.columns) == set(['chrom', 'start','end']))):
# raise ValueError('regions must be provided as a dict or chrom-indexed Series of chromsizes or as a bedframe.')
@@ -429,10 +432,9 @@ def compute_scaling(

if not ignore_trans:
trans_counts.reset_index(inplace=True)
trans_counts["n_bp2"] = (
(trans_counts["end1"] - trans_counts["start1"]) * (
trans_counts["n_bp2"] = (trans_counts["end1"] - trans_counts["start1"]) * (
trans_counts["end2"] - trans_counts["start2"]
))
)

return sc, trans_counts

@@ -452,12 +454,12 @@ def norm_scaling_factor(bins, cfreqs, norm_window):
"""

lo, hi = np.searchsorted(bins, norm_window)
return cfreqs[lo:hi+1].mean()
return cfreqs[lo : hi + 1].mean()


def norm_scaling(bins, cfreqs, norm_window, log_input=False):
"""
Normalize a contact-frequency-vs-distance curve, by setting the average contact frequency
Normalize a contact-frequency-vs-distance curve, by setting the average contact frequency
in a given window to 1.0.

Args:
@@ -469,14 +471,14 @@ def norm_scaling(bins, cfreqs, norm_window, log_input=False):
Returns:
float or array-like: The normalized contact frequencies.
"""

norm = norm_scaling_factor(bins, cfreqs, norm_window)
if log_input:
return cfreqs - norm
else:
return cfreqs / norm


def unity_norm_scaling(bins, cfreqs, norm_range=(1e4, 1e9)):
bin_lens = np.diff(bins)
bin_mids = np.sqrt(bins[1:] * bins[:-1])
Loading