Skip to content

Fix scaling: take chromsizes from header #239

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions pairtools/cli/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@
help="Number of bins to split the distance range in log10-space, specified per a factor of 10 difference.",
)
@common_io_options
def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs):
def scaling(
input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs
):
"""Calculate pairs scalings.

INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
Expand All @@ -63,10 +65,14 @@ def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins_decade,

Output is .tsv file with scaling stats (both cis scalings and trans levels).
"""
scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs)
scaling_py(
input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs
)


def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs):
def scaling_py(
input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs
):

if len(input_path) == 0:
raise ValueError(f"No input paths: {input_path}")
Expand Down
42 changes: 23 additions & 19 deletions pairtools/lib/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,11 @@ def make_empty_cross_region_table(


def bins_pairs_by_distance(
pairs_df, dist_bins, regions=None, chromsizes=None, ignore_trans=False,
pairs_df,
dist_bins,
regions=None,
chromsizes=None,
ignore_trans=False,
keep_unassigned=False,
):

Expand Down Expand Up @@ -187,7 +191,6 @@ def bins_pairs_by_distance(
pairs_df.chrom2.values, pairs_df.pos2.values, regions
).T


pairs_reduced_df = pd.DataFrame(
{
"chrom1": pairs_df.chrom1.values,
Expand All @@ -207,10 +210,11 @@ def bins_pairs_by_distance(
)

if not keep_unassigned:
pairs_reduced_df = (pairs_reduced_df
.query('(start1 >= 0) and (start2 >= 0)')
pairs_reduced_df = (
pairs_reduced_df.query("(start1 >= 0) and (start2 >= 0)")
# do not test for end1 and end2, as they can be -1 if regions and not specified
.reset_index(drop=True))
.reset_index(drop=True)
)

pairs_reduced_df["min_dist"] = np.where(
pairs_reduced_df["dist_bin_idx"] > 0,
Expand All @@ -219,7 +223,7 @@ def bins_pairs_by_distance(
)

pairs_reduced_df["max_dist"] = np.where(
pairs_reduced_df["dist_bin_idx"] < len(dist_bins)-1,
pairs_reduced_df["dist_bin_idx"] < len(dist_bins) - 1,
dist_bins[pairs_reduced_df["dist_bin_idx"]],
np.iinfo(np.int64).max,
)
Expand Down Expand Up @@ -348,7 +352,7 @@ def compute_scaling(
Parameters
----------
pairs : pd.DataFrame or str or file-like object
A table with pairs of genomic coordinates representing contacts.
A table with pairs of genomic coordinates representing contacts.
It can be a pandas DataFrame, a path to a pairs file, or a file-like object.
regions : bioframe viewframe or None, optional
Genomic regions of interest. It can be anything that can serve as input to bioframe.from_any,
Expand Down Expand Up @@ -379,16 +383,18 @@ def compute_scaling(
"""

dist_bins = geomspace(
dist_range[0],
dist_range[0],
dist_range[1],
int(np.round(np.log10(dist_range[1]/dist_range[0])*n_dist_bins_decade))
int(np.round(np.log10(dist_range[1] / dist_range[0]) * n_dist_bins_decade)),
)

if isinstance(pairs, pd.DataFrame):
pairs_df = pairs

elif isinstance(pairs, str) or hasattr(pairs, "buffer") or hasattr(pairs, "peek"):
pairs_df, _, _ = pairsio.read_pairs(pairs, nproc=nproc_in, chunksize=chunksize)
pairs_df, _, chromsizes = pairsio.read_pairs(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the main change

pairs, nproc=nproc_in, chunksize=chunksize
)
else:
raise ValueError(
"pairs must be either a path to a pairs file or a pd.DataFrame"
Expand All @@ -404,7 +410,7 @@ def compute_scaling(
regions=regions,
chromsizes=chromsizes,
ignore_trans=ignore_trans,
keep_unassigned=keep_unassigned
keep_unassigned=keep_unassigned,
)

sc = sc_chunk if sc is None else sc.add(sc_chunk, fill_value=0)
Expand All @@ -415,7 +421,6 @@ def compute_scaling(
else trans_counts.add(trans_counts_chunk, fill_value=0)
)


# if not (isinstance(regions, pd.DataFrame) and
# (set(regions.columns) == set(['chrom', 'start','end']))):
# raise ValueError('regions must be provided as a dict or chrom-indexed Series of chromsizes or as a bedframe.')
Expand All @@ -427,10 +432,9 @@ def compute_scaling(

if not ignore_trans:
trans_counts.reset_index(inplace=True)
trans_counts["n_bp2"] = (
(trans_counts["end1"] - trans_counts["start1"]) * (
trans_counts["n_bp2"] = (trans_counts["end1"] - trans_counts["start1"]) * (
trans_counts["end2"] - trans_counts["start2"]
))
)

return sc, trans_counts

Expand All @@ -450,12 +454,12 @@ def norm_scaling_factor(bins, cfreqs, norm_window):
"""

lo, hi = np.searchsorted(bins, norm_window)
return cfreqs[lo:hi+1].mean()
return cfreqs[lo : hi + 1].mean()


def norm_scaling(bins, cfreqs, norm_window, log_input=False):
"""
Normalize a contact-frequency-vs-distance curve, by setting the average contact frequency
Normalize a contact-frequency-vs-distance curve, by setting the average contact frequency
in a given window to 1.0.

Args:
Expand All @@ -467,14 +471,14 @@ def norm_scaling(bins, cfreqs, norm_window, log_input=False):
Returns:
float or array-like: The normalized contact frequencies.
"""

norm = norm_scaling_factor(bins, cfreqs, norm_window)
if log_input:
return cfreqs - norm
else:
return cfreqs / norm


def unity_norm_scaling(bins, cfreqs, norm_range=(1e4, 1e9)):
bin_lens = np.diff(bins)
bin_mids = np.sqrt(bins[1:] * bins[:-1])
Expand Down
Loading
Loading