Skip to content

Commit bbb024b

Browse files
authored
Merge pull request #161 from macarthur-lab/infer_sex
Updated infer_sex function to use sex chromosome ploidies and return …
2 parents bfdf209 + 07d009d commit bbb024b

File tree

1 file changed

+78
-72
lines changed

1 file changed

+78
-72
lines changed

gnomad_hail/utils/sample_qc.py

Lines changed: 78 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22
from .generic import *
33
from .gnomad_functions import logger, filter_to_adj
4+
from .sparse_mt import impute_sex_ploidy
45
from gnomad_hail.utils.relatedness import get_duplicated_samples
56

67

@@ -279,86 +280,91 @@ def assign_platform_from_pcs(
279280
return ht
280281

281282

282-
# TODO: This should be reviewed / merged with work from Kristen
283-
def infer_sex(
284-
x_mt: hl.MatrixTable, # TODO: This feels somewhat unsatisfying to provide two MTs. Maybe just reapply the QC MT filters to both (minus callrate for Y)?
285-
y_mt: hl.MatrixTable,
286-
platform_ht: hl.Table,
287-
male_min_f_stat: float,
288-
female_max_f_stat: float,
289-
min_male_y_sites_called: int = 500,
290-
max_y_female_call_rate: float = 0.15,
291-
min_y_male_call_rate: float = 0.8
283+
def default_annotate_sex(
284+
mt: hl.MatrixTable,
285+
is_sparse: bool = True,
286+
excluded_intervals: Optional[hl.Table] = None,
287+
included_intervals: Optional[hl.Table] = None,
288+
normalization_contig: str = 'chr20',
289+
sites_ht: Optional[hl.Table] = None,
290+
aaf_expr: Optional[str] = None,
291+
gt_expr: str = 'GT',
292+
f_stat_cutoff: float = 0.5,
293+
aaf_threshold: float = 0.001
292294
) -> hl.Table:
293295
"""
294-
Imputes sample sex based on X-chromosome heterozygosity and Y-callrate (if Y calls are present)
295-
296-
:param x_mt:
297-
:param y_mt:
298-
:param platform_ht:
299-
:param male_min_f_stat:
300-
:param female_max_f_stat:
301-
:param min_male_y_sites_called:
302-
:param max_y_female_call_rate:
303-
:param min_y_male_call_rate:
304-
:return:
296+
Imputes sample sex based on X-chromosome heterozygosity and sex chromosome ploidy.
297+
298+
Returns Table with the following fields:
299+
- s (str): Sample
300+
- chr20_mean_dp (float32): Sample's mean coverage over chromosome 20.
301+
- chrX_mean_dp (float32): Sample's mean coverage over chromosome X.
302+
- chrY_mean_dp (float32): Sample's mean coverage over chromosome Y.
303+
- chrX_ploidy (float32): Sample's imputed ploidy over chromosome X.
304+
- chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y.
305+
- f_stat (float64): Sample f-stat. Calculated using hl.impute_sex.
306+
- n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex.
307+
- expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex.
308+
- observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex.
309+
- X_karyotype (str): Sample's chromosome X karyotype.
310+
- Y_karyotype (str): Sample's chromosome Y karyotype.
311+
- sex_karyotype (str): Sample's sex karyotype.
312+
313+
:param mt: Input MatrixTable
314+
:param bool is_sparse: Whether input MatrixTable is in sparse data format
315+
:param excluded_intervals: Optional table of intervals to exclude from the computation.
316+
:param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes.
317+
:param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies.
318+
:param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex,
319+
and pulls alternate allele frequency from this Table.
320+
:param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency.
321+
:param gt_expr: Name of entry field storing the genotype. Default: 'GT'
322+
:param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff.
323+
:param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations.
324+
:return: Table of samples and their imputed sex karyotypes.
305325
"""
306-
logger.info("Imputing samples sex")
307-
308-
x_mt = hl.filter_intervals(x_mt, [hl.parse_locus_interval(x_contig) for x_contig in get_reference_genome(x_mt.locus).x_contigs])
309-
x_ht = hl.impute_sex(x_mt.GT, aaf_threshold=0.05, female_threshold=female_max_f_stat, male_threshold=male_min_f_stat)
310-
y_mt = hl.filter_intervals(y_mt, [hl.parse_locus_interval(y_contig) for y_contig in get_reference_genome(y_mt.locus).y_contigs])
311-
y_mt = y_mt.filter_rows(y_mt.locus.in_y_nonpar())
312-
sex_ht = y_mt.annotate_cols(
313-
qc_platform=platform_ht[y_mt.col_key].qc_platform,
314-
is_female=x_ht[y_mt.col_key].is_female,
315-
y_call_rate=hl.agg.fraction(hl.is_defined(y_mt.GT)),
316-
n_y_sites_called=hl.agg.count_where(hl.is_defined(y_mt.GT)),
317-
**{f'x_{ann}': x_ht[y_mt.col_key][ann] for ann in x_ht.row_value if ann != 'is_female'}
318-
).cols()
319-
320-
mean_male_y_sites_called = sex_ht.aggregate(hl.agg.filter(~sex_ht.is_female, hl.agg.group_by(sex_ht.qc_platform, hl.agg.mean(sex_ht.n_y_sites_called))))
321-
y_call_rate_stats = sex_ht.aggregate(
322-
hl.agg.filter(
323-
hl.is_defined(sex_ht.is_female),
324-
hl.agg.group_by(hl.tuple([sex_ht.qc_platform, sex_ht.is_female]),
325-
hl.agg.stats(sex_ht.y_call_rate)
326-
)
326+
logger.info("Imputing sex chromosome ploidies...")
327+
if is_sparse:
328+
ploidy_ht = impute_sex_ploidy(
329+
mt, excluded_intervals, included_intervals,
330+
normalization_contig
327331
)
328-
)
332+
else:
333+
raise NotImplementedError("Imputing sex ploidy does not exist yet for dense data.")
329334

330-
no_y_call_rate_platforms = set()
331-
for platform, sites_called in mean_male_y_sites_called.items():
332-
if sites_called < min_male_y_sites_called:
333-
logger.warn(f"Mean number of sites in males on Y chromosome for platform {platform} is < {min_male_y_sites_called} ({sites_called} sites found). Y call rate filter will NOT be applied for samples on platform {platform}.")
334-
no_y_call_rate_platforms.add(platform)
335-
336-
sex_ht = sex_ht.annotate_globals(y_call_rate_stats=y_call_rate_stats,
337-
no_y_call_rate_platforms=no_y_call_rate_platforms if no_y_call_rate_platforms else hl.empty_set(platform_ht.qc_platform.dtype))
338-
y_female_stats = sex_ht.y_call_rate_stats[(sex_ht.qc_platform, True)]
339-
y_male_stats = sex_ht.y_call_rate_stats[(sex_ht.qc_platform, False)]
340-
sex_ht = sex_ht.annotate(
341-
is_female=(
342-
hl.case()
343-
.when(sex_ht.no_y_call_rate_platforms.contains(sex_ht.qc_platform), sex_ht.is_female)
344-
.when(sex_ht.is_female & ((sex_ht.y_call_rate - y_female_stats.min) / (y_male_stats.max - y_female_stats.min) < max_y_female_call_rate), True)
345-
.when(~sex_ht.is_female & ((sex_ht.y_call_rate - y_female_stats.min) / (y_male_stats.max - y_female_stats.min) > min_y_male_call_rate), False)
346-
.or_missing()
347-
)
348-
)
349-
sex_ht = sex_ht.annotate_globals(
350-
impute_sex_params=hl.struct(
351-
male_min_f_stat=male_min_f_stat,
352-
female_max_f_stat=female_max_f_stat,
353-
min_male_y_sites_called=min_male_y_sites_called,
354-
max_y_female_call_rate=max_y_female_call_rate,
355-
min_y_male_call_rate=min_y_male_call_rate
335+
x_contigs = get_reference_genome(mt.locus).x_contigs
336+
logger.info(f"Filtering mt to biallelic SNPs in X contigs: {x_contigs}")
337+
if 'was_split' in list(mt.row):
338+
mt = mt.filter_rows((~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1]))
339+
else:
340+
mt = mt.filter_rows((hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]))
341+
mt = hl.filter_intervals(mt, [hl.parse_locus_interval(contig) for contig in x_contigs])
342+
343+
if sites_ht is not None:
344+
if aaf_expr == None:
345+
logger.warning("sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'.")
346+
aaf_expr = "AF"
347+
logger.info("Filtering to provided sites")
348+
mt = mt.annotate_rows(**sites_ht[mt.row_key])
349+
mt = mt.filter_rows(hl.is_defined(mt[aaf_expr]))
350+
351+
logger.info("Calculating inbreeding coefficient on chrX")
352+
sex_ht = hl.impute_sex(mt[gt_expr], aaf_threshold=aaf_threshold, male_threshold=f_stat_cutoff, female_threshold=f_stat_cutoff, aaf=aaf_expr)
353+
354+
logger.info("Annotating sex ht with sex chromosome ploidies")
355+
sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key])
356+
357+
logger.info("Inferring sex karyotypes")
358+
x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(sex_ht, f_stat_cutoff)
359+
return sex_ht.annotate(
360+
**get_sex_expr(
361+
sex_ht.chrX_ploidy,
362+
sex_ht.chrY_ploidy,
363+
x_ploidy_cutoffs,
364+
y_ploidy_cutoffs
356365
)
357366
)
358367

359-
sex_ht = sex_ht.drop('qc_platform')
360-
return (sex_ht)
361-
362368

363369
def get_ploidy_cutoffs(
364370
ht: hl.Table,

0 commit comments

Comments
 (0)