|
1 | 1 | import numpy as np |
2 | 2 | from .generic import * |
3 | 3 | from .gnomad_functions import logger, filter_to_adj |
| 4 | +from .sparse_mt import impute_sex_ploidy |
4 | 5 | from gnomad_hail.utils.relatedness import get_duplicated_samples |
5 | 6 |
|
6 | 7 |
|
@@ -279,86 +280,91 @@ def assign_platform_from_pcs( |
279 | 280 | return ht |
280 | 281 |
|
281 | 282 |
|
282 | | -# TODO: This should be reviewed / merged with work from Kristen |
283 | | -def infer_sex( |
284 | | - x_mt: hl.MatrixTable, # TODO: This feels somewhat unsatisfying to provide two MTs. Maybe just reapply the QC MT filters to both (minus callrate for Y)? |
285 | | - y_mt: hl.MatrixTable, |
286 | | - platform_ht: hl.Table, |
287 | | - male_min_f_stat: float, |
288 | | - female_max_f_stat: float, |
289 | | - min_male_y_sites_called: int = 500, |
290 | | - max_y_female_call_rate: float = 0.15, |
291 | | - min_y_male_call_rate: float = 0.8 |
| 283 | +def default_annotate_sex( |
| 284 | + mt: hl.MatrixTable, |
| 285 | + is_sparse: bool = True, |
| 286 | + excluded_intervals: Optional[hl.Table] = None, |
| 287 | + included_intervals: Optional[hl.Table] = None, |
| 288 | + normalization_contig: str = 'chr20', |
| 289 | + sites_ht: Optional[hl.Table] = None, |
| 290 | + aaf_expr: Optional[str] = None, |
| 291 | + gt_expr: str = 'GT', |
| 292 | + f_stat_cutoff: float = 0.5, |
| 293 | + aaf_threshold: float = 0.001 |
292 | 294 | ) -> hl.Table: |
293 | 295 | """ |
294 | | - Imputes sample sex based on X-chromosome heterozygosity and Y-callrate (if Y calls are present) |
295 | | -
|
296 | | - :param x_mt: |
297 | | - :param y_mt: |
298 | | - :param platform_ht: |
299 | | - :param male_min_f_stat: |
300 | | - :param female_max_f_stat: |
301 | | - :param min_male_y_sites_called: |
302 | | - :param max_y_female_call_rate: |
303 | | - :param min_y_male_call_rate: |
304 | | - :return: |
| 296 | + Imputes sample sex based on X-chromosome heterozygosity and sex chromosome ploidy. |
| 297 | + |
| 298 | + Returns Table with the following fields: |
| 299 | + - s (str): Sample |
| 300 | + - chr20_mean_dp (float32): Sample's mean coverage over chromosome 20. |
| 301 | + - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. |
| 302 | + - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. |
| 303 | + - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. |
| 304 | + - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y. |
| 305 | + - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex. |
| 306 | + - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex. |
| 307 | + - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex. |
| 308 | + - observed_homs (int64): Expected number of homozygotes. Calculated using hl.impute_sex. |
| 309 | + - X_karyotype (str): Sample's chromosome X karyotype. |
| 310 | + - Y_karyotype (str): Sample's chromosome Y karyotype. |
| 311 | + - sex_karyotype (str): Sample's sex karyotype. |
| 312 | +
|
| 313 | + :param mt: Input MatrixTable |
| 314 | + :param bool is_sparse: Whether input MatrixTable is in sparse data format |
| 315 | + :param excluded_intervals: Optional table of intervals to exclude from the computation. |
| 316 | + :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. |
| 317 | + :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies. |
| 318 | + :param sites_ht: Optional Table to use. If present, filters input MatrixTable to sites in this Table prior to imputing sex, |
| 319 | + and pulls alternate allele frequency from this Table. |
| 320 | + :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. |
| 321 | + :param gt_expr: Name of entry field storing the genotype. Default: 'GT' |
| 322 | + :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff. |
| 323 | + :param float aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations. |
| 324 | + :return: Table of samples and their imputed sex karyotypes. |
305 | 325 | """ |
306 | | - logger.info("Imputing samples sex") |
307 | | - |
308 | | - x_mt = hl.filter_intervals(x_mt, [hl.parse_locus_interval(x_contig) for x_contig in get_reference_genome(x_mt.locus).x_contigs]) |
309 | | - x_ht = hl.impute_sex(x_mt.GT, aaf_threshold=0.05, female_threshold=female_max_f_stat, male_threshold=male_min_f_stat) |
310 | | - y_mt = hl.filter_intervals(y_mt, [hl.parse_locus_interval(y_contig) for y_contig in get_reference_genome(y_mt.locus).y_contigs]) |
311 | | - y_mt = y_mt.filter_rows(y_mt.locus.in_y_nonpar()) |
312 | | - sex_ht = y_mt.annotate_cols( |
313 | | - qc_platform=platform_ht[y_mt.col_key].qc_platform, |
314 | | - is_female=x_ht[y_mt.col_key].is_female, |
315 | | - y_call_rate=hl.agg.fraction(hl.is_defined(y_mt.GT)), |
316 | | - n_y_sites_called=hl.agg.count_where(hl.is_defined(y_mt.GT)), |
317 | | - **{f'x_{ann}': x_ht[y_mt.col_key][ann] for ann in x_ht.row_value if ann != 'is_female'} |
318 | | - ).cols() |
319 | | - |
320 | | - mean_male_y_sites_called = sex_ht.aggregate(hl.agg.filter(~sex_ht.is_female, hl.agg.group_by(sex_ht.qc_platform, hl.agg.mean(sex_ht.n_y_sites_called)))) |
321 | | - y_call_rate_stats = sex_ht.aggregate( |
322 | | - hl.agg.filter( |
323 | | - hl.is_defined(sex_ht.is_female), |
324 | | - hl.agg.group_by(hl.tuple([sex_ht.qc_platform, sex_ht.is_female]), |
325 | | - hl.agg.stats(sex_ht.y_call_rate) |
326 | | - ) |
| 326 | + logger.info("Imputing sex chromosome ploidies...") |
| 327 | + if is_sparse: |
| 328 | + ploidy_ht = impute_sex_ploidy( |
| 329 | + mt, excluded_intervals, included_intervals, |
| 330 | + normalization_contig |
327 | 331 | ) |
328 | | - ) |
| 332 | + else: |
| 333 | + raise NotImplementedError("Imputing sex ploidy does not exist yet for dense data.") |
329 | 334 |
|
330 | | - no_y_call_rate_platforms = set() |
331 | | - for platform, sites_called in mean_male_y_sites_called.items(): |
332 | | - if sites_called < min_male_y_sites_called: |
333 | | - logger.warn(f"Mean number of sites in males on Y chromosome for platform {platform} is < {min_male_y_sites_called} ({sites_called} sites found). Y call rate filter will NOT be applied for samples on platform {platform}.") |
334 | | - no_y_call_rate_platforms.add(platform) |
335 | | - |
336 | | - sex_ht = sex_ht.annotate_globals(y_call_rate_stats=y_call_rate_stats, |
337 | | - no_y_call_rate_platforms=no_y_call_rate_platforms if no_y_call_rate_platforms else hl.empty_set(platform_ht.qc_platform.dtype)) |
338 | | - y_female_stats = sex_ht.y_call_rate_stats[(sex_ht.qc_platform, True)] |
339 | | - y_male_stats = sex_ht.y_call_rate_stats[(sex_ht.qc_platform, False)] |
340 | | - sex_ht = sex_ht.annotate( |
341 | | - is_female=( |
342 | | - hl.case() |
343 | | - .when(sex_ht.no_y_call_rate_platforms.contains(sex_ht.qc_platform), sex_ht.is_female) |
344 | | - .when(sex_ht.is_female & ((sex_ht.y_call_rate - y_female_stats.min) / (y_male_stats.max - y_female_stats.min) < max_y_female_call_rate), True) |
345 | | - .when(~sex_ht.is_female & ((sex_ht.y_call_rate - y_female_stats.min) / (y_male_stats.max - y_female_stats.min) > min_y_male_call_rate), False) |
346 | | - .or_missing() |
347 | | - ) |
348 | | - ) |
349 | | - sex_ht = sex_ht.annotate_globals( |
350 | | - impute_sex_params=hl.struct( |
351 | | - male_min_f_stat=male_min_f_stat, |
352 | | - female_max_f_stat=female_max_f_stat, |
353 | | - min_male_y_sites_called=min_male_y_sites_called, |
354 | | - max_y_female_call_rate=max_y_female_call_rate, |
355 | | - min_y_male_call_rate=min_y_male_call_rate |
| 335 | + x_contigs = get_reference_genome(mt.locus).x_contigs |
| 336 | + logger.info(f"Filtering mt to biallelic SNPs in X contigs: {x_contigs}") |
| 337 | + if 'was_split' in list(mt.row): |
| 338 | + mt = mt.filter_rows((~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1])) |
| 339 | + else: |
| 340 | + mt = mt.filter_rows((hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1])) |
| 341 | + mt = hl.filter_intervals(mt, [hl.parse_locus_interval(contig) for contig in x_contigs]) |
| 342 | + |
| 343 | + if sites_ht is not None: |
| 344 | + if aaf_expr == None: |
| 345 | + logger.warning("sites_ht was provided, but aaf_expr is missing. Assuming name of field with alternate allele frequency is 'AF'.") |
| 346 | + aaf_expr = "AF" |
| 347 | + logger.info("Filtering to provided sites") |
| 348 | + mt = mt.annotate_rows(**sites_ht[mt.row_key]) |
| 349 | + mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) |
| 350 | + |
| 351 | + logger.info("Calculating inbreeding coefficient on chrX") |
| 352 | + sex_ht = hl.impute_sex(mt[gt_expr], aaf_threshold=aaf_threshold, male_threshold=f_stat_cutoff, female_threshold=f_stat_cutoff, aaf=aaf_expr) |
| 353 | + |
| 354 | + logger.info("Annotating sex ht with sex chromosome ploidies") |
| 355 | + sex_ht = sex_ht.annotate(**ploidy_ht[sex_ht.key]) |
| 356 | + |
| 357 | + logger.info("Inferring sex karyotypes") |
| 358 | + x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs(sex_ht, f_stat_cutoff) |
| 359 | + return sex_ht.annotate( |
| 360 | + **get_sex_expr( |
| 361 | + sex_ht.chrX_ploidy, |
| 362 | + sex_ht.chrY_ploidy, |
| 363 | + x_ploidy_cutoffs, |
| 364 | + y_ploidy_cutoffs |
356 | 365 | ) |
357 | 366 | ) |
358 | 367 |
|
359 | | - sex_ht = sex_ht.drop('qc_platform') |
360 | | - return (sex_ht) |
361 | | - |
362 | 368 |
|
363 | 369 | def get_ploidy_cutoffs( |
364 | 370 | ht: hl.Table, |
|
0 commit comments