Skip to content

Commit

Permalink
Merge pull request #119 from dpryan79/implement61
Browse files Browse the repository at this point in the history
Implement #61, allow filtering by non-CpG conversion efficiency. Also…
  • Loading branch information
dpryan79 authored Jul 22, 2021
2 parents 74f52da + e52ba66 commit 94e9220
Show file tree
Hide file tree
Showing 12 changed files with 163 additions and 2 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
Version 0.6.0:

* Added the `minConversionEfficiency` option, which allows filtering out incompletely converted alignments on the fly. Note that doing this is generally NOT recommended. (issue #61)
* Fixed various segfaults and other issues related to mappability filtering (courtesy of @Valiec)

Version 0.5.3:

* Fixed an issue with the `perRead` subcommand, wherein the requireFlags option didn't fully work (a read would pass if it had at least one of the required flags set, rather than all of them). (issue #117)
Expand Down
13 changes: 13 additions & 0 deletions MBias.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ void *extractMBias(void *foo) {
fprintf(stderr, "Note that the output will be truncated!\n");
return NULL;
}
data->seq = seq;
data->lseq = seqlen;
data->offset = localPos;

//Start the pileup
iter = bam_mplp_init(1, filter_func, (void **) &data);
Expand Down Expand Up @@ -263,6 +266,11 @@ void mbias_usage() {
" present, or else the alignment is ignored. This is equivalent\n"
" to the -f option in samtools. The default is 0, which\n"
" includes all alignments.\n"
" --minConversionEfficiency The minimum non-CpG conversion efficiency observed\n"
" in a read to include it in the output. The default is 0.0 and\n"
" the maximum is 1.0 (100%% conversion). You are strongly\n"
" encouraged to NOT use this option without an EXTREMELY\n"
" compelling reason!\n"
" --txt Output tab separated metrics to the screen. These can be\n"
" imported into R or another program for manual plotting and\n"
" analysis. Note that coordinates are 1-based.\n"
Expand Down Expand Up @@ -312,6 +320,7 @@ int mbias_main(int argc, char *argv[]) {
config.requireFlags = 0;
config.nThreads = 1;
config.chunkSize = 1000000;
config.minConversionEfficiency = 0.0;
for(i=0; i<16; i++) config.bounds[i] = 0;
for(i=0; i<16; i++) config.absoluteBounds[i] = 0;

Expand All @@ -330,6 +339,7 @@ int mbias_main(int argc, char *argv[]) {
{"nCTOB", 1, NULL, 12},
{"chunkSize", 1, NULL, 13},
{"keepStrand", 0, NULL, 14},
{"minConversionEfficiency", 1, NULL, 15},
{"ignoreFlags", 1, NULL, 'F'},
{"requireFlags", 1, NULL, 'R'},
{"help", 0, NULL, 'h'},
Expand Down Expand Up @@ -400,6 +410,9 @@ int mbias_main(int argc, char *argv[]) {
case 14:
keepStrand = 1;
break;
case 15:
config.minConversionEfficiency = atof(optarg);
break;
case 'F' :
config.ignoreFlags = atoi(optarg);
break;
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ CFLAGS ?= -Wall -g -O3 -pthread
all: MethylDackel

OBJS = common.o bed.o svg.o overlaps.o extract.o MBias.o mergeContext.o perRead.o
VERSION = 0.5.3
VERSION = 0.6.0

version.h:
echo '#define VERSION "$(VERSION)"' > $@
Expand Down
8 changes: 8 additions & 0 deletions MethylDackel.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ typedef struct {
@field output_fp Output file pointers (to CpG, CHG, and CHH, respectively)
@field reg A region specified by -r
@field BAMName The BAM file name
@field minConversionEfficiency The minimum acceptable conversion efficiency in non-CpG positions for read inclusion
@field fp Input file pointer (must be a BAM or CRAM file)
@field bai The index for fp
@field bedName The BED file name
Expand All @@ -96,6 +97,7 @@ typedef struct {
FILE **output_fp;
char *reg;
char *BAMName;
float minConversionEfficiency;
char *BWName;
char *outBBMName;
char *BBMName;
Expand Down Expand Up @@ -128,6 +130,9 @@ typedef struct {
@field iter: The alignment iterator that should be traversed.
@field ohash: A pointer to the hash table needed for overlap detection
@field bedIdx: The last index into the BED file
@field lseq: The length of seq.
@field seq: The sequence for the current region
@field offset: The beginning position of seq on the relevant contig
*/
typedef struct {
Config *config;
Expand All @@ -136,6 +141,9 @@ typedef struct {
hts_itr_t *iter;
void *ohash;
int32_t bedIdx;
int lseq;
char *seq;
uint32_t offset;
} mplp_data;

/*! @function
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ Excluding low-coverage regions

If your downstream analysis requires an absolute minimum coverage (here, defined as the number of methylation calls kept after filtering for MAPQ, phred score, etc.), you can use the `--minDepth` option to achieve this. By default, `MethylDackel extract` will output all methylation metrics as long as the coverage is at least 1. If you use `--minDepth 10`, then only sites covered at least 10x will be output. This works in conjunction with the `--mergeContext` option, above. So if you request per-CpG context output (i.e., with `--mergeContext`) and `--minDepth 10` then only CpGs with a minimum coverage of 10 will be output.

Excluding partially converted reads
===================================

Some users wish to filter out reads that have evidence of incomplete bisulfite conversion. This can be determined by looking for CHH and CHG-context Cytosines in each read and determining their methylation state. Such filtering should generally be avoided, since there are regions in most genomes with at least some CHH and CHG-context Cytosine methylation, which would cause excess filtering in those regions. However, if you absolutely need to filter out only partially-converted alignments, you can do so with the `--minConversionEfficiency` option. The default is 0, meaning that CHH and CHG-context Cytosine conversion status is ignored. The maximum value is 1.0, meaning that 100% of the CHH and CHG-context Cytosines in an alignment must be converted to T. Note that an alignment with no CHH or CHG-context Cytosines will be given a conversion efficiency of 1.0.

Logit, fraction, and counts only output
=======================================

Expand Down
8 changes: 8 additions & 0 deletions azure-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ jobs:
matrix:
htslib111:
htslib_version: '1.11'
htslib112:
htslib_version: '1.12'
htslib113:
htslib_version: '1.13'
maxParallel: 1

steps:
Expand All @@ -27,6 +31,10 @@ jobs:
matrix:
htslib111:
htslib_version: '1.11'
htslib112:
htslib_version: '1.12'
htslib113:
htslib_version: '1.13'
maxParallel: 1

steps:
Expand Down
74 changes: 74 additions & 0 deletions common.c
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,75 @@ char check_mappability(void *data, bam1_t *b) {
return num_mappable_reads;
}

// This is the same as updateMetrics, 1 on methylation, -1 on unmethylation
int getMethylState(bam1_t *b, int seqPos, Config *config) {
uint8_t base = bam_seqi(bam_get_seq(b), seqPos);
int strand = getStrand(b); //1=OT, 2=OB, 3=CTOT, 4=CTOB

if(strand==0) {
fprintf(stderr, "Can't determine the strand of a read!\n");
assert(strand != 0);
}
//Is the phred score even high enough?
if(bam_get_qual(b)[seqPos] < config->minPhred) return 0;

if(base == 2 && (strand==1 || strand==3)) return 1; //C on an OT/CTOT alignment
else if(base == 8 && (strand==1 || strand==3)) return -1; //T on an OT/CTOT alignment
else if(base == 4 && (strand==2 || strand==4)) return 1; //G on an OB/CTOB alignment
else if(base == 1 && (strand==2 || strand==4)) return -1; //A on an OB/CTOB alignment
return 0;
}

float computeEfficiency(unsigned int nMethyl, unsigned int nUMethyl) {
if(nMethyl + nUMethyl == 0) return 1.0;
return nUMethyl / ((float)(nMethyl + nUMethyl));
}

float computeConversionEfficiency(bam1_t *b, mplp_data *ldata) {
unsigned int nMethyl = 0, nUMethyl = 0;
uint32_t i, j, seqEnd = ldata->offset + ldata->lseq; // 1-base after the end of the sequence
uint32_t *cigar = bam_get_cigar(b), op, opLen;
int direction, type, state;
int pos = b->core.pos, seqPos = 0; //position in the genome and position in the read

for(i=0; i<b->core.n_cigar; i++) {
op = bam_cigar_op(cigar[i]);
opLen = bam_cigar_oplen(cigar[i]);

switch(op) {
case 0:
case 7:
case 8:
// do something
for(j=0; j<opLen; j++, seqPos++) {
if(pos+j >= seqEnd) return computeEfficiency(nMethyl, nUMethyl);
if(isCpG(ldata->seq, pos+j-ldata->offset, ldata->lseq)) {
continue;
} else if((direction = isCHG(ldata->seq, pos+j-ldata->offset, ldata->lseq))) {
state = getMethylState(b, seqPos, ldata->config);
if(state > 0) nMethyl++;
else if(state < 0) nUMethyl++;
} else if((direction = isCHH(ldata->seq, pos+j-ldata->offset, ldata->lseq))) {
state = getMethylState(b, seqPos, ldata->config);
if(state > 0) nMethyl++;
else if(state < 0) nUMethyl++;
}
}
break;
case 1: // I, consume read
case 4: // S, consume read
seqPos += opLen;
break;
case 2: // D, consume seq
case 3: // N, consume seq
pos += opLen;
break;
}
}

return computeEfficiency(nMethyl, nUMethyl);
}

//This will need to be restructured to handle multiple input files
int filter_func(void *data, bam1_t *b) {
int rv, NH, overlap;
Expand Down Expand Up @@ -367,6 +436,11 @@ int filter_func(void *data, bam1_t *b) {
}
}

// This is (A) moderately expensive to compute and (B) not completely correct at chunk boundaries.
if(ldata->config->minConversionEfficiency > 0.0) {
if(computeConversionEfficiency(b, ldata) < ldata->config->minConversionEfficiency) continue;
}

/***********************************************************************
*
* Deal with bounds inclusion (--OT, --OB, etc.)
Expand Down
15 changes: 14 additions & 1 deletion extract.c
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,11 @@ void *extractCalls(void *foo) {
fprintf(stderr, "faidx_fetch_seq returned %i while trying to fetch the sequence for tid %s:%"PRIu32"-%"PRIu32"!\n",\
seqlen, hdr->target_name[localTid], localPos2, localEnd);
fprintf(stderr, "Note that the output will be truncated!\n");
return NULL;
continue;
}
data->seq = seq;
data->offset = localPos2;
data->lseq = seqlen;

//Start the pileup
data->ohash = initOlapHash();
Expand Down Expand Up @@ -646,6 +649,11 @@ void extract_usage() {
" -q apply here as well. Note further that if you use\n"
" --mergeContext that a merged site will be excluded if either\n"
" of its individual Cs would be excluded.\n"
" --minConversionEfficiency The minimum non-CpG conversion efficiency observed\n"
" in a read to include it in the output. The default is 0.0 and\n"
" the maximum is 1.0 (100%% conversion). You are strongly\n"
" encouraged to NOT use this option without an EXTREMELY\n"
" compelling reason!\n"
" --maxVariantFrac The maximum fraction of A/T/C base calls on the strand\n"
" opposite of a C to allow before a position is declared a\n"
" variant (thereby being excluded from output). The default is\n"
Expand Down Expand Up @@ -736,6 +744,7 @@ int extract_main(int argc, char *argv[]) {
config.chunkSize = 1000000;
config.cytosine_report = 0;
config.noBAM = 0;
config.minConversionEfficiency = 0.0;
for(i=0; i<16; i++) config.bounds[i] = 0;
for(i=0; i<16; i++) config.absoluteBounds[i] = 0;

Expand Down Expand Up @@ -766,6 +775,7 @@ int extract_main(int argc, char *argv[]) {
{"chunkSize", 1, NULL, 19},
{"keepStrand", 0, NULL, 20},
{"cytosine_report", 0, NULL, 21},
{"minConversionEfficiency", 1, NULL, 22},
{"ignoreFlags", 1, NULL, 'F'},
{"requireFlags", 1, NULL, 'R'},
{"help", 0, NULL, 'h'},
Expand Down Expand Up @@ -872,6 +882,9 @@ int extract_main(int argc, char *argv[]) {
case 21:
config.cytosine_report = 1;
break;
case 22:
config.minConversionEfficiency = atof(optarg);
break;
case 'M':
config.BWName = optarg;
break;
Expand Down
2 changes: 2 additions & 0 deletions tests/chgchh.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>contig1
CGCTGCTGCTGCTGCTGCTGCTGCTGCTTCTTCTTCTTCG
Binary file added tests/chgchh_aln.bam
Binary file not shown.
Binary file added tests/chgchh_aln.bam.bai
Binary file not shown.
33 changes: 33 additions & 0 deletions tests/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,37 @@ def rm(f):
lines = sum(1 for _ in open('test9_CpG.bedGraph'))
assert lines == 48
rm('test9_CpG.bedGraph')

# Check conversion efficiency. 2 read pairs, one mostly converted
# By default, 1 read is MAPQ filtered, another is kept
rm('test10_CpG.bedGraph')
check_call([MPath, 'extract', '-o', 'test10', 'chgchh.fa', 'chgchh_aln.bam'])
assert op.exists('test10_CpG.bedGraph')
lines = sum(1 for _ in open('test10_CpG.bedGraph'))
assert lines == 2
rm('test10_CpG.bedGraph')

# Ensure 2 reads/positions are covered by changing MAPQ
rm('test11_CpG.bedGraph')
check_call([MPath, 'extract', '-o', 'test11', '-q', '5', 'chgchh.fa', 'chgchh_aln.bam'])
assert op.exists('test11_CpG.bedGraph')
lines = sum(1 for _ in open('test11_CpG.bedGraph'))
assert lines == 3
rm('test11_CpG.bedGraph')

# Only 1 read has a conversion efficiency >=0.9
rm('test12_CpG.bedGraph')
check_call([MPath, 'extract', '-o', 'test12', '-q', '5', '--minConversionEfficiency', '0.9', 'chgchh.fa', 'chgchh_aln.bam'])
assert op.exists('test12_CpG.bedGraph')
lines = sum(1 for _ in open('test12_CpG.bedGraph'))
assert lines == 2
rm('test12_CpG.bedGraph')

# No perfectly converted reads
rm('test13_CpG.bedGraph')
check_call([MPath, 'extract', '-o', 'test13', '-q', '5', '--minConversionEfficiency', '1.0', 'chgchh.fa', 'chgchh_aln.bam'])
assert op.exists('test13_CpG.bedGraph')
lines = sum(1 for _ in open('test13_CpG.bedGraph'))
assert lines == 1
rm('test13_CpG.bedGraph')
print("Finished correctly")

0 comments on commit 94e9220

Please sign in to comment.