Skip to content

Commit 4da6806

Browse files
committed
Support for telomere (POS=0) coordinate
This follows changes in samtools/htslib#1573 See also samtools/htslib#1571
1 parent fe98a6b commit 4da6806

9 files changed

+63
-51
lines changed

csq.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -3374,7 +3374,7 @@ void vbuf_flush(args_t *args, uint32_t pos)
33743374
i = rbuf_shift(&args->vcf_rbuf);
33753375
assert( i>=0 );
33763376
vbuf = args->vcf_buf[i];
3377-
int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1;
3377+
int pos = vbuf->n ? vbuf->vrec[0]->line->pos : CSI_COOR_EMPTY;
33783378
for (i=0; i<vbuf->n; i++)
33793379
{
33803380
vrec_t *vrec = vbuf->vrec[i];
@@ -3413,7 +3413,7 @@ void vbuf_flush(args_t *args, uint32_t pos)
34133413
bcf_empty(vrec->line);
34143414
vrec->line->pos = save_pos;
34153415
}
3416-
if ( pos!=-1 )
3416+
if ( pos!=CSI_COOR_EMPTY )
34173417
{
34183418
khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos);
34193419
if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k);
@@ -4169,7 +4169,7 @@ static void process(args_t *args, bcf1_t **rec_ptr)
41694169
}
41704170

41714171
bcf1_t *rec = *rec_ptr;
4172-
static int32_t prev_rid = -1, prev_pos = -1;
4172+
static int32_t prev_rid = -1, prev_pos = CSI_COOR_EMPTY;
41734173
if ( prev_rid!=rec->rid )
41744174
{
41754175
prev_rid = rec->rid;

test/telomere.0.out

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
22 0 id0 C G . . .

test/telomere.1.out

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
22 1 id1 C G . . .

test/telomere.vcf

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
##fileformat=VCFv4.2
2+
##contig=<ID=22>
3+
#CHROM POS ID REF ALT QUAL FILTER INFO
4+
22 0 id0 C G . . .
5+
22 1 id1 C G . . .

test/test.pl

+2
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
run_test(\&test_tabix,$opts,in=>'merge.a',reg=>'1:3000151-3000151',out=>'tabix.1.3000151.out');
3939
run_test(\&test_index,$opts,in=>'large_chrom_csi_limit',reg=>'chr20:1-2147483647',out=>'large_chrom_csi_limit.20.1.2147483647.out'); # 2147483647 (1<<31-1) is the current chrom limit for csi. bcf conversion and indexing fail above this
4040
run_test(\&test_index,$opts,in=>'large_chrom_csi_limit',reg=>'chr20',out=>'large_chrom.20.1.2147483647.out'); # this fails until bug resolved
41+
run_test(\&test_index,$opts,in=>'telomere',reg=>'22:0',out=>'telomere.0.out');
42+
run_test(\&test_index,$opts,in=>'telomere',reg=>'22:1',out=>'telomere.1.out');
4143
run_test(\&test_vcf_idxstats,$opts,in=>'idx',args=>'-s',out=>'idx.out');
4244
run_test(\&test_vcf_idxstats,$opts,in=>'idx',args=>'-n',out=>'idx_count.out');
4345
run_test(\&test_vcf_idxstats,$opts,in=>'empty',args=>'-s',out=>'empty.idx.out');

vcfconcat.c

+20-17
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
22
3-
Copyright (C) 2013-2021 Genome Research Ltd.
3+
Copyright (C) 2013-2023 Genome Research Ltd.
44
55
Author: Petr Danecek <[email protected]>
66
@@ -39,6 +39,8 @@ THE SOFTWARE. */
3939
#include <sys/time.h>
4040
#include "bcftools.h"
4141

42+
#define EMPTY_FILE -3
43+
4244
typedef struct _args_t
4345
{
4446
bcf_srs_t *files;
@@ -95,11 +97,11 @@ static void init_data(args_t *args)
9597
if ( args->phased_concat )
9698
{
9799
int ret = bcf_read(fp, hdr, line);
98-
if ( ret!=0 ) args->start_pos[i] = -2; // empty file
100+
if ( ret!=0 ) args->start_pos[i] = EMPTY_FILE;
99101
else
100102
{
101103
int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line));
102-
args->start_pos[i] = chrid==prev_chrid ? line->pos : -1;
104+
args->start_pos[i] = chrid==prev_chrid ? line->pos : CSI_COOR_EMPTY;
103105
prev_chrid = chrid;
104106
}
105107
}
@@ -171,11 +173,11 @@ static void init_data(args_t *args)
171173
int nok = 0;
172174
while (1)
173175
{
174-
while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++;
176+
while ( nok<args->nfnames && args->start_pos[nok]!=EMPTY_FILE ) nok++;
175177
if ( nok==args->nfnames ) break;
176178

177179
i = nok;
178-
while ( i<args->nfnames && args->start_pos[i]==-2 ) i++;
180+
while ( i<args->nfnames && args->start_pos[i]==EMPTY_FILE ) i++;
179181
if ( i==args->nfnames ) break;
180182

181183
int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp;
@@ -185,7 +187,7 @@ static void init_data(args_t *args)
185187
args->nfnames = nok;
186188

187189
for (i=1; i<args->nfnames; i++)
188-
if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] )
190+
if ( args->start_pos[i-1]!=CSI_COOR_EMPTY && args->start_pos[i]!=CSI_COOR_EMPTY && args->start_pos[i]<args->start_pos[i-1] )
189191
error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]);
190192

191193
args->prev_chr = -1;
@@ -264,7 +266,7 @@ static void phased_flush(args_t *args)
264266
bcf1_t *brec = args->buf[i+1];
265267

266268
int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
267-
if ( nGTs < 0 )
269+
if ( nGTs < 0 )
268270
{
269271
if ( !gt_absent_warned )
270272
{
@@ -359,7 +361,7 @@ static void phased_flush(args_t *args)
359361
bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl);
360362
PQ_printed = 1;
361363
for (j=0; j<nsmpl; j++)
362-
if ( args->phase_qual[j] < args->min_PQ )
364+
if ( args->phase_qual[j] < args->min_PQ )
363365
{
364366
args->phase_set[j] = rec->pos+1;
365367
args->phase_set_changed = 1;
@@ -404,7 +406,7 @@ static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec, int is_overlap
404406
if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", arec ? bcf_seqname(ahdr,arec) : bcf_seqname(bhdr,brec));
405407
args->seen_seq[chr_id] = 1;
406408
args->prev_chr = chr_id;
407-
args->prev_pos_check = -1;
409+
args->prev_pos_check = CSI_COOR_EMPTY;
408410
}
409411

410412
if ( !is_overlap )
@@ -463,12 +465,12 @@ static void concat(args_t *args)
463465
new_file = 1;
464466

465467
args->ifname++;
466-
if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open
467-
if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome
468+
if ( args->start_pos[args->ifname-1]==CSI_COOR_EMPTY ) break; // new chromosome, start with only one file open
469+
if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==CSI_COOR_EMPTY ) break; // next file starts on a different chromosome
468470
}
469471

470472
// is there a line from the previous run? Seek the newly opened reader to that position
471-
int seek_pos = -1;
473+
int seek_pos = CSI_COOR_EMPTY;
472474
int seek_chr = -1;
473475
if ( bcf_sr_has_line(args->files,0) )
474476
{
@@ -521,11 +523,12 @@ static void concat(args_t *args)
521523

522524
// This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
523525
if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line)) ) continue;
524-
seek_pos = seek_chr = -1;
526+
seek_pos = CSI_COOR_EMPTY;
527+
seek_chr = -1;
525528

526529
// Check if the position overlaps with the next, yet unopened, reader
527530
int must_seek = 0;
528-
while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] )
531+
while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=CSI_COOR_EMPTY && line->pos >= args->start_pos[args->ifname] )
529532
{
530533
must_seek = 1;
531534
if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
@@ -618,7 +621,7 @@ static void concat(args_t *args)
618621
if ( chr_id<0 ) error("\nThe sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]);
619622
if ( prev_chr_id!=chr_id )
620623
{
621-
prev_pos = -1;
624+
prev_pos = CSI_COOR_EMPTY;
622625
if ( args->seen_seq[chr_id] )
623626
error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s);
624627
}
@@ -643,7 +646,7 @@ static void concat(args_t *args)
643646

644647
if ( prev_chr_id!=line->rid )
645648
{
646-
prev_pos = -1;
649+
prev_pos = CSI_COOR_EMPTY;
647650
if ( args->seen_seq[line->rid] )
648651
error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
649652
}
@@ -980,7 +983,7 @@ int main_vcfconcat(int argc, char *argv[])
980983
case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
981984
case 'd': args->remove_dups = optarg; break;
982985
case 'D': args->remove_dups = "exact"; break;
983-
case 'q':
986+
case 'q':
984987
args->min_PQ = strtol(optarg,&tmp,10);
985988
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
986989
break;

vcfconvert.c

+24-24
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* vcfconvert.c -- convert between VCF/BCF and related formats.
22
3-
Copyright (C) 2013-2021 Genome Research Ltd.
3+
Copyright (C) 2013-2023 Genome Research Ltd.
44
55
Author: Petr Danecek <[email protected]>
66
@@ -59,7 +59,7 @@ struct _args_t
5959
bcf_hdr_t *header;
6060
void (*convert_func)(struct _args_t *);
6161
struct {
62-
int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
62+
int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
6363
} n;
6464
kstring_t str;
6565
int32_t *gts;
@@ -160,7 +160,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
160160
// REF,ALT
161161
args->str.l = 0;
162162
se = ++ss;
163-
while ( se < tsv->se && *se!='_' ) se++;
163+
while ( se < tsv->se && *se!='_' ) se++;
164164
if ( *se!='_' ) return -1;
165165
kputsn(ss,se-ss,&args->str);
166166
ss = ++se;
@@ -202,14 +202,14 @@ static int tsv_setter_chrom_pos_ref_alt_or_id(tsv_t *tsv, bcf1_t *rec, void *usr
202202
{
203203
args_t *args = (args_t*)usr;
204204
if ( _set_chrom_pos_ref_alt(tsv,rec,usr)==0 ) return 0;
205-
rec->pos = -1; // mark the record as unset
205+
rec->pos = CSI_COOR_EMPTY; // mark the record as unset
206206
if ( !args->output_vcf_ids) return 0;
207207
return tsv_setter_id(tsv,rec,usr);
208208
}
209209
static int tsv_setter_chrom_pos_ref_alt_id_or_die(tsv_t *tsv, bcf1_t *rec, void *usr)
210210
{
211211
args_t *args = (args_t*)usr;
212-
if ( rec->pos!=-1 )
212+
if ( rec->pos!=CSI_COOR_EMPTY )
213213
{
214214
if ( !args->output_vcf_ids ) return 0;
215215
return tsv_setter_id(tsv,rec,usr);
@@ -269,12 +269,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr)
269269
if ( aa >= ab )
270270
{
271271
if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
272-
else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
272+
else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
273273
}
274-
else if ( ab >= bb )
274+
else if ( ab >= bb )
275275
{
276276
args->gts[2*i+0] = bcf_gt_unphased(0);
277-
args->gts[2*i+1] = bcf_gt_unphased(1);
277+
args->gts[2*i+1] = bcf_gt_unphased(1);
278278
}
279279
else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
280280
}
@@ -293,7 +293,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
293293
else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }
294294

295295
// up is short for "unphased"
296-
int nup = 0;
296+
int nup = 0;
297297
for (i=0; i<nsamples; i++)
298298
{
299299
char *ss = tsv->ss + 4*i + nup;
@@ -324,11 +324,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
324324
break;
325325
default :
326326
fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
327-
return -1;
327+
return -1;
328328
}
329329
if( ss[all*2+up+1]=='*' ) up = up + 1;
330330
}
331-
331+
332332
if(up && up != 2)
333333
{
334334
fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
@@ -356,13 +356,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
356356
static void gensample_to_vcf(args_t *args)
357357
{
358358
/*
359-
* Inpute: IMPUTE2 output (indentation changed here for clarity):
359+
* Inpute: IMPUTE2 output (indentation changed here for clarity):
360360
*
361361
* 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ...
362362
* --- 20:62116698_C_A 62116698 C A 1 0 0 ...
363363
*
364364
* Second column is expected in the form of CHROM:POS_REF_ALT. We use second
365-
* column because the first can be empty ("--") when filling sites from reference
365+
* column because the first can be empty ("--") when filling sites from reference
366366
* panel. When the option --vcf-ids is given, the first column is used to set the
367367
* VCF ID.
368368
*
@@ -784,7 +784,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
784784
}
785785
for (i=0; i<nlines; i++) free(lines[i]);
786786
free(lines);
787-
for (i=0; i<bcf_hdr_nsamples(hdr); i++)
787+
for (i=0; i<bcf_hdr_nsamples(hdr); i++)
788788
if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
789789
return sample2sex;
790790
}
@@ -847,7 +847,7 @@ static void vcf_to_gensample(args_t *args)
847847
if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
848848

849849
// write samples file
850-
if (sample_fname)
850+
if (sample_fname)
851851
{
852852
char *sample2sex = NULL;
853853
if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
@@ -877,7 +877,7 @@ static void vcf_to_gensample(args_t *args)
877877
return;
878878
}
879879

880-
int prev_rid = -1, prev_pos = -1;
880+
int prev_rid = -1, prev_pos = CSI_COOR_EMPTY;
881881
int no_alt = 0, non_biallelic = 0, filtered = 0, ndup = 0, nok = 0;
882882
BGZF *gout = bgzf_open(gen_fname, gen_compressed ? "wg" : "wu");
883883
while ( bcf_sr_next_line(args->files) )
@@ -915,7 +915,7 @@ static void vcf_to_gensample(args_t *args)
915915
nok++;
916916
}
917917
}
918-
fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
918+
fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
919919
nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);
920920

921921
if ( str.m ) free(str.s);
@@ -976,7 +976,7 @@ static void vcf_to_haplegendsample(args_t *args)
976976
{
977977
char *sample2sex = NULL;
978978
if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
979-
979+
980980
int i;
981981
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
982982
str.l = 0;
@@ -1078,7 +1078,7 @@ static void vcf_to_hapsample(args_t *args)
10781078
kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str);
10791079
else
10801080
kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);
1081-
1081+
10821082
if ( args->hap2dip )
10831083
kputs("%_GT_TO_HAP2\n", &str);
10841084
else
@@ -1229,7 +1229,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[]
12291229
if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
12301230
if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;
12311231

1232-
gts[0] = bcf_gt_unphased(alleles[a0]);
1232+
gts[0] = bcf_gt_unphased(alleles[a0]);
12331233
gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;
12341234

12351235
if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR
@@ -1265,7 +1265,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
12651265
}
12661266
ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
12671267
if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
1268-
if ( ret==-2 )
1268+
if ( ret==-2 )
12691269
{
12701270
// something else than a SNP
12711271
free(ref);
@@ -1275,7 +1275,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
12751275

12761276
args->str.l = 0;
12771277
kputc(ref[0], &args->str);
1278-
for (i=0; i<5; i++)
1278+
for (i=0; i<5; i++)
12791279
{
12801280
if ( alleles[i]>0 )
12811281
{
@@ -1419,7 +1419,7 @@ static void gvcf_to_vcf(args_t *args)
14191419
{
14201420
int pass = filter_test(args->filter, line, NULL);
14211421
if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
1422-
if ( !pass )
1422+
if ( !pass )
14231423
{
14241424
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
14251425
continue;
@@ -1667,7 +1667,7 @@ int main_vcfconvert(int argc, char *argv[])
16671667
else args->infname = argv[optind];
16681668
}
16691669
if ( !args->infname ) usage();
1670-
1670+
16711671
if ( args->convert_func ) args->convert_func(args);
16721672
else vcf_to_vcf(args);
16731673

0 commit comments

Comments
 (0)