Skip to content

Commit 1c22ac5

Browse files
authored
feat: Allow for non-diploid genotypes (#476)
* feat: allow for non-diploid genotypes
1 parent 8741513 commit 1c22ac5

File tree

5 files changed

+226
-28
lines changed

5 files changed

+226
-28
lines changed

src/bam/record.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -358,9 +358,9 @@ impl Record {
358358

359359
let orig_aux_offset = self.qname_capacity()
360360
+ 4 * self.cigar_len()
361-
+ (self.seq_len() + 1) / 2
361+
+ self.seq_len().div_ceil(2)
362362
+ self.seq_len();
363-
let new_aux_offset = q_len + extranul + cigar_width + (seq.len() + 1) / 2 + qual.len();
363+
let new_aux_offset = q_len + extranul + cigar_width + seq.len().div_ceil(2) + qual.len();
364364
assert!(orig_aux_offset <= self.inner.l_data as usize);
365365
let aux_len = self.inner.l_data as usize - orig_aux_offset;
366366
self.inner_mut().l_data = (new_aux_offset + aux_len) as i32;
@@ -416,7 +416,7 @@ impl Record {
416416
});
417417
}
418418
self.inner_mut().core.l_qseq = seq.len() as i32;
419-
i += (seq.len() + 1) / 2;
419+
i += seq.len().div_ceil(2);
420420
}
421421

422422
// qual
@@ -564,7 +564,7 @@ impl Record {
564564

565565
fn seq_data(&self) -> &[u8] {
566566
let offset = self.qname_capacity() + self.cigar_len() * 4;
567-
&self.data()[offset..][..(self.seq_len() + 1) / 2]
567+
&self.data()[offset..][..self.seq_len().div_ceil(2)]
568568
}
569569

570570
/// Get read sequence. Complexity: O(1).
@@ -579,7 +579,7 @@ impl Record {
579579
/// This does not entail any offsets, hence the qualities can be used directly without
580580
/// e.g. subtracting 33. Complexity: O(1).
581581
pub fn qual(&self) -> &[u8] {
582-
&self.data()[self.qname_capacity() + self.cigar_len() * 4 + (self.seq_len() + 1) / 2..]
582+
&self.data()[self.qname_capacity() + self.cigar_len() * 4 + self.seq_len().div_ceil(2)..]
583583
[..self.seq_len()]
584584
}
585585

@@ -787,7 +787,7 @@ impl Record {
787787
// CIGAR (uint32_t):
788788
+ self.cigar_len() * std::mem::size_of::<u32>()
789789
// Read sequence (4-bit encoded):
790-
+ (self.seq_len() + 1) / 2
790+
+ self.seq_len().div_ceil(2)
791791
// Base qualities (char):
792792
+ self.seq_len()..],
793793
}

src/bcf/mod.rs

Lines changed: 127 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ impl IndexedReader {
318318
/// # Arguments
319319
///
320320
/// * `rid` - numeric ID of the reference to jump to; use `HeaderView::name2rid` for resolving
321-
/// contig name to ID.
321+
/// contig name to ID.
322322
/// * `start` - `0`-based **inclusive** start coordinate of region on reference.
323323
/// * `end` - Optional `0`-based **inclusive** end coordinate of region on reference. If `None`
324324
/// is given, records are fetched from `start` until the end of the contig.
@@ -605,7 +605,7 @@ pub mod synced {
605605
/// # Arguments
606606
///
607607
/// * `rid` - numeric ID of the reference to jump to; use `HeaderView::name2rid` for resolving
608-
/// contig name to ID.
608+
/// contig name to ID.
609609
/// * `start` - `0`-based start coordinate of region on reference.
610610
/// * `end` - `0`-based end coordinate of region on reference.
611611
pub fn fetch(&mut self, rid: u32, start: u64, end: u64) -> Result<()> {
@@ -835,6 +835,8 @@ fn bcf_open(target: &[u8], mode: &[u8]) -> Result<*mut htslib::htsFile> {
835835

836836
#[cfg(test)]
837837
mod tests {
838+
use tempfile::NamedTempFile;
839+
838840
use super::record::Buffer;
839841
use super::*;
840842
use crate::bcf::header::Id;
@@ -1090,6 +1092,129 @@ mod tests {
10901092
}
10911093
}
10921094

1095+
#[test]
1096+
fn test_genotypes_read_mixed_ploidy() {
1097+
let mut vcf = Reader::from_path("test/test_non_diploid.vcf").expect("Error opening file.");
1098+
1099+
// Expected genotypes for comparison
1100+
let expected = [vec!["0", "1"], vec!["0/1", "1/1"], vec!["1|0", "1/1|0"]];
1101+
1102+
for (rec, exp_gts) in vcf.records().zip(expected.iter()) {
1103+
let rec = rec.expect("Error reading record.");
1104+
1105+
// Get the genotypes from the record
1106+
let genotypes = rec.genotypes().expect("Error reading genotypes");
1107+
1108+
// Compare each genotype with the expected value
1109+
for (sample, exp_gt) in exp_gts.iter().enumerate() {
1110+
assert_eq!(&format!("{}", genotypes.get(sample)), exp_gt);
1111+
}
1112+
}
1113+
}
1114+
1115+
#[test]
1116+
fn test_genotypes_write_and_read_mixed_ploidy() {
1117+
let mut vcf = Reader::from_path("test/test_non_diploid.vcf").expect("Error opening file.");
1118+
1119+
// Create a temporary file to write the modified VCF data
1120+
let tmp = NamedTempFile::new().unwrap();
1121+
let path = tmp.path();
1122+
1123+
{
1124+
// Create a VCF writer with the same header as the input VCF
1125+
let mut writer = Writer::from_path(
1126+
path,
1127+
&Header::from_template(vcf.header()),
1128+
true,
1129+
Format::Vcf,
1130+
)
1131+
.unwrap();
1132+
1133+
// Modify record template by adding different genotypes and write the to the temp file.
1134+
let mut rec_tpl = vcf.records().next().unwrap().unwrap();
1135+
rec_tpl
1136+
.push_genotype_structured(
1137+
&[
1138+
vec![GenotypeAllele::Unphased(0)],
1139+
vec![GenotypeAllele::Unphased(1)],
1140+
],
1141+
3,
1142+
)
1143+
.unwrap();
1144+
writer.write(&rec_tpl).unwrap();
1145+
rec_tpl
1146+
.push_genotype_structured(
1147+
&[
1148+
vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)],
1149+
vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)],
1150+
],
1151+
3,
1152+
)
1153+
.unwrap();
1154+
writer.write(&rec_tpl).unwrap();
1155+
rec_tpl
1156+
.push_genotype_structured(
1157+
&[
1158+
vec![GenotypeAllele::Unphased(1), GenotypeAllele::Phased(0)],
1159+
vec![
1160+
GenotypeAllele::Unphased(1),
1161+
GenotypeAllele::Unphased(1),
1162+
GenotypeAllele::Phased(0),
1163+
],
1164+
],
1165+
3,
1166+
)
1167+
.unwrap();
1168+
writer.write(&rec_tpl).unwrap();
1169+
}
1170+
1171+
// Read back the temporary file with the modified VCF data
1172+
let mut reader = Reader::from_path(path).unwrap();
1173+
1174+
// Expected genotypes for validation
1175+
let expected = [vec!["0", "1"], vec!["0/1", "1/1"], vec!["1|0", "1/1|0"]];
1176+
1177+
// Iterate over the records in the temporary file and validate the genotypes
1178+
for (rec, exp_gts) in reader.records().zip(expected.iter()) {
1179+
let rec = rec.expect("Error reading record");
1180+
let genotypes = rec.genotypes().expect("Error reading genotypes");
1181+
1182+
// Compare each genotype with the expected value
1183+
for (sample, exp_gt) in exp_gts.iter().enumerate() {
1184+
assert_eq!(&format!("{}", genotypes.get(sample)), exp_gt);
1185+
}
1186+
}
1187+
}
1188+
1189+
#[test]
1190+
fn test_genotypes_wrong_max_ploidy() {
1191+
let mut vcf = Reader::from_path("test/test_non_diploid.vcf").expect("Error opening file.");
1192+
1193+
// Modify record template by adding different genotypes and write the to the temp file.
1194+
let mut rec_tpl = vcf.records().next().unwrap().unwrap();
1195+
let err = rec_tpl
1196+
.push_genotype_structured(
1197+
&[
1198+
vec![
1199+
GenotypeAllele::Unphased(0),
1200+
GenotypeAllele::Unphased(1),
1201+
GenotypeAllele::Unphased(0),
1202+
],
1203+
vec![
1204+
GenotypeAllele::Unphased(1),
1205+
GenotypeAllele::Unphased(0),
1206+
GenotypeAllele::Unphased(1),
1207+
GenotypeAllele::Unphased(0),
1208+
],
1209+
],
1210+
3,
1211+
)
1212+
.expect_err(
1213+
"This should fail since there are more alleles specified (4 for second sample) than max_ploidy (3) suggests",
1214+
);
1215+
assert_eq!(err, crate::errors::Error::BcfSetValues);
1216+
}
1217+
10931218
#[test]
10941219
fn test_header_ids() {
10951220
let vcf = Reader::from_path("test/test_string.vcf").expect("Error opening file.");

src/bcf/record.rs

Lines changed: 83 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
// except according to those terms.
55

66
use std::borrow::{Borrow, BorrowMut};
7-
use std::ffi;
87
use std::fmt;
98
use std::marker::PhantomData;
109
use std::ops::Deref;
@@ -13,6 +12,7 @@ use std::ptr;
1312
use std::rc::Rc;
1413
use std::slice;
1514
use std::str;
15+
use std::{ffi, iter};
1616

1717
use bio_types::genome;
1818
use derive_new::new;
@@ -659,7 +659,7 @@ impl Record {
659659
/// # Arguments
660660
///
661661
/// - `genotypes` - a flattened, two-dimensional array of GenotypeAllele,
662-
/// the first dimension contains one array for each sample.
662+
/// the first dimension contains one array for each sample.
663663
///
664664
/// # Errors
665665
///
@@ -692,6 +692,76 @@ impl Record {
692692
self.push_format_integer(b"GT", &encoded)
693693
}
694694

695+
/// Add/replace genotypes in FORMAT GT tag by providing a list of genotypes.
696+
///
697+
/// # Arguments
698+
///
699+
/// - `genotypes` - a two-dimensional array of GenotypeAllele
700+
/// - `max_ploidy` - the maximum number of alleles allowed for any genotype on any sample.
701+
///
702+
/// # Errors
703+
///
704+
/// Returns an error if any genotype has more allelles than `max_ploidy` or if the GT tag is not present in the header.
705+
///
706+
/// # Example
707+
///
708+
/// Example assumes we have a Record `record` from a VCF with a `GT` `FORMAT` tag and three samples.
709+
/// See [module documentation](../index.html#example-writing) for how to set up
710+
/// VCF, header, and record.
711+
///
712+
/// ```
713+
/// # use rust_htslib::bcf::{Format, Writer};
714+
/// # use rust_htslib::bcf::header::Header;
715+
/// # use rust_htslib::bcf::record::GenotypeAllele;
716+
/// # use std::iter;
717+
/// # let mut header = Header::new();
718+
/// # let header_contig_line = r#"##contig=<ID=1,length=10>"#;
719+
/// # header.push_record(header_contig_line.as_bytes());
720+
/// # let header_gt_line = r#"##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">"#;
721+
/// # header.push_record(header_gt_line.as_bytes());
722+
/// # header.push_sample("first_sample".as_bytes());
723+
/// # header.push_sample("second_sample".as_bytes());
724+
/// # header.push_sample("third_sample".as_bytes());
725+
/// # let mut vcf = Writer::from_stdout(&header, true, Format::Vcf)?;
726+
/// # let mut record = vcf.empty_record();
727+
/// let alleles = vec![
728+
/// vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)],
729+
/// vec![GenotypeAllele::Unphased(0), GenotypeAllele::Phased(1)],
730+
/// vec![GenotypeAllele::Unphased(0)],
731+
/// ];
732+
/// record.push_genotype_structured(&alleles, 2);
733+
/// let gts = record.genotypes()?;
734+
/// assert_eq!("1/1", &format!("{}", gts.get(0)));
735+
/// assert_eq!("0|1", &format!("{}", gts.get(1)));
736+
/// assert_eq!("0", &format!("{}", gts.get(2)));
737+
/// # Ok::<(), rust_htslib::errors::Error>(())
738+
/// ```
739+
pub fn push_genotype_structured<GT>(
740+
&mut self,
741+
genotypes: &[GT],
742+
max_ploidy: usize,
743+
) -> Result<()>
744+
where
745+
GT: AsRef<[GenotypeAllele]>,
746+
{
747+
let mut data = Vec::with_capacity(max_ploidy * genotypes.len());
748+
for gt in genotypes {
749+
if gt.as_ref().len() > max_ploidy {
750+
return Err(Error::BcfSetValues);
751+
}
752+
data.extend(
753+
gt.as_ref()
754+
.iter()
755+
.map(|gta| i32::from(*gta))
756+
.chain(iter::repeat_n(
757+
VECTOR_END_INTEGER,
758+
max_ploidy - gt.as_ref().len(),
759+
)),
760+
);
761+
}
762+
self.push_format_integer(b"GT", &data)
763+
}
764+
695765
/// Get genotypes as vector of one `Genotype` per sample.
696766
///
697767
/// # Example
@@ -771,7 +841,7 @@ impl Record {
771841
///
772842
/// - `tag` - The tag's string.
773843
/// - `data` - a flattened, two-dimensional array, the first dimension contains one array
774-
/// for each sample.
844+
/// for each sample.
775845
///
776846
/// # Errors
777847
///
@@ -786,7 +856,7 @@ impl Record {
786856
///
787857
/// - `tag` - The tag's string.
788858
/// - `data` - a flattened, two-dimensional array, the first dimension contains one array
789-
/// for each sample.
859+
/// for each sample.
790860
///
791861
/// # Errors
792862
///
@@ -823,7 +893,7 @@ impl Record {
823893
///
824894
/// - `tag` - The tag's string.
825895
/// - `data` - a flattened, two-dimensional array, the first dimension contains one array
826-
/// for each sample.
896+
/// for each sample.
827897
///
828898
/// # Errors
829899
///
@@ -864,7 +934,7 @@ impl Record {
864934
///
865935
/// - `tag` - The tag's string.
866936
/// - `data` - a two-dimensional array, the first dimension contains one array
867-
/// for each sample. Must be non-empty.
937+
/// for each sample. Must be non-empty.
868938
///
869939
/// # Errors
870940
///
@@ -1249,14 +1319,19 @@ where
12491319
}
12501320

12511321
impl<'a, B: Borrow<Buffer> + 'a> Genotypes<'a, B> {
1252-
/// Get genotype of ith sample. So far, only supports diploid genotypes.
1322+
/// Get genotype of ith sample.
12531323
///
12541324
/// Note that the result complies with the BCF spec. This means that the
12551325
/// first allele will always be marked as `Unphased`. That is, if you have 1|1 in the VCF,
12561326
/// this method will return `[Unphased(1), Phased(1)]`.
12571327
pub fn get(&self, i: usize) -> Genotype {
12581328
let igt = self.encoded[i];
1259-
Genotype(igt.iter().map(|&e| GenotypeAllele::from(e)).collect())
1329+
let allelles = igt
1330+
.iter()
1331+
.take_while(|&&i| i != VECTOR_END_INTEGER)
1332+
.map(|&i| GenotypeAllele::from(i))
1333+
.collect();
1334+
Genotype(allelles)
12601335
}
12611336
}
12621337

0 commit comments

Comments
 (0)