From 8129bd8d772128764e147b7825d4df8cdddd1433 Mon Sep 17 00:00:00 2001 From: James Pirruccello Date: Tue, 10 Jul 2018 16:58:48 -0400 Subject: [PATCH] Large commit that creates allele and variantreader types. Enables iterating through variants as per the spec, based on the layout and compression type. Does not yet handle genotype data (either compressed or uncompressed). --- allele.go | 7 ++ compression.go | 14 +++ example/limix/example.go | 14 +++ layout.go | 12 ++ variant.go | 6 + variantreader.go | 235 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 288 insertions(+) create mode 100644 allele.go create mode 100644 variantreader.go diff --git a/allele.go b/allele.go new file mode 100644 index 0000000..26d6753 --- /dev/null +++ b/allele.go @@ -0,0 +1,7 @@ +package bgen + +type Allele string + +func (a Allele) String() string { + return string(a) +} diff --git a/compression.go b/compression.go index 7ddb3ff..eea0823 100644 --- a/compression.go +++ b/compression.go @@ -8,3 +8,17 @@ const ( CompressionZLIB CompressionZStandard ) + +func (c Compression) String() string { + switch c { + case CompressionDisabled: + return "CompressionDisabled" + case CompressionZLIB: + return "CompressionZLIB" + case CompressionZStandard: + return "CompressionZStandard" + + default: + return "Illegal selection" + } +} diff --git a/example/limix/example.go b/example/limix/example.go index 5dee7d7..f4ca897 100644 --- a/example/limix/example.go +++ b/example/limix/example.go @@ -47,4 +47,18 @@ func main() { log.Println("Iterated over", i, "samples") } + + vr := bg.NewVariantReader() + for i := 1; ; i++ { + v := vr.Read() + if v == nil { + break + } + + log.Println(i, v) + } + + if vr.Error() != nil { + log.Println("VR error:", err.Error()) + } } diff --git a/layout.go b/layout.go index 548b686..cce5adf 100644 --- a/layout.go +++ b/layout.go @@ -7,3 +7,15 @@ const ( Layout1 Layout = iota Layout2 ) + +func (l Layout) String() string { + switch l { + case Layout1: + return "Layout1" + case Layout2: + return "Layout2" + + default: + return "Illegal selection" + } +} diff --git a/variant.go b/variant.go index 68f9f78..8d14e1b 100644 --- a/variant.go +++ b/variant.go @@ -1,6 +1,12 @@ package bgen type Variant struct { + ID string + RSID string + Chromosome string + Position uint32 + NAlleles uint16 + Alleles []Allele } //func NewVariantReader() // <- iterate over variants sequentially, possibly to build an index diff --git a/variantreader.go b/variantreader.go new file mode 100644 index 0000000..5daa3fe --- /dev/null +++ b/variantreader.go @@ -0,0 +1,235 @@ +package bgen + +import ( + "encoding/binary" + "fmt" + "io" + + "github.com/carbocation/pfx" +) + +type VariantReader struct { + VariantsSeen uint32 + b *BGEN + currentOffset uint32 + err error + + // Cached values + buffer []byte +} + +func (b *BGEN) NewVariantReader() *VariantReader { + vr := &VariantReader{ + currentOffset: b.VariantsStart, + b: b, + } + + return vr +} + +func (vr *VariantReader) Error() error { + return vr.err +} + +func (vr *VariantReader) Read() *Variant { + v, newOffset, err := vr.parseVariantAtOffset(int64(vr.currentOffset)) + if err != nil { + if err == io.EOF { + return nil + } + vr.err = pfx.Err(err) + } + + vr.VariantsSeen++ + vr.currentOffset = uint32(newOffset) + + return v +} + +// parseVariantAtOffset does not mutate the VariantReader. +// TODO: Offer an option with a reusable buffer to reduce allocations +func (vr *VariantReader) parseVariantAtOffset(offset int64) (*Variant, int64, error) { + v := &Variant{} + var err error + +VariantLoop: + for { + // ID: + if err = vr.readNBytesAtOffset(2, offset); err != nil { + break + } + offset += 2 + stringSize := int(binary.LittleEndian.Uint16(vr.buffer[:2])) + if err = vr.readNBytesAtOffset(stringSize, offset); err != nil { + break + } + v.ID = string(vr.buffer[:stringSize]) + offset += int64(stringSize) + + // RSID + if err = vr.readNBytesAtOffset(2, offset); err != nil { + break + } + offset += 2 + stringSize = int(binary.LittleEndian.Uint16(vr.buffer[:2])) + if err = vr.readNBytesAtOffset(stringSize, offset); err != nil { + break + } + v.RSID = string(vr.buffer[:stringSize]) + offset += int64(stringSize) + + // Chrom + if err = vr.readNBytesAtOffset(2, offset); err != nil { + break + } + offset += 2 + stringSize = int(binary.LittleEndian.Uint16(vr.buffer[:2])) + if stringSize != 2 { + err = fmt.Errorf("Chromosome field size is %d bytes; expected 2", stringSize) + break + } + if err = vr.readNBytesAtOffset(stringSize, offset); err != nil { + break + } + v.Chromosome = Chromosome(binary.LittleEndian.Uint16(vr.buffer[:stringSize])) + offset += int64(stringSize) + + // Position + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break + } + offset += 4 + v.Position = binary.LittleEndian.Uint32(vr.buffer[:4]) + + // NAlleles + if vr.b.FlagLayout == Layout1 { + // Assumed to be 2 in Layout1 + v.NAlleles = 2 + } else if vr.b.FlagLayout == Layout2 { + if err = vr.readNBytesAtOffset(2, offset); err != nil { + break + } + offset += 2 + v.NAlleles = binary.LittleEndian.Uint16(vr.buffer[:2]) + } + + // Allele slice + var alleleLength int + for i := uint16(0); i < v.NAlleles; i++ { + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break VariantLoop + } + offset += 4 + alleleLength = int(binary.LittleEndian.Uint32(vr.buffer[:4])) + + if err = vr.readNBytesAtOffset(alleleLength, offset); err != nil { + break VariantLoop + } + offset += int64(alleleLength) + v.Alleles = append(v.Alleles, Allele(string(vr.buffer[:alleleLength]))) + } + + // Genotype data + if vr.b.FlagLayout == Layout1 { + // From the spec: "If CompressedSNPBlocks=0 this field is omitted + // and the length of the uncompressed data is C=6N." + if comp := vr.b.FlagCompression; comp == CompressionDisabled { + uncompressedDataBlockSize := int64(6 * vr.b.NSamples) + if err = vr.readNBytesAtOffset(int(uncompressedDataBlockSize), offset); err != nil { + break + } + offset += uncompressedDataBlockSize + // TODO: Handle the uncompressed genotype data + _ = vr.buffer[:uncompressedDataBlockSize] + + } else if comp == CompressionZLIB { + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break + } + offset += 4 + genoBlockLength := binary.LittleEndian.Uint32(vr.buffer[:4]) + + if err = vr.readNBytesAtOffset(int(genoBlockLength), offset); err != nil { + break + } + offset += int64(genoBlockLength) + // TODO: Handle the ZLIB compressed genotype data + _ = vr.buffer[:genoBlockLength] + } else { + err = fmt.Errorf("Compression choice %s is not compatible with Layout %s", vr.b.FlagCompression, vr.b.FlagLayout) + break + } + + } else if vr.b.FlagLayout == Layout2 { + // The genotype layout data block for Layout2 is guaranteed to have + // a 4 byte chunk that indicates how much data is left for this + // block (skipping ahead by this much will bring you to the next + // chunk). + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break + } + offset += 4 + nextDataOffset := binary.LittleEndian.Uint32(vr.buffer[:4]) + + if vr.b.FlagCompression == CompressionDisabled { + // If compression is disabled, it will not have the second 4 + // byte chunk that indicates how large the data chunk is after + // decompression. + + if err = vr.readNBytesAtOffset(int(nextDataOffset), offset); err != nil { + break + } + // TODO: Handle the uncompressed genotype data + _ = vr.buffer[:nextDataOffset] + + offset += int64(nextDataOffset) + + } else { + // If compression is enabled, there will be a second 4 byte + // chunk that indicates how large the data chunk is after + // decompression. + + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break + } + offset += 4 + decompressedDataLength := binary.LittleEndian.Uint32(vr.buffer[:4]) + // TODO: Is this a checksum or actually useful? + _ = decompressedDataLength // ??? + + // From the spec: "If CompressedSNPBlocks is nonzero, this is + // C-4 bytes which can be uncompressed to form D bytes in the + // format described below." For us, "C" is nextDataOffset. + genoBlockDataSizeToDecompress := nextDataOffset - 4 + // Compressed geno data + if err = vr.readNBytesAtOffset(int(genoBlockDataSizeToDecompress), offset); err != nil { + break + } + // TODO: Handle the compressed genotype data + _ = vr.buffer[:genoBlockDataSizeToDecompress] + + offset += int64(genoBlockDataSizeToDecompress) + } + } + + // TODO: actually interpret the genotype data based on which layout + // version is being used. + + break + } + // log.Println(vr.b.FlagLayout) + // log.Println(vr.b.FlagCompression) + // panic("yah") + // log.Println("New offset:", offset) + + return v, offset, err +} + +func (vr *VariantReader) readNBytesAtOffset(N int, offset int64) error { + if vr.buffer == nil || len(vr.buffer) < N { + vr.buffer = make([]byte, N) + } + + _, err := vr.b.File.ReadAt(vr.buffer[:N], offset) + return err +}