Skip to content

Commit

Permalink
Large commit that creates allele and variantreader types. Enables
Browse files Browse the repository at this point in the history
iterating through variants as per the spec, based on the layout
and compression type. Does not yet handle genotype data (either
compressed or uncompressed).
  • Loading branch information
carbocation committed Jul 10, 2018
1 parent 4cfb44c commit 8129bd8
Show file tree
Hide file tree
Showing 6 changed files with 288 additions and 0 deletions.
7 changes: 7 additions & 0 deletions allele.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package bgen

type Allele string

func (a Allele) String() string {
return string(a)
}
14 changes: 14 additions & 0 deletions compression.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,17 @@ const (
CompressionZLIB
CompressionZStandard
)

func (c Compression) String() string {
switch c {
case CompressionDisabled:
return "CompressionDisabled"
case CompressionZLIB:
return "CompressionZLIB"
case CompressionZStandard:
return "CompressionZStandard"

default:
return "Illegal selection"
}
}
14 changes: 14 additions & 0 deletions example/limix/example.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,18 @@ func main() {

log.Println("Iterated over", i, "samples")
}

vr := bg.NewVariantReader()
for i := 1; ; i++ {
v := vr.Read()
if v == nil {
break
}

log.Println(i, v)
}

if vr.Error() != nil {
log.Println("VR error:", err.Error())
}
}
12 changes: 12 additions & 0 deletions layout.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,15 @@ const (
Layout1 Layout = iota
Layout2
)

func (l Layout) String() string {
switch l {
case Layout1:
return "Layout1"
case Layout2:
return "Layout2"

default:
return "Illegal selection"
}
}
6 changes: 6 additions & 0 deletions variant.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
package bgen

type Variant struct {
ID string
RSID string
Chromosome string
Position uint32
NAlleles uint16
Alleles []Allele
}

//func NewVariantReader() // <- iterate over variants sequentially, possibly to build an index
Expand Down
235 changes: 235 additions & 0 deletions variantreader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
package bgen

import (
"encoding/binary"
"fmt"
"io"

"github.com/carbocation/pfx"
)

type VariantReader struct {
VariantsSeen uint32
b *BGEN
currentOffset uint32
err error

// Cached values
buffer []byte
}

func (b *BGEN) NewVariantReader() *VariantReader {
vr := &VariantReader{
currentOffset: b.VariantsStart,
b: b,
}

return vr
}

func (vr *VariantReader) Error() error {
return vr.err
}

func (vr *VariantReader) Read() *Variant {
v, newOffset, err := vr.parseVariantAtOffset(int64(vr.currentOffset))
if err != nil {
if err == io.EOF {
return nil
}
vr.err = pfx.Err(err)
}

vr.VariantsSeen++
vr.currentOffset = uint32(newOffset)

return v
}

// parseVariantAtOffset does not mutate the VariantReader.
// TODO: Offer an option with a reusable buffer to reduce allocations
func (vr *VariantReader) parseVariantAtOffset(offset int64) (*Variant, int64, error) {
v := &Variant{}
var err error

VariantLoop:
for {
// ID:
if err = vr.readNBytesAtOffset(2, offset); err != nil {
break
}
offset += 2
stringSize := int(binary.LittleEndian.Uint16(vr.buffer[:2]))
if err = vr.readNBytesAtOffset(stringSize, offset); err != nil {
break
}
v.ID = string(vr.buffer[:stringSize])
offset += int64(stringSize)

// RSID
if err = vr.readNBytesAtOffset(2, offset); err != nil {
break
}
offset += 2
stringSize = int(binary.LittleEndian.Uint16(vr.buffer[:2]))
if err = vr.readNBytesAtOffset(stringSize, offset); err != nil {
break
}
v.RSID = string(vr.buffer[:stringSize])
offset += int64(stringSize)

// Chrom
if err = vr.readNBytesAtOffset(2, offset); err != nil {
break
}
offset += 2
stringSize = int(binary.LittleEndian.Uint16(vr.buffer[:2]))
if stringSize != 2 {
err = fmt.Errorf("Chromosome field size is %d bytes; expected 2", stringSize)
break
}
if err = vr.readNBytesAtOffset(stringSize, offset); err != nil {
break
}
v.Chromosome = Chromosome(binary.LittleEndian.Uint16(vr.buffer[:stringSize]))
offset += int64(stringSize)

// Position
if err = vr.readNBytesAtOffset(4, offset); err != nil {
break
}
offset += 4
v.Position = binary.LittleEndian.Uint32(vr.buffer[:4])

// NAlleles
if vr.b.FlagLayout == Layout1 {
// Assumed to be 2 in Layout1
v.NAlleles = 2
} else if vr.b.FlagLayout == Layout2 {
if err = vr.readNBytesAtOffset(2, offset); err != nil {
break
}
offset += 2
v.NAlleles = binary.LittleEndian.Uint16(vr.buffer[:2])
}

// Allele slice
var alleleLength int
for i := uint16(0); i < v.NAlleles; i++ {
if err = vr.readNBytesAtOffset(4, offset); err != nil {
break VariantLoop
}
offset += 4
alleleLength = int(binary.LittleEndian.Uint32(vr.buffer[:4]))

if err = vr.readNBytesAtOffset(alleleLength, offset); err != nil {
break VariantLoop
}
offset += int64(alleleLength)
v.Alleles = append(v.Alleles, Allele(string(vr.buffer[:alleleLength])))
}

// Genotype data
if vr.b.FlagLayout == Layout1 {
// From the spec: "If CompressedSNPBlocks=0 this field is omitted
// and the length of the uncompressed data is C=6N."
if comp := vr.b.FlagCompression; comp == CompressionDisabled {
uncompressedDataBlockSize := int64(6 * vr.b.NSamples)
if err = vr.readNBytesAtOffset(int(uncompressedDataBlockSize), offset); err != nil {
break
}
offset += uncompressedDataBlockSize
// TODO: Handle the uncompressed genotype data
_ = vr.buffer[:uncompressedDataBlockSize]

} else if comp == CompressionZLIB {
if err = vr.readNBytesAtOffset(4, offset); err != nil {
break
}
offset += 4
genoBlockLength := binary.LittleEndian.Uint32(vr.buffer[:4])

if err = vr.readNBytesAtOffset(int(genoBlockLength), offset); err != nil {
break
}
offset += int64(genoBlockLength)
// TODO: Handle the ZLIB compressed genotype data
_ = vr.buffer[:genoBlockLength]
} else {
err = fmt.Errorf("Compression choice %s is not compatible with Layout %s", vr.b.FlagCompression, vr.b.FlagLayout)
break
}

} else if vr.b.FlagLayout == Layout2 {
// The genotype layout data block for Layout2 is guaranteed to have
// a 4 byte chunk that indicates how much data is left for this
// block (skipping ahead by this much will bring you to the next
// chunk).
if err = vr.readNBytesAtOffset(4, offset); err != nil {
break
}
offset += 4
nextDataOffset := binary.LittleEndian.Uint32(vr.buffer[:4])

if vr.b.FlagCompression == CompressionDisabled {
// If compression is disabled, it will not have the second 4
// byte chunk that indicates how large the data chunk is after
// decompression.

if err = vr.readNBytesAtOffset(int(nextDataOffset), offset); err != nil {
break
}
// TODO: Handle the uncompressed genotype data
_ = vr.buffer[:nextDataOffset]

offset += int64(nextDataOffset)

} else {
// If compression is enabled, there will be a second 4 byte
// chunk that indicates how large the data chunk is after
// decompression.

if err = vr.readNBytesAtOffset(4, offset); err != nil {
break
}
offset += 4
decompressedDataLength := binary.LittleEndian.Uint32(vr.buffer[:4])
// TODO: Is this a checksum or actually useful?
_ = decompressedDataLength // ???

// From the spec: "If CompressedSNPBlocks is nonzero, this is
// C-4 bytes which can be uncompressed to form D bytes in the
// format described below." For us, "C" is nextDataOffset.
genoBlockDataSizeToDecompress := nextDataOffset - 4
// Compressed geno data
if err = vr.readNBytesAtOffset(int(genoBlockDataSizeToDecompress), offset); err != nil {
break
}
// TODO: Handle the compressed genotype data
_ = vr.buffer[:genoBlockDataSizeToDecompress]

offset += int64(genoBlockDataSizeToDecompress)
}
}

// TODO: actually interpret the genotype data based on which layout
// version is being used.

break
}
// log.Println(vr.b.FlagLayout)
// log.Println(vr.b.FlagCompression)
// panic("yah")
// log.Println("New offset:", offset)

return v, offset, err
}

func (vr *VariantReader) readNBytesAtOffset(N int, offset int64) error {
if vr.buffer == nil || len(vr.buffer) < N {
vr.buffer = make([]byte, N)
}

_, err := vr.b.File.ReadAt(vr.buffer[:N], offset)
return err
}

0 comments on commit 8129bd8

Please sign in to comment.