diff --git a/allele.go b/allele.go new file mode 100644 index 0000000..26d6753 --- /dev/null +++ b/allele.go @@ -0,0 +1,7 @@ +package bgen + +type Allele string + +func (a Allele) String() string { + return string(a) +} diff --git a/compression.go b/compression.go index 7ddb3ff..eea0823 100644 --- a/compression.go +++ b/compression.go @@ -8,3 +8,17 @@ const ( CompressionZLIB CompressionZStandard ) + +func (c Compression) String() string { + switch c { + case CompressionDisabled: + return "CompressionDisabled" + case CompressionZLIB: + return "CompressionZLIB" + case CompressionZStandard: + return "CompressionZStandard" + + default: + return "Illegal selection" + } +} diff --git a/example/limix/example.go b/example/limix/example.go index 5dee7d7..f4ca897 100644 --- a/example/limix/example.go +++ b/example/limix/example.go @@ -47,4 +47,18 @@ func main() { log.Println("Iterated over", i, "samples") } + + vr := bg.NewVariantReader() + for i := 1; ; i++ { + v := vr.Read() + if v == nil { + break + } + + log.Println(i, v) + } + + if vr.Error() != nil { + log.Println("VR error:", err.Error()) + } } diff --git a/layout.go b/layout.go index 548b686..cce5adf 100644 --- a/layout.go +++ b/layout.go @@ -7,3 +7,15 @@ const ( Layout1 Layout = iota Layout2 ) + +func (l Layout) String() string { + switch l { + case Layout1: + return "Layout1" + case Layout2: + return "Layout2" + + default: + return "Illegal selection" + } +} diff --git a/variant.go b/variant.go index 68f9f78..8d14e1b 100644 --- a/variant.go +++ b/variant.go @@ -1,6 +1,12 @@ package bgen type Variant struct { + ID string + RSID string + Chromosome string + Position uint32 + NAlleles uint16 + Alleles []Allele } //func NewVariantReader() // <- iterate over variants sequentially, possibly to build an index diff --git a/variantreader.go b/variantreader.go new file mode 100644 index 0000000..5daa3fe --- /dev/null +++ b/variantreader.go @@ -0,0 +1,235 @@ +package bgen + +import ( + "encoding/binary" + "fmt" + "io" + + "github.com/carbocation/pfx" +) + +type VariantReader struct { + VariantsSeen uint32 + b *BGEN + currentOffset uint32 + err error + + // Cached values + buffer []byte +} + +func (b *BGEN) NewVariantReader() *VariantReader { + vr := &VariantReader{ + currentOffset: b.VariantsStart, + b: b, + } + + return vr +} + +func (vr *VariantReader) Error() error { + return vr.err +} + +func (vr *VariantReader) Read() *Variant { + v, newOffset, err := vr.parseVariantAtOffset(int64(vr.currentOffset)) + if err != nil { + if err == io.EOF { + return nil + } + vr.err = pfx.Err(err) + } + + vr.VariantsSeen++ + vr.currentOffset = uint32(newOffset) + + return v +} + +// parseVariantAtOffset does not mutate the VariantReader. +// TODO: Offer an option with a reusable buffer to reduce allocations +func (vr *VariantReader) parseVariantAtOffset(offset int64) (*Variant, int64, error) { + v := &Variant{} + var err error + +VariantLoop: + for { + // ID: + if err = vr.readNBytesAtOffset(2, offset); err != nil { + break + } + offset += 2 + stringSize := int(binary.LittleEndian.Uint16(vr.buffer[:2])) + if err = vr.readNBytesAtOffset(stringSize, offset); err != nil { + break + } + v.ID = string(vr.buffer[:stringSize]) + offset += int64(stringSize) + + // RSID + if err = vr.readNBytesAtOffset(2, offset); err != nil { + break + } + offset += 2 + stringSize = int(binary.LittleEndian.Uint16(vr.buffer[:2])) + if err = vr.readNBytesAtOffset(stringSize, offset); err != nil { + break + } + v.RSID = string(vr.buffer[:stringSize]) + offset += int64(stringSize) + + // Chrom + if err = vr.readNBytesAtOffset(2, offset); err != nil { + break + } + offset += 2 + stringSize = int(binary.LittleEndian.Uint16(vr.buffer[:2])) + if stringSize != 2 { + err = fmt.Errorf("Chromosome field size is %d bytes; expected 2", stringSize) + break + } + if err = vr.readNBytesAtOffset(stringSize, offset); err != nil { + break + } + v.Chromosome = Chromosome(binary.LittleEndian.Uint16(vr.buffer[:stringSize])) + offset += int64(stringSize) + + // Position + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break + } + offset += 4 + v.Position = binary.LittleEndian.Uint32(vr.buffer[:4]) + + // NAlleles + if vr.b.FlagLayout == Layout1 { + // Assumed to be 2 in Layout1 + v.NAlleles = 2 + } else if vr.b.FlagLayout == Layout2 { + if err = vr.readNBytesAtOffset(2, offset); err != nil { + break + } + offset += 2 + v.NAlleles = binary.LittleEndian.Uint16(vr.buffer[:2]) + } + + // Allele slice + var alleleLength int + for i := uint16(0); i < v.NAlleles; i++ { + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break VariantLoop + } + offset += 4 + alleleLength = int(binary.LittleEndian.Uint32(vr.buffer[:4])) + + if err = vr.readNBytesAtOffset(alleleLength, offset); err != nil { + break VariantLoop + } + offset += int64(alleleLength) + v.Alleles = append(v.Alleles, Allele(string(vr.buffer[:alleleLength]))) + } + + // Genotype data + if vr.b.FlagLayout == Layout1 { + // From the spec: "If CompressedSNPBlocks=0 this field is omitted + // and the length of the uncompressed data is C=6N." + if comp := vr.b.FlagCompression; comp == CompressionDisabled { + uncompressedDataBlockSize := int64(6 * vr.b.NSamples) + if err = vr.readNBytesAtOffset(int(uncompressedDataBlockSize), offset); err != nil { + break + } + offset += uncompressedDataBlockSize + // TODO: Handle the uncompressed genotype data + _ = vr.buffer[:uncompressedDataBlockSize] + + } else if comp == CompressionZLIB { + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break + } + offset += 4 + genoBlockLength := binary.LittleEndian.Uint32(vr.buffer[:4]) + + if err = vr.readNBytesAtOffset(int(genoBlockLength), offset); err != nil { + break + } + offset += int64(genoBlockLength) + // TODO: Handle the ZLIB compressed genotype data + _ = vr.buffer[:genoBlockLength] + } else { + err = fmt.Errorf("Compression choice %s is not compatible with Layout %s", vr.b.FlagCompression, vr.b.FlagLayout) + break + } + + } else if vr.b.FlagLayout == Layout2 { + // The genotype layout data block for Layout2 is guaranteed to have + // a 4 byte chunk that indicates how much data is left for this + // block (skipping ahead by this much will bring you to the next + // chunk). + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break + } + offset += 4 + nextDataOffset := binary.LittleEndian.Uint32(vr.buffer[:4]) + + if vr.b.FlagCompression == CompressionDisabled { + // If compression is disabled, it will not have the second 4 + // byte chunk that indicates how large the data chunk is after + // decompression. + + if err = vr.readNBytesAtOffset(int(nextDataOffset), offset); err != nil { + break + } + // TODO: Handle the uncompressed genotype data + _ = vr.buffer[:nextDataOffset] + + offset += int64(nextDataOffset) + + } else { + // If compression is enabled, there will be a second 4 byte + // chunk that indicates how large the data chunk is after + // decompression. + + if err = vr.readNBytesAtOffset(4, offset); err != nil { + break + } + offset += 4 + decompressedDataLength := binary.LittleEndian.Uint32(vr.buffer[:4]) + // TODO: Is this a checksum or actually useful? + _ = decompressedDataLength // ??? + + // From the spec: "If CompressedSNPBlocks is nonzero, this is + // C-4 bytes which can be uncompressed to form D bytes in the + // format described below." For us, "C" is nextDataOffset. + genoBlockDataSizeToDecompress := nextDataOffset - 4 + // Compressed geno data + if err = vr.readNBytesAtOffset(int(genoBlockDataSizeToDecompress), offset); err != nil { + break + } + // TODO: Handle the compressed genotype data + _ = vr.buffer[:genoBlockDataSizeToDecompress] + + offset += int64(genoBlockDataSizeToDecompress) + } + } + + // TODO: actually interpret the genotype data based on which layout + // version is being used. + + break + } + // log.Println(vr.b.FlagLayout) + // log.Println(vr.b.FlagCompression) + // panic("yah") + // log.Println("New offset:", offset) + + return v, offset, err +} + +func (vr *VariantReader) readNBytesAtOffset(N int, offset int64) error { + if vr.buffer == nil || len(vr.buffer) < N { + vr.buffer = make([]byte, N) + } + + _, err := vr.b.File.ReadAt(vr.buffer[:N], offset) + return err +}