Skip to content

Commit

Permalink
Merge pull request ga4gh#268 from dcolligan/246_reads_search
Browse files Browse the repository at this point in the history
246 reads search
  • Loading branch information
jeromekelleher committed Mar 27, 2015
2 parents 14a97b5 + d1b8f26 commit b1809e3
Show file tree
Hide file tree
Showing 10 changed files with 292 additions and 66 deletions.
78 changes: 77 additions & 1 deletion ga4gh/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ def __init__(self):
self._referenceSetIds = []
self._readGroupSetIdMap = {}
self._readGroupSetIds = []
self._readGroupIds = []
self._readGroupIdMap = {}
self._requestValidation = False
self._responseValidation = False
self._defaultPageSize = 100
Expand Down Expand Up @@ -205,6 +207,68 @@ def variantSetsGenerator(self, request):
return self._topLevelObjectGenerator(
request, self._variantSetIdMap, self._variantSetIds)

def readsGenerator(self, request):
# Local utility functions to save some typing
def getPosition(readAlignment):
return readAlignment.alignment.position.position

def getEndPosition(readAlignment):
return getPosition(readAlignment) + \
len(readAlignment.alignedSequence)

if len(request.readGroupIds) != 1:
raise exceptions.NotImplementedException(
"Read search over multiple readGroups not supported")
readGroupId = request.readGroupIds[0]
try:
readGroup = self._readGroupIdMap[request.readGroupIds[0]]
except KeyError:
raise exceptions.ReadGroupNotFoundException(readGroupId)
startPosition = request.start
equalPositionsToSkip = 0
if request.pageToken is not None:
startPosition, equalPositionsToSkip = self.parsePageToken(
request.pageToken, 2)
iterator = readGroup.getReadAlignments(
request.referenceName, request.referenceId, startPosition,
request.end)
readAlignment = next(iterator, None)
if request.pageToken is not None:
# First, skip any reads with position < startPosition
# or endPosition < request.start
while (getPosition(readAlignment) < startPosition or
getEndPosition(readAlignment) < request.start):
readAlignment = next(iterator, None)
if readAlignment is None:
raise exceptions.BadPageTokenException(
"Inconsistent page token provided")
# Now, skip equalPositionsToSkip records which have position
# == startPosition
equalPositionsSkipped = 0
while equalPositionsSkipped < equalPositionsToSkip:
if getPosition(readAlignment) != startPosition:
raise exceptions.BadPageTokenException(
"Inconsistent page token provided")
equalPositionsSkipped += 1
readAlignment = next(iterator, None)
if readAlignment is None:
raise exceptions.BadPageTokenException(
"Inconsistent page token provided")
# The iterator is now positioned at the correct place.
while readAlignment is not None:
nextReadAlignment = next(iterator, None)
nextPageToken = None
if nextReadAlignment is not None:
if getPosition(readAlignment) == \
getPosition(nextReadAlignment):
equalPositionsToSkip += 1
else:
equalPositionsToSkip = 0
nextPageToken = "{}:{}".format(
getPosition(nextReadAlignment), equalPositionsToSkip)
yield readAlignment, nextPageToken
readAlignment = nextReadAlignment

def variantsGenerator(self, request):
"""
Returns a generator over the (variant, nextPageToken) pairs defined by
Expand Down Expand Up @@ -332,6 +396,15 @@ def __init__(self, randomSeed=0, numCalls=1, variantDensity=0.5,
self._variantSetIdMap[variantSetId] = variantSet
self._variantSetIds = sorted(self._variantSetIdMap.keys())

# Reads
readGroupSetId = "aReadGroupSet"
readGroupSet = reads.SimulatedReadGroupSet(readGroupSetId)
self._readGroupSetIdMap[readGroupSetId] = readGroupSet
for readGroup in readGroupSet.getReadGroups():
self._readGroupIdMap[readGroup.getId()] = readGroup
self._readGroupSetIds = sorted(self._readGroupSetIdMap.keys())
self._readGroupIds = sorted(self._readGroupIdMap.keys())


class FileSystemBackend(AbstractBackend):
"""
Expand Down Expand Up @@ -366,7 +439,10 @@ def __init__(self, dataDir):
for readGroupSetId in os.listdir(readGroupSetDir):
relativePath = os.path.join(readGroupSetDir, readGroupSetId)
if os.path.isdir(relativePath):
readGroupSet = reads.ReadGroupSet(
readGroupSet = reads.HtslibReadGroupSet(
readGroupSetId, relativePath)
self._readGroupSetIdMap[readGroupSetId] = readGroupSet
for readGroup in readGroupSet.getReadGroups():
self._readGroupIdMap[readGroup.getId()] = readGroup
self._readGroupSetIds = sorted(self._readGroupSetIdMap.keys())
self._readGroupIds = sorted(self._readGroupIdMap.keys())
119 changes: 94 additions & 25 deletions ga4gh/datamodel/reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,23 +67,13 @@ def setFlag(flagAttr, flag):
flagAttr |= flag


class ReadGroupSet(object):
class AbstractReadGroupSet(object):
"""
Class representing a logical collection ReadGroups.
The base class of a read group set
"""
def __init__(self, id_, dataDir):
def __init__(self, id_):
self._id = id_
self._dataDir = dataDir
self._readGroups = []
# we only support BAM files right now;
# SAM files (which can't have index files) would be too slow
pattern = "*.bam"
for path in glob.glob(os.path.join(self._dataDir, pattern)):
filename = os.path.basename(path)
localId = os.path.splitext(filename)[0]
readGroupId = "{}:{}".format(self._id, localId)
readGroup = ReadGroup(readGroupId, path)
self._readGroups.append(readGroup)

def getReadGroups(self):
"""
Expand All @@ -104,29 +94,50 @@ def toProtocolElement(self):
return readGroupSet


class ReadGroup(object):
class SimulatedReadGroupSet(AbstractReadGroupSet):
"""
A simulated read group set
"""
def __init__(self, id_):
super(SimulatedReadGroupSet, self).__init__(id_)
readGroupId = "{}:one".format(id_)
readGroup = SimulatedReadGroup(readGroupId)
self._readGroups.append(readGroup)


class HtslibReadGroupSet(AbstractReadGroupSet):
"""
Class representing a logical collection ReadGroups.
"""
def __init__(self, id_, dataDir):
super(HtslibReadGroupSet, self).__init__(id_)
self._dataDir = dataDir
# we only support BAM files right now;
# SAM files (which can't have index files) would be too slow
pattern = "*.bam"
for path in glob.glob(os.path.join(self._dataDir, pattern)):
filename = os.path.split(path)[1]
localId = os.path.splitext(filename)[0]
readGroupId = "{}:{}".format(self._id, localId)
readGroup = HtslibReadGroup(readGroupId, path)
self._readGroups.append(readGroup)


class AbstractReadGroup(object):
"""
Class representing a ReadGroup. A ReadGroup is all the data that's
processed the same way by the sequencer. There are typically 1-10
ReadGroups in a ReadGroupSet.
"""
def __init__(self, id_, dataFile):
def __init__(self, id_):
self._id = id_
self._samFilePath = dataFile
self._samFile = pysam.AlignmentFile(dataFile)

def getId(self):
"""
Returns the id of the read group
"""
return self._id

def getSamFilePath(self):
"""
Returns the file path of the sam file
"""
return self._samFilePath

def toProtocolElement(self):
"""
Returns the GA4GH protocol representation of this ReadGroup.
Expand All @@ -149,6 +160,64 @@ def toProtocolElement(self):
readGroup.sampleId = None
return readGroup


class SimulatedReadGroup(AbstractReadGroup):
"""
A simulated readgroup
"""
def __init__(self, id_):
super(SimulatedReadGroup, self).__init__(id_)

def getReadAlignments(self, referenceName=None, referenceId=None,
start=None, end=None):
for i in range(2):
alignment = self._createReadAlignment(i)
yield alignment

def _createReadAlignment(self, i):
# TODO fill out a bit more
id_ = "{}:simulated{}".format(self._id, i)
alignment = protocol.GAReadAlignment()
alignment.alignedQuality = [1, 2, 3]
alignment.alignedSequence = "ACT"
gaPosition = protocol.GAPosition()
gaPosition.position = 0
gaPosition.referenceName = "whatevs"
gaPosition.reverseStrand = False
gaLinearAlignment = protocol.GALinearAlignment()
gaLinearAlignment.position = gaPosition
alignment.alignment = gaLinearAlignment
alignment.duplicateFragment = False
alignment.failedVendorQualityChecks = False
alignment.fragmentLength = 3
alignment.fragmentName = id_
alignment.id = id_
alignment.info = {}
alignment.nextMatePosition = None
alignment.numberReads = None
alignment.properPlacement = False
alignment.readGroupId = self._id
alignment.readNumber = None
alignment.secondaryAlignment = False
alignment.supplementaryAlignment = False
return alignment


class HtslibReadGroup(AbstractReadGroup):
"""
A readgroup based on htslib's reading of a given file
"""
def __init__(self, id_, dataFile):
super(HtslibReadGroup, self).__init__(id_)
self._samFilePath = dataFile
self._samFile = pysam.AlignmentFile(dataFile)

def getSamFilePath(self):
"""
Returns the file path of the sam file
"""
return self._samFilePath

def getReadAlignments(self, referenceName=None, referenceId=None,
start=None, end=None):
"""
Expand All @@ -170,7 +239,7 @@ def convertReadAlignment(self, read):
# TODO fill out remaining fields
# TODO refine in tandem with code in converters module
ret = protocol.GAReadAlignment()
ret.alignedQuality = read.query_qualities
ret.alignedQuality = list(read.query_qualities)
ret.alignedSequence = read.query_sequence
ret.alignment = protocol.GALinearAlignment()
ret.alignment.mappingQuality = read.mapping_quality
Expand All @@ -190,7 +259,7 @@ def convertReadAlignment(self, read):
read.flag, SamFlags.FAILED_VENDOR_QUALITY_CHECKS)
ret.fragmentLength = read.template_length
ret.fragmentName = read.query_name
ret.id = None # TODO
ret.id = "{}:{}".format(self._id, read.query_name)
ret.info = dict(read.tags)
ret.nextMatePosition = protocol.GAPosition()
ret.nextMatePosition.position = read.next_reference_start
Expand Down
5 changes: 5 additions & 0 deletions ga4gh/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ def __init__(self, variantSetId):
variantSetId)


class ReadGroupNotFoundException(ObjectNotFoundException):
def __init__(self, readGroupId):
self.message = "readGroupId '{}' not found".format(readGroupId)


class UnsupportedMediaTypeException(BaseServerException):
httpStatus = 415
message = "Unsupported media type"
Expand Down
8 changes: 4 additions & 4 deletions ga4gh/frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from __future__ import unicode_literals

import os
import traceback
import datetime

import flask
Expand Down Expand Up @@ -213,7 +212,7 @@ def handleException(exception):
a request.
"""
if app.config['DEBUG']:
print(traceback.format_exc(exception))
app.log_exception(exception)
serverException = exception
if not isinstance(exception, exceptions.BaseServerException):
serverException = exceptions.getServerError(exception)
Expand Down Expand Up @@ -270,9 +269,10 @@ def searchReadGroupSets(version):
version, flask.request, app.backend.searchReadGroupSets)


@app.route('/<version>/reads/search', methods=['POST'])
@app.route('/<version>/reads/search', methods=['POST', 'OPTIONS'])
def searchReads(version):
raise exceptions.PathNotFoundException()
return handleFlaskPostRequest(
version, flask.request, app.backend.searchReads)


@app.route('/<version>/referencesets/search', methods=['POST'])
Expand Down
8 changes: 5 additions & 3 deletions tests/datadriven/test_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _readSam(self, readGroupSetId, samFileName):
self._readGroupInfos[samFileName] = readGroupInfo

def getDataModelClass(self):
return reads.ReadGroupSet
return reads.HtslibReadGroupSet

def getProtocolClass(self):
return protocol.GAReadGroupSet
Expand Down Expand Up @@ -167,7 +167,7 @@ def assertAlignmentsEqual(self, gaAlignment, pysamAlignment,
readGroupInfo):
self.assertEqual(
gaAlignment.alignedQuality,
pysamAlignment.query_qualities)
list(pysamAlignment.query_qualities))
self.assertEqual(
gaAlignment.alignedSequence,
pysamAlignment.query_sequence)
Expand Down Expand Up @@ -195,7 +195,9 @@ def assertAlignmentsEqual(self, gaAlignment, pysamAlignment,
self.assertEqual(
gaAlignment.fragmentName,
pysamAlignment.query_name)
self.assertIsNone(gaAlignment.id)
self.assertEqual(
gaAlignment.id,
"{}:{}".format(readGroupInfo.id, pysamAlignment.query_name))
self.assertEqual(
gaAlignment.info,
dict(pysamAlignment.tags))
Expand Down
6 changes: 3 additions & 3 deletions tests/datadriven/test_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ def _assertEquivalentGaVCFValues(gaValue, pyvcfValue):
if isinstance(value, list):
self.assertEqual(len(gaObjectInfo[key]), len(value))
for gaValue, pyvcfValue in zip(
gaObjectInfo[key], pyvcfInfo[key]):
gaObjectInfo[key], pyvcfInfo[key]):
_assertEquivalentGaVCFValues(gaValue, pyvcfValue)
else:
self.assertEqual(len(gaObjectInfo[key]), 1)

def _verifyVariantCallEqual(self, gaCall, pyvcfCall):
genotype, phaseset = variants.convertVCFGenotype(
pyvcfCall.data.GT, pyvcfCall.phased)
pyvcfCall.data.GT, pyvcfCall.phased)
self.assertEqual(gaCall.callSetId, pyvcfCall.site.ID)
self.assertEqual(gaCall.callSetName, pyvcfCall.sample)
self.assertEqual(gaCall.genotype, genotype)
Expand Down Expand Up @@ -214,7 +214,7 @@ def _assertEmptyVariant(self, referenceName, startPosition, endPosition):
referenceName, startPosition, endPosition, None, []))
self.assertEqual(len(gaVariants), 0)
pyvcfVariants = self._getPyvcfVariants(
referenceName, startPosition, endPosition)
referenceName, startPosition, endPosition)
self.assertEqual(len(pyvcfVariants), 0)

def _assertVariantsEqualInRange(
Expand Down
Loading

0 comments on commit b1809e3

Please sign in to comment.