Use ORM for datarepo (ga4gh#1485)

* Use peewee models for datarepo * Adding other data types * Fix exception name for ontology * Move models to their own file * Add models to test imports * Flake fixes * Remove commits from prepare compliance data * Adjust visual indent * Add future imports Write out short variable names, thanks @dcolligan * Biosample refactor Better exception reporting * Remove short names * Change which exception is expected Flake fixes. Fix requirements * Lengthen variable names * Add URL to peewee API * Use updated client * Set constraints back * New species field replaces ncbitaxonid Missed some biosample refactor
ejacox · Feb 2, 2017 · 2afdfdb · 2afdfdb
1 parent f7a9990
commit 2afdfdb
Show file tree

Hide file tree

Showing 15 changed files with 756 additions and 746 deletions.
diff --git a/ga4gh/server/datamodel/bio_metadata.py b/ga4gh/server/datamodel/bio_metadata.py
@@ -66,14 +66,14 @@ def populateFromJson(self, jsonString):
             self._info[key] = {"values": protocol.toJsonDict(parsed.info[key])}
         return self
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, biosampleRecord):
         # TODO coerce to types
-        self._created = row[b'created']
-        self._updated = row[b'updated']
-        self._description = row[b'description']
-        self._disease = json.loads(row[b'disease'])
-        self._individualId = row[b'individualId']
-        self._info = json.loads(row[b'info'])
+        self._created = biosampleRecord.created
+        self._updated = biosampleRecord.updated
+        self._description = biosampleRecord.description
+        self._disease = json.loads(biosampleRecord.disease)
+        self._individualId = biosampleRecord.individualid
+        self._info = json.loads(biosampleRecord.info)
         return self
 
     def setIndividualId(self, individualId):
@@ -146,15 +146,15 @@ def toProtocolElement(self):
                 gaIndividual.info[key].values.add().string_value = value
         return gaIndividual
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, individualRecord):
         # TODO coerce to types
-        self._name = row[b'name']
-        self._created = row[b'created']
-        self._updated = row[b'updated']
-        self._description = row[b'description']
-        self._species = json.loads(row[b'species'])
-        self._sex = json.loads(row[b'sex'])
-        self._info = json.loads(row[b'info'])
+        self._name = individualRecord.name
+        self._created = individualRecord.created
+        self._updated = individualRecord.updated
+        self._description = individualRecord.description
+        self._species = json.loads(individualRecord.species)
+        self._sex = json.loads(individualRecord.sex)
+        self._info = json.loads(individualRecord.info)
         return self
 
     def populateFromJson(self, jsonString):

diff --git a/ga4gh/server/datamodel/datasets.py b/ga4gh/server/datamodel/datasets.py
@@ -60,13 +60,13 @@ def __init__(self, localId):
         self._rnaQuantificationSetNameMap = {}
         self._info = {}
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, dataset):
         """
         Populates the instance variables of this Dataset from the
         specified database row.
         """
-        self._description = row[b'description']
-        self._info = json.loads(row[b'info'])
+        self._description = dataset.description
+        self._info = json.loads(dataset.info)
 
     def setDescription(self, description):
         """

diff --git a/ga4gh/server/datamodel/genotype_phenotype_featureset.py b/ga4gh/server/datamodel/genotype_phenotype_featureset.py
@@ -44,12 +44,12 @@ def __init__(self, parentContainer, localId):
             parentContainer, localId)
 
     # mimic featureset
-    def populateFromRow(self, row):
+    def populateFromRow(self, featureSetRecord):
         """
         Populates the instance variables of this FeatureSet from the specified
         DB row.
         """
-        self._dbFilePath = row[b'dataUrl']
+        self._dbFilePath = featureSetRecord.dataurl
         self.populateFromFile(self._dbFilePath)
 
     def populateFromFile(self, dataUrl):

diff --git a/ga4gh/server/datamodel/ontologies.py b/ga4gh/server/datamodel/ontologies.py
@@ -74,12 +74,12 @@ def populateFromFile(self, dataUrl):
         self._dataUrl = dataUrl
         self._readFile()
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, ontologyRecord):
         """
         Populates this Ontology using values in the specified DB row.
         """
-        self._id = row[b'id']
-        self._dataUrl = row[b'dataUrl']
+        self._id = ontologyRecord.id
+        self._dataUrl = ontologyRecord.dataurl
         self._readFile()
         # TODO sanity check the stored values against what we have just read.
 

diff --git a/ga4gh/server/datamodel/reads.py b/ga4gh/server/datamodel/reads.py
@@ -382,19 +382,19 @@ def getBamHeaderReferenceSetName(self):
         """
         return self._bamHeaderReferenceSetName
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, readGroupSetRecord):
         """
         Populates the instance variables of this ReadGroupSet from the
         specified database row.
         """
-        self._dataUrl = row[b'dataUrl']
-        self._indexFile = row[b'indexFile']
+        self._dataUrl = readGroupSetRecord.dataurl
+        self._indexFile = readGroupSetRecord.indexfile
         self._programs = []
-        for jsonDict in json.loads(row[b'programs']):
+        for jsonDict in json.loads(readGroupSetRecord.programs):
             program = protocol.fromJson(json.dumps(jsonDict),
                                         protocol.Program)
             self._programs.append(program)
-        stats = protocol.fromJson(row[b'stats'], protocol.ReadStats)
+        stats = protocol.fromJson(readGroupSetRecord.stats, protocol.ReadStats)
         self._numAlignedReads = stats.aligned_read_count
         self._numUnalignedReads = stats.unaligned_read_count
 
@@ -747,18 +747,19 @@ def populateFromHeader(self, readGroupHeader):
         self._platformUnit = readGroupHeader.get('PU', None)
         self._runTime = readGroupHeader.get('DT', None)
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, readGroupRecord):
         """
         Populate the instance variables using the specified DB row.
         """
-        self._sampleName = row[b'sampleName']
-        self._biosampleId = row[b'biosampleId']
-        self._description = row[b'description']
-        self._predictedInsertSize = row[b'predictedInsertSize']
-        stats = protocol.fromJson(row[b'stats'], protocol.ReadStats)
+        self._sampleName = readGroupRecord.samplename
+        self._biosampleId = readGroupRecord.biosampleid
+        self._description = readGroupRecord.description
+        self._predictedInsertSize = readGroupRecord.predictedinsertsize
+        stats = protocol.fromJson(readGroupRecord.stats, protocol.ReadStats)
         self._numAlignedReads = stats.aligned_read_count
         self._numUnalignedReads = stats.unaligned_read_count
-        experiment = protocol.fromJson(row[b'experiment'], protocol.Experiment)
+        experiment = protocol.fromJson(
+            readGroupRecord.experiment, protocol.Experiment)
         self._instrumentModel = experiment.instrument_model
         self._sequencingCenter = experiment.sequencing_center
         self._experimentDescription = experiment.description

diff --git a/ga4gh/server/datamodel/references.py b/ga4gh/server/datamodel/references.py
@@ -481,19 +481,21 @@ def populateFromFile(self, dataUrl):
             reference.setLength(len(bases))
             self.addReference(reference)
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, referenceSetRecord):
         """
         Populates this reference set from the values in the specified DB
         row.
         """
-        self._dataUrl = row[b'dataUrl']
-        self._description = row[b'description']
-        self._assemblyId = row[b'assemblyId']
-        self._isDerived = bool(row[b'isDerived'])
-        self._md5checksum = row[b'md5checksum']
-        self._species = json.loads(row[b'species'])
-        self._sourceAccessions = json.loads(row[b'sourceAccessions'])
-        self._sourceUri = row[b'sourceUri']
+        self._dataUrl = referenceSetRecord.dataurl
+        self._description = referenceSetRecord.description
+        self._assemblyId = referenceSetRecord.assemblyid
+        self._isDerived = bool(referenceSetRecord.isderived)
+        self._md5checksum = referenceSetRecord.md5checksum
+        if referenceSetRecord.species is not None:
+            self.setSpeciesFromJson(referenceSetRecord.species)
+        self._sourceAccessions = json.loads(
+            referenceSetRecord.sourceaccessions)
+        self._sourceUri = referenceSetRecord.sourceuri
 
     def getDataUrl(self):
         """
@@ -519,17 +521,18 @@ class HtslibReference(datamodel.PysamDatamodelMixin, AbstractReference):
     def __init__(self, parentContainer, localId):
         super(HtslibReference, self).__init__(parentContainer, localId)
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, referenceRecord):
         """
         Populates this reference from the values in the specified DB row.
         """
-        self._length = row[b'length']
-        self._isDerived = bool(row[b'isDerived'])
-        self._md5checksum = row[b'md5checksum']
-        self._species = json.loads(row[b'species'])
-        self._sourceAccessions = json.loads(row[b'sourceAccessions'])
-        self._sourceDivergence = row[b'sourceDivergence']
-        self._sourceUri = row[b'sourceUri']
+        self._length = referenceRecord.length
+        self._isDerived = bool(referenceRecord.isderived)
+        self._md5checksum = referenceRecord.md5checksum
+        if referenceRecord.species is not None:
+            self.setSpeciesFromJson(referenceRecord.species)
+        self._sourceAccessions = json.loads(referenceRecord.sourceaccessions)
+        self._sourceDivergence = referenceRecord.sourcedivergence
+        self._sourceUri = referenceRecord.sourceuri
 
     def getBases(self, start, end):
         self.checkQueryRange(start, end)

diff --git a/ga4gh/server/datamodel/rna_quantification.py b/ga4gh/server/datamodel/rna_quantification.py
@@ -179,12 +179,12 @@ def populateFromFile(self, dataUrl):
         self._db = SqliteRnaBackend(self._dbFilePath)
         self.addRnaQuants()
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, quantificationSetRecord):
         """
         Populates the instance variables of this RnaQuantificationSet from the
         specified DB row.
         """
-        self._dbFilePath = row[b'dataUrl']
+        self._dbFilePath = quantificationSetRecord.dataurl
         self._db = SqliteRnaBackend(self._dbFilePath)
         self.addRnaQuants()
 

diff --git a/ga4gh/server/datamodel/sequence_annotations.py b/ga4gh/server/datamodel/sequence_annotations.py
@@ -340,12 +340,12 @@ def populateFromFile(self, dataUrl):
         self._dbFilePath = dataUrl
         self._db = Gff3DbBackend(self._dbFilePath)
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, featureSetRecord):
         """
         Populates the instance variables of this FeatureSet from the specified
         DB row.
         """
-        self._dbFilePath = row[b'dataUrl']
+        self._dbFilePath = featureSetRecord.dataurl
         self._db = Gff3DbBackend(self._dbFilePath)
 
     def getDataUrl(self):

diff --git a/ga4gh/server/datamodel/variants.py b/ga4gh/server/datamodel/variants.py
@@ -49,12 +49,11 @@ def __init__(self, parentContainer, localId):
         self._info = {}
         self._biosampleId = None
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, callSetRecord):
         """
         Populates this CallSet from the specified DB row.
         """
-        # currently a noop
-        self._biosampleId = row[b'biosampleId']
+        self._biosampleId = callSetRecord.biosampleid
 
     def toProtocolElement(self):
         """
@@ -459,19 +458,19 @@ def getDataUrlIndexPairs(self):
         """
         return set(self._chromFileMap.values())
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, variantSetRecord):
         """
         Populates this VariantSet from the specified DB row.
         """
-        self._created = row[b'created']
-        self._updated = row[b'updated']
+        self._created = variantSetRecord.created
+        self._updated = variantSetRecord.updated
         self._chromFileMap = {}
         # We can't load directly as we want tuples to be stored
         # rather than lists.
-        for key, value in json.loads(row[b'dataUrlIndexMap']).items():
+        for key, value in json.loads(variantSetRecord.dataurlindexmap).items():
             self._chromFileMap[key] = tuple(value)
         self._metadata = []
-        for jsonDict in json.loads(row[b'metadata']):
+        for jsonDict in json.loads(variantSetRecord.metadata):
             metadata = protocol.fromJson(json.dumps(jsonDict),
                                          protocol.VariantSetMetadata)
             self._metadata.append(metadata)
@@ -1048,14 +1047,15 @@ def populateFromFile(self, varFile, annotationType):
         self._creationTime = self._analysis.created
         self._updatedTime = datetime.datetime.now().isoformat() + "Z"
 
-    def populateFromRow(self, row):
+    def populateFromRow(self, annotationSetRecord):
         """
         Populates this VariantAnnotationSet from the specified DB row.
         """
-        self._annotationType = row[b'annotationType']
-        self._analysis = protocol.fromJson(row[b'analysis'], protocol.Analysis)
-        self._creationTime = row[b'created']
-        self._updatedTime = row[b'updated']
+        self._annotationType = annotationSetRecord.annotationtype
+        self._analysis = protocol.fromJson(
+            annotationSetRecord.analysis, protocol.Analysis)
+        self._creationTime = annotationSetRecord.created
+        self._updatedTime = annotationSetRecord.updated
 
     def getAnnotationType(self):
         """