-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathKBaseGenomes.spec
1012 lines (908 loc) · 38.2 KB
/
KBaseGenomes.spec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
@author chenry, jayrbolton
*/
module KBaseGenomes {
/*
Reference to a ContigSet object containing the contigs for this genome in the workspace
@id ws KBaseGenomes.ContigSet
*/
typedef string ContigSet_ref;
/*
Reference to a ProteinSet object containing the proteins for this genome in the workspace
@id ws KBaseGenomes.ProteinSet
*/
typedef string ProteinSet_ref;
/*
Reference to a TranscriptSet object containing the transcripts for this genome in the workspace
@id ws KBaseGenomes.TranscriptSet
*/
typedef string TranscriptSet_ref;
/*
Reference to a Feature object of a genome in the workspace
@id subws KBaseGenomes.Genome.features.[*].id
*/
typedef string Feature_ref;
/*
Reference to a Genome object in the workspace
@id ws KBaseGenomes.Genome KBaseGenomeAnnotations.GenomeAnnotation
*/
typedef string Genome_ref;
/*
Reference to an Assembly object in the workspace
@id ws KBaseGenomeAnnotations.Assembly
*/
typedef string Assembly_ref;
/*
Reference to a Pangenome object in the workspace
@id ws KBaseGenomes.Pangenome
*/
typedef string Pangenome_ref;
/*
Reference to a Proteome Comparison object in the workspace
@id ws GenomeComparison.ProteomeComparison
*/
typedef string Protcomp_ref;
/*
Reference to a source_id
@id external
*/
typedef string source_id;
/*
KBase legacy data ID
@id kb
*/
typedef string Genome_id;
/*
KBase Reaction ID
@id external
*/
typedef string Reaction_id;
/*
KBase Feature ID
@id external
*/
typedef string Feature_id;
/*
KBase ProteinSet ID
@id kb
*/
typedef string ProteinSet_id;
/*
ProbabilisticAnnotation ID
@id kb
*/
typedef string ProbabilisticAnnotation_id;
/*
Genome protein ID
@id external
*/
typedef string Protein_id;
/*
Reference to an individual contig in a ContigSet object
@id subws KBase.ContigSet.contigs.[*].id
*/
typedef string Contig_ref;
/*
ContigSet contig ID
@id external
*/
typedef string Contig_id;
/*
KBase contig set ID
@id kb
*/
typedef string ContigSet_id;
/*
Reference to a reads file in shock
@id shock
*/
typedef string Reads_ref;
/*
Reference to a fasta file in shock
@id shock
*/
typedef string Fasta_ref;
typedef string Feature_type;
typedef int Bool;
/*
Type spec for a "Contig" subobject in the "ContigSet" object
Fields:
id - Contig_id - ID of contig in contigset
md5 - string - unique hash of contig sequence
sequence - string - sequence of the contig
description - string - Description of the contig (e.g. everything after the ID in a FASTA file)
@optional length md5 genetic_code cell_compartment replicon_geometry replicon_type name description complete
*/
typedef structure {
Contig_id id;
int length;
string md5;
string sequence;/*using "sequence" instead of "dna"*/
int genetic_code;
string cell_compartment;
string replicon_type;
/* circular / linear */
string replicon_geometry;
string name;
string description;
Bool complete;
} Contig;
/*
Type spec for the "ContigSet" object
Fields:
id - string - unique kbase ID of the contig set
name - string - name of the contig set
type - string - type of the contig set (values are: Genome,Transcripts,Environment,Collection)
source_id - string - source ID of the contig set
source - string - source of the contig set
contigs - list<Contig> - list of contigs in the contig set
reads_ref - string - reference to the shocknode with the raw reads from which contigs
were assembled
fasta_ref - string - reference to fasta file from which contig set were read
@optional name type reads_ref fasta_ref
@metadata ws type as Type
@metadata ws source_id as Source ID
@metadata ws source as Source
@metadata ws name as Name
@metadata ws length(contigs) as Number contigs
*/
typedef structure {
ContigSet_id id;
string name;
string md5;
source_id source_id;
string source;
string type;
Reads_ref reads_ref;
Fasta_ref fasta_ref;
list<Contig> contigs;
} ContigSet;
/*
Structure for a publication
Elements:
(0) pubmedid - float
(1) source - string - (ex. Pubmed)
(2) title - string
(3) string web address - string
(4) publication year - string
(5) authors - string
(6) journal - string
*/
typedef tuple<float pubmedid,string source,string title, string url,string year,string authors, string journal> publication;
/*
KBase CDS ID
@id external
*/
typedef string cds_id;
/*
KBase mRNA ID
@id external
*/
typedef string mrna_id;
/*
Type spec for the "InferenceInfo" object.
TODO docs
Found in the `inference_data` fields in mRNAs and CDSs
Fields:
category - string - TODO
type - string - TODO
evidence - string - TODO
*/
typedef structure {
string category;
string type;
string evidence;
} InferenceInfo;
/*
Structure for a single CDS encoding "gene" of a genome
ONLY PUT GENES THAT HAVE A CORRESPONDING CDS IN THIS ARRAY
NOTE: Sequence is optional. Ideally we can keep it in here, but
Recognize due to space constraints another solution may be needed.
We may want to add additional fields for other CDM functions
(e.g., atomic regulons, coexpressed fids, co_occurring fids,...)
protein_translation_length and protein_translation are
for longest coded protein (representative protein for splice variants)
NOTE: New Aliases field definitely breaks compatibility.
As Does Function.
flags are flag fields in GenBank format. This will be a controlled vocabulary.
Initially Acceptable values are pseudo, ribosomal_slippage, and trans_splicing
Md5 is the md5 of dna_sequence.
@optional functions ontology_terms note protein_translation mrnas flags warnings
@optional inference_data dna_sequence aliases db_xrefs children functional_descriptions
*/
typedef structure {
Feature_id id;
list<tuple<Contig_id,int,string,int>> location;
list<string> functions;
list<string> functional_descriptions;
mapping<string ontology_namespace,mapping<string ontology_id,list<int> evidence_events>> ontology_terms;
string note;
string md5;
string protein_translation;
int protein_translation_length;
list<string> cdss;
list<string> mrnas;
list<string> children;
list<string> flags;
list<string> warnings;
list <InferenceInfo> inference_data;
string dna_sequence;
int dna_sequence_length;
list<tuple<string fieldname,string alias>> aliases;
list<tuple<string db_source,string db_identifier>> db_xrefs;
} Feature;
/*
Structure for a single feature that is NOT one of the following:
- Protein encoding gene (gene that has a corresponding CDS)
- mRNA
- CDS
Note pseudo-genes and Non protein encoding genes are put into this
flags are flag fields in GenBank format. This will be a controlled vocabulary.
Initially Acceptable values are pseudo, ribosomal_slippage, and trans_splicing
Md5 is the md5 of dna_sequence.
@optional functions ontology_terms note flags warnings functional_descriptions
@optional inference_data dna_sequence aliases db_xrefs children parent_gene
*/
typedef structure {
Feature_id id;
list<tuple<Contig_id,int,string,int>> location;
string type;
list<string> functions;
list<string> functional_descriptions;
mapping<string ontology_namespace,mapping<string ontology_id,list<int> evidence_event>> ontology_terms;
string note;
string md5;
string parent_gene;
list<string> children;
list<string> flags;
list<string> warnings;
list <InferenceInfo> inference_data;
string dna_sequence;
int dna_sequence_length;
list<tuple<string fieldname,string alias>> aliases;
list<tuple<string db_source,string db_identifier>> db_xrefs;
} NonCodingFeature;
/*
Structure for a single coding sequence.
Coding sequences are the sections of a feature's sequence that are translated
to a protein (minus introns and UTRs).
Fields:
id - string - identifier of the coding sequence, such as "b0001_CDS_1"
location - list<tuple<string, int, string, int>> - list of
locations from where this sequence originates in the original assembly.
Each sub-sequence in the list constitutes a section of the resulting
CDS. The first element in the tuple corresponds to the "contig_id",
such as "NC_000913.3". The second element in the tuple is an index in
the contig of where the sequence starts. The third element is either a
plus or minus sign indicating whether it is on the 5' to 3' leading
strand ("+") or on the 3' to 5' lagging strand ("-"). The last element
is the length of the sub-sequence.
For a location on the leading strand (denoted by "+"), the index is
of the leftmost base, and the sequence extends to the right. For a
location on the lagging strand (denoted by "-"), the index is of
the rightmost base, and the sequence extends to the left.
NOTE: the last element in each tuple is the *length* of each
sub-sequence. If you have a location such as ("xyz", 100, "+", 50),
then your sequence will go from index 100 to index 149 (this has a
length of 50). It *does not* go from index 100 to index 150, as
that would have a length of 51.
Likewise, if you have the location ("xyz", 100, "-", 50), then the
sequence extends from 100 down to 51, which has a length of 50
bases. It does not go from index 100 to 50, as that would have a
length of 51.
md5 - string - md5 of the dna sequence - TODO clarification
protein_md5 - string - hash of the protein sequence that this CDS encodes
parent_gene - string - gene (feature) from which this CDS comes from,
including introns and UTRs that have been removed to create this CDS.
parent_mrna - string - mRNA sequence from which this sequence is derived,
including UTRs but not introns.
note - string - TODO
functions - list<string> - list of protein products or chemical
processes that this sequence creates, facilitates, or influences.
functional_descriptions - list<string> - TODO list of protein products or chemical
processes that sequence creates, facilitates, or influences.
ontology_terms - mapping<string, mapping<string, list<int>>> - a mapping
of ontology source id (eg. "GO") to a mapping of term IDs (eg "GO:16209")
to a list of indexes into the ontology_events data (found in the top
level of the genome object). The index into an ontology event indicates
what service and method created this term assignment.
flags - list<string> - (controlled vocab) fields from the genbank source. A
common example is "pseudo" for pseudo-genes that do not encode proteins,
which shows up as "/pseudo" in the genbank.
Values can be: "pseudo", "ribosomal_slippage", "trans_splicing"
warnings - list<string> - TODO
inference_data - list<InferenceInfo> - TODO
protein_translation - string - amino acid sequence that this CDS gets translated into.
protein_translation_length - int - length of the above
aliases - list<(string, string)> - alternative list of names or identifiers
eg: [["gene", "thrA"], ["locus_tag", "b0002"]]
db_xrefs - list<(string, string)> - Identifiers from other databases (database cross-references)
The first string is the database name, the second is the database identifier.
eg: [["ASAP", "ABE-0000006"], ["EcoGene", "EG11277"]]
dna_sequence - string - sequence of exons from the genome that constitute this protein encoding sequence.
dna_sequence_length - int - length of the above
@optional parent_gene parent_mrna functions ontology_terms note flags warnings
@optional inference_data dna_sequence aliases db_xrefs functional_descriptions
*/
typedef structure {
cds_id id;
list<tuple<Contig_id, int, string, int>> location;
string md5;
string protein_md5;
Feature_id parent_gene;
mrna_id parent_mrna;
string note;
list<string> functions;
list<string> functional_descriptions;
mapping<string ontology_namespace, mapping<string ontology_id, list<int> evidence_events>> ontology_terms;
list<string> flags;
list<string> warnings;
list<InferenceInfo> inference_data;
string protein_translation;
int protein_translation_length;
list<tuple<string fieldname, string alias>> aliases;
list<tuple<string db_source, string db_identifier>> db_xrefs;
string dna_sequence;
int dna_sequence_length;
} CDS;
/*
The mRNA is the transcribed sequence from the original feature, minus the
introns, but including the UTRs.
Fields:
id - string - identifying string for the mRNA
location - list<tuple<string, int, string, int>> - list of
locations from where this sequence originates in the original assembly.
Each sub-sequence in the list constitutes a section of the resulting
CDS. The first element in the tuple corresponds to the "contig_id",
such as "NC_000913.3". The second element in the tuple is an index in
the contig of where the sequence starts. The third element is either a
plus or minus sign indicating whether it is on the 5' to 3' leading
strand ("+") or on the 3' to 5' lagging strand ("-"). The last element
is the length of the sub-sequence.
For a location on the leading strand (denoted by "+"), the index is
of the leftmost base, and the sequence extends to the right. For a
location on the lagging strand (denoted by "-"), the index is of
the rightmost base, and the sequence extends to the left.
NOTE: the last element in each tuple is the *length* of each
sub-sequence. If you have a location such as ("xyz", 100, "+", 50),
then your sequence will go from index 100 to index 149 (this has a
length of 50). It *does not* go from index 100 to index 150, as
that would have a length of 51.
Likewise, if you have the location ("xyz", 100, "-", 50), then the
sequence extends from 100 down to 51, which has a length of 50
bases. It does not go from index 100 to 50, as that would have a
length of 51.
md5 - string - md5 of the dna sequence - TODO clarification
parent_gene - Feature_id - corresponding feature for this sequence, including introns and UTRs
cds - string - corresponding coding sequence for this mRNA (the sequence minus UTRs)
dna_sequence - string - sequence of UTRs and exons from the genome that constitute this mRNA
dna_sequence_length - int - length of the above
note - string - TODO
functions - list<string> - TODO list of protein products or chemical
processes that sequence creates, facilitates, or influences.
functional_descriptions - list<string> - TODO list of protein products or chemical
processes that sequence creates, facilitates, or influences.
ontology_terms - mapping<string, mapping<string, list<int>>> - a mapping
of ontology source id (eg. "GO") to a mapping of term IDs (eg "GO:16209")
to a list of indexes into the ontology_events data (found in the top
level of the genome object). The index into an ontology event indicates
what service and method created this term assignment.
flags - list<string> - controlled vocab - fields from the genbank source. A
common example is "pseudo" for pseudo-genes that do not encode proteins,
which shows up as "/pseudo" in the genbank.
Values can be: "pseudo", "ribosomal_slippage", "trans_splicing"
warnings - list<string> - TODO
inference_data - list<InferenceInfo> - TODO
aliases - list<(string, string)> - alternative list of names or identifiers
eg: [["gene", "thrA"], ["locus_tag", "b0002"]]
db_xrefs - list<(string, string)> - Identifiers from other databases (database cross-references).
The first string is the database name, the second is the database identifier.
eg: [["ASAP", "ABE-0000006"], ["EcoGene", "EG11277"]]
@optional parent_gene cds functions ontology_terms note flags warnings
@optional inference_data dna_sequence aliases db_xrefs functional_descriptions
*/
typedef structure {
mrna_id id;
list<tuple<Contig_id, int, string, int>> location;
string md5;
Feature_id parent_gene;
cds_id cds;
string dna_sequence;
int dna_sequence_length;
string note;
list<string> functions;
list<string> functional_descriptions;
mapping<string ontology_namespace, mapping<string ontology_id, list<int> evidence_events>> ontology_terms;
list<string> flags;
list<string> warnings;
list<InferenceInfo> inference_data;
list<tuple<string fieldname, string alias>> aliases;
list<tuple<string db_source, string db_identifier>> db_xrefs;
} mRNA;
/*
Reference to a taxon object
@id ws KBaseGenomeAnnotations.Taxon
*/
typedef string Taxon_ref;
/*
Reference to a handle to the Genbank file on shock
@id handle
*/
typedef string genbank_handle_ref;
/*
Reference to a handle to the GFF file on shock
@id handle
*/
typedef string gff_handle_ref;
/*
Reference to a ontology object
@id ws KBaseOntology.OntologyDictionary
*/
typedef string Ontology_ref;
/*
Reference to a report object
@id ws KBaseReport.Report
*/
typedef string Method_report_ref;
/*
@optional ontology_ref method_version eco
*/
typedef structure {
string id;
Ontology_ref ontology_ref;
string method;
string method_version;
string timestamp;
string eco;
} Ontology_event;
/*
Genome quality score
Fields:
method - string - TODO
method_report_ref - string - TODO
method_version - string - TODO
score: string - TODO
score_interpretation - string - TODO
timestamp - string - TODO
Score_interpretation - fraction_complete - controlled vocabulary managed by API
@optional method_report_ref method_version
*/
typedef structure {
string method;
Method_report_ref method_report_ref;
string method_version;
string score;
string score_interpretation;
string timestamp;
} GenomeQualityScore;
/*
Genome type -- annotated and assembled genome data.
Field descriptions:
id - string - KBase legacy data ID
scientific_name - string - human readable species name
domain - string - human readable phylogenetic domain name (eg. "Bacteria")
warnings - list of string - genome-level warnings generated in the annotation process
genome_tiers - list of string - controlled vocabulary (based on app input and checked by GenomeFileUtil)
A list of labels describing the data source for this genome.
Allowed values - Representative, Reference, ExternalDB, User
Tier assignments based on genome source:
* All phytozome - Representative and ExternalDB
* Phytozome flagship genomes - Reference, Representative and ExternalDB
* Ensembl - Representative and ExternalDB
* RefSeq Reference - Reference, Representative and ExternalDB
* RefSeq Representative - Representative and ExternalDB
* RefSeq Latest or All Assemblies folder - ExternalDB
* User Data - User tagged
feature_counts - map of string to integer - total counts of each type of feature
keys are a controlled vocabulary of - "CDS", "gene", "misc_feature",
"misc_recomb", "mobile_element", "ncRNA" - 72, "non_coding_features",
"non_coding_genes", "protein_encoding_gene", "rRNA", "rep_origin",
"repeat_region", "tRNA"
genetic_code - int - An NCBI-assigned taxonomic category for the organism
See here - https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
dna_size - integer - total number of nucleotides
num_contigs - integer - total number of contigs in the genome
molecule_type - string - controlled vocab - the type of molecule sequenced
Possible values are "Unknown", "DNA", "RNA", "genomic DNA", "genomic RNA",
"mRNA", "tRNA", "rRNA", "other RNA", "other DNA", "transcribed RNA",
"viral cRNA", "unassigned DNA", "unassigned RNA"
contig_lengths - list of int - nucleotide length of each contig in the genome
Indexes in this list correspond to indexes in the `contig_ids` list.
contig_ids - list of str - external database identifiers for each contig (eg. "NC_000913.3")
source - str - controlled vocab - descriptor of where this data came from (eg. "RefSeq")
Allowed entries RefSeq, Ensembl, Phytozome, RAST, Prokka, User_upload
source_id - string - identifier of this genome from the source database (eg. the RefSeq ID such as "NC_000913")
md5 - string - checksum of the underlying assembly sequence
taxonomy - string - semicolon-delimited taxonomy lineage, in order of parent to child
taxon_assignments - mapping of taxonomy namespace to taxon ID.
example - {"ncbi": "286", "gtdb": "s__staphylococcus_devriesei"}
gc_content - float - ratio of GC count to AT in the genome
publications - tuple of (pubmedid, source, title, web_addr, year, authors, journal). See typedef above.
ontology_events - A record of the service and method used for a set of
ontology assignments on the genome.
ontologies_present - a mapping of ontology source id (eg. "GO") to a mapping
of term IDs (eg "GO:16209") to term names (eg. "histidine biosynthetic process").
features - array of Feature - protein coding genes (see the separate Feature spec)
cdss - array of protein-coding sequences
mrnas - array of transcribed messenger RNA sequences (equal to cdss plus 5' and 3' UTRs)
non_coding_features - array of features that does not include mRNA, CDS, and protein-encoding genes
assembly_ref - workspace reference to an assembly object from which this annotated genome was derived.
taxon_ref - workspace reference to a taxon object that classifies the species or strain of this genome.
genbank_handle_ref - file server handle reference to the source genbank file for this genome.
gff_handle_ref - file server handle reference to the source GFF file for this genome.
external_source_origination_date - TODO look at GFU for this
release - string - User-supplied release or version of the source data. This
most likely will come from an input field in the import app.
original_source_file_name - filename from which this genome was derived (eg. genbank or gff filename).
notes - TODO
quality_scores - TODO
suspect - bool - flag of whether this annotation is problematic due to some warning
genome_type - string - controlled vocab - One of "draft isolate",
"finished isolate", "mag", "sag", "virus", "plasmid", "construct"
Features vs. coding sequences: a feature is a sequence in the DNA that codes
for a protein, including non-transcribed introns. A coding sequence (stored as
`cdss`) includes **only** the sections of the feature that codes for a protein,
minus introns and UTRs.
@optional warnings contig_lengths contig_ids source_id taxonomy publications
@optional ontology_events ontologies_present non_coding_features mrnas genome_type
@optional genbank_handle_ref gff_handle_ref external_source_origination_date
@optional release original_source_file_name notes quality_scores suspect assembly_ref
@optional taxon_ref taxon_assignments
@metadata ws gc_content as GC content
@metadata ws taxonomy as Taxonomy
@metadata ws md5 as MD5
@metadata ws dna_size as Size
@metadata ws genetic_code as Genetic code
@metadata ws domain as Domain
@metadata ws source_id as Source ID
@metadata ws source as Source
@metadata ws scientific_name as Name
@metadata ws genome_type as Type
@metadata ws length(features) as Number of Protein Encoding Genes
@metadata ws length(cdss) as Number of CDS
@metadata ws assembly_ref as Assembly Object
@metadata ws num_contigs as Number contigs
@metadata ws length(warnings) as Number of Genome Level Warnings
@metadata ws suspect as Suspect Genome
*/
typedef structure {
Genome_id id;
string scientific_name;
string domain;
list<string> warnings;
list<string> genome_tiers;
mapping<string type, int count> feature_counts;
int genetic_code;
int dna_size;
int num_contigs;
string molecule_type;
list<int> contig_lengths;
list<string> contig_ids;
string source;
source_id source_id;
string md5;
string taxonomy;
mapping<string, string> taxon_assignments;
float gc_content;
list<publication> publications;
list<Ontology_event> ontology_events;
mapping<string ontology_namespace, mapping<string ontology_id, string termname>> ontologies_present;
list<Feature> features;
list<NonCodingFeature> non_coding_features;
list<CDS> cdss;
list<mRNA> mrnas;
Assembly_ref assembly_ref;
Taxon_ref taxon_ref;
genbank_handle_ref genbank_handle_ref;
gff_handle_ref gff_handle_ref;
string external_source_origination_date;
string release;
string original_source_file_name;
string notes;
list<GenomeQualityScore> quality_scores;
Bool suspect;
string genome_type;
} Genome;
/*
Structure for a protein family
@optional query_begin query_end subject_begin subject_end score evalue subject_description release_version
*/
typedef structure {
string id;
string subject_db;
string release_version;
string subject_description;
int query_begin;
int query_end;
int subject_begin;
int subject_end;
float score;
float evalue;
} ProteinFamily;
/* TODO docs */
typedef tuple<string comment, string annotator, float annotation_time> annotation;
/*
Type spec for the "Protein" object
Fields:
id - string - unique external ID of protein
function - string - annotated function for protein
md5 - string - md5 hash of protein sequence
sequence - string - amino acid sequence of protein
length - int -length of protein
protein_families - list<ProteinFamily> - families to which the protein belongs
aliases - list<string> - aliases for the protein
annotations - list<annotation> - curator annotations on protein
subsystem_data - list<subsystem_data> - TODO
@optional function
@searchable ws_subset id md5 function length aliases
*/
typedef structure {
Protein_id id;
string function;
string md5;
string sequence;
int length;
list<ProteinFamily> protein_families;
list<string> aliases;
list<annotation> annotations;
} Protein;
/*
Type spec for the "ProteinSet" object
Fields:
id - string - unique kbase ID of the protein set
name - string - name of the protein set
type - string - type of the protein set (values are: Organism,Environment,Collection)
source_id - string - source ID of the protein set
source - string -source of the protein set
proteins - list<Protein> - list of proteins in the protein set
fasta_ref - fasta_ref - reference to fasta file from which contig set were read
@optional name type fasta_ref
@searchable ws_subset proteins.[*].(id,md5,function,length,aliases) md5 id name source_id source type
*/
typedef structure {
ProteinSet_id id;
string name;
string md5;
source_id source_id;
string source;
string type;
Fasta_ref fasta_ref;
list<Protein> proteins;
} ProteinSet;
/*
A function_probability is a (annotation, probability) pair associated with a gene
An annotation is a "///"-delimited list of roles that could be associated with that gene.
*/
typedef tuple<string annotation, float probability> function_probability;
/*
Object to carry alternative functions and probabilities for genes in a genome
Fields:
id - string - ID of the probabilistic annotation object
genome_ref - string - reference to genome probabilistic annotation was built for
roleset_probabilities - mapping<string, list<function_probability>> - mapping of
features to list of alternative function_probability objects
skipped_features - list<string> - list of features in genome with no probability
@searchable ws_subset id genome_ref skipped_features
*/
typedef structure {
ProbabilisticAnnotation_id id;
Genome_ref genome_ref;
mapping<Feature_id,list<function_probability>> roleset_probabilities;
list<Feature_id> skipped_features;
} ProbabilisticAnnotation;
/*
Structure for the "MetagenomeAnnotationOTUFunction" object
Fields:
reference_genes - list<string> - list of genes associated with hit
functional_role - string - annotated function
kbid - string - kbase ID of OTU function in metagenome
abundance - int - number of hits with associated role and OTU
confidence - float - confidence of functional role hit
confidence_type - string - type of functional role hit
@searchable ws_subset id abundance confidence functional_role
*/
typedef structure {
string id;
list<string> reference_genes;
string functional_role;
int abundance;
float confidence;
} MetagenomeAnnotationOTUFunction;
/*
Structure for the "MetagenomeAnnotationOTU" object
Fields:
name - string - name of metagenome OTU
kbid - string - KBase ID of OTU of metagenome object
source_id - string - ID used for OTU in metagenome source
source - string - source OTU ID
functions - list<MetagenomeAnnotationOTUFunction> - list of functions in OTU
@searchable ws_subset id name source_id source functions.[*].(id,abundance,confidence,functional_role)
*/
typedef structure {
float ave_confidence;
float ave_coverage;
string id;
string name;
string source_id;
string source;
list<MetagenomeAnnotationOTUFunction> functions;
} MetagenomeAnnotationOTU;
/*
Structure for the "MetagenomeAnnotation" object
Fields:
type - string - type of metagenome object
name - string - name of metagenome object
kbid - string - KBase ID of metagenome object
source_id - string - ID used in metagenome source
source - string - source of metagenome data
confidence_type - string - type of confidence score
otus - list<MetagenomeAnnotationOTU> - list of otus in metagenome
@searchable ws_subset type name id source_id source confidence_type otus.[*].(id,name,source_id,source,functions.[*].(id,abundance,confidence,functional_role))
@metadata ws type as Type
@metadata ws name as Name
@metadata ws source_id as Source ID
@metadata ws source as Source
@metadata ws length(otus) as Number OTUs
*/
typedef structure {
string type;
string name;
string id;
string source_id;
string source;
string confidence_type;
list<MetagenomeAnnotationOTU> otus;
} MetagenomeAnnotation;
/*
Domain - a subobject holding information on a single protein domain
Fields:
id - string - numerical ID assigned by KBase
source_id - string - assession ID from CDD database;
type - string - type of CDD, possible values are cd, pfam, smart, COG, PRK, CHL
name - string - name of CDD
description - string - description of CDD
*/
typedef structure {
string id;
string source_id;
string type;
string name;
string description;
} Domain;
/*
FeatureDomain - a subobject holding information on how a domain appears in a gene
Fields:
id - string - numerical ID assigned by KBase
source_id - string - assession ID from CDD database;
type - string - type of CDD, possible values are cd, pfam, smart, COG, PRK, CHL
name - string - name of CDD
description - string - description of CDD
@optional feature_ref domains
*/
typedef structure {
string id;
string feature_id;
string feature_ref;
string function;
int feature_length;
list<tuple<string domain_ref,int identity,int alignment_length,int mismatches,int gaps,float protein_start,float protein_end,float domain_start,float domain_end,float evalue,float bit_score>> domains;
} FeatureDomainData;
/*
GenomeDomainData object: this object holds all data regarding protein domains in a genome in
KBase
@optional genome_ref
@searchable ws_subset id genome_id scientific_name genome_ref num_domains num_features
*/
typedef structure {
string id;
Genome_id genome_id;
string scientific_name;
Genome_ref genome_ref;
int num_domains;
int num_features;
list<Domain> domains;
list<FeatureDomainData> featuredomains;
} GenomeDomainData;
/*
OrthologFamily object: this object holds all data for a single ortholog family in a metagenome.
Fields:
id - string - group identifier
type - string - ....
function - string - function as described in KBaseGenomes.Genome
md5 - string - md5 encoded string of protein_translation
protein_translation - string - protein translation string
orthologs - list<tuple<string,float,string>> - list of tuples of:
(0) string - gene identifier (ID in gff file)
(1) float - numerical order in gff file OR gene order in BLAST
(2) string - genome workspace reference
@optional type function md5 protein_translation
*/
typedef structure {
string id;
string type;
string function;
string md5;
string protein_translation;
list<tuple<string,float,string>> orthologs;
} OrthologFamily;
/*
Pangenome object: this object holds all data regarding a pangenome
@searchable ws_subset id name
@metadata ws type as Type
@metadata ws name as Name
@metadata ws length(orthologs) as Number orthologs
@metadata ws length(genome_refs) as Number genomes
*/
typedef structure {
string id;
string name;
string type;
list<Genome_ref> genome_refs;
list<OrthologFamily> orthologs;
} Pangenome;
/*
GenomeComparisonGenome object: this object holds information about a genome in a genome
comparison.
*/
typedef structure {
string id;
Genome_ref genome_ref;
mapping<string genome_id,tuple<int commonfamilies,int commonfunctions> > genome_similarity;
string name;
string taxonomy;
int features;
int families;
int functions;
} GenomeComparisonGenome;
/*
GenomeComparisonFunction object: this object holds information about a genome in a function
across all genomes.
*/
typedef structure {
int core;
mapping<string genome_id,list<tuple<Feature_id,int famindex,float score> > > genome_features;
string id;
list<tuple<Reaction_id, string equation>> reactions;
string subsystem;
string primclass;
string subclass;
int number_genomes;
float fraction_genomes;
float fraction_consistent_families;
string most_consistent_family;
} GenomeComparisonFunction;
/*
GenomeComparisonFamily object: this object holds information about a protein family across a set
of genomes.
*/
typedef structure {
int core;
mapping<string genome_id,list< tuple<Feature_id,list<int> funcindecies,float score > > > genome_features;
string id;
string type;
string protein_translation;
int number_genomes;
float fraction_genomes;
float fraction_consistent_annotations;
string most_consistent_role;
} GenomeComparisonFamily;
/*
GenomeComparisonData object: this object holds information about a multigenome comparison.
@optional protcomp_ref pangenome_ref
@metadata ws core_functions as Core functions
@metadata ws core_families as Core families
@metadata ws name as Name
@metadata ws length(genomes) as Number genomes
*/